diff --git a/.github/workflows/array_api.yml b/.github/workflows/array_api.yml
index 4a562b2bd876..5c517a336db6 100644
--- a/.github/workflows/array_api.yml
+++ b/.github/workflows/array_api.yml
@@ -15,24 +15,8 @@ permissions:
 env:
   CCACHE_DIR: "${{ github.workspace }}/.ccache"
   INSTALLDIR: "build-install"
-  XP_TESTS: >-
-    -t scipy.cluster
-    -t scipy.constants
-    -t scipy.fft
-    -t scipy.special.tests.test_logsumexp
-    -t scipy.special.tests.test_support_alternative_backends
-    -t scipy._lib.tests.test_array_api
-    -t scipy._lib.tests.test__util
-    -t scipy.differentiate.tests.test_differentiate
-    -t scipy.integrate.tests.test_tanhsinh
-    -t scipy.integrate.tests.test_cubature
-    -t scipy.optimize.tests.test_bracket
-    -t scipy.optimize.tests.test_chandrupatla
-    -t scipy.optimize.tests.test_optimize
-    -t scipy.stats
-    -t scipy.ndimage
-    -t scipy.integrate.tests.test_quadrature
-    -t scipy.signal.tests.test_signaltools
+  # Only run tests that use the `xp` fixture
+  XP_TESTS: -m array_api_backends
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -43,8 +27,8 @@ jobs:
     name: Get commit message
     uses: ./.github/workflows/commit_message.yml
 
-  pytorch_cpu:
-    name: Linux PyTorch/JAX/xp-strict CPU
+  xp_cpu:
+    name: Linux PyTorch/JAX/Dask/xp-strict CPU
     needs: get_commit_message
     if: >
       needs.get_commit_message.outputs.message == 1
@@ -86,6 +70,10 @@ jobs:
       run: |
         python -m pip install "jax[cpu]"
 
+    - name: Install Dask
+      run: |
+        python -m pip install git+https://github.com/dask/dask.git
+
     - name:  Prepare compiler cache
       id:    prep-ccache
       shell: bash
@@ -111,6 +99,4 @@ jobs:
     - name: Test SciPy
       run: |
         export OMP_NUM_THREADS=2
-        # expand as more modules are supported by adding to `XP_TESTS` above
         python dev.py --no-build test -b all $XP_TESTS -- --durations 3 --timeout=60
-
diff --git a/.github/workflows/commit_message.yml b/.github/workflows/commit_message.yml
index 5a6666b84b2b..05a3ad9dc02f 100644
--- a/.github/workflows/commit_message.yml
+++ b/.github/workflows/commit_message.yml
@@ -18,6 +18,7 @@ jobs:
       message: ${{ steps.skip_check.outputs.message }}
     steps:
       - name: Checkout scipy
+        if: ${{ !contains(github.actor, 'nektos/act') }}
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         # Gets the correct commit message for pull request
         with:
@@ -29,10 +30,14 @@ jobs:
         # and changing the output here based on the combination of tags (if desired).
         run: |
           set -xe
-          COMMIT_MSG=$(git log --no-merges -1)
           RUN="1"
-          if [[ "$COMMIT_MSG" == *"[lint only]"* || "$COMMIT_MSG" == *"[docs only]"* ]]; then
-              RUN="0"
+          # For running a job locally with Act (https://github.com/nektos/act),
+          # always run the job rather than skip based on commit message
+          if [[ $ACT != true ]]; then
+            COMMIT_MSG=$(git log --no-merges -1)
+            if [[ "$COMMIT_MSG" == *"[lint only]"* || "$COMMIT_MSG" == *"[docs only]"* ]]; then
+                RUN="0"
+            fi
           fi
           echo "message=$RUN" >> $GITHUB_OUTPUT
           echo github.ref ${{ github.ref }}
diff --git a/.gitignore b/.gitignore
index 4db8aada330d..0baf18f88c2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -247,7 +247,7 @@ scipy/signal/_bspline_util.c
 scipy/sparse/_csparsetools.c
 scipy/sparse/_csparsetools.pyx
 scipy/sparse/csgraph/_min_spanning_tree.c
-scipy/sparse/csgraph/_shortest_path.c
+scipy/sparse/csgraph/_shortest_path.cxx
 scipy/sparse/csgraph/_tools.c
 scipy/sparse/csgraph/_traversal.c
 scipy/sparse/csgraph/_flow.c
diff --git a/LICENSE.txt b/LICENSE.txt
index 117117616e80..4ccc94ba7860 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2001-2002 Enthought, Inc. 2003-2024, SciPy Developers.
+Copyright (c) 2001-2002 Enthought, Inc. 2003-2025, SciPy Developers.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/benchmarks/benchmarks/sparse_csgraph_dijkstra.py b/benchmarks/benchmarks/sparse_csgraph_dijkstra.py
index 02a04852c75d..150d30e89640 100755
--- a/benchmarks/benchmarks/sparse_csgraph_dijkstra.py
+++ b/benchmarks/benchmarks/sparse_csgraph_dijkstra.py
@@ -20,7 +20,7 @@ def setup(self, n, min_only, format):
         rng = np.random.default_rng(1234)
         if format == 'random':
             # make a random connectivity matrix
-            data = scipy.sparse.rand(n, n, density=0.2, format='csc',
+            data = scipy.sparse.rand(n, n, density=0.2, format='lil',
                                      random_state=42, dtype=np.bool_)
             data.setdiag(np.zeros(n, dtype=np.bool_))
             self.data = data
diff --git a/dev.py b/dev.py
index ef42bdbb2ca6..c4bc096e2137 100644
--- a/dev.py
+++ b/dev.py
@@ -710,7 +710,8 @@ class Test(Task):
         multiple=True,
         help=(
             "Array API backend "
-            "('all', 'numpy', 'torch', 'cupy', 'array_api_strict', 'jax.numpy')."
+            "('all', 'numpy', 'torch', 'cupy', 'array_api_strict',"
+            " 'jax.numpy', 'dask.array')."
         )
     )
     # Argument can't have `help=`; used to consume all of `-- arg1 arg2 arg3`
diff --git a/doc/Makefile b/doc/Makefile
index ff7950101df6..64a55bc028fa 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -109,14 +109,11 @@ upload:
 # Basic Sphinx generation rules for different formats
 #------------------------------------------------------------------------------
 
-html: version-check convert-notebooks html-build
+html: version-check html-build
 html-build:
 	mkdir -p build/html build/doctrees
 	$(SPHINXBUILD) -WT --keep-going $(VERSIONWARNING) -b html $(ALLSPHINXOPTS) build/html $(FILES)
 
-convert-notebooks:
-	$(PYTHON) convert_notebooks.py
-
 coverage: build version-check
 	mkdir -p build/coverage build/doctrees
 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) build/coverage $(FILES)
diff --git a/doc/convert_notebooks.py b/doc/convert_notebooks.py
deleted file mode 100644
index 09532adab972..000000000000
--- a/doc/convert_notebooks.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import glob
-import os
-import shutil
-import jupytext
-
-def call_jupytext(md_files, _contents_path, _contents_cache_path):
-    is_cached = os.path.exists(_contents_cache_path)
-    if not is_cached:
-        for md_file in md_files:
-            basename = os.path.splitext(os.path.basename(md_file))[0]
-            output_name = os.path.join(_contents_path, f"{basename}.ipynb")
-            nb = jupytext.read(md_file)
-            jupytext.write(jupytext.read(md_file), output_name, fmt="ipynb", version=4)
-        return True
-
-    is_dirty = False
-
-    for md_file in md_files:
-        basename = os.path.splitext(os.path.basename(md_file))[0]
-        output_path = os.path.join(_contents_path, f"{basename}.ipynb")
-        cached_output_path = os.path.join(_contents_cache_path, f"{basename}.ipynb")
-        cmd_execution_time = os.stat(cached_output_path).st_mtime
-        md_file_modification_time = os.stat(md_file).st_mtime
-        if cmd_execution_time < md_file_modification_time:
-            nb = jupytext.read(md_file)
-            jupytext.write(nb, output_path, fmt="ipynb", version=4)
-            is_dirty = True
-        else:
-            shutil.copyfile(
-                cached_output_path,
-                os.path.join(_contents_path, f"{basename}.ipynb")
-            )
-
-    return is_dirty
-
-if __name__ == '__main__':
-    _contents_cache_path = os.path.join("build", "_contents")
-    _contents_path = os.path.join("source", "_contents")
-
-    os.makedirs(os.path.expanduser(_contents_path), exist_ok=True)
-
-    md_files = glob.glob("source/tutorial/stats/*.md")
-    md_files += glob.glob("source/tutorial/*.md")
-    is_dirty = call_jupytext(
-        md_files,
-        _contents_path,
-        _contents_cache_path
-    )
-
-    if is_dirty:
-        os.makedirs(os.path.expanduser(_contents_cache_path), exist_ok=True)
-        shutil.copytree(_contents_path, _contents_cache_path, dirs_exist_ok=True)
diff --git a/doc/source/_static/version_switcher.json b/doc/source/_static/version_switcher.json
index 9dbaf31b4dff..eefedc9d0111 100644
--- a/doc/source/_static/version_switcher.json
+++ b/doc/source/_static/version_switcher.json
@@ -5,9 +5,14 @@
         "url": "https://scipy.github.io/devdocs/"
     },
     {
-        "name": "1.14.1 (stable)",
-        "version":"1.14.1",
+        "name": "1.15.0 (stable)",
+        "version":"1.15.0",
         "preferred": true,
+        "url": "https://docs.scipy.org/doc/scipy-1.15.0/"
+    },
+    {
+        "name": "1.14.1",
+        "version":"1.14.1",
         "url": "https://docs.scipy.org/doc/scipy-1.14.1/"
     },
     {
diff --git a/doc/source/building/index.rst b/doc/source/building/index.rst
index 1814dd726c65..5453efe2c9c8 100644
--- a/doc/source/building/index.rst
+++ b/doc/source/building/index.rst
@@ -419,6 +419,16 @@ interface is self-documenting, so please see ``python dev.py --help`` and
     on how things work under the hood.
 
 
+Installing static type stubs
+----------------------------
+
+If you would like to install static type stubs to aid your development of SciPy,
+you can include the ``scipy-stubs`` package in your development environment.
+It is available on PyPI and conda-forge - see the scipy-stubs_ installation guide.
+
+.. _scipy-stubs: https://github.com/jorenham/scipy-stubs?tab=readme-ov-file#installation
+
+
 Customizing builds
 ------------------
 
diff --git a/doc/source/conf.py b/doc/source/conf.py
index b0727107bd13..f20d1bc71894 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -411,6 +411,9 @@
 # interactive renditions of the notebooks
 strip_tagged_cells = True
 
+# Enable overrides for JupyterLite settings at runtime
+jupyterlite_overrides = "overrides.json"
+
 #------------------------------------------------------------------------------
 # Interactive examples with jupyterlite-sphinx
 #------------------------------------------------------------------------------
diff --git a/doc/source/dev/api-dev/array_api.rst b/doc/source/dev/api-dev/array_api.rst
index 9095e86df976..f5a41586cd4f 100644
--- a/doc/source/dev/api-dev/array_api.rst
+++ b/doc/source/dev/api-dev/array_api.rst
@@ -236,10 +236,11 @@ large and hard-to-detect performance bottleneck.
 Adding tests
 ------------
 
+To run a test on multiple array backends, you should add the ``xp`` fixture to it,
+which is valued to the currently tested array namespace. 
+
 The following pytest markers are available:
 
-* ``array_api_compatible -> xp``: use a parametrisation to run a test on
-  multiple array backends.
 * ``skip_xp_backends(backend=None, reason=None, np_only=False, cpu_only=False, exceptions=None)``:
   skip certain backends or categories of backends.
   ``@pytest.mark.usefixtures("skip_xp_backends")`` must be used alongside this
@@ -262,18 +263,33 @@ The following pytest markers are available:
   causing the test to fail. When ``SCIPY_ARRAY_API=1`` behavior becomes the
   default and only behavior, these tests (and the decorator itself) will be
   removed.
+* ``array_api_backends``: this marker is automatically added by the ``xp`` fixture to
+  all tests that use it. This is useful e.g. to select all and only such tests::
+
+    python dev.py test -b all -- -m array_api_backends
+
+  Note that this includes tests that use the ``xp`` fixture indirectly through another
+  array API fixture, such as ``@pytest.mark.usefixtures("skip_xp_backends")``, even if
+  they don't explicitly consume ``xp`` themselves.
+
+* OBSOLETE: ``array_api_compatible`` (does nothing; pending removal)
 
 ``scipy._lib._array_api`` contains array-agnostic assertions such as ``xp_assert_close``
 which can be used to replace assertions from `numpy.testing`.
 
+When these assertions are executed within a test that uses the ``xp`` fixture, they
+enforce that the namespaces of both the actual and desired arrays match the namespace
+which was set by the fixture. Tests without the ``xp`` fixture infer the namespace from
+the desired array. This machinery can be overridden by explicitly passing the ``xp=``
+parameter to the assertion functions.
+
 The following examples demonstrate how to use the markers::
 
-  from scipy.conftest import array_api_compatible, skip_xp_invalid_arg
+  from scipy.conftest import skip_xp_invalid_arg
   from scipy._lib._array_api import xp_assert_close
   ...
   @pytest.mark.skip_xp_backends(np_only=True, reason='skip reason')
   @pytest.mark.usefixtures("skip_xp_backends")
-  @array_api_compatible
   def test_toto1(self, xp):
       a = xp.asarray([1, 2, 3])
       b = xp.asarray([0, 2, 5])
@@ -284,7 +300,6 @@ The following examples demonstrate how to use the markers::
   @pytest.mark.skip_xp_backends('cupy',
                                 reason='skip reason 2')
   @pytest.mark.usefixtures("skip_xp_backends")
-  @array_api_compatible
   def test_toto2(self, xp):
       ...
   ...
@@ -310,7 +325,6 @@ for compiled code::
   @pytest.mark.skip_xp_backends('array_api_strict', reason='skip reason 1')
   @pytest.mark.skip_xp_backends('cupy', reason='skip reason 2')
   @pytest.mark.usefixtures("skip_xp_backends")
-  @array_api_compatible
   def test_toto(self, xp):
       ...
 
@@ -318,9 +332,7 @@ When every test function in a file has been updated for array API
 compatibility, one can reduce verbosity by telling ``pytest`` to apply the
 markers to every test function using ``pytestmark``::
 
-    from scipy.conftest import array_api_compatible
-
-    pytestmark = [array_api_compatible, pytest.mark.usefixtures("skip_xp_backends")]
+    pytestmark = [pytest.mark.usefixtures("skip_xp_backends")]
     skip_xp_backends = pytest.mark.skip_xp_backends
     ...
     @skip_xp_backends(np_only=True, reason='skip reason')
diff --git a/doc/source/dev/roadmap-detailed.rst b/doc/source/dev/roadmap-detailed.rst
index 10ee95f0732a..792db55e33b4 100644
--- a/doc/source/dev/roadmap-detailed.rst
+++ b/doc/source/dev/roadmap-detailed.rst
@@ -78,13 +78,17 @@ Pythran is still an optional build dependency, and can be disabled with
 happen it must be clear that the maintenance burden is low enough.
 
 
-Use of venerable Fortran libraries
-``````````````````````````````````
+Use of Fortran libraries
+````````````````````````
 SciPy owes a lot of its success to relying on wrapping well established
-Fortran libraries (QUADPACK, FITPACK, ODRPACK, ODEPACK etc). Some of these
-libraries are aging well, others less so. We should audit our use of these
-libraries with respect to the maintenance effort, the functionality, and the
-existence of (possibly partial) alternatives, *including those inside SciPy*.
+Fortran libraries (QUADPACK, FITPACK, ODRPACK, ODEPACK etc). The Fortran 77
+that these libraries are written in is quite hard to maintain, and the use
+of Fortran is problematic for many reasons; e.g., it makes our wheel builds
+much harder to maintain, it has repeatedly been problematic for supporting
+new platforms like macOS arm64 and Windows on Arm, and it is highly problematic
+for Pyodide's SciPy support. Our goal is to remove all Fortran code from SciPy
+by replacing the functionality with code written in other languages. Progress
+towards this goal is tracked in `gh-18566 <https://github.com/scipy/scipy/issues/18566>`__.
 
 
 Continuous integration
@@ -108,7 +112,7 @@ checking. Stripping of debug symbols in ``multibuild`` can perhaps be improved
 (see `this issue <https://github.com/multi-build/multibuild/issues/162>`__).
 An effort should be made to slim down where possible, and not add new large
 files. In the future, things that are being considered (very tentatively) and
-may help are separating out the bundled` ``libopenblas`` and removing support
+may help are separating out the bundled ``libopenblas`` and removing support
 for ``long double``.
 
 
@@ -123,8 +127,26 @@ feature requests.
 
 constants
 `````````
-This module is basically done, low-maintenance and without open issues.
+This module needs only periodic updates to the numerical values.
 
+differentiate
+`````````````
+This module was added with the understanding that its scope would be limited.
+The goal is to provide basic differentiation of black-box functions, not
+replicate all features of existing differentiation tools. With that in mind,
+items for future work include:
+
+- Improve support for callables that accept additional arguments (e.g. add
+  ``*args`` to ``jacobian`` and ``hessian``). Note that this is not trivial
+  due to the way elements of the arrays are eliminated when their corresponding
+  calculations have converged.
+- Improve implementation of `scipy.differentiate.hessian`: rather than chaining
+  first-order differentiation, use a second-order differentiation stencil.
+- Consider the addition of an option to use relative step sizes.
+- Consider generalizing the approach to use "polynomial extrapolation"; i.e.,
+  rather than estimating derivatives of a given order from the minimal number
+  of function evaluations, use a least-squares approach to improve robustness
+  to numerical noise.
 
 fft
 ````
@@ -133,15 +155,13 @@ This module is in good shape.
 
 integrate
 `````````
-Needed for ODE solvers:
-
-- Documentation is pretty bad, needs fixing
-- A new ODE solver interface  (``solve_ivp``) was added in SciPy 1.0.0.
-  In the future we can consider (soft-)deprecating the older API.
-
-The numerical integration functions are in good shape.  Support for integrating
-complex-valued functions and integrating multiple intervals (see `gh-3325
-<https://github.com/scipy/scipy/issues/3325>`__) could be added.
+- Complete the feature set of the new ``cubature`` function, and add an interface
+  tailored to one-dimensional integrals.
+- Migrate functions for generating quadrature rule points and weights from `special`,
+  improve their reliability, and add support for other important rules.
+- Complete the feature set of ``solve_ivp``, including all functionality of the
+  ``odeint`` interface.
+- Gracefully sunset superseded functions and classes.
 
 
 interpolate
@@ -239,13 +259,8 @@ is in good shape, however we can make a number of improvements:
 
 misc
 ````
-``scipy.misc`` will be removed as a public module.  Most functions in it have
-been moved to another submodule or deprecated.  The few that are left:
-
-- ``derivative``, ``central_diff_weight`` : remove, possibly replacing them
-  with more extensive functionality for numerical differentiation.
-- ``ascent``, ``face``, ``electrocardiogram`` : remove or move to the
-  appropriate subpackages (e.g. ``scipy.ndimage``, ``scipy.signal``).
+All features have been removed from ``scipy.misc``, and the namespace itself
+will eventually be removed.
 
 
 ndimage
@@ -290,23 +305,14 @@ maintenance.  No major plans or wishes here.
 
 optimize
 ````````
-Overall this module is in good shape. Two good global optimizers were added in
-1.2.0; large-scale optimizers is still a gap that could be filled.  Other
-things that are needed:
-
-- Many ideas for additional functionality (e.g. integer constraints) in
-  ``linprog``, see `gh-9269 <https://github.com/scipy/scipy/issues/9269>`__.
-- Add functionality to the benchmark suite to compare results more easily
-  (e.g. with summary plots).
-- deprecate the ``fmin_*`` functions in the documentation, ``minimize`` is
-  preferred.
-- ``scipy.optimize`` has an extensive set of benchmarks for accuracy and speed of
-  the global optimizers. That has allowed adding new optimizers (``shgo`` and
-  ``dual_annealing``) with significantly better performance than the existing
-  ones.  The ``optimize`` benchmark system itself is slow and hard to use
-  however; we need to make it faster and make it easier to compare performance of
-  optimizers via plotting performance profiles.
+We aim to continuously improve the set of optimizers provided by this module.
+For large scale problems, the state of the art continues to advance and we aim
+to keep up by leveraging implementations from domain-specific libraries like
+HiGHS and PRIMA. Other areas for future work include the following.
 
+- Improve the interfaces of existing optimizers (e.g. ``shgo``).
+- Improve usability of the benchmark system, and add features for comparing
+  results more easily (e.g. summary plots).
 
 signal
 ``````
@@ -390,7 +396,6 @@ This module is in good shape.
 sparse.linalg
 `````````````
 There are a significant number of open issues for ``_arpack`` and ``lobpcg``.
-``_propack`` is new in 1.8.0, TBD how robust it will turn out to be.
 
 ``_isolve``:
 
@@ -455,50 +460,18 @@ may also be included in SciPy, especially if no other widely used and
 well-supported package covers the topic.  Also note that *some* duplication
 with downstream projects is inevitable and not necessarily a bad thing.)
 
-In addition to the items described in the :ref:`scipy-roadmap`, the following
-improvements will help SciPy better serve this role.
-
-- Add fundamental and widely used hypothesis tests, such as:
-
-  - post hoc tests (e.g. Dunnett's test)
-  - the various types of analysis of variance (ANOVA):
-
-    - two-way ANOVA (single replicate, uniform number of replicates, variable
-      number of replicates)
-    - multiway ANOVA (i.e. generalize two-way ANOVA)
-    - nested ANOVA
-    - analysis of covariance (ANCOVA)
-
-  Also, provide an infrastructure for implementing hypothesis tests.
-- Add additional tools for meta-analysis
-- Add tools for survival analysis
-- Speed up random variate sampling (method ``rvs``) of distributions, 
-  leveraging ``scipy.stats.sampling`` where appropriate
-- Expand QMC capabilities and performance
-- Enhance the `fit` method of the continuous probability distributions:
-
-  - Expand the options for fitting to include:
-
-    - maximal product spacings
-    - method of L-moments / probability weighted moments
-
-  - Include measures of goodness-of-fit in the results
-  - Handle censored data (e.g. merge `gh-13699 <https://github.com/scipy/scipy/pull/13699>`__)
-
-- Implement additional widely used continuous and discrete probability
-  distributions, e.g. mixture distributions.
-
-- Improve the core calculations provided by SciPy's probability distributions
-  so they can robustly handle wide ranges of parameter values.  Specifically,
-  replace many of the PDF and CDF methods from the Fortran library CDFLIB
-  used in ``scipy.special`` with Boost implementations as in
-  `gh-13328 <https://github.com/scipy/scipy/pull/13328>`__.
-
-In addition, we should:
-
-- Continue work on making the function signatures of ``stats`` and
-  ``stats.mstats`` more consistent, and add tests to ensure that that
-  remains the case.
-- Improve statistical tests: return confidence intervals for the test
-  statistic, and implement exact p-value calculations - considering the
-  possibility of ties - where computationally feasible.
+The following improvements will help SciPy better serve this role.
+
+- Improve statistical tests: include methods for generating confidence
+  intervals, and implement exact and randomized p-value calculations -
+  considering the possibility of ties - where computationally feasible.
+- Extend the new univariate distribution infrastructure, adding support
+  for discrete distributions and circular continuous distributions.
+  Add a selection of the most widely used statistical distributions
+  under the new infrastructure, performing rigorous accuracy testing
+  and documenting the results. Enable users to create custom distributions
+  that leverage the new infrastructure.
+- Improve the multivariate distribution infrastructure to ensure a
+  consistent API, thorough testing, and complete documentation.
+- Continue to make the APIs of functions more consistent, with standard
+  support for batched calculations, NaN policies, and dtype preservation.
diff --git a/doc/source/dev/roadmap.rst b/doc/source/dev/roadmap.rst
index 054df79fb8bb..abf18fce3b5f 100644
--- a/doc/source/dev/roadmap.rst
+++ b/doc/source/dev/roadmap.rst
@@ -36,8 +36,7 @@ algorithms are beneficial to most science domains and use cases.  We have
 established an API design pattern for multiprocessing - using the ``workers``
 keyword - that can be adopted in many more functions.
 
-Enabling the use of an accelerator like Pythran, possibly via Transonic, and
-making it easier for users to use Numba's ``@njit`` in their code that relies
+Making it easier for users to use Numba's ``@njit`` in their code that relies
 on SciPy functionality would unlock a lot of performance gain.  That needs a
 strategy though, all solutions are still maturing (see for example
 `this overview <https://fluiddyn.netlify.app/transonic-vision.html>`__).
@@ -47,19 +46,6 @@ Finally, many individual functions can be optimized for performance.
 requested in this respect.
 
 
-Statistics enhancements
------------------------
-
-The following `scipy.stats` enhancements and those listed in the
-:ref:`scipy-roadmap-detailed` are of particularly high importance to the
-project.
-
-- Overhaul the univariate distribution infrastructure to address longstanding
-  issues (e.g. see `gh-15928 <https://github.com/scipy/scipy/issues/15928>`_.)
-- Consistently handle ``nan_policy``, ``axis`` arguments, and masked
-  arrays in ``stats`` functions (where appropriate).
-
-
 Support for more hardware platforms
 -----------------------------------
 
diff --git a/doc/source/dev/toolchain.rst b/doc/source/dev/toolchain.rst
index 436f7f7b153a..7d4e7ce05cf9 100644
--- a/doc/source/dev/toolchain.rst
+++ b/doc/source/dev/toolchain.rst
@@ -476,7 +476,7 @@ Sphinx-Design                 Whatever recent versions work. >= 0.4.0.
 numpydoc                      Whatever recent versions work. >= 1.5.0.
 matplotlib                    Generally suggest >= 3.5.
 MyST-NB                       Whatever recent versions work. >= 0.17.1
-jupyterlite-sphinx            Whatever recent versions work. >= 0.13.1
+jupyterlite-sphinx            Whatever recent versions work. >= 0.17.1
 jupyterlite-pyodide-kernel    Whatever recent versions work. >= 0.1.0
 ============================  =================================================
 
diff --git a/doc/source/overrides.json b/doc/source/overrides.json
new file mode 100644
index 000000000000..6c67db51743c
--- /dev/null
+++ b/doc/source/overrides.json
@@ -0,0 +1,14 @@
+{
+  "@jupyterlab/notebook-extension:panel": {
+    "toolbar": [
+      {
+        "name": "download",
+        "label": "Download",
+        "args": {},
+        "command": "docmanager:download",
+        "icon": "ui-components:download",
+        "rank": 50
+      }
+    ]
+  }
+}
diff --git a/doc/source/release/1.15.0-notes.rst b/doc/source/release/1.15.0-notes.rst
index f77053da6f72..f9e74fcc3df5 100644
--- a/doc/source/release/1.15.0-notes.rst
+++ b/doc/source/release/1.15.0-notes.rst
@@ -2,8 +2,6 @@
 SciPy 1.15.0 Release Notes
 ==========================
 
-.. note:: SciPy 1.15.0 is not released yet!
-
 .. contents::
 
 SciPy 1.15.0 is the culmination of 6 months of hard work. It contains
@@ -52,6 +50,8 @@ Highlights of this release
 
 - `scipy.interpolate.AAA` adds the AAA algorithm for barycentric rational
   approximation of real or complex functions.
+- `scipy.special` adds new functions offering improved Legendre function
+  implementations with a more consistent interface.
 
 
 ************
@@ -211,6 +211,14 @@ and support several Array API compatible array libraries in addition to NumPy
 
 ``scipy.special`` improvements
 ==============================
+- New functions offering improved Legendre function implementations with a
+  more consistent interface. See respective docstrings for more information.
+
+  - `scipy.special.legendre_p`, `scipy.special.legendre_p_all`
+  - `scipy.special.assoc_legendre_p`, `scipy.special.assoc_legendre_p_all`
+  - `scipy.special.sph_harm_y`, `scipy.special.sph_harm_y_all`
+  - `scipy.special.sph_legendre_p`, `scipy.special.sph_legendre_p_all`,
+
 - The factorial functions ``special.{factorial,factorial2,factorialk}`` now
   offer an extension to the complex domain by passing the kwarg
   ``extend='complex'``. This is opt-in because it changes the values for
@@ -227,17 +235,15 @@ and support several Array API compatible array libraries in addition to NumPy
   sum has magnitude much bigger than the rest.
 - The accuracy of several functions has been improved:
 
-  - `scipy.special.ncfdtr` and `scipy.special.nctdtr` have been improved
-    throughout the domain.
+  - `scipy.special.ncfdtr`, `scipy.special.nctdtr`, and
+    `scipy.special.gdtrib` have been improved throughout the domain.
   - `scipy.special.hyperu` is improved for the case of ``b=1``, small ``x``,
     and small ``a``.
   - `scipy.special.logit` is improved near the argument ``p=0.5``.
   - `scipy.special.rel_entr` is improved when ``x/y`` overflows, underflows,
     or is close to ``1``.
 
-- `scipy.special.gdtrib` may now be used in a CuPy ``ElementwiseKernel`` on
-  GPUs.
-- `scipy.special.ndtr` is now more efficient.
+- `scipy.special.ndtr` is now more efficient for ``sqrt(2)/2 < |x| < 1``.
 
 ``scipy.stats`` improvements
 ============================
@@ -368,9 +374,9 @@ with support added for SciPy ``1.15.0`` include:
   and for other backends will transit via NumPy arrays on the host.
 
 
-*******************
-Deprecated features
-*******************
+**************************************
+Deprecated features and future changes
+**************************************
 - Functions `scipy.linalg.interpolative.rand` and
   `scipy.linalg.interpolative.seed` have been deprecated and will be removed
   in SciPy ``1.17.0``.
@@ -380,7 +386,7 @@ Deprecated features
 - `scipy.spatial.distance.kulczynski1` and
   `scipy.spatial.distance.sokalmichener` were deprecated and will be removed
   in SciPy ``1.17.0``.
-- `scipy.stats.find_repeats` is deprecated as of SciPy ``1.15.0`` and will be
+- `scipy.stats.find_repeats` is deprecated and will be
   removed in SciPy ``1.17.0``. Please use
   ``numpy.unique``/``numpy.unique_counts`` instead.
 - `scipy.linalg.kron` is deprecated in favour of ``numpy.kron``.
@@ -388,16 +394,18 @@ Deprecated features
   convolution/correlation functions (`scipy.signal.correlate`,
   `scipy.signal.convolve` and `scipy.signal.choose_conv_method`) and
   filtering functions (`scipy.signal.lfilter`, `scipy.signal.sosfilt`) has
-  been deprecated as of SciPy ``1.15.0`` and will be removed in SciPy
-  ``1.17.0``.
+  been deprecated and will be removed in SciPy ``1.17.0``.
 - `scipy.stats.linregress` has deprecated one-argument use; the two
   variables must be specified as separate arguments.
 - ``scipy.stats.trapz`` is deprecated in favor of `scipy.stats.trapezoid`.
 - `scipy.special.lpn` is deprecated in favor of `scipy.special.legendre_p_all`.
 - `scipy.special.lpmn` and `scipy.special.clpmn` are deprecated in favor of
   `scipy.special.assoc_legendre_p_all`.
-- The raveling of multi-dimensional input by `scipy.linalg.toeplitz` has
-  been deprecated. It will support batching in SciPy ``1.17.0``.
+- `scipy.special.sph_harm` has been deprecated in favor of
+  `scipy.special.sph_harm_y`.
+- Multi-dimensional ``r`` and ``c`` arrays passed to `scipy.linalg.toeplitz`,
+  `scipy.linalg.matmul_toeplitz`, or `scipy.linalg.solve_toeplitz` will be
+  treated as batches of 1-D coefficients beginning in SciPy ``1.17.0``.
 - The ``random_state`` and ``permutations`` arguments of
   `scipy.stats.ttest_ind` are deprecated. Use ``method`` to perform a
   permutation test, instead.
@@ -500,12 +508,12 @@ Other changes
   `scipy.interpolate`.
 
 
-*******
-Authors
-*******
+*****************
+Authors (commits)
+*****************
 
 * endolith (4)
-* h-vetinari (61)
+* h-vetinari (62)
 * a-drenaline (1) +
 * Afleloup (1) +
 * Ahmad Alkadri (1) +
@@ -520,14 +528,14 @@ Authors
 * Christoph Baumgarten (1)
 * Nickolai Belakovski (3)
 * Krishan Bhasin (1) +
-* Jake Bowhay (85)
+* Jake Bowhay (89)
 * Michael Bratsch (2) +
 * Matthew Brett (1)
 * Keith Briggs (1) +
 * Olly Britton (145) +
-* Dietrich Brunn (10)
+* Dietrich Brunn (11)
 * Clemens Brunner (1)
-* Evgeni Burovski (181)
+* Evgeni Burovski (185)
 * Matthias Bussonnier (7)
 * CJ Carey (32)
 * Cesar Carrasco (4) +
@@ -547,13 +555,13 @@ Authors
 * Sahil Garje (1) +
 * Gabriel Gerlero (2)
 * Yotam Gingold (1) +
-* Ralf Gommers (105)
+* Ralf Gommers (111)
 * Rohit Goswami (62)
 * Anil Gurses (1) +
 * Oscar Gustafsson (1) +
-* Matt Haberland (362)
+* Matt Haberland (392)
 * Matt Hall (1) +
-* Joren Hammudoglu (2) +
+* Joren Hammudoglu (6) +
 * CY Han (1) +
 * Daniel Isaac (4) +
 * Maxim Ivanov (1)
@@ -566,7 +574,7 @@ Authors
 * Guus Kamphuis (1) +
 * Aditya Karumanchi (2) +
 * Robert Kern (5)
-* Agriya Khetarpal (10)
+* Agriya Khetarpal (11)
 * Andrew Knyazev (7)
 * Gideon Genadi Kogan (1) +
 * Damien LaRocque (1) +
@@ -576,6 +584,7 @@ Authors
 * Boyu Liu (1) +
 * Drew Allan Loney (1) +
 * Christian Lorentzen (1)
+* Loïc Estève (2)
 * Smit Lunagariya (1)
 * Henry Lunn (1) +
 * Marco Maggi (4)
@@ -605,18 +614,19 @@ Authors
 * Tom M. Ragonneau (2)
 * Peter Ralph (1) +
 * Stephan Rave (1) +
-* Tyler Reddy (126)
+* Tyler Reddy (192)
 * redha2404 (2) +
 * Ritvik1sharma (1) +
+* Érico Nogueira Rolim (1) +
 * Heshy Roskes (1)
 * Pamphile Roy (34)
 * Mikhail Ryazanov (1) +
 * Sina Saber (1) +
 * Atsushi Sakai (1)
 * Clemens Schmid (1) +
-* Daniel Schmitz (15)
+* Daniel Schmitz (17)
 * Moritz Schreiber (1) +
-* Dan Schult (87)
+* Dan Schult (91)
 * Searchingdays (1) +
 * Matias Senger (1) +
 * Scott Shambaugh (1)
@@ -624,7 +634,7 @@ Authors
 * Sheila-nk (4)
 * Romain Simon (2) +
 * Gagandeep Singh (31)
-* Albert Steppi (35)
+* Albert Steppi (40)
 * Kai Striega (1)
 * Anushka Suyal (143) +
 * Alex Szatmary (1)
@@ -652,7 +662,7 @@ Authors
 * Gang Zhao (1)
 * ਗਗਨਦੀਪ ਸਿੰਘ (Gagandeep Singh) (10)
 
-A total of 147 people contributed to this release.
+A total of 149 people contributed to this release.
 People with a "+" by their names contributed a patch for the first time.
 This list of names is automatically generated, and may not be fully complete.
 
@@ -875,6 +885,7 @@ Issues closed for 1.15.0
 * `#21661 <https://github.com/scipy/scipy/issues/21661>`__: BUG: fft.fht: should set ``u.imag[-1] = 0`` only when ``n`` is...
 * `#21670 <https://github.com/scipy/scipy/issues/21670>`__: BUG: ndimage: ``_normalize_sequence`` fails on 0d array
 * `#21671 <https://github.com/scipy/scipy/issues/21671>`__: BUG: signal.ShortTimeFFT: inverse tranform error with multichannel...
+* `#21675 <https://github.com/scipy/scipy/issues/21675>`__: BUG: Errors at compiling through pip for python 3.13 with option...
 * `#21677 <https://github.com/scipy/scipy/issues/21677>`__: BLD: build warnings from quadpack
 * `#21696 <https://github.com/scipy/scipy/issues/21696>`__: MAINT: lombscargle numerical backward-compat
 * `#21704 <https://github.com/scipy/scipy/issues/21704>`__: DOC: stats.bootstrap: clarify meaning of ``paired`` argument
@@ -903,6 +914,7 @@ Issues closed for 1.15.0
 * `#21837 <https://github.com/scipy/scipy/issues/21837>`__: BUG: linalg.svd: Segmentation Fault, Integer overflow in LAPACK...
 * `#21838 <https://github.com/scipy/scipy/issues/21838>`__: ENH: sparse: revisit default index dtype selection in sparray...
 * `#21855 <https://github.com/scipy/scipy/issues/21855>`__: TST, MAINT: torch + GPU failures for test_create_diagonal
+* `#21862 <https://github.com/scipy/scipy/issues/21862>`__: BUG: large number of fails with macOS 15.1 using Accelerate
 * `#21885 <https://github.com/scipy/scipy/issues/21885>`__: BUG: ``interpolate/tests/test_interpnd.py::TestLinearNDInterpolation::test_threa``...
 * `#21900 <https://github.com/scipy/scipy/issues/21900>`__: BUG: stats: New XSLOW test failure in test_sampling.py
 * `#21908 <https://github.com/scipy/scipy/issues/21908>`__: BUG: integrate.trapezoid: broadcasting failure after #21524
@@ -916,6 +928,20 @@ Issues closed for 1.15.0
 * `#21963 <https://github.com/scipy/scipy/issues/21963>`__: DOC: Deprecation warning in ``sphinx`` when used with Python...
 * `#21988 <https://github.com/scipy/scipy/issues/21988>`__: refguide_check currently failing
 * `#22005 <https://github.com/scipy/scipy/issues/22005>`__: TST: ``TestJacobian::test_attrs`` tol bump?
+* `#22022 <https://github.com/scipy/scipy/issues/22022>`__: TST: tolerance violation in ``test_x0_working[tfqmr]`` on windows
+* `#22029 <https://github.com/scipy/scipy/issues/22029>`__: ``Test_SVDS_LOBPCG.test_svd_rng_3`` test failure in wheel build...
+* `#22031 <https://github.com/scipy/scipy/issues/22031>`__: BUG: mypy failure in main
+* `#22077 <https://github.com/scipy/scipy/issues/22077>`__: DOC, REL: a few release notes/process issues
+* `#22094 <https://github.com/scipy/scipy/issues/22094>`__: API: unannounced breaking change: ``scipy.integrate.AccuracyWarning``...
+* `#22095 <https://github.com/scipy/scipy/issues/22095>`__: DOC: sparse: ``sparse.eye_array`` does not accept ``tuple[int,``...
+* `#22097 <https://github.com/scipy/scipy/issues/22097>`__: DEP: ``interpolate.interpnd.GradientEstimationWarning`` still...
+* `#22112 <https://github.com/scipy/scipy/issues/22112>`__: BUG/DOC: sparse: ND COO unexpected behaviour 1.15.0rc1
+* `#22123 <https://github.com/scipy/scipy/issues/22123>`__: DOC: stats: random variable transition guide launches wrong notebook
+* `#22128 <https://github.com/scipy/scipy/issues/22128>`__: BUG/DOC: it's not clear how to use ``differentiate.jacobian``...
+* `#22137 <https://github.com/scipy/scipy/issues/22137>`__: BUG: ``stats._distribution_infrastructure._Domain.symbols`` class...
+* `#22143 <https://github.com/scipy/scipy/issues/22143>`__: BUG: Fail to call ``BSpline`` after unpickling with ``mmap_mode="r"``
+* `#22146 <https://github.com/scipy/scipy/issues/22146>`__: BUG:``stats.ContinuousDistribution.llf``\ : should not be public
+* `#22204 <https://github.com/scipy/scipy/issues/22204>`__: BUG: signal.ShortTimeFFT: ``istft`` with ``mfft > len(win)``...
 
 ************************
 Pull requests for 1.15.0
@@ -1018,6 +1044,7 @@ Pull requests for 1.15.0
 * `#20900 <https://github.com/scipy/scipy/pull/20900>`__: ENH: stats: add array API support to combine_pvalues
 * `#20906 <https://github.com/scipy/scipy/pull/20906>`__: DOC: linalg.schur: update doc for the argument ``sort``
 * `#20907 <https://github.com/scipy/scipy/pull/20907>`__: CI: Make sure nightly free-threaded wheels are tested with GIL...
+* `#20908 <https://github.com/scipy/scipy/pull/20908>`__: DOC: signal.dbode: improve docstring
 * `#20912 <https://github.com/scipy/scipy/pull/20912>`__: DOC: Add more information about how to use Accelerate
 * `#20913 <https://github.com/scipy/scipy/pull/20913>`__: BUG: sparse.csgraph.dijkstra: fix dtype and shape bugs
 * `#20915 <https://github.com/scipy/scipy/pull/20915>`__: DOC: ``integrate.quad_vec``\ : Add example when using ``workers``
@@ -1341,6 +1368,7 @@ Pull requests for 1.15.0
 * `#21668 <https://github.com/scipy/scipy/pull/21668>`__: BUG: fft.fht: set ``u.imag[-1] = 0`` only when ``n`` is even
 * `#21672 <https://github.com/scipy/scipy/pull/21672>`__: BUG: ndimage: fix 0d arrays in ``_normalize_sequence``
 * `#21673 <https://github.com/scipy/scipy/pull/21673>`__: BUG: signal.ShortTimeFFT: fix multichannel roundtrip with ``mfft``...
+* `#21678 <https://github.com/scipy/scipy/pull/21678>`__: BUG: fix ``nan`` output of ``special.betaincinv``
 * `#21680 <https://github.com/scipy/scipy/pull/21680>`__: MAINT:integrate: Silence a few QUADPACK compiler warnings
 * `#21682 <https://github.com/scipy/scipy/pull/21682>`__: DOC: Reduce duplication in user guide
 * `#21686 <https://github.com/scipy/scipy/pull/21686>`__: BUG: signal: int handling for ``resample_poly``
@@ -1500,6 +1528,7 @@ Pull requests for 1.15.0
 * `#21977 <https://github.com/scipy/scipy/pull/21977>`__: ENH: integrate.tanhsinh: make ``_tanhsinh`` public
 * `#21979 <https://github.com/scipy/scipy/pull/21979>`__: API: integrate.simpson: allow ``x`` to be passed positionally
 * `#21981 <https://github.com/scipy/scipy/pull/21981>`__: MAINT: purge ``from __future__ import annotations``
+* `#21982 <https://github.com/scipy/scipy/pull/21982>`__: DOC: SciPy 1.15.0 relnotes
 * `#21983 <https://github.com/scipy/scipy/pull/21983>`__: BUG: linalg: fix cython import order
 * `#21984 <https://github.com/scipy/scipy/pull/21984>`__: BUG: signal: actually reject objects in correlate/convolve
 * `#21985 <https://github.com/scipy/scipy/pull/21985>`__: DOC: optimize.root: fix docs for ``inner_\*`` parameters
@@ -1513,4 +1542,47 @@ Pull requests for 1.15.0
 * `#22002 <https://github.com/scipy/scipy/pull/22002>`__: TST: Run complex zeta avoid underflow tests only on platforms...
 * `#22003 <https://github.com/scipy/scipy/pull/22003>`__: DEV: unified git submodule exclusion for tools
 * `#22009 <https://github.com/scipy/scipy/pull/22009>`__: TST: differentiate.jacobian: tolerance bump for float32
+* `#22024 <https://github.com/scipy/scipy/pull/22024>`__: MAINT: version pins/prep for 1.15.0rc1
+* `#22025 <https://github.com/scipy/scipy/pull/22025>`__: DOC: stats: probability tutorial/transition guide
+* `#22026 <https://github.com/scipy/scipy/pull/22026>`__: MAINT: stats.Mixture: fix default ``weights``
+* `#22027 <https://github.com/scipy/scipy/pull/22027>`__: MAINT: stats.ContinuousDistribution: improve doc generation;...
+* `#22030 <https://github.com/scipy/scipy/pull/22030>`__: MAINT: ``stats.FoldedDistribution``\ : accommodate for private...
+* `#22032 <https://github.com/scipy/scipy/pull/22032>`__: MAINT: fix mypy complaints
+* `#22033 <https://github.com/scipy/scipy/pull/22033>`__: TST: fix sparse.linalg failures for tfmqr and svds
+* `#22036 <https://github.com/scipy/scipy/pull/22036>`__: DOC: adapt to NumPy 2.2 changes for abbreviation of large arrays
+* `#22037 <https://github.com/scipy/scipy/pull/22037>`__: MAINT: stats: Add custom reprs for transformed distributions
+* `#22040 <https://github.com/scipy/scipy/pull/22040>`__: MAINT: ``stats.make_distribution``\ : support more existing distributions
+* `#22043 <https://github.com/scipy/scipy/pull/22043>`__: ENH: sparse: make two sputils public for easier index array casting
+* `#22048 <https://github.com/scipy/scipy/pull/22048>`__: TST: integrate.tanhsinh: fix abscissae/weight "compression" bug
+* `#22050 <https://github.com/scipy/scipy/pull/22050>`__: MAINT: ``stats.order_statistic``\ : override ``support``
+* `#22058 <https://github.com/scipy/scipy/pull/22058>`__: DOC: ``stats.order_statistic``\ : add 'Returns' section
+* `#22059 <https://github.com/scipy/scipy/pull/22059>`__: TST: temp skip of extending tests
+* `#22067 <https://github.com/scipy/scipy/pull/22067>`__: MAINT: 1.15.0rc1 backports
+* `#22078 <https://github.com/scipy/scipy/pull/22078>`__: REL: set 1.15.0rc2 unreleased
+* `#22081 <https://github.com/scipy/scipy/pull/22081>`__: MAINT: Add ``__str__`` overrides for distributions in new infra...
+* `#22082 <https://github.com/scipy/scipy/pull/22082>`__: BUG, DOC: fixup md5 hash reporting
+* `#22085 <https://github.com/scipy/scipy/pull/22085>`__: DOC: sparse: explicit dtypes for nonzero()
+* `#22091 <https://github.com/scipy/scipy/pull/22091>`__: DOC: Update special release notes
+* `#22098 <https://github.com/scipy/scipy/pull/22098>`__: DOC: mention the removal of AccuracyWarning
+* `#22099 <https://github.com/scipy/scipy/pull/22099>`__: DEP: update version in deprecation warning of interpnd
+* `#22104 <https://github.com/scipy/scipy/pull/22104>`__: DOC: 1.15.0 release note updates
+* `#22106 <https://github.com/scipy/scipy/pull/22106>`__: DOC: sparse: correct ``eye_array`` docs for first shape input...
+* `#22107 <https://github.com/scipy/scipy/pull/22107>`__: MAINT/DOC: Doctests fix scpdt 1.6
+* `#22113 <https://github.com/scipy/scipy/pull/22113>`__: ENH: sparse: enhance dtype checking in constructors
+* `#22124 <https://github.com/scipy/scipy/pull/22124>`__: DOC: Fix incorrect reference to "Random Variable Transition Guide"...
+* `#22129 <https://github.com/scipy/scipy/pull/22129>`__: ENH: sparse: nD cleanup and docs
+* `#22135 <https://github.com/scipy/scipy/pull/22135>`__: MAINT: _lib: add missing f string to _deprecate_positional_args
+* `#22139 <https://github.com/scipy/scipy/pull/22139>`__: MAINT: stats._SimpleDomain: ensure that instances do not share...
+* `#22149 <https://github.com/scipy/scipy/pull/22149>`__: MAINT: stats.ContinuousDistribution.llf: remove method
+* `#22150 <https://github.com/scipy/scipy/pull/22150>`__: MAINT: SciPy 1.15.0rc2 backports
+* `#22156 <https://github.com/scipy/scipy/pull/22156>`__: DEP: deprecation warnings for ``special.lpn`` and ``[c]lpmn``
+* `#22158 <https://github.com/scipy/scipy/pull/22158>`__: MAINT: accept ndarray subclasses in interpolate._dierckx
+* `#22162 <https://github.com/scipy/scipy/pull/22162>`__: TYP: temporarily ignore the ``numpy==2.2.1`` mypy errors
+* `#22167 <https://github.com/scipy/scipy/pull/22167>`__: DEP: special: deprecation warning for ``sph_harm`` + comments
+* `#22168 <https://github.com/scipy/scipy/pull/22168>`__: BUG: fix incorrect values in factorial for 0 with uint dtypes
+* `#22175 <https://github.com/scipy/scipy/pull/22175>`__: MAINT: stats: fix thread-safety issues under free-threaded CPython
+* `#22177 <https://github.com/scipy/scipy/pull/22177>`__: MAINT: fix extension module not declaring free-threading support,...
+* `#22181 <https://github.com/scipy/scipy/pull/22181>`__: REL: set 1.15.0rc3 unreleased
+* `#22193 <https://github.com/scipy/scipy/pull/22193>`__: DEP: linalg.solve_toeplitz/matmul_toeplitz: warn on n-D ``c``\...
+* `#22225 <https://github.com/scipy/scipy/pull/22225>`__: DOC: differentiate.jacobian: correct/improve documentation about...
 
diff --git a/doc/source/tutorial/linalg_batch.md b/doc/source/tutorial/linalg_batch.md
index 62f42b4de33c..8130166a7643 100644
--- a/doc/source/tutorial/linalg_batch.md
+++ b/doc/source/tutorial/linalg_batch.md
@@ -12,15 +12,20 @@ kernelspec:
 orphan: true
 ---
 
++++ {"tags": ["jupyterlite_sphinx_strip"]}
+
 ```{eval-rst}
-.. jupyterlite:: ../_contents/linalg_batch.ipynb
+.. notebooklite:: linalg_batch.md
    :new_tab: True
 ```
 
 (linalg_batch)=
+
++++
+
 # Batched Linear Operations
 
-Some of SciPy's linear algebra functions support N-dimensional array input. These operations have not been mathematically generalized to higher-order tensors; rather, the indicated operation is performed on a *batch* (or "stack") of input scalars, vectors, and/or matrices.
+Almost all of SciPy's linear algebra functions now support N-dimensional array input. These operations have not been mathematically generalized to higher-order tensors; rather, the indicated operation is performed on a *batch* (or "stack") of input scalars, vectors, and/or matrices.
 
 Consider the `linalg.det` function, which maps a matrix to a scalar.
 
@@ -151,3 +156,36 @@ input_b = rng.random(batch_shape_b + core_shape_b)
 evals, evecs = linalg.eig(input_a, b=input_b)
 evals.shape, evecs.shape
 ```
+
+There are a few functions for which the core dimensionality (i.e., the length of the core shape) of an argument or output can be either 1 or 2. In these cases, the core dimensionality is taken to be 1 if the array has only one dimension and 2 if the array has two or more dimensions. For instance, consider the following calls to {func}`scipy.linalg.solve`. The simplest case is a single square matrix `A` and a single vector `b`:
+
+```{code-cell} ipython3
+A = np.eye(5)
+b = np.arange(5)
+linalg.solve(A, b)
+```
+
+In this case, the core dimensionality of `A` is 2 (shape `(5, 5)`), the core dimensionality of `b` is 1  (shape `(5,)`), and the core dimensionality of the output is 1  (shape `(5,)`).
+
+However, `b` can also be a two-dimensional array in which the *columns* are taken to be one-dimensional vectors.
+
+```{code-cell} ipython3
+b = np.empty((5, 2))
+b[:, 0] = np.arange(5)
+b[:, 1] = np.arange(5, 10)
+linalg.solve(A, b)
+```
+
+```{code-cell} ipython3
+b.shape
+```
+
+At first glance, it might seem that the core shape of `b` is still `(5,)`, and we have simply performed the operation with a batch shape of `(2,)`. However, if this were the case, the batch shape of `b` would be *prepended* to the core shape, resulting in `b` and the output having shape `(2, 5)`. Thinking more carefully, it is correct to consider the core dimensionality of both inputs and the output to be 2; the batch shape is `()`.
+
+Likewise, whenever `b` has more than two dimensions, the core dimensionality of `b` and the output is considered to be 2. For example, to solve a batch of three entirely separate linear systems, each with only one right hand side, `b` must be provided as a three-dimensional array: one dimensions for the batch shape (`(3,)`) and two for the core shape (`(5, 1)`).
+
+```{code-cell} ipython3
+A = rng.random((3, 5, 5))
+b = rng.random((3, 5, 1))  # batch shape (3,), core shape (5, 1)
+linalg.solve(A, b).shape
+```
diff --git a/doc/source/tutorial/stats/hypothesis_bartlett.md b/doc/source/tutorial/stats/hypothesis_bartlett.md
index 7516b7375c90..41ac57adbe71 100644
--- a/doc/source/tutorial/stats/hypothesis_bartlett.md
+++ b/doc/source/tutorial/stats/hypothesis_bartlett.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_bartlett.ipynb
+.. notebooklite:: hypothesis_bartlett.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_chi2_contingency.md b/doc/source/tutorial/stats/hypothesis_chi2_contingency.md
index 54474e5bf843..e040c93f598e 100644
--- a/doc/source/tutorial/stats/hypothesis_chi2_contingency.md
+++ b/doc/source/tutorial/stats/hypothesis_chi2_contingency.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_chi2_contingency.ipynb
+.. notebooklite:: hypothesis_chi2_contingency.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_chisquare.md b/doc/source/tutorial/stats/hypothesis_chisquare.md
index df55e3cab25b..86addd094e1c 100644
--- a/doc/source/tutorial/stats/hypothesis_chisquare.md
+++ b/doc/source/tutorial/stats/hypothesis_chisquare.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_chisquare.ipynb
+.. notebooklite:: hypothesis_chisquare.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_dunnett.md b/doc/source/tutorial/stats/hypothesis_dunnett.md
index 080ed771dd06..7fdf00815ccd 100644
--- a/doc/source/tutorial/stats/hypothesis_dunnett.md
+++ b/doc/source/tutorial/stats/hypothesis_dunnett.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_dunnett.ipynb
+.. notebooklite:: hypothesis_dunnett.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_fisher_exact.md b/doc/source/tutorial/stats/hypothesis_fisher_exact.md
index ad122446e4f2..0c19b5a72c91 100644
--- a/doc/source/tutorial/stats/hypothesis_fisher_exact.md
+++ b/doc/source/tutorial/stats/hypothesis_fisher_exact.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_fisher_exact.ipynb
+.. notebooklite:: hypothesis_fisher_exact.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_fligner.md b/doc/source/tutorial/stats/hypothesis_fligner.md
index ab877425f7ae..6bc118cdf4e5 100644
--- a/doc/source/tutorial/stats/hypothesis_fligner.md
+++ b/doc/source/tutorial/stats/hypothesis_fligner.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_fligner.ipynb
+.. notebooklite:: hypothesis_fligner.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_friedmanchisquare.md b/doc/source/tutorial/stats/hypothesis_friedmanchisquare.md
index 4cbabaa0562a..3a1e00a645f1 100644
--- a/doc/source/tutorial/stats/hypothesis_friedmanchisquare.md
+++ b/doc/source/tutorial/stats/hypothesis_friedmanchisquare.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_friedmanchisquare.ipynb
+.. notebooklite:: hypothesis_friedmanchisquare.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_jarque_bera.md b/doc/source/tutorial/stats/hypothesis_jarque_bera.md
index 74d31d349d13..f91733489a1e 100644
--- a/doc/source/tutorial/stats/hypothesis_jarque_bera.md
+++ b/doc/source/tutorial/stats/hypothesis_jarque_bera.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_jarque_bera.ipynb
+.. notebooklite:: hypothesis_jarque_bera.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_kendalltau.md b/doc/source/tutorial/stats/hypothesis_kendalltau.md
index c0cea74766e7..01907444582e 100644
--- a/doc/source/tutorial/stats/hypothesis_kendalltau.md
+++ b/doc/source/tutorial/stats/hypothesis_kendalltau.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_kendalltau.ipynb
+.. notebooklite:: hypothesis_kendalltau.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_kurtosistest.md b/doc/source/tutorial/stats/hypothesis_kurtosistest.md
index 89671e75a6ed..68f23345b2cd 100644
--- a/doc/source/tutorial/stats/hypothesis_kurtosistest.md
+++ b/doc/source/tutorial/stats/hypothesis_kurtosistest.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_kurtosistest.ipynb
+.. notebooklite:: hypothesis_kurtosistest.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_levene.md b/doc/source/tutorial/stats/hypothesis_levene.md
index 12dd439a5404..418b9048ba34 100644
--- a/doc/source/tutorial/stats/hypothesis_levene.md
+++ b/doc/source/tutorial/stats/hypothesis_levene.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_levene.ipynb
+.. notebooklite:: hypothesis_levene.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_normaltest.md b/doc/source/tutorial/stats/hypothesis_normaltest.md
index 827e33e8be73..5785ec62703e 100644
--- a/doc/source/tutorial/stats/hypothesis_normaltest.md
+++ b/doc/source/tutorial/stats/hypothesis_normaltest.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_normaltest.ipynb
+.. notebooklite:: hypothesis_normaltest.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_odds_ratio.md b/doc/source/tutorial/stats/hypothesis_odds_ratio.md
index d9d0816d5777..32f56a08b720 100644
--- a/doc/source/tutorial/stats/hypothesis_odds_ratio.md
+++ b/doc/source/tutorial/stats/hypothesis_odds_ratio.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_odds_ratio.ipynb
+.. notebooklite:: hypothesis_odds_ratio.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_shapiro.md b/doc/source/tutorial/stats/hypothesis_shapiro.md
index ab8d081fdfc6..0bf7cdf3de10 100644
--- a/doc/source/tutorial/stats/hypothesis_shapiro.md
+++ b/doc/source/tutorial/stats/hypothesis_shapiro.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_shapiro.ipynb
+.. notebooklite:: hypothesis_shapiro.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_skewtest.md b/doc/source/tutorial/stats/hypothesis_skewtest.md
index 525a482e9f74..f99bd41809ae 100644
--- a/doc/source/tutorial/stats/hypothesis_skewtest.md
+++ b/doc/source/tutorial/stats/hypothesis_skewtest.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_skewtest.ipynb
+.. notebooklite:: hypothesis_skewtest.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/hypothesis_spearmanr.md b/doc/source/tutorial/stats/hypothesis_spearmanr.md
index b8c944c44e58..cdd6a90ca492 100644
--- a/doc/source/tutorial/stats/hypothesis_spearmanr.md
+++ b/doc/source/tutorial/stats/hypothesis_spearmanr.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/hypothesis_spearmanr.ipynb
+.. notebooklite:: hypothesis_spearmanr.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/rv_infrastructure.md b/doc/source/tutorial/stats/rv_infrastructure.md
index 1713e4f36eee..1195bf676764 100644
--- a/doc/source/tutorial/stats/rv_infrastructure.md
+++ b/doc/source/tutorial/stats/rv_infrastructure.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/rv_infrastructure.ipynb
+.. notebooklite:: rv_infrastructure.md
    :new_tab: True
 ```
 
diff --git a/doc/source/tutorial/stats/sampling.md b/doc/source/tutorial/stats/sampling.md
index 4e2297b5ee83..50d4f94f0d22 100644
--- a/doc/source/tutorial/stats/sampling.md
+++ b/doc/source/tutorial/stats/sampling.md
@@ -14,7 +14,7 @@ kernelspec:
 +++ {"tags": ["jupyterlite_sphinx_strip"]}
 
 ```{eval-rst}
-.. jupyterlite:: ../../_contents/sampling.ipynb
+.. notebooklite:: sampling.md
    :new_tab: True
 ```
 
diff --git a/environment.yml b/environment.yml
index d0f923980a1a..33011b0d3377 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,7 +47,7 @@ dependencies:
   - sphinx-design
   - jupytext
   - myst-nb
-  - jupyterlite-sphinx>=0.16.5
+  - jupyterlite-sphinx>=0.17.1
   - jupyterlite-pyodide-kernel
   # Some optional test dependencies
   - mpmath
diff --git a/mypy.ini b/mypy.ini
index 0f9606329823..95deab150a79 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -261,6 +261,54 @@ ignore_errors = True
 [mypy-scipy.signal._ltisys]
 ignore_errors = True
 
+[mypy-scipy.signal._support_alternative_backends]
+ignore_errors = True
+
+[mypy-scipy.signal._signal_api]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_upfirdn]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_splines]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_short_time_fft]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_savitzky_golay]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_result_type]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_max_len_seq]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_ltisys]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_dltisys]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_fir_filter_design]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_filter_design]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_spectral]
+ignore_errors = True
+
+[mypy-scipy.signal.tests._scipy_spectral_test_shim]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_cont2discrete]
+ignore_errors = True
+
+[mypy-scipy.signal.tests.test_czt]
+ignore_errors = True
+
 [mypy-scipy.integrate._ode]
 ignore_errors = True
 
diff --git a/pyproject.toml b/pyproject.toml
index 8d83d6c8caff..13ae3d7d015b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -99,7 +99,7 @@ doc = [
     "jupytext",
     "myst-nb",
     "pooch",
-    "jupyterlite-sphinx>=0.16.5",
+    "jupyterlite-sphinx>=0.17.1",
     "jupyterlite-pyodide-kernel",
 ]
 dev = [
diff --git a/pytest.ini b/pytest.ini
index c7ef634e840c..35241adf0362 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -22,3 +22,12 @@ filterwarnings =
     ignore:Using the slower implementation::cupy
     ignore:Jitify is performing a one-time only warm-up::cupy
     ignore:.*scipy.misc.*:DeprecationWarning
+
+markers =
+    slow: Tests that are very slow
+    xslow: mark test as extremely slow (not run unless explicitly requested)
+    xfail_on_32bit: mark test as failing on 32-bit platforms
+    array_api_backends: test iterates on all array API backends
+    array_api_compatible: test is compatible with array API
+    skip_xp_backends(backends, reason=None, np_only=False, cpu_only=False, exceptions=None): mark the desired skip configuration for the `skip_xp_backends` fixture
+    xfail_xp_backends(backends, reason=None, np_only=False, cpu_only=False, exceptions=None): mark the desired xfail configuration for the `xfail_xp_backends` fixture
diff --git a/requirements/doc.txt b/requirements/doc.txt
index cd11aa0db0e2..9f13a656e2ac 100644
--- a/requirements/doc.txt
+++ b/requirements/doc.txt
@@ -10,5 +10,5 @@ numpydoc
 jupytext
 myst-nb
 pooch
-jupyterlite-sphinx>=0.16.5
+jupyterlite-sphinx>=0.17.1
 jupyterlite-pyodide-kernel
diff --git a/scipy/__init__.py b/scipy/__init__.py
index 2e001cfc1359..ca6bb93fcc73 100644
--- a/scipy/__init__.py
+++ b/scipy/__init__.py
@@ -3,13 +3,10 @@
 ================================================
 
 Documentation is available in the docstrings and
-online at https://docs.scipy.org.
+online at https://docs.scipy.org/doc/scipy/
 
 Subpackages
 -----------
-Using any of these subpackages requires an explicit import. For example,
-``import scipy.cluster``.
-
 ::
 
  cluster                      --- Vector Quantization / Kmeans
diff --git a/scipy/_lib/_array_api.py b/scipy/_lib/_array_api.py
index 68744a12ca35..7b90aa12acb3 100644
--- a/scipy/_lib/_array_api.py
+++ b/scipy/_lib/_array_api.py
@@ -8,6 +8,9 @@
 """
 import os
 
+from collections.abc import Generator
+from contextlib import contextmanager
+from contextvars import ContextVar
 from types import ModuleType
 from typing import Any, Literal, TypeAlias
 
@@ -24,12 +27,13 @@
     is_cupy_namespace as is_cupy,
     is_torch_namespace as is_torch,
     is_jax_namespace as is_jax,
+    is_dask_namespace as is_dask,
     is_array_api_strict_namespace as is_array_api_strict
 )
 
 __all__ = [
     '_asarray', 'array_namespace', 'assert_almost_equal', 'assert_array_almost_equal',
-    'get_xp_devices',
+    'get_xp_devices', 'default_xp',
     'is_array_api_strict', 'is_complex', 'is_cupy', 'is_jax', 'is_numpy', 'is_torch', 
     'SCIPY_ARRAY_API', 'SCIPY_DEVICE', 'scipy_namespace_for',
     'xp_assert_close', 'xp_assert_equal', 'xp_assert_less',
@@ -103,11 +107,8 @@ def _compliance_scipy(arrays):
 
 def _check_finite(array: Array, xp: ModuleType) -> None:
     """Check for NaNs or Infs."""
-    msg = "array must not contain infs or NaNs"
-    try:
-        if not xp.all(xp.isfinite(array)):
-            raise ValueError(msg)
-    except TypeError:
+    if not xp.all(xp.isfinite(array)):
+        msg = "array must not contain infs or NaNs"
         raise ValueError(msg)
 
 
@@ -223,12 +224,41 @@ def xp_copy(x: Array, *, xp: ModuleType | None = None) -> Array:
     return _asarray(x, copy=True, xp=xp)
 
 
+_default_xp_ctxvar: ContextVar[ModuleType] = ContextVar("_default_xp")
+
+@contextmanager
+def default_xp(xp: ModuleType) -> Generator[None, None, None]:
+    """In all ``xp_assert_*`` and ``assert_*`` function calls executed within this
+    context manager, test by default that the array namespace is 
+    the provided across all arrays, unless one explicitly passes the ``xp=``
+    parameter or ``check_namespace=False``.
+
+    Without this context manager, the default value for `xp` is the namespace
+    for the desired array (the second parameter of the tests).
+    """
+    token = _default_xp_ctxvar.set(xp)
+    try:
+        yield
+    finally:
+        _default_xp_ctxvar.reset(token)
+
+
 def _strict_check(actual, desired, xp, *,
                   check_namespace=True, check_dtype=True, check_shape=True,
                   check_0d=True):
     __tracebackhide__ = True  # Hide traceback for py.test
+
+    if xp is None:
+        try:
+            xp = _default_xp_ctxvar.get()
+        except LookupError:
+            xp = array_namespace(desired)
+        else:
+            # Wrap namespace if needed
+            xp = array_namespace(xp.asarray(0))
+ 
     if check_namespace:
-        _assert_matching_namespace(actual, desired)
+        _assert_matching_namespace(actual, desired, xp)
 
     # only NumPy distinguishes between scalars and arrays; we do if check_0d=True.
     # do this first so we can then cast to array (and thus use the array API) below.
@@ -246,32 +276,39 @@ def _strict_check(actual, desired, xp, *,
         assert actual.dtype == desired.dtype, _msg
 
     if check_shape:
+        if is_dask(xp):
+            actual.compute_chunk_sizes()
+            desired.compute_chunk_sizes()
         _msg = f"Shapes do not match.\nActual: {actual.shape}\nDesired: {desired.shape}"
         assert actual.shape == desired.shape, _msg
 
     desired = xp.broadcast_to(desired, actual.shape)
-    return actual, desired
+    return actual, desired, xp
 
 
-def _assert_matching_namespace(actual, desired):
+def _assert_matching_namespace(actual, desired, xp):
     __tracebackhide__ = True  # Hide traceback for py.test
-    actual = actual if isinstance(actual, tuple) else (actual,)
-    desired_space = array_namespace(desired)
-    for arr in actual:
-        arr_space = array_namespace(arr)
-        _msg = (f"Namespaces do not match.\n"
-                f"Actual: {arr_space.__name__}\n"
-                f"Desired: {desired_space.__name__}")
-        assert arr_space == desired_space, _msg
+
+    desired_arr_space = array_namespace(desired)
+    _msg = ("Namespace of desired array does not match expectations "
+            "set by the `default_xp` context manager or by the `xp`"
+            "pytest fixture.\n"
+            f"Desired array's space: {desired_arr_space.__name__}\n"
+            f"Expected namespace: {xp.__name__}")
+    assert desired_arr_space == xp, _msg
+
+    actual_arr_space = array_namespace(actual)
+    _msg = ("Namespace of actual and desired arrays do not match.\n"
+            f"Actual: {actual_arr_space.__name__}\n"
+            f"Desired: {xp.__name__}")
+    assert actual_arr_space == xp, _msg
 
 
 def xp_assert_equal(actual, desired, *, check_namespace=True, check_dtype=True,
                     check_shape=True, check_0d=True, err_msg='', xp=None):
     __tracebackhide__ = True  # Hide traceback for py.test
-    if xp is None:
-        xp = array_namespace(actual)
 
-    actual, desired = _strict_check(
+    actual, desired, xp = _strict_check(
         actual, desired, xp, check_namespace=check_namespace,
         check_dtype=check_dtype, check_shape=check_shape,
         check_0d=check_0d
@@ -293,10 +330,8 @@ def xp_assert_close(actual, desired, *, rtol=None, atol=0, check_namespace=True,
                     check_dtype=True, check_shape=True, check_0d=True,
                     err_msg='', xp=None):
     __tracebackhide__ = True  # Hide traceback for py.test
-    if xp is None:
-        xp = array_namespace(actual)
 
-    actual, desired = _strict_check(
+    actual, desired, xp = _strict_check(
         actual, desired, xp,
         check_namespace=check_namespace, check_dtype=check_dtype,
         check_shape=check_shape, check_0d=check_0d
@@ -326,10 +361,8 @@ def xp_assert_close(actual, desired, *, rtol=None, atol=0, check_namespace=True,
 def xp_assert_less(actual, desired, *, check_namespace=True, check_dtype=True,
                    check_shape=True, check_0d=True, err_msg='', verbose=True, xp=None):
     __tracebackhide__ = True  # Hide traceback for py.test
-    if xp is None:
-        xp = array_namespace(actual)
 
-    actual, desired = _strict_check(
+    actual, desired, xp = _strict_check(
         actual, desired, xp, check_namespace=check_namespace,
         check_dtype=check_dtype, check_shape=check_shape,
         check_0d=check_0d
diff --git a/scipy/_lib/_util.py b/scipy/_lib/_util.py
index 90060bc65468..3377a63f2b39 100644
--- a/scipy/_lib/_util.py
+++ b/scipy/_lib/_util.py
@@ -12,6 +12,7 @@
 import numpy as np
 from scipy._lib._array_api import array_namespace, is_numpy, xp_size
 from scipy._lib._docscrape import FunctionDoc, Parameter
+import scipy._lib.array_api_extra as xpx
 
 
 AxisError: type[Exception]
@@ -113,7 +114,7 @@ def _lazywhere(cond, arrays, f, fillvalue=None, f2=None):
     -----
     ``xp.where(cond, x, fillvalue)`` requires explicitly forming `x` even where
     `cond` is False. This function evaluates ``f(arr1[cond], arr2[cond], ...)``
-    onle where `cond` ``is True.
+    only where `cond` is True.
 
     Examples
     --------
@@ -153,9 +154,9 @@ def _lazywhere(cond, arrays, f, fillvalue=None, f2=None):
         temp2 = xp.asarray(f2(*(arr[ncond] for arr in arrays)))
         dtype = xp.result_type(temp1, temp2)
         out = xp.empty(cond.shape, dtype=dtype)
-        out[ncond] = temp2
+        out = xpx.at(out, ncond).set(temp2)
 
-    out[cond] = temp1
+    out = xpx.at(out, cond).set(temp1)
 
     return out
 
@@ -1235,9 +1236,13 @@ def wrapper(*args, **kwargs):
             for i, (array, ndim) in enumerate(zip(arrays, ndims)):
                 array = None if array is None else np.asarray(array)
                 shape = () if array is None else array.shape
+
+                if ndim == "1|2":  # special case for `solve`, etc.
+                    ndim = 2 if array.ndim >= 2 else 1
+
                 arrays[i] = array
                 batch_shapes.append(shape[:-ndim] if ndim > 0 else shape)
-                core_shapes.append(shape[-ndim:] if ndim > 0 else shape)
+                core_shapes.append(shape[-ndim:] if ndim > 0 else ())
 
             # Early exit if call is not batched
             if not any(batch_shapes):
diff --git a/scipy/_lib/array_api_compat b/scipy/_lib/array_api_compat
index 498f08656836..6d5366be3eb0 160000
--- a/scipy/_lib/array_api_compat
+++ b/scipy/_lib/array_api_compat
@@ -1 +1 @@
-Subproject commit 498f086568362002185b005a0a7f38ad136ca8bb
+Subproject commit 6d5366be3eb050150321314933606cbba137d9d0
diff --git a/scipy/_lib/array_api_extra b/scipy/_lib/array_api_extra
index ccb82a634511..c7b47f1ce772 160000
--- a/scipy/_lib/array_api_extra
+++ b/scipy/_lib/array_api_extra
@@ -1 +1 @@
-Subproject commit ccb82a63451182d050ddadb9dec6f3e369787c9a
+Subproject commit c7b47f1ce772f6ee4cbbb32c45be445567fb16d0
diff --git a/scipy/_lib/meson.build b/scipy/_lib/meson.build
index f6804dc19361..58f7f949fbbd 100644
--- a/scipy/_lib/meson.build
+++ b/scipy/_lib/meson.build
@@ -165,6 +165,7 @@ py3.install_sources(
   [
     'array_api_compat/array_api_compat/cupy/__init__.py',
     'array_api_compat/array_api_compat/cupy/_aliases.py',
+    'array_api_compat/array_api_compat/cupy/_info.py',
     'array_api_compat/array_api_compat/cupy/_typing.py',
     'array_api_compat/array_api_compat/cupy/_info.py',
     'array_api_compat/array_api_compat/cupy/fft.py',
@@ -187,6 +188,7 @@ py3.install_sources(
     'array_api_compat/array_api_compat/dask/array/_info.py',
     'array_api_compat/array_api_compat/dask/array/fft.py',
     'array_api_compat/array_api_compat/dask/array/linalg.py',
+    'array_api_compat/array_api_compat/dask/array/fft.py',
   ],
   subdir: 'scipy/_lib/array_api_compat/dask/array',
 )
@@ -195,6 +197,7 @@ py3.install_sources(
   [
     'array_api_compat/array_api_compat/numpy/__init__.py',
     'array_api_compat/array_api_compat/numpy/_aliases.py',
+    'array_api_compat/array_api_compat/numpy/_info.py',
     'array_api_compat/array_api_compat/numpy/_typing.py',
     'array_api_compat/array_api_compat/numpy/_info.py',
     'array_api_compat/array_api_compat/numpy/fft.py',
diff --git a/scipy/_lib/tests/test__util.py b/scipy/_lib/tests/test__util.py
index 2a4d22ce468e..7222b6a0ae82 100644
--- a/scipy/_lib/tests/test__util.py
+++ b/scipy/_lib/tests/test__util.py
@@ -13,15 +13,14 @@
 from scipy.conftest import array_api_compatible, skip_xp_invalid_arg
 
 from scipy._lib._array_api import (xp_assert_equal, xp_assert_close, is_numpy,
-                                   xp_copy, is_array_api_strict)
+                                   is_array_api_strict)
 from scipy._lib._util import (_aligned_zeros, check_random_state, MapWrapper,
                               getfullargspec_no_self, FullArgSpec,
                               rng_integers, _validate_int, _rename_parameter,
                               _contains_nan, _rng_html_rewrite, _lazywhere)
+import scipy._lib.array_api_extra as xpx
 from scipy import cluster, interpolate, linalg, optimize, sparse, spatial, stats
 
-skip_xp_backends = pytest.mark.skip_xp_backends
-
 
 @pytest.mark.slow
 def test__aligned_zeros():
@@ -347,17 +346,13 @@ def test_contains_nan_with_strings(self):
         data4 = np.array([["1", 2], [3, np.nan]], dtype='object')
         assert _contains_nan(data4)[0]
 
-    @skip_xp_backends('jax.numpy',
-                      reason="JAX arrays do not support item assignment")
-    @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
     @pytest.mark.parametrize("nan_policy", ['propagate', 'omit', 'raise'])
     def test_array_api(self, xp, nan_policy):
         rng = np.random.default_rng(932347235892482)
         x0 = rng.random(size=(2, 3, 4))
         x = xp.asarray(x0)
-        x_nan = xp_copy(x, xp=xp)
-        x_nan[1, 2, 1] = np.nan
+        x_nan = xpx.at(x)[1, 2, 1].set(np.nan, copy=True)
 
         contains_nan, nan_policy_out = _contains_nan(x, nan_policy=nan_policy)
         assert not contains_nan
@@ -602,15 +597,11 @@ class TestLazywhere:
 
     @pytest.mark.fail_slow(10)
     @pytest.mark.filterwarnings('ignore::RuntimeWarning')  # overflows, etc.
-    @skip_xp_backends('jax.numpy',
-                      reason="JAX arrays do not support item assignment")
-    @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
     @given(n_arrays=n_arrays, rng_seed=rng_seed, dtype=dtype, p=p, data=data)
     @pytest.mark.thread_unsafe
     def test_basic(self, n_arrays, rng_seed, dtype, p, data, xp):
-        mbs = npst.mutually_broadcastable_shapes(num_shapes=n_arrays+1,
-                                                 min_side=0)
+        mbs = npst.mutually_broadcastable_shapes(num_shapes=n_arrays+1, min_side=0)
         input_shapes, result_shape = data.draw(mbs)
         cond_shape, *shapes = input_shapes
         elements = {'allow_subnormal': False}  # cupy/cupy#8382
diff --git a/scipy/_lib/tests/test_array_api.py b/scipy/_lib/tests/test_array_api.py
index f02be1702e98..ccb5de187869 100644
--- a/scipy/_lib/tests/test_array_api.py
+++ b/scipy/_lib/tests/test_array_api.py
@@ -4,13 +4,11 @@
 from scipy.conftest import array_api_compatible
 from scipy._lib._array_api import (
     _GLOBAL_CONFIG, array_namespace, _asarray, xp_copy, xp_assert_equal, is_numpy,
-    np_compat,
+    np_compat, is_dask
 )
 from scipy._lib import array_api_extra as xpx
 from scipy._lib._array_api_no_0d import xp_assert_equal as xp_assert_equal_no_0d
 
-skip_xp_backends = pytest.mark.skip_xp_backends
-
 
 @pytest.mark.skipif(not _GLOBAL_CONFIG["SCIPY_ARRAY_API"],
         reason="Array API test; set environment variable SCIPY_ARRAY_API=1 to run it")
@@ -63,9 +61,6 @@ def test_array_api_extra_hook(self):
         with pytest.raises(TypeError, match=msg):
             xpx.atleast_nd("abc", ndim=0)
 
-    @skip_xp_backends('jax.numpy',
-                      reason="JAX arrays do not support item assignment")
-    @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
     def test_copy(self, xp):
         for _xp in [xp, None]:
@@ -73,15 +68,14 @@ def test_copy(self, xp):
             y = xp_copy(x, xp=_xp)
             # with numpy we'd want to use np.shared_memory, but that's not specified
             # in the array-api
-            x[0] = 10
-            x[1] = 11
-            x[2] = 12
-
-            assert x[0] != y[0]
-            assert x[1] != y[1]
-            assert x[2] != y[2]
             assert id(x) != id(y)
-
+            try:
+                y[0] = 10
+            except (TypeError, ValueError):
+                pass
+            else:
+                assert x[0] != y[0]
+    
     @array_api_compatible
     @pytest.mark.parametrize('dtype', ['int32', 'int64', 'float32', 'float64'])
     @pytest.mark.parametrize('shape', [(), (3,)])
@@ -97,8 +91,16 @@ def test_strict_checks(self, xp, dtype, shape):
         if xp == np:
             xp_assert_equal(x, y, **options)
         else:
-            with pytest.raises(AssertionError, match="Namespaces do not match."):
+            with pytest.raises(
+                AssertionError,
+                match="Namespace of desired array does not match",
+            ):
                 xp_assert_equal(x, y, **options)
+            with pytest.raises(
+                AssertionError,
+                match="Namespace of actual and desired arrays do not match",
+            ):
+                xp_assert_equal(y, x, **options)
 
         options = dict(zip(kwarg_names, [False, True, False, False]))
         if y.dtype.name in str(x.dtype):
diff --git a/scipy/_lib/tests/test_public_api.py b/scipy/_lib/tests/test_public_api.py
index 5ad2dd9fdb63..dc2efa1bec93 100644
--- a/scipy/_lib/tests/test_public_api.py
+++ b/scipy/_lib/tests/test_public_api.py
@@ -357,6 +357,7 @@ def check_importable(module_name):
                           ('scipy.integrate.odepack', None),
                           ('scipy.integrate.quadpack', None),
                           ('scipy.integrate.vode', None),
+                          ('scipy.interpolate.dfitpack', None),
                           ('scipy.interpolate.fitpack', None),
                           ('scipy.interpolate.fitpack2', None),
                           ('scipy.interpolate.interpolate', None),
diff --git a/scipy/cluster/hierarchy.py b/scipy/cluster/hierarchy.py
index de15e9db6682..e180ae0e9e2c 100644
--- a/scipy/cluster/hierarchy.py
+++ b/scipy/cluster/hierarchy.py
@@ -4157,7 +4157,7 @@ def leaders(Z, T):
     if T.shape[0] != Z.shape[0] + 1:
         raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
 
-    n_clusters = int(xp.unique_values(T).shape[0])
+    n_clusters = int(np.asarray(xp.unique_values(T)).shape[0])
     n_obs = int(Z.shape[0] + 1)
     L = np.zeros(n_clusters, dtype=np.int32)
     M = np.zeros(n_clusters, dtype=np.int32)
diff --git a/scipy/cluster/tests/test_hierarchy.py b/scipy/cluster/tests/test_hierarchy.py
index b8f568625980..ded0150d7d39 100644
--- a/scipy/cluster/tests/test_hierarchy.py
+++ b/scipy/cluster/tests/test_hierarchy.py
@@ -970,6 +970,9 @@ def test_valid_label_size(self, xp):
          reason='MPL 3.9.2 & torch DeprecationWarning from __array_wrap__'
                 ' and NumPy 2.0'
     )
+    @skip_xp_backends('dask.array',
+         reason='dask.array has bad interaction with matplotlib'
+    )
     @pytest.mark.skipif(not have_matplotlib, reason="no matplotlib")
     def test_dendrogram_plot(self, xp):
         for orientation in ['top', 'bottom', 'left', 'right']:
@@ -1041,6 +1044,9 @@ def check_dendrogram_plot(self, orientation, xp):
           reason='MPL 3.9.2 & torch DeprecationWarning from __array_wrap__'
                  ' and NumPy 2.0'
      )
+    @skip_xp_backends('dask.array',
+         reason='dask.array has bad interaction with matplotlib'
+    )
     @pytest.mark.skipif(not have_matplotlib, reason="no matplotlib")
     def test_dendrogram_truncate_mode(self, xp):
         Z = linkage(xp.asarray(hierarchy_test_data.ytdist), 'single')
diff --git a/scipy/cluster/tests/test_vq.py b/scipy/cluster/tests/test_vq.py
index d0321e7d81d7..0697691da924 100644
--- a/scipy/cluster/tests/test_vq.py
+++ b/scipy/cluster/tests/test_vq.py
@@ -357,6 +357,7 @@ def test_kmeans2_init(self, xp):
     def krand_lock(self):
         return Lock()
 
+    @skip_xp_backends('dask.array', reason="Wrong answer")
     @pytest.mark.skipif(sys.platform == 'win32',
                         reason='Fails with MemoryError in Wine.')
     def test_krandinit(self, xp, krand_lock):
diff --git a/scipy/cluster/vq.py b/scipy/cluster/vq.py
index a791e2956070..34045c1357fe 100644
--- a/scipy/cluster/vq.py
+++ b/scipy/cluster/vq.py
@@ -310,20 +310,18 @@ def _kmeans(obs, guess, thresh=1e-5, xp=None):
     code_book = guess
     diff = xp.inf
     prev_avg_dists = deque([diff], maxlen=2)
+
+    np_obs = np.asarray(obs)
     while diff > thresh:
         # compute membership and distances between obs and code_book
         obs_code, distort = vq(obs, code_book, check_finite=False)
         prev_avg_dists.append(xp.mean(distort, axis=-1))
         # recalc code_book as centroids of associated obs
-        obs = np.asarray(obs)
         obs_code = np.asarray(obs_code)
-        code_book, has_members = _vq.update_cluster_means(obs, obs_code,
+        code_book, has_members = _vq.update_cluster_means(np_obs, obs_code,
                                                           code_book.shape[0])
-        obs = xp.asarray(obs)
-        obs_code = xp.asarray(obs_code)
-        code_book = xp.asarray(code_book)
-        has_members = xp.asarray(has_members)
         code_book = code_book[has_members]
+        code_book = xp.asarray(code_book)
         diff = xp.abs(prev_avg_dists[0] - prev_avg_dists[1])
 
     return code_book, prev_avg_dists[1]
@@ -814,7 +812,7 @@ def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
 
     data = np.asarray(data)
     code_book = np.asarray(code_book)
-    for i in range(iter):
+    for _ in range(iter):
         # Compute the nearest neighbor for each obs using the current code book
         label = vq(data, code_book, check_finite=check_finite)[0]
         # Update the code book by computing centroids
diff --git a/scipy/conftest.py b/scipy/conftest.py
index ca2cf0b3187e..dccfb967c4db 100644
--- a/scipy/conftest.py
+++ b/scipy/conftest.py
@@ -12,7 +12,7 @@
 
 from scipy._lib._fpumode import get_fpu_mode
 from scipy._lib._testutils import FPUModeChangeWarning
-from scipy._lib._array_api import SCIPY_ARRAY_API, SCIPY_DEVICE
+from scipy._lib._array_api import SCIPY_ARRAY_API, SCIPY_DEVICE, xp_device
 from scipy._lib import _pep440
 
 try:
@@ -29,12 +29,6 @@
 
 
 def pytest_configure(config):
-    config.addinivalue_line("markers",
-        "slow: Tests that are very slow.")
-    config.addinivalue_line("markers",
-        "xslow: mark test as extremely slow (not run unless explicitly requested)")
-    config.addinivalue_line("markers",
-        "xfail_on_32bit: mark test as failing on 32-bit platforms")
     try:
         import pytest_timeout  # noqa:F401
     except Exception:
@@ -47,14 +41,7 @@ def pytest_configure(config):
     except Exception:
         config.addinivalue_line(
             "markers", 'fail_slow: mark a test for a non-default timeout failure')
-    config.addinivalue_line("markers",
-        "skip_xp_backends(backends, reason=None, np_only=False, cpu_only=False, "
-        "exceptions=None): "
-        "mark the desired skip configuration for the `skip_xp_backends` fixture.")
-    config.addinivalue_line("markers",
-        "xfail_xp_backends(backends, reason=None, np_only=False, cpu_only=False, "
-        "exceptions=None): "
-        "mark the desired xfail configuration for the `xfail_xp_backends` fixture.")
+
     if not PARALLEL_RUN_AVAILABLE:
         config.addinivalue_line(
             'markers',
@@ -178,6 +165,15 @@ def num_parallel_threads():
     except ImportError:
         pass
 
+    try:
+        # Note: dask.array main namespace is not array API compatible
+        # (to address this, we will fix tests that use the broken dask behavior to
+        # use the array-api-compat wrapped version instead)
+        import dask.array  # type: ignore[import-not-found]
+        xp_available_backends.update({'dask.array': dask.array})
+    except ImportError:
+        pass
+
     # by default, use all available backends
     if SCIPY_ARRAY_API.lower() not in ("1", "true"):
         SCIPY_ARRAY_API_ = json.loads(SCIPY_ARRAY_API)
@@ -206,7 +202,32 @@ def num_parallel_threads():
     from cupyx.scipy import signal
     del signal
 
-array_api_compatible = pytest.mark.parametrize("xp", xp_available_backends.values())
+
+@pytest.fixture(params=[
+    pytest.param(v, id=k, marks=pytest.mark.array_api_backends)
+    for k, v in xp_available_backends.items()
+])
+def xp(request):
+    """Run the test that uses this fixture on each available array API library.
+
+    You can select all and only the tests that use the `xp` fixture by
+    passing `-m array_api_backends` to pytest.
+
+    Please read: https://docs.scipy.org/doc/scipy/dev/api-dev/array_api.html
+    """
+    if SCIPY_ARRAY_API:
+        from scipy._lib._array_api import default_xp
+
+        # Throughout all calls to assert_almost_equal, assert_array_almost_equal, and
+        # xp_assert_* functions, test that the array namespace is xp in both the
+        # expected and actual arrays. This is to detect the case where both arrays are
+        # erroneously just plain numpy while xp is something else.
+        with default_xp(request.param):
+            yield request.param
+    else:
+        yield request.param
+
+array_api_compatible = pytest.mark.array_api_compatible
 
 skip_xp_invalid_arg = pytest.mark.skipif(SCIPY_ARRAY_API,
     reason = ('Test involves masked arrays, object arrays, or other types '
@@ -366,6 +387,9 @@ def skip_or_xfail_xp_backends(xp, backends, kwargs, skip_or_xfail='skip'):
                 for d in xp.empty(0).devices():
                     if 'cpu' not in d.device_kind:
                         skip_or_xfail(reason=reason)
+            elif xp.__name__ == 'dask.array':
+                if xp_device(xp.empty(0)) != 'cpu':
+                    skip_or_xfail(reason=reason)
 
 
 # Following the approach of NumPy's conftest.py...
diff --git a/scipy/differentiate/_differentiate.py b/scipy/differentiate/_differentiate.py
index 2b50b1732dca..0e104a071055 100644
--- a/scipy/differentiate/_differentiate.py
+++ b/scipy/differentiate/_differentiate.py
@@ -802,27 +802,47 @@ def jacobian(f, x, *, tolerances=None, maxiter=10, order=8, initial_step=0.5,
     Notes
     -----
     Suppose we wish to evaluate the Jacobian of a function
-    :math:`f: \mathbf{R}^m \rightarrow \mathbf{R}^n`, and assign to variables
+    :math:`f: \mathbf{R}^m \rightarrow \mathbf{R}^n`. Assign to variables
     ``m`` and ``n`` the positive integer values of :math:`m` and :math:`n`,
-    respectively. If we wish to evaluate the Jacobian at a single point,
-    then:
+    respectively, and let ``...`` represent an arbitrary tuple of integers.
+    If we wish to evaluate the Jacobian at a single point, then:
 
     - argument `x` must be an array of shape ``(m,)``
-    - argument `f` must be vectorized to accept an array of shape ``(m, p)``.
-      The first axis represents the :math:`m` inputs of :math:`f`; the second
-      is for evaluating the function at multiple points in a single call.
-    - argument `f` must return an array of shape ``(n, p)``. The first
-      axis represents the :math:`n` outputs of :math:`f`; the second
-      is for the result of evaluating the function at multiple points.
+    - argument `f` must be vectorized to accept an array of shape ``(m, ...)``.
+      The first axis represents the :math:`m` inputs of :math:`f`; the remainder
+      are for evaluating the function at multiple points in a single call.
+    - argument `f` must return an array of shape ``(n, ...)``. The first
+      axis represents the :math:`n` outputs of :math:`f`; the remainder
+      are for the result of evaluating the function at multiple points.
     - attribute ``df`` of the result object will be an array of shape ``(n, m)``,
       the Jacobian.
 
     This function is also vectorized in the sense that the Jacobian can be
     evaluated at ``k`` points in a single call. In this case, `x` would be an
     array of shape ``(m, k)``, `f` would accept an array of shape
-    ``(m, k, p)`` and return an array of shape ``(n, k, p)``, and the ``df``
+    ``(m, k, ...)`` and return an array of shape ``(n, k, ...)``, and the ``df``
     attribute of the result would have shape ``(n, m, k)``.
 
+    Suppose the desired callable ``f_not_vectorized`` is not vectorized; it can
+    only accept an array of shape ``(m,)``. A simple solution to satisfy the required
+    interface is to wrap ``f_not_vectorized`` as follows::
+
+        def f(x):
+            return np.apply_along_axis(f_not_vectorized, axis=0, arr=x)
+
+    Alternatively, suppose the desired callable ``f_vec_q`` is vectorized, but
+    only for 2-D arrays of shape ``(m, q)``. To satisfy the required interface,
+    consider::
+
+        def f(x):
+            m, batch = x.shape[0], x.shape[1:]  # x.shape is (m, ...)
+            x = np.reshape(x, (m, -1))  # `-1` is short for q = prod(batch)
+            res = f_vec_q(x)  # pass shape (m, q) to function
+            n = res.shape[0]
+            return np.reshape(res, (n,) + batch)  # return shape (n, ...)
+
+    Then pass the wrapped callable ``f`` as the first argument of `jacobian`.
+
     References
     ----------
     .. [1] Jacobian matrix and determinant, *Wikipedia*,
diff --git a/scipy/differentiate/tests/test_differentiate.py b/scipy/differentiate/tests/test_differentiate.py
index 64bc8193cc23..f9ea4e4478ee 100644
--- a/scipy/differentiate/tests/test_differentiate.py
+++ b/scipy/differentiate/tests/test_differentiate.py
@@ -21,6 +21,7 @@
 
 @pytest.mark.skip_xp_backends('array_api_strict', reason=array_api_strict_skip_reason)
 @pytest.mark.skip_xp_backends('jax.numpy',reason=jax_skip_reason)
+@pytest.mark.skip_xp_backends('dask.array', reason='boolean indexing assignment')
 class TestDerivative:
 
     def f(self, x):
@@ -477,6 +478,7 @@ def test_iv(self, xp):
 
 @pytest.mark.skip_xp_backends('array_api_strict', reason=array_api_strict_skip_reason)
 @pytest.mark.skip_xp_backends('jax.numpy',reason=jax_skip_reason)
+@pytest.mark.skip_xp_backends('dask.array', reason='boolean indexing assignment')
 class TestJacobian(JacobianHessianTest):
     jh_func = jacobian
 
@@ -628,6 +630,7 @@ def f(x):
 
 @pytest.mark.skip_xp_backends('array_api_strict', reason=array_api_strict_skip_reason)
 @pytest.mark.skip_xp_backends('jax.numpy',reason=jax_skip_reason)
+@pytest.mark.skip_xp_backends('dask.array', reason='boolean indexing assignment')
 class TestHessian(JacobianHessianTest):
     jh_func = hessian
 
diff --git a/scipy/fft/tests/test_basic.py b/scipy/fft/tests/test_basic.py
index 4ed32d54c893..38284e3a6711 100644
--- a/scipy/fft/tests/test_basic.py
+++ b/scipy/fft/tests/test_basic.py
@@ -333,8 +333,12 @@ def test_dtypes_real(self, dtype, xp):
 
     @pytest.mark.parametrize("dtype", ["complex64", "complex128"])
     def test_dtypes_complex(self, dtype, xp):
+        # Trick to get the array-api-compat namespace for dask
+        # (otherwise the "naked" dask.array asarray does not respect
+        # the input dtype)
+        xp_test = array_namespace(xp.asarray(1))
         rng = np.random.default_rng(1234)
-        x = xp.asarray(rng.random(30), dtype=getattr(xp, dtype))
+        x = xp.asarray(rng.random(30), dtype=getattr(xp_test, dtype))
 
         res_fft = fft.ifft(fft.fft(x))
         # Check both numerical results and exact dtype matches
diff --git a/scipy/fft/tests/test_helper.py b/scipy/fft/tests/test_helper.py
index 4333886555ff..f87a9ae4dc92 100644
--- a/scipy/fft/tests/test_helper.py
+++ b/scipy/fft/tests/test_helper.py
@@ -520,23 +520,24 @@ def test_definition(self, xp):
 
         # default dtype varies across backends
 
-        y = 9 * fft.fftfreq(9, xp=xp)
+        wrapped_xp = array_namespace(x)
+        y = 9 * fft.fftfreq(9, xp=wrapped_xp)
         xp_assert_close(y, x, check_dtype=False, check_namespace=True)
 
-        y = 9 * xp.pi * fft.fftfreq(9, xp.pi, xp=xp)
+        y = 9 * xp.pi * fft.fftfreq(9, xp.pi, xp=wrapped_xp)
         xp_assert_close(y, x, check_dtype=False)
 
-        y = 10 * fft.fftfreq(10, xp=xp)
+        y = 10 * fft.fftfreq(10, xp=wrapped_xp)
         xp_assert_close(y, x2, check_dtype=False)
 
-        y = 10 * xp.pi * fft.fftfreq(10, xp.pi, xp=xp)
+        y = 10 * xp.pi * fft.fftfreq(10, xp.pi, xp=wrapped_xp)
         xp_assert_close(y, x2, check_dtype=False)
 
     def test_device(self, xp):
         xp_test = array_namespace(xp.empty(0))
         devices = get_xp_devices(xp)
         for d in devices:
-            y = fft.fftfreq(9, xp=xp, device=d)
+            y = fft.fftfreq(9, xp=xp_test, device=d)
             x = xp_test.empty(0, device=d)
             assert xp_device(y) == xp_device(x)
 
@@ -552,23 +553,23 @@ def test_definition(self, xp):
         x2 = xp.asarray([0, 1, 2, 3, 4, 5], dtype=xp.float64)
 
         # default dtype varies across backends
-        
-        y = 9 * fft.rfftfreq(9, xp=xp)
+        wrapped_xp = array_namespace(x)
+        y = 9 * fft.rfftfreq(9, xp=wrapped_xp)
         xp_assert_close(y, x, check_dtype=False, check_namespace=True)
 
-        y = 9 * xp.pi * fft.rfftfreq(9, xp.pi, xp=xp)
+        y = 9 * xp.pi * fft.rfftfreq(9, xp.pi, xp=wrapped_xp)
         xp_assert_close(y, x, check_dtype=False)
 
-        y = 10 * fft.rfftfreq(10, xp=xp)
+        y = 10 * fft.rfftfreq(10, xp=wrapped_xp)
         xp_assert_close(y, x2, check_dtype=False)
 
-        y = 10 * xp.pi * fft.rfftfreq(10, xp.pi, xp=xp)
+        y = 10 * xp.pi * fft.rfftfreq(10, xp.pi, xp=wrapped_xp)
         xp_assert_close(y, x2, check_dtype=False)
 
     def test_device(self, xp):
         xp_test = array_namespace(xp.empty(0))
         devices = get_xp_devices(xp)
         for d in devices:
-            y = fft.rfftfreq(9, xp=xp, device=d)
+            y = fft.rfftfreq(9, xp=xp_test, device=d)
             x = xp_test.empty(0, device=d)
             assert xp_device(y) == xp_device(x)
diff --git a/scipy/fft/tests/test_real_transforms.py b/scipy/fft/tests/test_real_transforms.py
index 890dc79640af..9937ec4741e0 100644
--- a/scipy/fft/tests/test_real_transforms.py
+++ b/scipy/fft/tests/test_real_transforms.py
@@ -7,7 +7,7 @@
 import scipy.fft as fft
 from scipy import fftpack
 from scipy.conftest import array_api_compatible
-from scipy._lib._array_api import xp_copy, xp_assert_close
+from scipy._lib._array_api import xp_copy, xp_assert_close, array_namespace
 
 pytestmark = [array_api_compatible, pytest.mark.usefixtures("skip_xp_backends")]
 skip_xp_backends = pytest.mark.skip_xp_backends
@@ -197,9 +197,10 @@ def test_orthogonalize_noop(func, type, norm, xp):
                   cpu_only=True)
 @pytest.mark.parametrize("norm", ["backward", "ortho", "forward"])
 def test_orthogonalize_dct1(norm, xp):
-    x = xp.asarray(np.random.rand(100))
+    x_np = np.random.rand(100)
+    x = xp.asarray(x_np)
 
-    x2 = xp_copy(x, xp=xp)
+    x2 = xp.asarray(x_np.copy())
     x2[0] *= SQRT_2
     x2[-1] *= SQRT_2
 
@@ -231,8 +232,9 @@ def test_orthogonalize_dcst2(func, norm, xp):
 @pytest.mark.parametrize("norm", ["backward", "ortho", "forward"])
 @pytest.mark.parametrize("func", [dct, dst])
 def test_orthogonalize_dcst3(func, norm, xp):
-    x = xp.asarray(np.random.rand(100))
-    x2 = xp_copy(x, xp=xp)
+    x_np = np.random.rand(100)
+    x = xp.asarray(x_np)
+    x2 = xp.asarray(x_np.copy())
     x2[0 if func == dct else -1] *= SQRT_2
 
     y1 = func(x, type=3, norm=norm, orthogonalize=True)
diff --git a/scipy/integrate/tests/test_cubature.py b/scipy/integrate/tests/test_cubature.py
index 899655c7631f..310097dffd1a 100644
--- a/scipy/integrate/tests/test_cubature.py
+++ b/scipy/integrate/tests/test_cubature.py
@@ -540,6 +540,7 @@ class TestCubatureProblems:
     Tests that `cubature` gives the correct answer.
     """
 
+    @skip_xp_backends("dask.array", reason="Dask hangs/takes a long time for some test cases")
     @pytest.mark.parametrize("problem", [
         # -- f1 --
         (
@@ -786,6 +787,7 @@ def test_scalar_output(self, problem, rule, rtol, atol, xp):
             err_msg=f"estimate_error={res.error}, subdivisions={res.subdivisions}",
         )
 
+    @skip_xp_backends("dask.array", reason="Dask hangs/takes a long time for some test cases")
     @pytest.mark.parametrize("problem", [
         (
             # Function to integrate, like `f(x, *args)`
@@ -977,6 +979,10 @@ def test_break_points(self, problem, rule, rtol, atol, xp):
         "jax.numpy",
         reasons=["transforms make use of indexing assignment"],
     )
+    @skip_xp_backends(
+        "dask.array",
+        reasons=["transforms make use of boolean index assignment"],
+    )
     @pytest.mark.parametrize("problem", [
         (
             # Function to integrate
@@ -1127,6 +1133,10 @@ def test_infinite_limits(self, problem, rule, rtol, atol, xp):
         "jax.numpy",
         reasons=["transforms make use of indexing assignment"],
     )
+    @skip_xp_backends(
+        "dask.array",
+        reasons=["transforms make use of boolean index assignment"],
+    )
     @pytest.mark.parametrize("problem", [
         (
             # Function to integrate
@@ -1338,6 +1348,10 @@ def test_genz_malik_1d_raises_error(self, xp):
     "jax.numpy",
     reasons=["transforms make use of indexing assignment"],
 )
+@skip_xp_backends(
+    "dask.array",
+    reasons=["transforms make use of boolean index assignment"],
+)
 class TestTransformations:
     @pytest.mark.parametrize(("a", "b", "points"), [
         (
diff --git a/scipy/integrate/tests/test_tanhsinh.py b/scipy/integrate/tests/test_tanhsinh.py
index e3c05990457e..e96df24c8013 100644
--- a/scipy/integrate/tests/test_tanhsinh.py
+++ b/scipy/integrate/tests/test_tanhsinh.py
@@ -48,6 +48,9 @@ def wrapped(*arg_arrays):
 @pytest.mark.skip_xp_backends(
     'array_api_strict', reason='Currently uses fancy indexing assignment.'
 )
+@pytest.mark.skip_xp_backends(
+    'dask.array', reason='boolean indexing assignment'
+)
 @pytest.mark.skip_xp_backends(
     'jax.numpy', reason='JAX arrays do not support item assignment.'
 )
@@ -758,6 +761,7 @@ def test_compress_nodes_weights_gh21496(self, xp):
 @pytest.mark.usefixtures("skip_xp_backends")
 @pytest.mark.skip_xp_backends('array_api_strict', reason='No fancy indexing.')
 @pytest.mark.skip_xp_backends('jax.numpy', reason='No mutation.')
+@pytest.mark.skip_xp_backends('dask.array', reason='Data-dependent shapes in boolean index assignment')
 class TestNSum:
     rng = np.random.default_rng(5895448232066142650)
     p = rng.uniform(1, 10, size=10).tolist()
diff --git a/scipy/interpolate/_bsplines.py b/scipy/interpolate/_bsplines.py
index 3d68e8d53210..3fc0cb3ce8ab 100644
--- a/scipy/interpolate/_bsplines.py
+++ b/scipy/interpolate/_bsplines.py
@@ -1825,8 +1825,9 @@ def make_lsq_spline(x, y, t, k=3, w=None, axis=0, check_finite=True, *, method="
         # have observation matrix & rhs, can solve the LSQ problem
         cho_decomp = cholesky_banded(ab, overwrite_ab=True, lower=lower,
                                      check_finite=check_finite)
-        c = cho_solve_banded((cho_decomp, lower), rhs, overwrite_b=True,
-                             check_finite=check_finite)
+        m = rhs.shape[0]
+        c = cho_solve_banded((cho_decomp, lower), rhs.reshape(m, -1), overwrite_b=True,
+                             check_finite=check_finite).reshape(rhs.shape)
     elif method == "qr":
         _, _, c = _lsq_solve_qr(x, yy, t, k, w)
 
diff --git a/scipy/interpolate/_cubic.py b/scipy/interpolate/_cubic.py
index 1a9ac67bffdb..fed25e254870 100644
--- a/scipy/interpolate/_cubic.py
+++ b/scipy/interpolate/_cubic.py
@@ -414,6 +414,7 @@ class Akima1DInterpolator(CubicHermiteSpline):
     __call__
     derivative
     antiderivative
+    integrate
     roots
 
     See Also
@@ -775,8 +776,9 @@ def __init__(self, x, y, axis=0, bc_type='not-a-knot', extrapolate=None):
                 b[1] = 3 * (dxr[0] * slope[1] + dxr[1] * slope[0])
                 b[2] = 2 * slope[1]
 
-                s = solve(A, b, overwrite_a=True, overwrite_b=True,
-                          check_finite=False)
+                m = b.shape[0]
+                s = solve(A, b.reshape(m, -1), overwrite_a=True, overwrite_b=True,
+                          check_finite=False).reshape(b.shape)
             elif n == 3 and bc[0] == 'periodic':
                 # In case when number of points is 3 we compute the derivatives
                 # manually
@@ -836,11 +838,15 @@ def __init__(self, x, y, axis=0, bc_type='not-a-knot', extrapolate=None):
                     b2[-1] = -a_m2_m1
 
                     # s1 and s2 are the solutions of (n-2, n-2) system
-                    s1 = solve_banded((1, 1), Ac, b1, overwrite_ab=False,
+                    m = b1.shape[0]
+                    s1 = solve_banded((1, 1), Ac, b1.reshape(m, -1), overwrite_ab=False,
                                       overwrite_b=False, check_finite=False)
+                    s1 = s1.reshape(b1.shape)
 
-                    s2 = solve_banded((1, 1), Ac, b2, overwrite_ab=False,
+                    m = b2.shape[0]
+                    s2 = solve_banded((1, 1), Ac, b2.reshape(m, -1), overwrite_ab=False,
                                       overwrite_b=False, check_finite=False)
+                    s2 = s2.reshape(b2.shape)
 
                     # computing the s[n-2] solution:
                     s_m1 = ((b[-1] - a_m1_0 * s1[0] - a_m1_m2 * s1[-1]) /
@@ -882,8 +888,10 @@ def __init__(self, x, y, axis=0, bc_type='not-a-knot', extrapolate=None):
                         A[-1, -2] = dx[-1]
                         b[-1] = 0.5 * bc_end[1] * dx[-1]**2 + 3 * (y[-1] - y[-2])
 
-                    s = solve_banded((1, 1), A, b, overwrite_ab=True,
+                    m = b.shape[0]
+                    s = solve_banded((1, 1), A, b.reshape(m, -1), overwrite_ab=True,
                                      overwrite_b=True, check_finite=False)
+                    s = s.reshape(b.shape)
 
         super().__init__(x, y, s, axis=0, extrapolate=extrapolate)
         self.axis = axis
diff --git a/scipy/interpolate/_rgi.py b/scipy/interpolate/_rgi.py
index 8e20200568ed..bb57b14cc42c 100644
--- a/scipy/interpolate/_rgi.py
+++ b/scipy/interpolate/_rgi.py
@@ -154,6 +154,11 @@ class RegularGridInterpolator:
     "cubic_legacy" and "quintic_legacy". These methods allow faster construction
     but evaluations will be much slower.
 
+    **Rounding rule at half points with `nearest` method**
+
+    The rounding rule with the `nearest` method at half points is rounding *down*.
+
+
     Examples
     --------
     **Evaluate a function on the points of a 3-D grid**
diff --git a/scipy/interpolate/dfitpack.py b/scipy/interpolate/dfitpack.py
index e10da3b3fd0c..71b4407257b4 100644
--- a/scipy/interpolate/dfitpack.py
+++ b/scipy/interpolate/dfitpack.py
@@ -6,31 +6,11 @@
 
 
 __all__ = [  # noqa: F822
-    'bispeu',
-    'bispev',
-    'curfit',
-    'dblint',
-    'fpchec',
-    'fpcurf0',
-    'fpcurf1',
-    'fpcurfm1',
-    'parcur',
-    'parder',
-    'pardeu',
-    'pardtc',
-    'percur',
-    'regrid_smth',
-    'regrid_smth_spher',
     'spalde',
-    'spherfit_lsq',
-    'spherfit_smth',
     'splder',
     'splev',
     'splint',
     'sproot',
-    'surfit_lsq',
-    'surfit_smth',
-    'types',
 ]
 
 
diff --git a/scipy/interpolate/interpnd.py b/scipy/interpolate/interpnd.py
index 4288ac233fdd..2f4265377d20 100644
--- a/scipy/interpolate/interpnd.py
+++ b/scipy/interpolate/interpnd.py
@@ -21,5 +21,4 @@ def __dir__():
 def __getattr__(name):
     return _sub_module_deprecation(sub_package="interpolate", module="interpnd",
                                    private_modules=["_interpnd"], all=__all__,
-                                   attribute=name)
-
+                                   attribute=name, dep_version="1.17.0")
diff --git a/scipy/io/_fast_matrix_market/__init__.py b/scipy/io/_fast_matrix_market/__init__.py
index cfd6f8fb30ea..898538746551 100644
--- a/scipy/io/_fast_matrix_market/__init__.py
+++ b/scipy/io/_fast_matrix_market/__init__.py
@@ -194,6 +194,8 @@ def _get_read_cursor(source, parallelism=None):
             source = bz2.BZ2File(path, 'rb')
             ret_stream_to_close = source
         else:
+            if not os.path.exists(path):
+                raise FileNotFoundError(f"The source file does not exist: {path}")
             return _fmm_core.open_read_file(path, parallelism), ret_stream_to_close
 
     # Stream object.
diff --git a/scipy/io/tests/test_mmio.py b/scipy/io/tests/test_mmio.py
index 150371a8038a..016d0ccd43a9 100644
--- a/scipy/io/tests/test_mmio.py
+++ b/scipy/io/tests/test_mmio.py
@@ -823,3 +823,9 @@ def test_threadpoolctl():
 
     with threadpoolctl.threadpool_limits(limits=2, user_api='scipy'):
         assert_equal(fmm.PARALLELISM, 2)
+
+
+def test_gh21999_file_not_exist():
+    tmpdir = mkdtemp(suffix=str(threading.get_native_id()))
+    wrong_fn = os.path.join(tmpdir, 'not_exist_test_file.mtx')
+    assert_raises(FileNotFoundError, mmread, wrong_fn)
diff --git a/scipy/linalg/_basic.py b/scipy/linalg/_basic.py
index d66f5bae1ff8..921a4231138e 100644
--- a/scipy/linalg/_basic.py
+++ b/scipy/linalg/_basic.py
@@ -4,6 +4,7 @@
 # w/ additions by Travis Oliphant, March 2002
 #              and Jake Vanderplas, August 2012
 
+import warnings
 from warnings import warn
 from itertools import product
 import numpy as np
@@ -72,6 +73,7 @@ def _find_matrix_structure(a):
     return kind, n_below, n_above
 
 
+@_apply_over_batch(('a', 2), ('b', '1|2'))
 def solve(a, b, lower=False, overwrite_a=False,
           overwrite_b=False, check_finite=True, assume_a=None,
           transposed=False):
@@ -408,6 +410,7 @@ def _ensure_dtype_cdsz(*arrays):
     return (array.astype(dtype, copy=False) for array in arrays)
 
 
+@_apply_over_batch(('a', 2), ('b', '1|2'))
 def solve_triangular(a, b, trans=0, lower=False, unit_diagonal=False,
                      overwrite_b=False, check_finite=True):
     """
@@ -591,7 +594,13 @@ def solve_banded(l_and_u, ab, b, overwrite_ab=False, overwrite_b=False,
     array([-2.37288136,  3.93220339, -4.        ,  4.3559322 , -1.3559322 ])
 
     """
+    (nlower, nupper) = l_and_u
+    return _solve_banded(nlower, nupper, ab, b, overwrite_ab=overwrite_ab,
+                         overwrite_b=overwrite_b, check_finite=check_finite)
+
 
+@_apply_over_batch(('nlower', 0), ('nupper', 0), ('ab', 2), ('b', '1|2'))
+def _solve_banded(nlower, nupper, ab, b, overwrite_ab, overwrite_b, check_finite):
     a1 = _asarray_validated(ab, check_finite=check_finite, as_inexact=True)
     b1 = _asarray_validated(b, check_finite=check_finite, as_inexact=True)
 
@@ -599,7 +608,6 @@ def solve_banded(l_and_u, ab, b, overwrite_ab=False, overwrite_b=False,
     if a1.shape[-1] != b1.shape[0]:
         raise ValueError("shapes of ab and b are not compatible.")
 
-    (nlower, nupper) = l_and_u
     if nlower + nupper + 1 != a1.shape[0]:
         raise ValueError("invalid values for the number of lower and upper "
                          "diagonals: l+u+1 (%d) does not equal ab.shape[0] "
@@ -642,6 +650,7 @@ def solve_banded(l_and_u, ab, b, overwrite_ab=False, overwrite_b=False,
                      'gbsv/gtsv' % -info)
 
 
+@_apply_over_batch(('a', 2), ('b', '1|2'))
 def solveh_banded(ab, b, overwrite_ab=False, overwrite_b=False, lower=False,
                   check_finite=True):
     """
@@ -783,21 +792,25 @@ def solveh_banded(ab, b, overwrite_ab=False, overwrite_b=False, lower=False,
 
 
 def solve_toeplitz(c_or_cr, b, check_finite=True):
-    """Solve a Toeplitz system using Levinson Recursion
+    r"""Solve a Toeplitz system using Levinson Recursion
 
     The Toeplitz matrix has constant diagonals, with c as its first column
     and r as its first row. If r is not given, ``r == conjugate(c)`` is
     assumed.
 
+    .. warning::
+
+        Beginning in SciPy 1.17, multidimensional input will be treated as a batch,
+        not ``ravel``\ ed. To preserve the existing behavior, ``ravel`` arguments
+        before passing them to `solve_toeplitz`.
+
     Parameters
     ----------
     c_or_cr : array_like or tuple of (array_like, array_like)
-        The vector ``c``, or a tuple of arrays (``c``, ``r``). Whatever the
-        actual shape of ``c``, it will be converted to a 1-D array. If not
+        The vector ``c``, or a tuple of arrays (``c``, ``r``). If not
         supplied, ``r = conjugate(c)`` is assumed; in this case, if c[0] is
         real, the Toeplitz matrix is Hermitian. r[0] is ignored; the first row
-        of the Toeplitz matrix is ``[c[0], r[1:]]``. Whatever the actual shape
-        of ``r``, it will be converted to a 1-D array.
+        of the Toeplitz matrix is ``[c[0], r[1:]]``.
     b : (M,) or (M, K) array_like
         Right-hand side in ``T x = b``.
     check_finite : bool, optional
@@ -853,9 +866,15 @@ def solve_toeplitz(c_or_cr, b, check_finite=True):
     # If numerical stability of this algorithm is a problem, a future
     # developer might consider implementing other O(N^2) Toeplitz solvers,
     # such as GKO (https://www.jstor.org/stable/2153371) or Bareiss.
+    c, r = c_or_cr if isinstance(c_or_cr, tuple) else (c_or_cr, np.conjugate(c_or_cr))
+    return _solve_toeplitz(c, r, b, check_finite)
 
+
+# Can uncomment when `solve_toeplitz` deprecation is done (SciPy 1.17)
+# @_apply_over_batch(('c', 1), ('r', 1), ('b', '1|2'))
+def _solve_toeplitz(c, r, b, check_finite):
     r, c, b, dtype, b_shape = _validate_args_for_toeplitz_ops(
-        c_or_cr, b, check_finite, keep_b_shape=True)
+        (c, r), b, check_finite, keep_b_shape=True)
 
     # accommodate empty arrays
     if b.size == 0:
@@ -1295,6 +1314,7 @@ def det(a, overwrite_a=False, check_finite=True):
 
 
 # Linear Least Squares
+@_apply_over_batch(('a', 2), ('b', '1|2'))
 def lstsq(a, b, cond=None, overwrite_a=False, overwrite_b=False,
           check_finite=True, lapack_driver=None):
     """
@@ -1919,12 +1939,21 @@ def _validate_args_for_toeplitz_ops(c_or_cr, b, check_finite, keep_b_shape,
 
     if isinstance(c_or_cr, tuple):
         c, r = c_or_cr
-        c = _asarray_validated(c, check_finite=check_finite).ravel()
-        r = _asarray_validated(r, check_finite=check_finite).ravel()
+        c = _asarray_validated(c, check_finite=check_finite)
+        r = _asarray_validated(r, check_finite=check_finite)
     else:
-        c = _asarray_validated(c_or_cr, check_finite=check_finite).ravel()
+        c = _asarray_validated(c_or_cr, check_finite=check_finite)
         r = c.conjugate()
 
+    if c.ndim > 1 or r.ndim > 1:
+        msg = ("Beginning in SciPy 1.17, multidimensional input will be treated as a "
+               "batch, not `ravel`ed. To preserve the existing behavior and silence "
+               "this warning, `ravel` arguments before passing them to "
+               "`toeplitz`, `matmul_toeplitz`, and `solve_toeplitz`.")
+        warnings.warn(msg, FutureWarning, stacklevel=2)
+        c = c.ravel()
+        r = r.ravel()
+
     if b is None:
         raise ValueError('`b` must be an array, not None.')
 
@@ -1948,7 +1977,7 @@ def _validate_args_for_toeplitz_ops(c_or_cr, b, check_finite, keep_b_shape,
 
 
 def matmul_toeplitz(c_or_cr, x, check_finite=False, workers=None):
-    """Efficient Toeplitz Matrix-Matrix Multiplication using FFT
+    r"""Efficient Toeplitz Matrix-Matrix Multiplication using FFT
 
     This function returns the matrix multiplication between a Toeplitz
     matrix and a dense matrix.
@@ -1957,15 +1986,19 @@ def matmul_toeplitz(c_or_cr, x, check_finite=False, workers=None):
     and r as its first row. If r is not given, ``r == conjugate(c)`` is
     assumed.
 
+    .. warning::
+
+        Beginning in SciPy 1.17, multidimensional input will be treated as a batch,
+        not ``ravel``\ ed. To preserve the existing behavior, ``ravel`` arguments
+        before passing them to `matmul_toeplitz`.
+
     Parameters
     ----------
     c_or_cr : array_like or tuple of (array_like, array_like)
-        The vector ``c``, or a tuple of arrays (``c``, ``r``). Whatever the
-        actual shape of ``c``, it will be converted to a 1-D array. If not
+        The vector ``c``, or a tuple of arrays (``c``, ``r``). If not
         supplied, ``r = conjugate(c)`` is assumed; in this case, if c[0] is
         real, the Toeplitz matrix is Hermitian. r[0] is ignored; the first row
-        of the Toeplitz matrix is ``[c[0], r[1:]]``. Whatever the actual shape
-        of ``r``, it will be converted to a 1-D array.
+        of the Toeplitz matrix is ``[c[0], r[1:]]``.
     x : (M,) or (M, K) array_like
         Matrix with which to multiply.
     check_finite : bool, optional
diff --git a/scipy/linalg/_decomp_cholesky.py b/scipy/linalg/_decomp_cholesky.py
index bd7e3824e392..31b3bf3d2c59 100644
--- a/scipy/linalg/_decomp_cholesky.py
+++ b/scipy/linalg/_decomp_cholesky.py
@@ -218,7 +218,12 @@ def cho_solve(c_and_lower, b, overwrite_b=False, check_finite=True):
     True
 
     """
-    (c, lower) = c_and_lower
+    c, lower = c_and_lower
+    return _cho_solve(c, b, lower, overwrite_b=overwrite_b, check_finite=check_finite)
+
+
+@_apply_over_batch(('c', 2), ('b', '1|2'))
+def _cho_solve(c, b, lower, overwrite_b, check_finite):
     if check_finite:
         b1 = asarray_chkfinite(b)
         c = asarray_chkfinite(c)
@@ -375,6 +380,12 @@ def cho_solve_banded(cb_and_lower, b, overwrite_b=False, check_finite=True):
 
     """
     (cb, lower) = cb_and_lower
+    return _cho_solve_banded(cb, b, lower, overwrite_b=overwrite_b,
+                             check_finite=check_finite)
+
+
+@_apply_over_batch(('cb', 2), ('b', '1|2'))
+def _cho_solve_banded(cb, b, lower, overwrite_b, check_finite):
     if check_finite:
         cb = asarray_chkfinite(cb)
         b = asarray_chkfinite(b)
diff --git a/scipy/linalg/_decomp_cossin.py b/scipy/linalg/_decomp_cossin.py
index e10c04fe5ebc..4cc413b6c19a 100644
--- a/scipy/linalg/_decomp_cossin.py
+++ b/scipy/linalg/_decomp_cossin.py
@@ -1,7 +1,7 @@
 from collections.abc import Iterable
 import numpy as np
 
-from scipy._lib._util import _asarray_validated
+from scipy._lib._util import _asarray_validated, _apply_over_batch
 from scipy.linalg import block_diag, LinAlgError
 from .lapack import _compute_lwork, get_lapack_funcs
 
@@ -80,6 +80,13 @@ def cossin(X, p=None, q=None, separate=False,
         (``m-q`` x ``m-q``) orthogonal/unitary matrices. If ``separate=True``,
         this contains the tuple of ``(V1H, V2H)``.
 
+    Notes
+    -----
+    The documentation is written assuming array arguments are of specified
+    "core" shapes. However, array argument(s) of this function may have additional
+    "batch" dimensions prepended to the core shape. In this case, the array is treated
+    as a batch of lower-dimensional slices; see :ref:`linalg_batch` for details.
+
     References
     ----------
     .. [1] Brian D. Sutton. Computing the complete CS decomposition. Numer.
@@ -111,16 +118,17 @@ def cossin(X, p=None, q=None, separate=False,
         p = 1 if p is None else int(p)
         q = 1 if q is None else int(q)
         X = _asarray_validated(X, check_finite=True)
-        if not np.equal(*X.shape):
+        if not np.equal(*X.shape[-2:]):
             raise ValueError("Cosine Sine decomposition only supports square"
-                             f" matrices, got {X.shape}")
-        m = X.shape[0]
+                             f" matrices, got {X.shape[-2:]}")
+        m = X.shape[-2]
         if p >= m or p <= 0:
-            raise ValueError(f"invalid p={p}, 0<p<{X.shape[0]} must hold")
+            raise ValueError(f"invalid p={p}, 0<p<{X.shape[-2]} must hold")
         if q >= m or q <= 0:
-            raise ValueError(f"invalid q={q}, 0<q<{X.shape[0]} must hold")
+            raise ValueError(f"invalid q={q}, 0<q<{X.shape[-2]} must hold")
 
-        x11, x12, x21, x22 = X[:p, :q], X[:p, q:], X[p:, :q], X[p:, q:]
+        x11, x12, x21, x22 = (X[..., :p, :q], X[..., :p, q:],
+                              X[..., p:, :q], X[..., p:, q:])
     elif not isinstance(X, Iterable):
         raise ValueError("When p and q are None, X must be an Iterable"
                          " containing the subblocks of X")
@@ -128,30 +136,37 @@ def cossin(X, p=None, q=None, separate=False,
         if len(X) != 4:
             raise ValueError("When p and q are None, exactly four arrays"
                              f" should be in X, got {len(X)}")
-
         x11, x12, x21, x22 = (np.atleast_2d(x) for x in X)
-        for name, block in zip(["x11", "x12", "x21", "x22"],
-                               [x11, x12, x21, x22]):
-            if block.shape[1] == 0:
-                raise ValueError(f"{name} can't be empty")
-        p, q = x11.shape
-        mmp, mmq = x22.shape
-
-        if x12.shape != (p, mmq):
-            raise ValueError(f"Invalid x12 dimensions: desired {(p, mmq)}, "
-                             f"got {x12.shape}")
-
-        if x21.shape != (mmp, q):
-            raise ValueError(f"Invalid x21 dimensions: desired {(mmp, q)}, "
-                             f"got {x21.shape}")
-
-        if p + mmp != q + mmq:
-            raise ValueError("The subblocks have compatible sizes but "
-                             "don't form a square array (instead they form a"
-                              f" {p + mmp}x{q + mmq} array). This might be "
-                              "due to missing p, q arguments.")
-
-        m = p + mmp
+
+    return _cossin(x11, x12, x21, x22, separate=separate, swap_sign=swap_sign,
+                   compute_u=compute_u, compute_vh=compute_vh)
+
+
+@_apply_over_batch(('x11', 2), ('x12', 2), ('x21', 2), ('x22', 2))
+def _cossin(x11, x12, x21, x22, separate, swap_sign, compute_u, compute_vh):
+
+    for name, block in zip(["x11", "x12", "x21", "x22"],
+                           [x11, x12, x21, x22]):
+        if block.shape[1] == 0:
+            raise ValueError(f"{name} can't be empty")
+    p, q = x11.shape
+    mmp, mmq = x22.shape
+
+    if x12.shape != (p, mmq):
+        raise ValueError(f"Invalid x12 dimensions: desired {(p, mmq)}, "
+                         f"got {x12.shape}")
+
+    if x21.shape != (mmp, q):
+        raise ValueError(f"Invalid x21 dimensions: desired {(mmp, q)}, "
+                         f"got {x21.shape}")
+
+    if p + mmp != q + mmq:
+        raise ValueError("The subblocks have compatible sizes but "
+                         "don't form a square array (instead they form a"
+                          f" {p + mmp}x{q + mmq} array). This might be "
+                          "due to missing p, q arguments.")
+
+    m = p + mmp
 
     cplx = any([np.iscomplexobj(x) for x in [x11, x12, x21, x22]])
     driver = "uncsd" if cplx else "orcsd"
diff --git a/scipy/linalg/_decomp_lu.py b/scipy/linalg/_decomp_lu.py
index 286ac214b2f1..bc9a657cb79e 100644
--- a/scipy/linalg/_decomp_lu.py
+++ b/scipy/linalg/_decomp_lu.py
@@ -178,6 +178,12 @@ def lu_solve(lu_and_piv, b, trans=0, overwrite_b=False, check_finite=True):
 
     """
     (lu, piv) = lu_and_piv
+    return _lu_solve(lu, piv, b, trans=trans, overwrite_b=overwrite_b,
+                     check_finite=check_finite)
+
+
+@_apply_over_batch(('lu', 2), ('piv', 1), ('b', '1|2'))
+def _lu_solve(lu, piv, b, trans, overwrite_b, check_finite):
     if check_finite:
         b1 = asarray_chkfinite(b)
     else:
diff --git a/scipy/linalg/_decomp_qr.py b/scipy/linalg/_decomp_qr.py
index 701e2749fe2d..ecf4e8af0fee 100644
--- a/scipy/linalg/_decomp_qr.py
+++ b/scipy/linalg/_decomp_qr.py
@@ -217,6 +217,7 @@ def qr(a, overwrite_a=False, lwork=None, mode='full', pivoting=False,
     return (Q,) + Rj
 
 
+@_apply_over_batch(('a', 2), ('c', '1|2'))
 def qr_multiply(a, c, mode='right', pivoting=False, conjugate=False,
                 overwrite_a=False, overwrite_c=False):
     """
diff --git a/scipy/linalg/_decomp_update.pyx.in b/scipy/linalg/_decomp_update.pyx.in
index 55a32c113551..6df7d4919039 100644
--- a/scipy/linalg/_decomp_update.pyx.in
+++ b/scipy/linalg/_decomp_update.pyx.in
@@ -71,6 +71,7 @@ from . cimport cython_blas as blas_pointers
 from . cimport cython_lapack as lapack_pointers
 
 import numpy as np
+from scipy._lib._util import _apply_over_batch
 
 #------------------------------------------------------------------------------
 # These are a set of fused type wrappers around the BLAS and LAPACK calls used.
@@ -1397,6 +1398,7 @@ cdef void* extract(cnp.ndarray arr, int* arrs) noexcept:
             arrs[1] = 0
     return cnp.PyArray_DATA(arr)
 
+@_apply_over_batch(('Q', 2), ('R', 2), ('k', 0))
 @cython.embedsignature(True)
 def qr_delete(Q, R, k, int p=1, which='row', overwrite_qr=False,
               check_finite=True):
@@ -1614,6 +1616,7 @@ def qr_delete(Q, R, k, int p=1, which='row', overwrite_qr=False,
     else:
         raise ValueError("'which' must be either 'row' or 'col'")
 
+@_apply_over_batch(('Q', 2), ('R', 2), ('u', '1|2'), ('k', 0))
 @cython.embedsignature(True)
 def qr_insert(Q, R, u, k, which='row', rcond=None, overwrite_qru=False, check_finite=True):
     """
@@ -2030,6 +2033,7 @@ RCONDS = ['&frc', '&drc', '&cfrc', '&cdrc']
                     raise MemoryError("Unable to allocate memory for array")
         return q1, rnew
 
+@_apply_over_batch(('Q', 2), ('R', 2), ('u', '1|2'), ('v', '1|2'))
 @cython.embedsignature(True)
 def qr_update(Q, R, u, v, overwrite_qruv=False, check_finite=True):
     """
diff --git a/scipy/linalg/_sketches.py b/scipy/linalg/_sketches.py
index 589172827f52..25b8e37a5c89 100644
--- a/scipy/linalg/_sketches.py
+++ b/scipy/linalg/_sketches.py
@@ -6,8 +6,8 @@
 import numpy as np
 
 from scipy._lib._util import (check_random_state, rng_integers,
-                              _transition_to_rng)
-from scipy.sparse import csc_matrix
+                              _transition_to_rng, _apply_over_batch)
+from scipy.sparse import csc_matrix, issparse
 
 __all__ = ['clarkson_woodruff_transform']
 
@@ -68,8 +68,8 @@ def clarkson_woodruff_transform(input_matrix, sketch_size, rng=None):
 
     Parameters
     ----------
-    input_matrix : array_like
-        Input matrix, of shape ``(n, d)``.
+    input_matrix : array_like, shape (..., n, d)
+        Input matrix.
     sketch_size : int
         Number of rows for the sketch.
     rng : `numpy.random.Generator`, optional
@@ -174,5 +174,16 @@ def clarkson_woodruff_transform(input_matrix, sketch_size, rng=None):
     166.58473879945151
 
     """
-    S = cwt_matrix(sketch_size, input_matrix.shape[0], rng=rng)
-    return S.dot(input_matrix)
+    if issparse(input_matrix) and input_matrix.ndim > 2:
+        message = "Batch support for sparse arrays is not available."
+        raise NotImplementedError(message)
+
+    S = cwt_matrix(sketch_size, input_matrix.shape[-2], rng=rng)
+    # Despite argument order (required by decorator), this is  S @ input_matrix
+    # Can avoid _batch_dot when gh-22153 is resolved.
+    return S @ input_matrix if input_matrix.ndim <= 2 else _batch_dot(input_matrix, S)
+
+
+@_apply_over_batch(('input_matrix', 2))
+def _batch_dot(input_matrix, S):
+    return S @ input_matrix
diff --git a/scipy/linalg/_special_matrices.py b/scipy/linalg/_special_matrices.py
index 512991fffadf..5ba669ec66cf 100644
--- a/scipy/linalg/_special_matrices.py
+++ b/scipy/linalg/_special_matrices.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from numpy.lib.stride_tricks import as_strided
+from scipy._lib._util import _apply_over_batch
 
 
 __all__ = ['toeplitz', 'circulant', 'hankel',
@@ -27,19 +28,17 @@ def toeplitz(c, r=None):
     Parameters
     ----------
     c : array_like
-        First column of the matrix.  Whatever the actual shape of `c`, it
-        will be converted to a 1-D array.
+        First column of the matrix.
     r : array_like, optional
         First row of the matrix. If None, ``r = conjugate(c)`` is assumed;
         in this case, if c[0] is real, the result is a Hermitian matrix.
         r[0] is ignored; the first row of the returned matrix is
-        ``[c[0], r[1:]]``.  Whatever the actual shape of `r`, it will be
-        converted to a 1-D array.
+        ``[c[0], r[1:]]``.
 
         .. warning::
 
             Beginning in SciPy 1.17, multidimensional input will be treated as a batch,
-            not ``ravel``\ ed. To preserve the existing behavior, ``ravel`` aruments
+            not ``ravel``\ ed. To preserve the existing behavior, ``ravel`` arguments
             before passing them to `toeplitz`.
 
     Returns
@@ -81,7 +80,7 @@ def toeplitz(c, r=None):
     if c.ndim > 1 or r.ndim > 1:
         msg = ("Beginning in SciPy 1.17, multidimensional input will be treated as a "
                "batch, not `ravel`ed. To preserve the existing behavior and silence "
-               "this warning, `ravel` aruments before passing them to `toeplitz`.")
+               "this warning, `ravel` arguments before passing them to `toeplitz`.")
         warnings.warn(msg, FutureWarning, stacklevel=2)
 
     c, r = c.ravel(), r.ravel()
@@ -271,6 +270,7 @@ def hadamard(n, dtype=int):
     return H
 
 
+@_apply_over_batch(("f", 1), ("s", 1))
 def leslie(f, s):
     """
     Create a Leslie matrix.
@@ -281,24 +281,22 @@ def leslie(f, s):
 
     Parameters
     ----------
-    f : (..., N,) array_like
+    f : (N,) array_like
         The "fecundity" coefficients.
-    s : (..., N-1,) array_like
-        The "survival" coefficients. The length of each slice of `s` (along the last
-        axis) must be one less than the length of `f`, and it must be at least 1.
+    s : (N-1,) array_like
+        The "survival" coefficients. The length of `s` must be one less
+        than the length of `f`, and it must be at least 1.
 
     Returns
     -------
-    L : (..., N, N) ndarray
+    L : (N, N) ndarray
         The array is zero except for the first row,
         which is `f`, and the first sub-diagonal, which is `s`.
-        For 1-D input, the data-type of the array will be the data-type of
+        The data-type of the array will be the data-type of
         ``f[0]+s[0]``.
 
     Notes
     -----
-    .. versionadded:: 0.8.0
-
     The Leslie matrix is used to model discrete-time, age-structured
     population growth [1]_ [2]_. In a population with `n` age classes, two sets
     of parameters define a Leslie matrix: the `n` "fecundity coefficients",
@@ -306,11 +304,6 @@ def leslie(f, s):
     class, and the `n` - 1 "survival coefficients", which give the
     per-capita survival rate of each age class.
 
-    N-dimensional input are treated as a batches of coefficient arrays: each
-    slice along the last axis of the input arrays is a 1-D coefficient array,
-    and each slice along the last two dimensions of the output is the
-    corresponding Leslie matrix.
-
     References
     ----------
     .. [1] P. H. Leslie, On the use of matrices in certain population
@@ -339,12 +332,6 @@ def leslie(f, s):
         raise ValueError("The length of s must be at least 1.")
 
     n = f.shape[-1]
-
-    if f.ndim > 1 or s.ndim > 1:
-        from scipy.stats._resampling import _vectorize_statistic
-        _leslie_nd = _vectorize_statistic(leslie)
-        return np.moveaxis(_leslie_nd(f, s, axis=-1), [0, 1], [-2, -1])
-
     tmp = f[0] + s[0]
     a = np.zeros((n, n), dtype=tmp.dtype)
     a[0] = f
@@ -408,9 +395,9 @@ def kron(a, b):
 
 def block_diag(*arrs):
     """
-    Create a block diagonal matrix from provided arrays.
+    Create a block diagonal array from provided arrays.
 
-    Given the inputs `A`, `B` and `C`, the output will have these
+    For example, given 2-D inputs `A`, `B` and `C`, the output will have these
     arrays arranged on the diagonal::
 
         [[A, 0, 0],
@@ -419,15 +406,17 @@ def block_diag(*arrs):
 
     Parameters
     ----------
-    A, B, C, ... : array_like, up to 2-D
-        Input arrays.  A 1-D array or array_like sequence of length `n` is
-        treated as a 2-D array with shape ``(1,n)``.
+    A, B, C, ... : array_like
+        Input arrays.  A 1-D array or array_like sequence of length ``n`` is
+        treated as a 2-D array with shape ``(1, n)``. Any dimensions before
+        the last two are treated as batch dimensions; see :ref:`linalg_batch`.
 
     Returns
     -------
     D : ndarray
-        Array with `A`, `B`, `C`, ... on the diagonal. `D` has the
-        same dtype as `A`.
+        Array with `A`, `B`, `C`, ... on the diagonal of the last two
+        dimensions. `D` has the same dtype as the result type of the
+        inputs.
 
     Notes
     -----
@@ -435,7 +424,8 @@ def block_diag(*arrs):
     block diagonal matrix.
 
     Empty sequences (i.e., array-likes of zero size) will not be ignored.
-    Noteworthy, both [] and [[]] are treated as matrices with shape ``(1,0)``.
+    Noteworthy, both ``[]`` and ``[[]]`` are treated as matrices with shape
+    ``(1,0)``.
 
     Examples
     --------
@@ -472,18 +462,16 @@ def block_diag(*arrs):
         arrs = ([],)
     arrs = [np.atleast_2d(a) for a in arrs]
 
-    bad_args = [k for k in range(len(arrs)) if arrs[k].ndim > 2]
-    if bad_args:
-        raise ValueError("arguments in the following positions "
-                         f"have dimension greater than 2: {bad_args}")
-
-    shapes = np.array([a.shape for a in arrs])
+    batch_shapes = [a.shape[:-2] for a in arrs]
+    batch_shape = np.broadcast_shapes(*batch_shapes)
+    arrs = [np.broadcast_to(a, batch_shape + a.shape[-2:]) for a in arrs]
     out_dtype = np.result_type(*[arr.dtype for arr in arrs])
-    out = np.zeros(np.sum(shapes, axis=0), dtype=out_dtype)
+    block_shapes = np.array([a.shape[-2:] for a in arrs])
+    out = np.zeros(batch_shape + tuple(np.sum(block_shapes, axis=0)), dtype=out_dtype)
 
     r, c = 0, 0
-    for i, (rr, cc) in enumerate(shapes):
-        out[r:r + rr, c:c + cc] = arrs[i]
+    for i, (rr, cc) in enumerate(block_shapes):
+        out[..., r:r + rr, c:c + cc] = arrs[i]
         r += rr
         c += cc
     return out
diff --git a/scipy/linalg/flapack_gen_banded.pyf.src b/scipy/linalg/flapack_gen_banded.pyf.src
index e09a0edd9820..bd273195cb3e 100644
--- a/scipy/linalg/flapack_gen_banded.pyf.src
+++ b/scipy/linalg/flapack_gen_banded.pyf.src
@@ -77,3 +77,56 @@ subroutine <prefix>gbtrs(ab,kl,ku,b,ipiv,trans,n,nrhs,ldab,ldb,info) ! in :Band:
     integer intent(out):: info
 
 end subroutine <prefix>gbtrs
+
+
+subroutine <prefix2>gbcon(norm,n,kl,ku,ab,ldab,ipiv,anorm,rcond,work,iwork,info)
+    ! ?GBCON estimates the reciprocal of the condition number of a real
+    ! general band matrix A, in either the 1-norm or the infinity-norm,
+    ! using the LU factorization computed by ?GBTRF.
+    ! An estimate is obtained for norm(inv(A)), and the reciprocal of the
+    ! condition number is computed as
+    ! RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+    threadsafe
+    callstatement (*f2py_func)(norm,&n,&kl,&ku,ab,&ldab,ipiv,&anorm,&rcond,work,iwork,&info)
+    callprotoargument char*, F_INT*, F_INT*, F_INT*, <ctype2>*, F_INT*, F_INT*, <ctype2>*, <ctype2>*, <ctype2>*, F_INT*, F_INT*
+
+    character optional, intent(in) :: norm = '1'
+    integer depend(ab),intent(hide) :: n=shape(ab,1)
+    integer intent(in),check(kl >= 0) :: kl
+    integer intent(in),check(ku>0) :: ku
+    <ftype2> dimension(ldab,n),intent(in) :: ab
+    integer optional,intent(in),check(ldab >= (2*kl+ku+1)),depend(ab,kl,ku) :: ldab=2*kl+ku+1
+    integer intent(in), depend(n), dimension(n) :: ipiv
+    <ftype2> intent(in) :: anorm
+    <ftype2> intent(out) :: rcond
+    <ftype2> intent(hide, cache), dimension(3*n), depend(n) :: work
+    integer intent(hide, cache), dimension(n), depend(n) :: iwork
+    integer intent(out) :: info
+end subroutine <prefix2>gbcon
+
+
+subroutine <prefix2c>gbcon(norm,n,kl,ku,ab,ldab,ipiv,anorm,rcond,work,rwork,info)
+    ! ?GBCON estimates the reciprocal of the condition number of a complex
+    ! general band matrix A, in either the 1-norm or the infinity-norm,
+    ! using the LU factorization computed by ?GBTRF.
+    ! An estimate is obtained for norm(inv(A)), and the reciprocal of the
+    ! condition number is computed as
+    ! RCOND = 1 / ( norm(A) * norm(inv(A)) ).
+    threadsafe
+    callstatement (*f2py_func)(norm,&n,&kl,&ku,ab,&ldab,ipiv,&anorm,&rcond,work,rwork,&info)
+    callprotoargument char*, F_INT*, F_INT*, F_INT*, <ctype2c>*, F_INT*, F_INT*, <ctype2>*, <ctype2>*, <ctype2c>*, <ctype2>*, F_INT*
+
+    character optional, intent(in) :: norm = '1'
+    integer depend(ab),intent(hide) :: n=shape(ab,1)
+    integer intent(in),check(kl>=0) :: kl
+    integer intent(in),check(ku>=0) :: ku
+    <ftype2c> dimension(ldab,n),intent(in) :: ab
+    integer optional,intent(in),depend(ab),check(ldab >= (2*kl+ku+1)),depend(ab,kl,ku) :: ldab=2*kl+ku+1
+    
+    integer intent(in), depend(n), dimension(n) :: ipiv
+    <ftype2> intent(in) :: anorm
+    <ftype2> intent(out) :: rcond
+    <ftype2c> intent(hide, cache), dimension(2*n), depend(n) :: work
+    <ftype2> intent(hide, cache), dimension(n), depend(n) :: rwork
+    integer intent(out) :: info
+end subroutine <prefix2c>gbcon
diff --git a/scipy/linalg/lapack.py b/scipy/linalg/lapack.py
index 2d15cf4d72d1..ffce33ffd105 100644
--- a/scipy/linalg/lapack.py
+++ b/scipy/linalg/lapack.py
@@ -44,6 +44,11 @@
 .. autosummary::
    :toctree: generated/
 
+   sgbcon
+   dgbcon
+   cgbcon
+   zgbcon
+
    sgbsv
    dgbsv
    cgbsv
diff --git a/scipy/linalg/tests/test_basic.py b/scipy/linalg/tests/test_basic.py
index fe51dcc21824..782fd0865046 100644
--- a/scipy/linalg/tests/test_basic.py
+++ b/scipy/linalg/tests/test_basic.py
@@ -791,7 +791,7 @@ def test_ill_condition_warning(self, structure):
 
     def test_multiple_rhs(self):
         a = np.eye(2)
-        b = np.random.rand(2, 3, 4)
+        b = np.random.rand(2, 12)
         x = solve(a, b)
         assert_array_almost_equal(x, b)
 
@@ -1894,7 +1894,7 @@ def test_basic3(self):
         c = np.array([1, 2, -3, -5])
         b = np.arange(24).reshape(4, 3, 2)
         x = solve_circulant(c, b)
-        y = solve(circulant(c), b)
+        y = solve(circulant(c), b.reshape(4, -1)).reshape(b.shape)
         assert_allclose(x, y)
 
     def test_complex(self):
diff --git a/scipy/linalg/tests/test_batch.py b/scipy/linalg/tests/test_batch.py
index 227185d24ffd..772cd77c867a 100644
--- a/scipy/linalg/tests/test_batch.py
+++ b/scipy/linalg/tests/test_batch.py
@@ -2,7 +2,7 @@
 import pytest
 import numpy as np
 from numpy.testing import assert_allclose
-from scipy import linalg
+from scipy import linalg, sparse
 
 
 real_floating = [np.float32, np.float64]
@@ -30,7 +30,7 @@ class TestBatch:
     # Test batch support for most linalg functions
 
     def batch_test(self, fun, arrays, *, core_dim=2, n_out=1, kwargs=None, dtype=None,
-                   broadcast=True):
+                   broadcast=True, check_kwargs=True):
         # Check that all outputs of batched call `fun(A, **kwargs)` are the same
         # as if we loop over the separate vectors/matrices in `A`. Also check
         # that `fun` accepts `A` by position or keyword and that results are
@@ -45,10 +45,11 @@ def batch_test(self, fun, arrays, *, core_dim=2, n_out=1, kwargs=None, dtype=Non
         arrays = (arrays,) if not isinstance(arrays, tuple) else arrays
 
         # Identical results when passing argument by keyword or position
-        res1 = fun(**dict(zip(parameters, arrays)), **kwargs)
         res2 = fun(*arrays, **kwargs)
-        for out1, out2 in zip(res1, res2):  # even a single array output is iterable...
-            np.testing.assert_equal(out1, out2)
+        if check_kwargs:
+            res1 = fun(**dict(zip(parameters, arrays)), **kwargs)
+            for out1, out2 in zip(res1, res2):  # even a single array is iterable...
+                np.testing.assert_equal(out1, out2)
 
         # Check results vs looping over
         res = (res2,) if n_out == 1 else res2
@@ -71,7 +72,7 @@ def batch_test(self, fun, arrays, *, core_dim=2, n_out=1, kwargs=None, dtype=Non
             out_dtype = ref[k].dtype if dtype is None else dtype
             assert res[k].dtype == out_dtype
 
-        return res1  # return original, non-tuplized result
+        return res2  # return original, non-tuplized result
 
     @pytest.fixture
     def rng(self):
@@ -184,6 +185,65 @@ def test_polar_qr_rq(self, fun, dtype, rng):
         A = get_random((5, 3, 2, 4), dtype=dtype, rng=rng)
         self.batch_test(fun, A, n_out=2)
 
+    @pytest.mark.parametrize('cdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_qr_multiply(self, cdim, dtype, rng):
+        A = get_random((2, 3, 5, 5), dtype=dtype, rng=rng)
+        c = get_random(cdim, dtype=dtype, rng=rng)
+        res = linalg.qr_multiply(A, c, mode='left')
+        q, r = linalg.qr(A)
+        ref = q @ c
+        atol = 1e-6 if dtype in {np.float32, np.complex64} else 1e-12
+        assert_allclose(res[0], ref, atol=atol)
+        assert_allclose(res[1], r, atol=atol)
+
+    @pytest.mark.parametrize('uvdim', [[(5,), (3,)], [(4, 5, 2), (4, 3, 2)]])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_qr_update(self, uvdim, dtype, rng):
+        udim, vdim = uvdim
+        A = get_random((4, 5, 3), dtype=dtype, rng=rng)
+        u = get_random(udim, dtype=dtype, rng=rng)
+        v = get_random(vdim, dtype=dtype, rng=rng)
+        q, r = linalg.qr(A)
+        res = linalg.qr_update(q, r, u, v)
+        for i in range(4):
+            qi, ri = q[i], r[i]
+            ui, vi = (u, v) if u.ndim == 1 else (u[i], v[i])
+            ref_i = linalg.qr_update(qi, ri, ui, vi)
+            assert_allclose(res[0][i], ref_i[0])
+            assert_allclose(res[1][i], ref_i[1])
+
+    @pytest.mark.parametrize('udim', [(5,), (4, 3, 5)])
+    @pytest.mark.parametrize('kdim', [(), (4,)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_qr_insert(self, udim, kdim, dtype, rng):
+        A = get_random((4, 5, 5), dtype=dtype, rng=rng)
+        u = get_random(udim, dtype=dtype, rng=rng)
+        k = rng.integers(0, 5, size=kdim)
+        q, r = linalg.qr(A)
+        res = linalg.qr_insert(q, r, u, k)
+        for i in range(4):
+            qi, ri = q[i], r[i]
+            ki = k if k.ndim == 0 else k[i]
+            ui = u if u.ndim == 1 else u[i]
+            ref_i = linalg.qr_insert(qi, ri, ui, ki)
+            assert_allclose(res[0][i], ref_i[0])
+            assert_allclose(res[1][i], ref_i[1])
+
+    @pytest.mark.parametrize('kdim', [(), (4,)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_qr_delete(self, kdim, dtype, rng):
+        A = get_random((4, 5, 5), dtype=dtype, rng=rng)
+        k = rng.integers(0, 4, size=kdim)
+        q, r = linalg.qr(A)
+        res = linalg.qr_delete(q, r, k)
+        for i in range(4):
+            qi, ri = q[i], r[i]
+            ki = k if k.ndim == 0 else k[i]
+            ref_i = linalg.qr_delete(qi, ri, ki)
+            assert_allclose(res[0][i], ref_i[0])
+            assert_allclose(res[1][i], ref_i[1])
+
     @pytest.mark.parametrize('fun', [linalg.schur, linalg.lu_factor])
     @pytest.mark.parametrize('dtype', floating)
     def test_schur_lu(self, fun, dtype, rng):
@@ -262,6 +322,23 @@ def test_two_generic_matrix_inputs(self, fun_n_out, dtype, rng):
         B = get_random((2, 3, 4, 4), dtype=dtype, rng=rng)
         self.batch_test(fun, (A, B), n_out=n_out)
 
+    @pytest.mark.parametrize('dtype', floating)
+    def test_cossin(self, dtype, rng):
+        p, q = 3, 4
+        X = get_random((2, 3, 10, 10), dtype=dtype, rng=rng)
+        x11, x12, x21, x22 = (X[..., :p, :q], X[..., :p, q:],
+                              X[..., p:, :q], X[..., p:, q:])
+        res = linalg.cossin(X, p, q)
+        ref = linalg.cossin((x11, x12, x21, x22))
+        for res_i, ref_i in zip(res, ref):
+            np.testing.assert_equal(res_i, ref_i)
+
+        for j in range(2):
+            for k in range(3):
+                ref_jk = linalg.cossin(X[j, k], p, q)
+                for res_i, ref_ijk in zip(res, ref_jk):
+                    np.testing.assert_equal(res_i[j, k], ref_ijk)
+
     @pytest.mark.parametrize('dtype', floating)
     def test_sylvester(self, dtype, rng):
         A = get_random((2, 3, 5, 5), dtype=dtype, rng=rng)
@@ -296,6 +373,24 @@ def test_cholesky_banded(self, dtype, rng):
         ab[..., -1, :] = 10  # make diagonal dominant
         self.batch_test(linalg.cholesky_banded, ab)
 
+    @pytest.mark.parametrize('dtype', floating)
+    def test_block_diag(self, dtype, rng):
+        a = get_random((1, 3, 1, 3), dtype=dtype, rng=rng)
+        b = get_random((2, 1, 3, 6), dtype=dtype, rng=rng)
+        c = get_random((1, 1, 3, 2), dtype=dtype, rng=rng)
+
+        # batch_test doesn't have the logic to broadcast just the batch shapes,
+        # so do it manually.
+        a2 = np.broadcast_to(a, (2, 3, 1, 3))
+        b2 = np.broadcast_to(b, (2, 3, 3, 6))
+        c2 = np.broadcast_to(c, (2, 3, 3, 2))
+        ref = self.batch_test(linalg.block_diag, (a2, b2, c2),
+                              check_kwargs=False, broadcast=False)
+
+        # Check that `block_diag` broadcasts the batch shapes as expected.
+        res = linalg.block_diag(a, b, c)
+        assert_allclose(res, ref)
+
     @pytest.mark.parametrize('fun_n_out', [(linalg.eigh_tridiagonal, 2),
                                            (linalg.eigvalsh_tridiagonal, 1)])
     @pytest.mark.parametrize('dtype', real_floating)
@@ -305,3 +400,144 @@ def test_eigh_tridiagonal(self, fun_n_out, dtype, rng):
         d = get_random((3, 4, 5), dtype=dtype, rng=rng)
         e = get_random((3, 4, 4), dtype=dtype, rng=rng)
         self.batch_test(fun, (d, e), core_dim=1, n_out=n_out, broadcast=False)
+
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_solve(self, bdim, dtype, rng):
+        A = get_random((2, 3, 5, 5), dtype=dtype, rng=rng)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.solve(A, b)
+        if len(bdim) == 1:
+            x = x[..., np.newaxis]
+            b = b[..., np.newaxis]
+        assert_allclose(A @ x - b, 0, atol=1e-6)
+        assert_allclose(x, np.linalg.solve(A, b), atol=2e-6)
+
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_lu_solve(self, bdim, dtype, rng):
+        A = get_random((2, 3, 5, 5), dtype=dtype, rng=rng)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        lu_and_piv = linalg.lu_factor(A)
+        x = linalg.lu_solve(lu_and_piv, b)
+        if len(bdim) == 1:
+            x = x[..., np.newaxis]
+            b = b[..., np.newaxis]
+        assert_allclose(A @ x - b, 0, atol=1e-6)
+        assert_allclose(x, np.linalg.solve(A, b), atol=2e-6)
+
+    @pytest.mark.parametrize('l_and_u', [(1, 1), ([2, 1, 0], [0, 1 , 2])])
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_solve_banded(self, l_and_u, bdim, dtype, rng):
+        l, u = l_and_u
+        ab = get_random((2, 3, 3, 5), dtype=dtype, rng=rng)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.solve_banded((l, u), ab, b)
+        for i in range(2):
+            for j in range(3):
+                bij = b if len(bdim) <= 2 else b[i, j]
+                lj = l if np.ndim(l) == 0 else l[j]
+                uj = u if np.ndim(u) == 0 else u[j]
+                xij = linalg.solve_banded((lj, uj), ab[i, j], bij)
+                assert_allclose(x[i, j], xij)
+
+    # Can uncomment when `solve_toeplitz` deprecation is done (SciPy 1.17)
+    # @pytest.mark.parametrize('separate_r', [False, True])
+    # @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    # @pytest.mark.parametrize('dtype', floating)
+    # def test_solve_toeplitz(self, separate_r, bdim, dtype, rng):
+    #     c = get_random((2, 3, 5), dtype=dtype, rng=rng)
+    #     r = get_random((2, 3, 5), dtype=dtype, rng=rng)
+    #     c_or_cr = (c, r) if separate_r else c
+    #     b = get_random(bdim, dtype=dtype, rng=rng)
+    #     x = linalg.solve_toeplitz(c_or_cr, b)
+    #     for i in range(2):
+    #         for j in range(3):
+    #             bij = b if len(bdim) <= 2 else b[i, j]
+    #             c_or_cr_ij = (c[i, j], r[i, j]) if separate_r else c[i, j]
+    #             xij = linalg.solve_toeplitz(c_or_cr_ij, bij)
+    #             assert_allclose(x[i, j], xij)
+
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_cho_solve(self, bdim, dtype, rng):
+        A = get_nearly_hermitian((2, 3, 5, 5), dtype=dtype, atol=0, rng=rng)
+        A = A + 5*np.eye(5)
+        c_and_lower = linalg.cho_factor(A)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.cho_solve(c_and_lower, b)
+        if len(bdim) == 1:
+            x = x[..., np.newaxis]
+            b = b[..., np.newaxis]
+        assert_allclose(A @ x - b, 0, atol=1e-6)
+        assert_allclose(x, np.linalg.solve(A, b), atol=2e-6)
+
+    @pytest.mark.parametrize('lower', [False, True])
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_cho_solve_banded(self, lower, bdim, dtype, rng):
+        A = get_random((2, 3, 3, 5), dtype=dtype, rng=rng)
+        row_diag = 0 if lower else -1
+        A[:, :, row_diag] = 10
+        cb = linalg.cholesky_banded(A, lower=lower)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.cho_solve_banded((cb, lower), b)
+        for i in range(2):
+            for j in range(3):
+                bij = b if len(bdim) <= 2 else b[i, j]
+                xij = linalg.cho_solve_banded((cb[i, j], lower), bij)
+                assert_allclose(x[i, j], xij)
+
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_solveh_banded(self, bdim, dtype, rng):
+        A = get_random((2, 3, 3, 5), dtype=dtype, rng=rng)
+        A[:, :, -1] = 10
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.solveh_banded(A, b)
+        for i in range(2):
+            for j in range(3):
+                bij = b if len(bdim) <= 2 else b[i, j]
+                xij = linalg.solveh_banded(A[i, j], bij)
+                assert_allclose(x[i, j], xij)
+
+    @pytest.mark.parametrize('bdim', [(5,), (5, 4), (2, 3, 5, 4)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_solve_triangular(self, bdim, dtype, rng):
+        A = get_random((2, 3, 5, 5), dtype=dtype, rng=rng)
+        A = np.tril(A)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        x = linalg.solve_triangular(A, b, lower=True)
+        if len(bdim) == 1:
+            x = x[..., np.newaxis]
+            b = b[..., np.newaxis]
+        atol = 1e-10 if dtype in (np.complex128, np.float64) else 2e-4
+        assert_allclose(A @ x - b, 0, atol=atol)
+        assert_allclose(x, np.linalg.solve(A, b), atol=5*atol)
+
+    @pytest.mark.parametrize('bdim', [(4,), (4, 3), (2, 3, 4, 3)])
+    @pytest.mark.parametrize('dtype', floating)
+    def test_lstsq(self, bdim, dtype, rng):
+        A = get_random((2, 3, 4, 5), dtype=dtype, rng=rng)
+        b = get_random(bdim, dtype=dtype, rng=rng)
+        res = linalg.lstsq(A, b)
+        x = res[0]
+        if len(bdim) == 1:
+            x = x[..., np.newaxis]
+            b = b[..., np.newaxis]
+        assert_allclose(A @ x - b, 0, atol=2e-6)
+        assert len(res) == 4
+
+    @pytest.mark.parametrize('dtype', floating)
+    def test_clarkson_woodruff_transform(self, dtype, rng):
+        A = get_random((5, 3, 4, 6), dtype=dtype, rng=rng)
+        self.batch_test(linalg.clarkson_woodruff_transform, A,
+                        kwargs=dict(sketch_size=3, rng=311224))
+
+    def test_clarkson_woodruff_transform_sparse(self, rng):
+        A = get_random((5, 3, 4, 6), dtype=np.float64, rng=rng)
+        A = sparse.coo_array(A)
+        message = "Batch support for sparse arrays is not available."
+        with pytest.raises(NotImplementedError, match=message):
+            linalg.clarkson_woodruff_transform(A, sketch_size=3, rng=rng)
diff --git a/scipy/linalg/tests/test_lapack.py b/scipy/linalg/tests/test_lapack.py
index d8cde5bc0b9f..3e4294602f03 100644
--- a/scipy/linalg/tests/test_lapack.py
+++ b/scipy/linalg/tests/test_lapack.py
@@ -19,7 +19,7 @@
 
 from scipy.linalg import (_flapack as flapack, lapack, inv, svd, cholesky,
                           solve, ldl, norm, block_diag, qr, eigh, qz)
-
+from scipy.linalg._basic import _to_banded
 from scipy.linalg.lapack import _compute_lwork
 from scipy.stats import ortho_group, unitary_group
 
@@ -3479,9 +3479,9 @@ def test_sy_hetrs(mtype, dtype, lower):
     names = f'{mtype}trf', f'{mtype}trf_lwork', f'{mtype}trs'
     trf, trf_lwork, trs = get_lapack_funcs(names, dtype=dtype)
     lwork = trf_lwork(n, lower=lower)
-    ldu, ipiv, info = trf(A, lwork=lwork)
+    ldu, ipiv, info = trf(A, lwork=lwork, lower=lower)
     assert info == 0
-    x, info = trs(a=ldu, ipiv=ipiv, b=b)
+    x, info = trs(a=ldu, ipiv=ipiv, b=b, lower=lower)
     assert info == 0
     eps = np.finfo(dtype).eps
     assert_allclose(A@x, b, atol=100*n*eps)
@@ -3506,3 +3506,41 @@ def test_lantr(norm, uplo, m, n, diag, dtype):
     ref = lange(norm, A)
 
     assert_allclose(res, ref, rtol=2e-6)
+
+
+@pytest.mark.parametrize('dtype', DTYPES)
+@pytest.mark.parametrize('norm', ['1', 'I', 'O'])
+def test_gbcon(dtype, norm):
+    rng = np.random.default_rng(17273783424)
+
+    # A is of shape n x n with ku/kl super/sub-diagonals
+    n, ku, kl = 10, 2, 2
+    A = rng.random((n, n)) + rng.random((n, n))*1j
+    # make the condition numbers more interesting
+    offset = rng.permuted(np.logspace(0, rng.integers(0, 10), n))
+    A += offset
+    if np.issubdtype(dtype, np.floating):
+        A = A.real
+    A = A.astype(dtype)
+    A[np.triu_indices(n, ku + 1)] = 0
+    A[np.tril_indices(n, -kl - 1)] = 0
+
+    # construct banded form
+    tmp = _to_banded(kl, ku, A)
+    # add rows required by ?gbtrf
+    LDAB = 2*kl + ku + 1
+    ab = np.zeros((LDAB, n), dtype=dtype)
+    ab[kl:, :] = tmp
+
+    anorm = np.linalg.norm(A, ord=np.inf if norm == 'I' else 1)
+    gbcon, gbtrf = get_lapack_funcs(("gbcon", "gbtrf"), (ab,))
+    lu_band, ipiv, _ = gbtrf(ab, kl, ku)
+    res = gbcon(norm=norm, kl=kl, ku=ku, ab=lu_band, ipiv=ipiv,
+                anorm=anorm)[0]
+
+    gecon, getrf = get_lapack_funcs(('gecon', 'getrf'), (A,))
+    lu = getrf(A)[0]
+    ref = gecon(lu, anorm, norm=norm)[0]
+    # This is an estimate of reciprocal condition number; we just need order of
+    # magnitude.
+    assert_allclose(res, ref, rtol=1)
diff --git a/scipy/linalg/tests/test_matmul_toeplitz.py b/scipy/linalg/tests/test_matmul_toeplitz.py
index 6d663e7f3c64..22f8f94fd10a 100644
--- a/scipy/linalg/tests/test_matmul_toeplitz.py
+++ b/scipy/linalg/tests/test_matmul_toeplitz.py
@@ -125,11 +125,12 @@ def test_exceptions(self):
 
     # For toeplitz matrices, matmul_toeplitz() should be equivalent to @.
     def do(self, x, c, r=None, check_finite=False, workers=None):
+        c = np.ravel(c)
         if r is None:
             actual = matmul_toeplitz(c, x, check_finite, workers)
         else:
             r = np.ravel(r)
             actual = matmul_toeplitz((c, r), x, check_finite)
-        desired = toeplitz(np.ravel(c), r) @ x
+        desired = toeplitz(c, r) @ x
         assert_allclose(actual, desired,
             rtol=self.tolerance, atol=self.tolerance)
diff --git a/scipy/linalg/tests/test_solve_toeplitz.py b/scipy/linalg/tests/test_solve_toeplitz.py
index c078235e927b..75358988f660 100644
--- a/scipy/linalg/tests/test_solve_toeplitz.py
+++ b/scipy/linalg/tests/test_solve_toeplitz.py
@@ -2,7 +2,7 @@
 """
 import numpy as np
 from scipy.linalg._solve_toeplitz import levinson
-from scipy.linalg import solve, toeplitz, solve_toeplitz
+from scipy.linalg import solve, toeplitz, solve_toeplitz, matmul_toeplitz
 from numpy.testing import assert_equal, assert_allclose
 
 import pytest
@@ -39,7 +39,7 @@ def test_multiple_rhs():
     c = random.randn(4)
     r = random.randn(4)
     for offset in [0, 1j]:
-        for yshape in ((4,), (4, 3), (4, 3, 2)):
+        for yshape in ((4,), (4, 3)):
             y = random.randn(*yshape) + offset
             actual = solve_toeplitz((c,r), b=y)
             desired = solve(toeplitz(c, r=r), y)
@@ -134,3 +134,17 @@ def test_empty(dt_c, dt_b):
     x1 = solve_toeplitz(c, b)
     assert x1.shape == (0, 0)
     assert x1.dtype == x.dtype
+
+
+@pytest.mark.parametrize('fun', [solve_toeplitz, matmul_toeplitz])
+def test_nd_FutureWarning(fun):
+    # Test future warnings with n-D `c`/`r`
+    rng = np.random.default_rng(283592436523456)
+    c = rng.random((2, 3, 4))
+    r = rng.random((2, 3, 4))
+    b_or_x = rng.random(24)
+    message = "Beginning in SciPy 1.17, multidimensional input will be..."
+    with pytest.warns(FutureWarning, match=message):
+         fun(c, b_or_x)
+    with pytest.warns(FutureWarning, match=message):
+         fun((c, r), b_or_x)
diff --git a/scipy/linalg/tests/test_special_matrices.py b/scipy/linalg/tests/test_special_matrices.py
index d32e7ed4b401..7c031c95f7b4 100644
--- a/scipy/linalg/tests/test_special_matrices.py
+++ b/scipy/linalg/tests/test_special_matrices.py
@@ -172,9 +172,6 @@ def test_scalar_and_1d_args(self):
         a = block_diag([2, 3], 4)
         assert_array_equal(a, [[2, 3, 0], [0, 0, 4]])
 
-    def test_bad_arg(self):
-        assert_raises(ValueError, block_diag, [[[1]]])
-
     def test_no_args(self):
         a = block_diag()
         assert_equal(a.ndim, 2)
diff --git a/scipy/ndimage/_morphology.py b/scipy/ndimage/_morphology.py
index 12972c09a7cd..d06dc526c873 100644
--- a/scipy/ndimage/_morphology.py
+++ b/scipy/ndimage/_morphology.py
@@ -1803,6 +1803,7 @@ def morphological_laplace(input, size=None, footprint=None, structure=None,
         tmp2 = grey_erosion(input, size, footprint, structure, None, mode,
                             cval, origin, axes=axes)
         np.add(tmp1, tmp2, tmp2)
+        input = np.asarray(input)
         np.subtract(tmp2, input, tmp2)
         np.subtract(tmp2, input, tmp2)
         return tmp2
diff --git a/scipy/ndimage/_support_alternative_backends.py b/scipy/ndimage/_support_alternative_backends.py
index 38d3edb21c81..b863a16bfb85 100644
--- a/scipy/ndimage/_support_alternative_backends.py
+++ b/scipy/ndimage/_support_alternative_backends.py
@@ -46,7 +46,7 @@ def wrapper(*args, **kwds):
                 # XXX: output arrays
                 result = func(*args, **kwds)
 
-                if isinstance(result, (np.ndarray, np.generic)):
+                if isinstance(result, np.ndarray | np.generic):
                     # XXX: np.int32->np.array_0D
                     return xp.asarray(result)
                 elif isinstance(result, int):
diff --git a/scipy/ndimage/tests/test_filters.py b/scipy/ndimage/tests/test_filters.py
index fec7e37e10b0..26c77e2b43bf 100644
--- a/scipy/ndimage/tests/test_filters.py
+++ b/scipy/ndimage/tests/test_filters.py
@@ -195,6 +195,7 @@ def test_correlate01(self, xp):
 
     @xfail_xp_backends('cupy', reason="Differs by a factor of two?")
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_correlate01_overlap(self, xp):
         array = xp.reshape(xp.arange(256), (16, 16))
         weights = xp.asarray([2])
@@ -537,6 +538,7 @@ def test_correlate22(self, dtype_array, dtype_output, xp):
         assert_array_almost_equal(output, expected)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype_array', types)
     @pytest.mark.parametrize('dtype_output', types)
     def test_correlate23(self, dtype_array, dtype_output, xp):
@@ -556,6 +558,7 @@ def test_correlate23(self, dtype_array, dtype_output, xp):
         assert_array_almost_equal(output, expected)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype_array', types)
     @pytest.mark.parametrize('dtype_output', types)
     def test_correlate24(self, dtype_array, dtype_output, xp):
@@ -576,6 +579,7 @@ def test_correlate24(self, dtype_array, dtype_output, xp):
         assert_array_almost_equal(output, tcov)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype_array', types)
     @pytest.mark.parametrize('dtype_output', types)
     def test_correlate25(self, dtype_array, dtype_output, xp):
@@ -881,6 +885,7 @@ def test_gauss06(self, xp):
         assert_array_almost_equal(output1, output2)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_gauss_memory_overlap(self, xp):
         input = xp.arange(100 * 100, dtype=xp.float32)
         input = xp.reshape(input, (100, 100))
@@ -1227,6 +1232,7 @@ def test_prewitt01(self, dtype, xp):
         assert_array_almost_equal(t, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype', types + complex_types)
     def test_prewitt02(self, dtype, xp):
         if is_torch(xp) and dtype in ("uint16", "uint32", "uint64"):
@@ -1289,6 +1295,7 @@ def test_sobel01(self, dtype, xp):
         assert_array_almost_equal(t, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.",)
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype', types + complex_types)
     def test_sobel02(self, dtype, xp):
         if is_torch(xp) and dtype in ("uint16", "uint32", "uint64"):
@@ -1349,6 +1356,7 @@ def test_laplace01(self, dtype, xp):
         assert_array_almost_equal(tmp1 + tmp2, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only",)
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype',
                              ["int32", "float32", "float64",
                               "complex64", "complex128"])
@@ -1379,6 +1387,7 @@ def test_gaussian_laplace01(self, dtype, xp):
         assert_array_almost_equal(tmp1 + tmp2, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype',
                              ["int32", "float32", "float64",
                               "complex64", "complex128"])
@@ -1395,6 +1404,7 @@ def test_gaussian_laplace02(self, dtype, xp):
         assert_array_almost_equal(tmp1 + tmp2, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype', types + complex_types)
     def test_generic_laplace01(self, dtype, xp):
         if is_torch(xp) and dtype in ("uint16", "uint32", "uint64"):
@@ -1420,6 +1430,7 @@ def derivative2(input, axis, output, mode, cval, a, b):
         assert_array_almost_equal(tmp, output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype',
                              ["int32", "float32", "float64",
                               "complex64", "complex128"])
@@ -1441,6 +1452,7 @@ def test_gaussian_gradient_magnitude01(self, dtype, xp):
         xp_assert_close(output, expected, rtol=1e-6, atol=1e-6)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @pytest.mark.parametrize('dtype',
                              ["int32", "float32", "float64",
                               "complex64", "complex128"])
@@ -1833,6 +1845,9 @@ def test_rank06(self, xp):
     @skip_xp_backends("jax.numpy",
         reason="assignment destination is read-only",
     )
+    @skip_xp_backends("dask.array",
+        reason="wrong answer",
+    )
     def test_rank06_overlap(self, xp):
         array = xp.asarray([[3, 2, 5, 1, 4],
                             [5, 8, 3, 7, 1],
@@ -2006,7 +2021,8 @@ def test_rank15(self, dtype, xp):
                                      origin=[-1, 0])
         xp_assert_equal(expected, output)
 
-    def test_rank16(self, xp):
+    @skip_xp_backends(np_only=True, reason="test list input")
+    def test_rank16(self):
         # test that lists are accepted and interpreted as numpy arrays
         array = [3, 2, 5, 1, 4]
         # expected values are: median(3, 2, 5) = 3, median(2, 5, 1) = 2, etc
@@ -2640,6 +2656,7 @@ def test_gaussian_radius_invalid(xp):
 
 
 @skip_xp_backends("jax.numpy", reason="output array is read-only")
+@skip_xp_backends("dask.array", reason="output array is read-only.")
 class TestThreading:
     def check_func_thread(self, n, fun, args, out):
         from threading import Thread
diff --git a/scipy/ndimage/tests/test_interpolation.py b/scipy/ndimage/tests/test_interpolation.py
index 5d9f80db4707..b74721947a45 100644
--- a/scipy/ndimage/tests/test_interpolation.py
+++ b/scipy/ndimage/tests/test_interpolation.py
@@ -296,54 +296,58 @@ def mapping(x):
         assert_array_almost_equal(out, xp.asarray([1, 2, 3, 4], dtype=out.dtype))
 
     def test_geometric_transform15(self, order, xp):
-        data = [1, 2, 3, 4]
+        data = xp.asarray([1, 2, 3, 4])
 
         def mapping(x):
             return (x[0] / 2,)
 
         out = ndimage.geometric_transform(data, mapping, [8], order=order)
-        assert_array_almost_equal(out[::2], [1, 2, 3, 4])
+        assert_array_almost_equal(out[::2], xp.asarray([1, 2, 3, 4]))
 
     def test_geometric_transform16(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9.0, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0], x[1] * 2)
 
         out = ndimage.geometric_transform(data, mapping, (3, 2),
                                           order=order)
-        assert_array_almost_equal(out, [[1, 3], [5, 7], [9, 11]])
+        assert_array_almost_equal(out, xp.asarray([[1, 3], [5, 7], [9, 11]]))
 
     def test_geometric_transform17(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0] * 2, x[1])
 
         out = ndimage.geometric_transform(data, mapping, (1, 4),
                                           order=order)
-        assert_array_almost_equal(out, [[1, 2, 3, 4]])
+        assert_array_almost_equal(out, xp.asarray([[1, 2, 3, 4]]))
 
     def test_geometric_transform18(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0] * 2, x[1] * 2)
 
         out = ndimage.geometric_transform(data, mapping, (1, 2),
                                           order=order)
-        assert_array_almost_equal(out, [[1, 3]])
+        assert_array_almost_equal(out, xp.asarray([[1, 3]]))
 
     def test_geometric_transform19(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0], x[1] / 2)
@@ -356,6 +360,7 @@ def test_geometric_transform20(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0] / 2, x[1])
@@ -368,6 +373,7 @@ def test_geometric_transform21(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (x[0] / 2, x[1] / 2)
@@ -377,9 +383,10 @@ def mapping(x):
         assert_array_almost_equal(out[::2, ::2], data)
 
     def test_geometric_transform22(self, order, xp):
-        data = xp.asarray([[1, 2, 3, 4],
-                           [5, 6, 7, 8],
-                           [9, 10, 11, 12]], dtype=xp.float64)
+        data = [[1, 2, 3, 4],
+                [5, 6, 7, 8],
+                [9, 10, 11, 12]]
+        data = xp.asarray(data, dtype=xp.float64)
 
         def mapping1(x):
             return (x[0] / 2, x[1] / 2)
@@ -397,18 +404,19 @@ def test_geometric_transform23(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x):
             return (1, x[0] * 2)
 
         out = ndimage.geometric_transform(data, mapping, (2,), order=order)
-        out = out.astype(np.int32)
-        assert_array_almost_equal(out, [5, 7])
+        assert_array_almost_equal(out, xp.asarray([5, 7]))
 
     def test_geometric_transform24(self, order, xp):
         data = [[1, 2, 3, 4],
                 [5, 6, 7, 8],
                 [9, 10, 11, 12]]
+        data = xp.asarray(data)
 
         def mapping(x, a, b):
             return (a, x[0] * b)
@@ -416,7 +424,7 @@ def mapping(x, a, b):
         out = ndimage.geometric_transform(
             data, mapping, (2,), order=order, extra_arguments=(1,),
             extra_keywords={'b': 2})
-        assert_array_almost_equal(out, [5, 7])
+        assert_array_almost_equal(out, xp.asarray([5, 7]))
 
 
 @skip_xp_backends("cupy", reason="CuPy does not have geometric_transform")
@@ -1474,6 +1482,6 @@ def test_rotate10(self, xp):
 
     @xfail_xp_backends("cupy", reason="https://github.com/cupy/cupy/issues/8400")
     def test_rotate_exact_180(self, xp):
-        a = np.tile(xp.arange(5), (5, 1))
+        a = xp.asarray(np.tile(np.arange(5), (5, 1)))
         b = ndimage.rotate(ndimage.rotate(a, 180), -180)
         xp_assert_equal(a, b)
diff --git a/scipy/ndimage/tests/test_measurements.py b/scipy/ndimage/tests/test_measurements.py
index c8175ba309dd..69a4b2f603e7 100644
--- a/scipy/ndimage/tests/test_measurements.py
+++ b/scipy/ndimage/tests/test_measurements.py
@@ -5,7 +5,6 @@
 from numpy.testing import suppress_warnings
 
 from scipy._lib._array_api import (
-    is_jax,
     is_torch,
     array_namespace,
     xp_assert_equal,
@@ -23,6 +22,7 @@
 
 from scipy.conftest import array_api_compatible
 skip_xp_backends = pytest.mark.skip_xp_backends
+xfail_xp_backends = pytest.mark.xfail_xp_backends
 pytestmark = [array_api_compatible, pytest.mark.usefixtures("skip_xp_backends"),
               skip_xp_backends(cpu_only=True, exceptions=['cupy', 'jax.numpy'],)]
 
@@ -112,6 +112,7 @@ def test_nonint_labels(self, xp):
             xp_assert_equal(centers, np.asarray([0.5, 8.0]))
 
 
+@skip_xp_backends(np_only=True, reason='test internal numpy-only helpers')
 class Test_measurements_select:
     """ndimage._measurements._select() is a utility used by other functions."""
 
@@ -365,10 +366,9 @@ def test_label_output_dtype(xp):
         assert output.dtype == t
 
 
+@skip_xp_backends('dask.array', reason='Dask does not raise')
+@xfail_xp_backends('jax.numpy', reason='JAX does not raise')
 def test_label_output_wrong_size(xp):
-    if is_jax(xp):
-        pytest.xfail("JAX does not raise")
-
     data = xp.ones([5])
     for t in types:
         dtype = getattr(xp, t)
@@ -555,6 +555,7 @@ def test_value_indices02(xp):
         ndimage.value_indices(data)
 
 
+@skip_xp_backends("dask.array", reason="len on data-dependent output shapes")
 def test_value_indices03(xp):
     "Test different input array shapes, from 1-D to 4-D"
     for shape in [(36,), (18, 2), (3, 3, 4), (3, 3, 2, 2)]:
@@ -674,6 +675,7 @@ def test_sum11(xp):
         assert_almost_equal(output, xp.asarray(6.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_sum12(xp):
     labels = xp.asarray([[1, 2], [2, 4]], dtype=xp.int8)
     for type in types:
@@ -683,6 +685,7 @@ def test_sum12(xp):
         assert_array_almost_equal(output, xp.asarray([4.0, 0.0, 5.0]))
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_sum_labels(xp):
     labels = xp.asarray([[1, 2], [2, 4]], dtype=xp.int8)
     for type in types:
@@ -695,7 +698,7 @@ def test_sum_labels(xp):
         assert xp.all(output_sum == output_labels)
         assert_array_almost_equal(output_labels, xp.asarray([4.0, 0.0, 5.0]))
 
-
+@skip_xp_backends("dask.array", reason="dask outputs wrong results here")
 def test_mean01(xp):
     labels = np.asarray([1, 0], dtype=bool)
     labels = xp.asarray(labels)
@@ -706,6 +709,7 @@ def test_mean01(xp):
         assert_almost_equal(output, xp.asarray(2.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="dask outputs wrong results here")
 def test_mean02(xp):
     labels = np.asarray([1, 0], dtype=bool)
     input = np.asarray([[1, 2], [3, 4]], dtype=bool)
@@ -716,6 +720,7 @@ def test_mean02(xp):
     assert_almost_equal(output, xp.asarray(1.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="dask outputs wrong results here")
 def test_mean03(xp):
     labels = xp.asarray([1, 2])
     for type in types:
@@ -726,6 +731,7 @@ def test_mean03(xp):
         assert_almost_equal(output, xp.asarray(3.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="dask outputs wrong results here")
 def test_mean04(xp):
     labels = xp.asarray([[1, 2], [2, 4]], dtype=xp.int8)
     with np.errstate(all='ignore'):
@@ -772,6 +778,7 @@ def test_minimum03(xp):
         assert_almost_equal(output, xp.asarray(2.0), check_0d=False)
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_minimum04(xp):
     labels = xp.asarray([[1, 2], [2, 3]])
     for type in types:
@@ -811,6 +818,7 @@ def test_maximum03(xp):
         assert_almost_equal(output, xp.asarray(4.0), check_0d=False)
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_maximum04(xp):
     labels = xp.asarray([[1, 2], [2, 3]])
     for type in types:
@@ -827,6 +835,7 @@ def test_maximum05(xp):
     assert ndimage.maximum(x) == -1
 
 
+@pytest.mark.filterwarnings("ignore::FutureWarning:dask")
 def test_median01(xp):
     a = xp.asarray([[1, 2, 0, 1],
                     [5, 3, 0, 4],
@@ -849,6 +858,7 @@ def test_median02(xp):
     assert_almost_equal(output, xp.asarray(1.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="dask.array.median only implemented for along an axis.")
 def test_median03(xp):
     a = xp.asarray([[1, 2, 0, 1],
                     [5, 3, 0, 4],
@@ -862,6 +872,7 @@ def test_median03(xp):
     assert_almost_equal(output, xp.asarray(3.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="Crash inside dask searchsorted")
 def test_median_gh12836_bool(xp):
     # test boolean addition fix on example from gh-12836
     a = np.asarray([1, 1], dtype=bool)
@@ -869,7 +880,7 @@ def test_median_gh12836_bool(xp):
     output = ndimage.median(a, labels=xp.ones((2,)), index=xp.asarray([1]))
     assert_array_almost_equal(output, xp.asarray([1.0]))
 
-
+@skip_xp_backends("dask.array", reason="Crash inside dask searchsorted")
 def test_median_no_int_overflow(xp):
     # test integer overflow fix on example from gh-12836
     a = xp.asarray([65, 70], dtype=xp.int8)
@@ -910,7 +921,9 @@ def test_variance04(xp):
     output = ndimage.variance(input)
     assert_almost_equal(output, xp.asarray(0.25), check_0d=False)
 
-
+# dask.array is maybe due to failed conversion to numpy?
+# array-api-strict should've caught use of non array API functions I think
+@skip_xp_backends("dask.array", reason="conjugate called on dask.array which doesn't exist")
 def test_variance05(xp):
     labels = xp.asarray([2, 2, 3])
     for type in types:
@@ -920,7 +933,7 @@ def test_variance05(xp):
         output = ndimage.variance(input, labels, 2)
         assert_almost_equal(output, xp.asarray(1.0), check_0d=False)
 
-
+@skip_xp_backends("dask.array", reason="Data-dependent output shapes")
 def test_variance06(xp):
     labels = xp.asarray([2, 2, 3, 3, 4])
     with np.errstate(all='ignore'):
@@ -965,6 +978,9 @@ def test_standard_deviation04(xp):
     assert_almost_equal(output, xp.asarray(0.5), check_0d=False)
 
 
+# dask.array is maybe due to failed conversion to numpy?
+# array-api-strict should've caught use of non array API functions I think
+@skip_xp_backends("dask.array", reason="conjugate called on dask.array which doesn't exist")
 def test_standard_deviation05(xp):
     labels = xp.asarray([2, 2, 3])
     for type in types:
@@ -974,6 +990,7 @@ def test_standard_deviation05(xp):
         assert_almost_equal(output, xp.asarray(1.0), check_0d=False)
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_standard_deviation06(xp):
     labels = xp.asarray([2, 2, 3, 3, 4])
     with np.errstate(all='ignore'):
@@ -986,6 +1003,7 @@ def test_standard_deviation06(xp):
             assert_array_almost_equal(output, xp.asarray([1.0, 1.0, 0.0]))
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_standard_deviation07(xp):
     labels = xp.asarray([1])
     with np.errstate(all='ignore'):
@@ -1059,6 +1077,7 @@ def test_minimum_position06(xp):
         assert output == (0, 1)
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_minimum_position07(xp):
     labels = xp.asarray([1, 2, 3, 4])
     for type in types:
@@ -1124,6 +1143,7 @@ def test_maximum_position05(xp):
         assert output == (0, 0)
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_maximum_position06(xp):
     labels = xp.asarray([1, 2, 0, 4])
     for type in types:
@@ -1136,7 +1156,7 @@ def test_maximum_position06(xp):
         assert output[0] == (0, 0)
         assert output[1] == (1, 1)
 
-
+@skip_xp_backends("dask.array", reason="crash in dask.array searchsorted")
 def test_maximum_position07(xp):
     # Test float labels
     if is_torch(xp):
@@ -1154,6 +1174,7 @@ def test_maximum_position07(xp):
         assert output[1] == (0, 3)
 
 
+@skip_xp_backends("dask.array", reason="dask wrong answer")
 def test_extrema01(xp):
     labels = np.asarray([1, 0], dtype=bool)
     labels = xp.asarray(labels)
@@ -1170,6 +1191,7 @@ def test_extrema01(xp):
         assert output1 == (output2, output3, output4, output5)
 
 
+@skip_xp_backends("dask.array", reason="dask wrong answer")
 def test_extrema02(xp):
     labels = xp.asarray([1, 2])
     for type in types:
@@ -1188,6 +1210,7 @@ def test_extrema02(xp):
         assert output1 == (output2, output3, output4, output5)
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_extrema03(xp):
     labels = xp.asarray([[1, 2], [2, 3]])
     for type in types:
@@ -1216,6 +1239,7 @@ def test_extrema03(xp):
         assert output1[3] == output5
 
 
+@skip_xp_backends('dask.array', reason="no argsort in Dask")
 def test_extrema04(xp):
     labels = xp.asarray([1, 2, 0, 4])
     for type in types:
@@ -1292,6 +1316,7 @@ def test_center_of_mass06(xp):
     assert output == expected
 
 
+@skip_xp_backends("dask.array", reason="wrong output shape")
 def test_center_of_mass07(xp):
     labels = xp.asarray([1, 0])
     expected = (0.5, 0.0)
@@ -1301,6 +1326,7 @@ def test_center_of_mass07(xp):
     assert output == expected
 
 
+@skip_xp_backends("dask.array", reason="wrong output shape")
 def test_center_of_mass08(xp):
     labels = xp.asarray([1, 2])
     expected = (0.5, 1.0)
@@ -1310,6 +1336,7 @@ def test_center_of_mass08(xp):
     assert output == expected
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_center_of_mass09(xp):
     labels = xp.asarray((1, 2))
     expected = xp.asarray([(0.5, 0.0), (0.5, 1.0)], dtype=xp.float64)
@@ -1347,6 +1374,7 @@ def test_histogram03(xp):
     assert_array_almost_equal(output[1], expected2)
 
 
+@skip_xp_backends("dask.array", reason="data-dependent output shapes")
 def test_stat_funcs_2d(xp):
     a = xp.asarray([[5, 6, 0, 0, 0], [8, 9, 0, 0, 0], [0, 0, 0, 3, 5]])
     lbl = xp.asarray([[1, 1, 0, 0, 0], [1, 1, 0, 0, 0], [0, 0, 0, 2, 2]])
@@ -1589,7 +1617,8 @@ def test_watershed_ift08(self, xp):
     @skip_xp_backends("cupy", reason="no watershed_ift on CuPy"	)
     def test_watershed_ift09(self, xp):
         # Test large cost. See gh-19575
-        data = xp.asarray([[xp.iinfo(xp.uint16).max, 0],
+        xp_test = array_namespace(xp.empty(0))  # dask.array needs iinfo
+        data = xp.asarray([[xp_test.iinfo(xp.uint16).max, 0],
                            [0, 0]], dtype=xp.uint16)
         markers = xp.asarray([[1, 0],
                               [0, 0]], dtype=xp.int8)
diff --git a/scipy/ndimage/tests/test_morphology.py b/scipy/ndimage/tests/test_morphology.py
index 2bdd133011e9..967ca10935c9 100644
--- a/scipy/ndimage/tests/test_morphology.py
+++ b/scipy/ndimage/tests/test_morphology.py
@@ -18,6 +18,11 @@
               skip_xp_backends(cpu_only=True, exceptions=['cupy', 'jax.numpy'],)]
 
 
+@skip_xp_backends('dask.array',
+    reason="Dask.array gets wrong results here. "
+           "Some tests can pass when creating input array from list of ones"
+           "instead of xp.ones, so maybe something is getting corrupted here."
+)
 class TestNdimageMorphology:
 
     @xfail_xp_backends('cupy', reason='CuPy does not have distance_transform_bf.')
@@ -645,10 +650,13 @@ def test_distance_transform_edt4(self, dtype, xp):
         out = ndimage.distance_transform_edt(data, sampling=[2, 1])
         assert_array_almost_equal(out, ref)
 
+    @xfail_xp_backends(
+        "cupy", reason="Only 2D and 3D distance transforms are supported in CuPy"
+    )
     def test_distance_transform_edt5(self, xp):
         # Ticket #954 regression test
-        out = ndimage.distance_transform_edt(False)
-        assert_array_almost_equal(out, [0.])
+        out = ndimage.distance_transform_edt(xp.asarray(False))
+        assert_array_almost_equal(out, xp.asarray([0.]))
 
     @xfail_xp_backends(
         np_only=True, reason='XXX: does not raise unless indices is a numpy array'
@@ -673,20 +681,28 @@ def test_distance_transform_edt6(self, xp):
                 distances=distances_out
             )
 
+    @skip_xp_backends(np_only=True,
+                      reason="generate_binary_structure always generates numpy objects")
     def test_generate_structure01(self, xp):
         struct = ndimage.generate_binary_structure(0, 1)
         assert struct == 1
 
+    @skip_xp_backends(np_only=True,
+                      reason="generate_binary_structure always generates numpy objects")
     def test_generate_structure02(self, xp):
         struct = ndimage.generate_binary_structure(1, 1)
         assert_array_almost_equal(struct, [1, 1, 1])
 
+    @skip_xp_backends(np_only=True,
+                      reason="generate_binary_structure always generates numpy objects")
     def test_generate_structure03(self, xp):
         struct = ndimage.generate_binary_structure(2, 1)
         assert_array_almost_equal(struct, [[0, 1, 0],
                                            [1, 1, 1],
                                            [0, 1, 0]])
 
+    @skip_xp_backends(np_only=True,
+                      reason="generate_binary_structure always generates numpy objects")
     def test_generate_structure04(self, xp):
         struct = ndimage.generate_binary_structure(2, 2)
         assert_array_almost_equal(struct, [[1, 1, 1],
@@ -1057,9 +1073,10 @@ def test_binary_erosion27(self, xp):
                                      iterations=2)
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason='inplace out= arguments are numpy-specific')
+    @xfail_xp_backends("cupy",
+                       reason="NotImplementedError: only brute_force iteration")
     def test_binary_erosion28(self, xp):
         struct = [[0, 1, 0],
                   [1, 1, 1],
@@ -1116,8 +1133,10 @@ def test_binary_erosion29(self, xp):
                                      border_value=1, iterations=3)
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(np_only=True,
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
                       reason='inplace out= arguments are numpy-specific')
+    @xfail_xp_backends("cupy",
+                       reason="NotImplementedError: only brute_force iteration")
     def test_binary_erosion30(self, xp):
         struct = [[0, 1, 0],
                   [1, 1, 1],
@@ -1151,9 +1170,8 @@ def test_binary_erosion30(self, xp):
                                iterations=3, output=data)
         assert_array_almost_equal(data, expected)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason='inplace out= arguments are numpy-specific')
     def test_binary_erosion31(self, xp):
         struct = [[0, 1, 0],
                   [1, 1, 1],
@@ -1277,9 +1295,8 @@ def test_binary_erosion34(self, xp):
                                      border_value=1, mask=mask)
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"], 
+                      reason='inplace out= arguments are numpy-specific')
     def test_binary_erosion35(self, xp):
         struct = [[0, 1, 0],
                   [1, 1, 1],
@@ -1364,9 +1381,10 @@ def test_binary_erosion36(self, xp):
                                      border_value=1, origin=(-1, -1))
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason='inplace out= arguments are numpy-specific')
+    @xfail_xp_backends("cupy",
+                       reason="NotImplementedError: only brute_force iteration")
     def test_binary_erosion37(self, xp):
         a = np.asarray([[1, 0, 1],
                         [0, 1, 0],
@@ -1390,9 +1408,10 @@ def test_binary_erosion38(self, xp):
         with assert_raises(TypeError):
             _ = ndimage.binary_erosion(data, iterations=iterations)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason='inplace out= arguments are numpy-specific')
+    @xfail_xp_backends("cupy",
+                       reason="NotImplementedError: only brute_force iteration")
     def test_binary_erosion39(self, xp):
         iterations = np.int32(3)
         struct = [[0, 1, 0],
@@ -1422,9 +1441,10 @@ def test_binary_erosion39(self, xp):
                                iterations=iterations, output=out)
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(
-        np_only=True, reason='inplace out= arguments are numpy-specific'
-    )
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason='inplace out= arguments are numpy-specific')
+    @xfail_xp_backends("cupy",
+                       reason="NotImplementedError: only brute_force iteration")
     def test_binary_erosion40(self, xp):
         iterations = np.int64(3)
         struct = [[0, 1, 0],
@@ -2303,6 +2323,7 @@ def test_grey_erosion01(self, xp):
                                               [5, 5, 3, 3, 1]]))
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     @xfail_xp_backends("cupy", reason="https://github.com/cupy/cupy/issues/8398")
     def test_grey_erosion01_overlap(self, xp):
 
@@ -2498,6 +2519,7 @@ def test_morphological_laplace02(self, xp):
         assert_array_almost_equal(output, expected)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_white_tophat01(self, xp):
         array = xp.asarray([[3, 2, 5, 1, 4],
                             [7, 6, 9, 3, 5],
@@ -2551,6 +2573,7 @@ def test_white_tophat03(self, xp):
         xp_assert_equal(output, expected)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_white_tophat04(self, xp):
         array = np.eye(5, dtype=bool)
         structure = np.ones((3, 3), dtype=bool)
@@ -2563,6 +2586,7 @@ def test_white_tophat04(self, xp):
         ndimage.white_tophat(array, structure=structure, output=output)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_black_tophat01(self, xp):
         array = xp.asarray([[3, 2, 5, 1, 4],
                             [7, 6, 9, 3, 5],
@@ -2616,6 +2640,7 @@ def test_black_tophat03(self, xp):
         xp_assert_equal(output, expected)
 
     @skip_xp_backends("jax.numpy", reason="output array is read-only.")
+    @skip_xp_backends("dask.array", reason="output array is read-only.")
     def test_black_tophat04(self, xp):
         array = xp.asarray(np.eye(5, dtype=bool))
         structure = xp.asarray(np.ones((3, 3), dtype=bool))
@@ -2680,11 +2705,10 @@ def test_grey_axes(self, xp, func_name, expand_axis, origin, footprint_mode,
             out = func(data, axes=axes, **kwargs)
         xp_assert_close(out, expected)
 
+    @skip_xp_backends(np_only=True, exceptions=["cupy"],
+                      reason="inplace output= is numpy-specific")
     @pytest.mark.parametrize('dtype', types)
     def test_hit_or_miss01(self, dtype, xp):
-        if not (is_numpy(xp) or is_cupy(xp)):
-            pytest.xfail("inplace output= is numpy-specific")
-
         dtype = getattr(xp, dtype)
         struct = [[0, 1, 0],
                   [1, 1, 1],
@@ -2867,6 +2891,9 @@ def test_binary_closing_noninteger_brute_force_passes_when_true(xp):
     )
 
 
+@skip_xp_backends(np_only=True, exceptions=["cupy"],
+                  reason="inplace output= is numpy-specific")
+@xfail_xp_backends("cupy", reason="NotImplementedError: only brute_force iteration")
 @pytest.mark.parametrize(
     'function',
     ['binary_erosion', 'binary_dilation', 'binary_opening', 'binary_closing'],
@@ -2876,6 +2903,7 @@ def test_binary_closing_noninteger_brute_force_passes_when_true(xp):
 def test_binary_input_as_output(function, iterations, brute_force, xp):
     rstate = np.random.RandomState(123)
     data = rstate.randint(low=0, high=2, size=100).astype(bool)
+    data = xp.asarray(data)
     ndi_func = getattr(ndimage, function)
 
     # input data is not modified
@@ -2893,6 +2921,7 @@ def test_binary_input_as_output(function, iterations, brute_force, xp):
 def test_binary_hit_or_miss_input_as_output(xp):
     rstate = np.random.RandomState(123)
     data = rstate.randint(low=0, high=2, size=100).astype(bool)
+    data = xp.asarray(data)
 
     # input data is not modified
     data_orig = data.copy()
diff --git a/scipy/optimize/_basinhopping.py b/scipy/optimize/_basinhopping.py
index 90498155887f..5d6ed30b18ab 100644
--- a/scipy/optimize/_basinhopping.py
+++ b/scipy/optimize/_basinhopping.py
@@ -451,7 +451,9 @@ def basinhopping(func, x0, niter=100, T=1.0, stepsize=0.5,
         cause of the termination. The ``OptimizeResult`` object returned by the
         selected minimizer at the lowest minimum is also contained within this
         object and can be accessed through the ``lowest_optimization_result``
-        attribute.  See `OptimizeResult` for a description of other attributes.
+        attribute. ``lowest_optimization_result`` will only be updated if a
+        local minimization was successful.          
+        See `OptimizeResult` for a description of other attributes.
 
     See Also
     --------
diff --git a/scipy/optimize/_chandrupatla.py b/scipy/optimize/_chandrupatla.py
index 5a4b70098919..17533cbe1e85 100644
--- a/scipy/optimize/_chandrupatla.py
+++ b/scipy/optimize/_chandrupatla.py
@@ -134,7 +134,7 @@ def _chandrupatla(func, a, b, *, args=(), xatol=None, xrtol=None,
     func, xs, fs, args, shape, dtype, xp = temp
     x1, x2 = xs
     f1, f2 = fs
-    status = xp.full_like(x1, xp.asarray(eim._EINPROGRESS),
+    status = xp.full_like(x1, eim._EINPROGRESS,
                           dtype=xp.int32)  # in progress
     nit, nfev = 0, 2  # two function evaluations performed above
     finfo = xp.finfo(dtype)
@@ -219,7 +219,7 @@ def post_termination_check(work):
         j = ((1 - xp.sqrt(1 - xi1)) < phi1) & (phi1 < xp.sqrt(xi1))
 
         f1j, f2j, f3j, alphaj = work.f1[j], work.f2[j], work.f3[j], alpha[j]
-        t = xp.full_like(alpha, xp.asarray(0.5))
+        t = xp.full_like(alpha, 0.5)
         t[j] = (f1j / (f1j - f2j) * f3j / (f3j - f2j)
                 - alphaj * f1j / (f3j - f1j) * f2j / (f2j - f3j))
 
@@ -401,8 +401,7 @@ def _chandrupatla_minimize(func, x1, x2, x3, *, args=(), xatol=None,
     x1, x2, x3 = xs
     f1, f2, f3 = fs
     phi = xp.asarray(0.5 + 0.5*5**0.5, dtype=dtype)[()]  # golden ratio
-    status = xp.full_like(x1, xp.asarray(eim._EINPROGRESS),
-                          dtype=xp.int32)  # in progress
+    status = xp.full_like(x1, eim._EINPROGRESS, dtype=xp.int32)  # in progress
     nit, nfev = 0, 3  # three function evaluations performed above
     fatol = xp.finfo(dtype).smallest_normal if fatol is None else fatol
     frtol = xp.finfo(dtype).smallest_normal if frtol is None else frtol
diff --git a/scipy/optimize/_lsq/dogbox.py b/scipy/optimize/_lsq/dogbox.py
index 6bb5abbe7902..b986929626f2 100644
--- a/scipy/optimize/_lsq/dogbox.py
+++ b/scipy/optimize/_lsq/dogbox.py
@@ -45,6 +45,8 @@
 
 from scipy.sparse.linalg import LinearOperator, aslinearoperator, lsmr
 from scipy.optimize import OptimizeResult
+from scipy._lib._util import _call_callback_maybe_halt
+
 
 from .common import (
     step_size_to_bound, in_bounds, update_tr_radius, evaluate_quadratic,
@@ -147,7 +149,7 @@ def dogleg_step(x, newton_step, g, a, b, tr_bounds, lb, ub):
 
 
 def dogbox(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
-           loss_function, tr_solver, tr_options, verbose):
+           loss_function, tr_solver, tr_options, verbose, callback=None):
     f = f0
     f_true = f.copy()
     nfev = 1
@@ -322,6 +324,18 @@ def dogbox(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
             actual_reduction = 0
 
         iteration += 1
+        
+        # Call callback function and possibly stop optimization
+        if callback is not None:
+            intermediate_result = OptimizeResult(
+                x=x, fun=f, nit=iteration, nfev=nfev)
+            intermediate_result["cost"] = cost_new
+
+            if _call_callback_maybe_halt(
+                callback, intermediate_result
+            ):
+                termination_status = -2
+                break
 
     if termination_status is None:
         termination_status = 0
diff --git a/scipy/optimize/_lsq/least_squares.py b/scipy/optimize/_lsq/least_squares.py
index e01ce04a8992..71fbba41762e 100644
--- a/scipy/optimize/_lsq/least_squares.py
+++ b/scipy/optimize/_lsq/least_squares.py
@@ -14,8 +14,11 @@
 from .dogbox import dogbox
 from .common import EPS, in_bounds, make_strictly_feasible
 
+    
+from scipy.optimize._optimize import _wrap_callback
 
 TERMINATION_MESSAGES = {
+    -2: "Stopped because `callback` function raised `StopIteration` or returned `True`",
     -1: "Improper input parameters status returned from `leastsq`",
     0: "The maximum number of function evaluations is exceeded.",
     1: "`gtol` termination condition is satisfied.",
@@ -242,7 +245,9 @@ def least_squares(
         fun, x0, jac='2-point', bounds=(-np.inf, np.inf), method='trf',
         ftol=1e-8, xtol=1e-8, gtol=1e-8, x_scale=1.0, loss='linear',
         f_scale=1.0, diff_step=None, tr_solver=None, tr_options=None,
-        jac_sparsity=None, max_nfev=None, verbose=0, args=(), kwargs=None):
+        jac_sparsity=None, max_nfev=None, verbose=0, args=(), kwargs=None,
+        callback=None
+):
     """Solve a nonlinear least-squares problem with bounds on the variables.
 
     Given the residuals f(x) (an m-D real function of n real
@@ -444,6 +449,26 @@ def least_squares(
         Additional arguments passed to `fun` and `jac`. Both empty by default.
         The calling signature is ``fun(x, *args, **kwargs)`` and the same for
         `jac`.
+    callback : None or callable, optional
+        Callback function that is called by the algorithm on each iteration.
+        This can be used to print or plot the optimization results at each
+        step, and to stop the optimization algorithm based on some user-defined
+        condition.  Only implemented for the `trf` and `dogbox` methods.
+
+        The signature is ``callback(intermediate_result: OptimizeResult)``
+
+        `intermediate_result is a `scipy.optimize.OptimizeResult`
+        which contains the intermediate results of the optimization at the
+        current iteration.
+
+        The callback also supports a signature like: ``callback(x)``
+
+        Introspection is used to determine which of the signatures is invoked.
+
+        If the `callback` function raises `StopIteration` the optimization algorithm
+        will stop and return with status code -2.
+
+        .. versionadded:: 1.16.0
 
     Returns
     -------
@@ -487,6 +512,7 @@ def least_squares(
         status : int
             The reason for algorithm termination:
 
+            * -2 : terminated because callback raised StopIteration.
             * -1 : improper input parameters status returned from MINPACK.
             *  0 : the maximum number of function evaluations is exceeded.
             *  1 : `gtol` termination condition is satisfied.
@@ -939,14 +965,20 @@ def jac_wrapped(x, f):
             else:
                 tr_solver = 'lsmr'
 
+    # Wrap callback function.  If callback is None, callback_wrapped also is None
+    callback_wrapped = _wrap_callback(callback)
+
     if method == 'lm':
+        if callback is not None:
+            warn("Callback function specified, but not supported with `lm` method.",
+                 stacklevel=2)
         result = call_minpack(fun_wrapped, x0, jac_wrapped, ftol, xtol, gtol,
                               max_nfev, x_scale, diff_step)
 
     elif method == 'trf':
         result = trf(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol, xtol,
                      gtol, max_nfev, x_scale, loss_function, tr_solver,
-                     tr_options.copy(), verbose)
+                     tr_options.copy(), verbose, callback=callback_wrapped)
 
     elif method == 'dogbox':
         if tr_solver == 'lsmr' and 'regularize' in tr_options:
@@ -958,7 +990,7 @@ def jac_wrapped(x, f):
 
         result = dogbox(fun_wrapped, jac_wrapped, x0, f0, J0, lb, ub, ftol,
                         xtol, gtol, max_nfev, x_scale, loss_function,
-                        tr_solver, tr_options, verbose)
+                        tr_solver, tr_options, verbose, callback=callback_wrapped)
 
     result.message = TERMINATION_MESSAGES[result.status]
     result.success = result.status > 0
diff --git a/scipy/optimize/_lsq/trf.py b/scipy/optimize/_lsq/trf.py
index 9154bdba5b2c..c72fbfae00f0 100644
--- a/scipy/optimize/_lsq/trf.py
+++ b/scipy/optimize/_lsq/trf.py
@@ -107,10 +107,11 @@
     CL_scaling_vector, compute_grad, compute_jac_scale, check_termination,
     update_tr_radius, scale_for_robust_loss_function, print_header_nonlinear,
     print_iteration_nonlinear)
+from scipy._lib._util import _call_callback_maybe_halt
 
 
 def trf(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
-        loss_function, tr_solver, tr_options, verbose):
+        loss_function, tr_solver, tr_options, verbose, callback=None):
     # For efficiency, it makes sense to run the simplified version of the
     # algorithm when no bounds are imposed. We decided to write the two
     # separate functions. It violates the DRY principle, but the individual
@@ -118,11 +119,11 @@ def trf(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
     if np.all(lb == -np.inf) and np.all(ub == np.inf):
         return trf_no_bounds(
             fun, jac, x0, f0, J0, ftol, xtol, gtol, max_nfev, x_scale,
-            loss_function, tr_solver, tr_options, verbose)
+            loss_function, tr_solver, tr_options, verbose, callback=callback)
     else:
         return trf_bounds(
             fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale,
-            loss_function, tr_solver, tr_options, verbose)
+            loss_function, tr_solver, tr_options, verbose, callback=callback)
 
 
 def select_step(x, J_h, diag_h, g_h, p, p_h, d, Delta, lb, ub, theta):
@@ -203,7 +204,8 @@ def select_step(x, J_h, diag_h, g_h, p, p_h, d, Delta, lb, ub, theta):
 
 
 def trf_bounds(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev,
-               x_scale, loss_function, tr_solver, tr_options, verbose):
+               x_scale, loss_function, tr_solver, tr_options, verbose, 
+               callback=None):
     x = x0.copy()
 
     f = f0
@@ -385,8 +387,20 @@ def trf_bounds(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev,
         else:
             step_norm = 0
             actual_reduction = 0
-
+            
         iteration += 1
+            
+        # Call callback function and possibly stop optimization
+        if callback is not None:
+            intermediate_result = OptimizeResult(
+                x=x, fun=f_true, nit=iteration, nfev=nfev)
+            intermediate_result["cost"] = cost
+
+            if _call_callback_maybe_halt(
+                callback, intermediate_result
+            ):
+                termination_status = -2
+                break
 
     if termination_status is None:
         termination_status = 0
@@ -399,7 +413,8 @@ def trf_bounds(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev,
 
 
 def trf_no_bounds(fun, jac, x0, f0, J0, ftol, xtol, gtol, max_nfev,
-                  x_scale, loss_function, tr_solver, tr_options, verbose):
+                  x_scale, loss_function, tr_solver, tr_options, verbose, 
+                  callback=None):
     x = x0.copy()
 
     f = f0
@@ -549,6 +564,18 @@ def trf_no_bounds(fun, jac, x0, f0, J0, ftol, xtol, gtol, max_nfev,
             actual_reduction = 0
 
         iteration += 1
+        
+        # Call callback function and possibly stop optimization
+        if callback is not None:
+            intermediate_result = OptimizeResult(
+                x=x, fun=f_true, nit=iteration, nfev=nfev)
+            intermediate_result["cost"] = cost
+
+            if _call_callback_maybe_halt(
+                callback, intermediate_result
+            ):
+                termination_status = -2
+                break
 
     if termination_status is None:
         termination_status = 0
diff --git a/scipy/optimize/tests/test_bracket.py b/scipy/optimize/tests/test_bracket.py
index bf81c6f7fbc8..3c92fc63ab54 100644
--- a/scipy/optimize/tests/test_bracket.py
+++ b/scipy/optimize/tests/test_bracket.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from scipy.cluster.tests.test_vq import skip_xp_backends
 from scipy.optimize._bracket import _ELIMITS
 from scipy.optimize.elementwise import bracket_root, bracket_minimum
 import scipy._lib._elementwise_iterative_method as eim
@@ -42,10 +43,11 @@ def _bracket_minimum(*args, **kwargs):
 
 
 array_api_strict_skip_reason = 'Array API does not support fancy indexing assignment.'
-jax_skip_reason = 'JAX arrays do not support item assignment.'
+boolean_index_skip_reason = 'JAX/Dask arrays do not support boolean assignment.'
 
 @pytest.mark.skip_xp_backends('array_api_strict', reason=array_api_strict_skip_reason)
-@pytest.mark.skip_xp_backends('jax.numpy', reason=jax_skip_reason)
+@pytest.mark.skip_xp_backends('jax.numpy', reason=boolean_index_skip_reason)
+@pytest.mark.skip_xp_backends('dask.array', reason=boolean_index_skip_reason)
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
 class TestBracketRoot:
@@ -179,7 +181,7 @@ def f(*args, **kwargs):
             ref_attr = [xp.asarray(getattr(ref, attr)) for ref in refs]
             res_attr = getattr(res, attr)
             xp_assert_close(xp_ravel(res_attr, xp=xp), xp.stack(ref_attr))
-            xp_assert_equal(res_attr.shape, shape)
+            assert res_attr.shape == shape
 
         xp_test = array_namespace(xp.asarray(1.))
         assert res.success.dtype == xp_test.bool
@@ -354,7 +356,8 @@ def f(x):
 
 
 @pytest.mark.skip_xp_backends('array_api_strict', reason=array_api_strict_skip_reason)
-@pytest.mark.skip_xp_backends('jax.numpy', reason=jax_skip_reason)
+@pytest.mark.skip_xp_backends('jax.numpy', reason=boolean_index_skip_reason)
+@pytest.mark.skip_xp_backends('dask.array', reason=boolean_index_skip_reason)
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
 class TestBracketMinimum:
@@ -784,7 +787,7 @@ def bracket_minimum_single(xm0, xl0, xr0, xmin, xmax, factor, a):
             ref_attr = [xp.asarray(getattr(ref, attr)) for ref in refs]
             res_attr = getattr(res, attr)
             xp_assert_close(xp_ravel(res_attr, xp=xp), xp.stack(ref_attr))
-            xp_assert_equal(res_attr.shape, shape)
+            assert res_attr.shape == shape
 
         xp_test = array_namespace(xp.asarray(1.))
         assert res.success.dtype == xp_test.bool
@@ -855,7 +858,7 @@ def f(x):
 
         result = _bracket_minimum(f, xp.asarray(0.5535723499480897), xmin=xmin,
                                   xmax=xmax)
-        assert xmin == result.xl
+        xp_assert_close(result.xl, xmin)
 
     def test_gh_20562_right(self, xp):
         # Regression test for https://github.com/scipy/scipy/issues/20562
@@ -868,4 +871,4 @@ def f(x):
 
         result = _bracket_minimum(f, xp.asarray(-0.5535723499480897),
                                   xmin=xmin, xmax=xmax)
-        assert xmax == result.xr
+        xp_assert_close(result.xr, xmax)
diff --git a/scipy/optimize/tests/test_chandrupatla.py b/scipy/optimize/tests/test_chandrupatla.py
index 1adf10d793c3..65d0deee0eca 100644
--- a/scipy/optimize/tests/test_chandrupatla.py
+++ b/scipy/optimize/tests/test_chandrupatla.py
@@ -187,6 +187,7 @@ def _bracket_minimum(func, x1, x2):
 
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
+@pytest.mark.skip_xp_backends('dask.array', reason='no argsort in Dask')
 @pytest.mark.skip_xp_backends('jax.numpy',
                               reason='JAX arrays do not support item assignment.')
 @pytest.mark.skip_xp_backends('array_api_strict',
@@ -553,6 +554,7 @@ def f(x):
 
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
+@pytest.mark.skip_xp_backends('dask.array', reason='boolean indexing assignment')
 @pytest.mark.skip_xp_backends('array_api_strict',
                               reason='Currently uses fancy indexing assignment.')
 @pytest.mark.skip_xp_backends('jax.numpy',
diff --git a/scipy/optimize/tests/test_least_squares.py b/scipy/optimize/tests/test_least_squares.py
index 30401cd9dce2..14731ca13812 100644
--- a/scipy/optimize/tests/test_least_squares.py
+++ b/scipy/optimize/tests/test_least_squares.py
@@ -13,6 +13,7 @@
 from scipy.optimize._lsq.least_squares import IMPLEMENTED_LOSSES
 from scipy.optimize._lsq.common import EPS, make_strictly_feasible, CL_scaling_vector
 
+from scipy.optimize import OptimizeResult
 
 def fun_trivial(x, a=0):
     return (x - a)**2 + 5.0
@@ -787,6 +788,73 @@ def test_basic():
     assert_allclose(res.x, 0, atol=1e-10)
 
 
+def test_callback():
+    # test that callback function works as expected
+
+    results = []
+    
+    def my_callback_optimresult(intermediate_result: OptimizeResult):
+        results.append(intermediate_result)
+        
+    def my_callback_x(x):
+        r = OptimizeResult()
+        r.nit = 1
+        r.x = x
+        results.append(r)
+        return False
+        
+    def my_callback_optimresult_stop_exception(
+        intermediate_result: OptimizeResult):
+        results.append(intermediate_result)
+        raise StopIteration
+
+    def my_callback_x_stop_exception(x):
+        r = OptimizeResult()
+        r.nit = 1
+        r.x = x
+        results.append(r)
+        raise StopIteration
+
+    # Try for different function signatures and stop methods
+    callbacks_nostop = [my_callback_optimresult, my_callback_x]
+    callbacks_stop = [my_callback_optimresult_stop_exception,
+                      my_callback_x_stop_exception]
+    
+    # Try for all the implemented methods: trf, trf_bounds and dogbox
+    calls = [
+        lambda callback: least_squares(fun_trivial, 5.0, method='trf', 
+                                       callback=callback),
+        lambda callback: least_squares(fun_trivial, 5.0, method='trf', 
+                                       bounds=(-8.0, 8.0), callback=callback),
+        lambda callback: least_squares(fun_trivial, 5.0, method='dogbox', 
+                                       callback=callback)
+    ]
+
+    for mycallback, call in product(callbacks_nostop, calls):
+        results.clear()
+        # Call the different implemented methods
+        res = call(mycallback)
+        # Check that callback was called
+        assert len(results) > 0
+        # Check that results data makes sense
+        assert results[-1].nit > 0
+        # Check that it didn't stop because of the callback
+        assert res.status != -2
+        # final callback x should be same as final result
+        assert_allclose(results[-1].x, res.x)
+
+    for mycallback, call in product(callbacks_stop, calls):
+        results.clear()
+        # Call the different implemented methods
+        res = call(mycallback)
+        # Check that callback was called
+        assert len(results) > 0
+        # Check that only one iteration was run
+        assert results[-1].nit == 1
+        # Check that it stopped because of the callback
+        assert res.status == -2
+
+
 def test_small_tolerances_for_lm():
     for ftol, xtol, gtol in [(None, 1e-13, 1e-13),
                              (1e-13, None, 1e-13),
diff --git a/scipy/signal/__init__.py b/scipy/signal/__init__.py
index 01a495ff77fb..7b9a69a58990 100644
--- a/scipy/signal/__init__.py
+++ b/scipy/signal/__init__.py
@@ -289,28 +289,15 @@
 use the classes to create a reusable function instead.
 
 """
+# bring in the public functionality from private namespaces
 
-from . import _sigtools, windows
-from ._waveforms import *
-from ._max_len_seq import max_len_seq
-from ._upfirdn import upfirdn
+# mypy: ignore-errors
 
-from ._spline import (
-    sepfir2d
-)
+from ._support_alternative_backends import *
+from . import _support_alternative_backends
+__all__ = _support_alternative_backends.__all__
+del _support_alternative_backends, _signal_api, _delegators  # noqa: F821
 
-from ._spline_filters import *
-from ._filter_design import *
-from ._fir_filter_design import *
-from ._ltisys import *
-from ._lti_conversion import *
-from ._signaltools import *
-from ._savitzky_golay import savgol_coeffs, savgol_filter
-from ._spectral_py import *
-from ._short_time_fft import *
-from ._peak_finding import *
-from ._czt import *
-from .windows import get_window  # keep this one in signal namespace
 
 # Deprecated namespaces, to be removed in v2.0.0
 from . import (
@@ -318,12 +305,6 @@
     spectral, signaltools, waveforms, wavelets, spline
 )
 
-# overwrite supported names/objects star-imported above
-from ._support_alternative_backends import *
-
-__all__ = [
-    s for s in dir() if not s.startswith("_")
-]
 
 from scipy._lib._testutils import PytestTester
 test = PytestTester(__name__)
diff --git a/scipy/signal/_delegators.py b/scipy/signal/_delegators.py
new file mode 100644
index 000000000000..05119be499e4
--- /dev/null
+++ b/scipy/signal/_delegators.py
@@ -0,0 +1,560 @@
+"""Delegators for alternative backends in scipy.signal.
+
+The signature of `func_signature` must match the signature of signal.func.
+The job of a `func_signature` is to know which arguments of `signal.func`
+are arrays.
+
+* signatures are generated by
+
+--------------
+ import inspect
+ from scipy import signal
+
+ names = [x for x in dir(signal) if not x.startswith('_')]
+ objs = [getattr(signal, name) for name in names]
+ funcs = [obj for obj in objs if inspect.isroutine(obj)]
+
+ for func in funcs:
+     try:
+        sig = inspect.signature(func)
+     except ValueError:
+         sig = "( FIXME )"
+     print(f"def {func.__name__}_signature{sig}:\n\treturn array_namespace(...
+ )\n\n")
+---------------
+
+* which arguments to delegate on: manually trawled the documentation for
+  array-like and array arguments
+
+"""
+import numpy as np
+from scipy._lib._array_api import array_namespace
+from scipy.ndimage._ni_support import _skip_if_int
+
+
+def _skip_if_lti(arg):
+    """Handle `system` arg overloads.
+
+    ATM, only pass tuples through. Consider updating when cupyx.lti class
+    is supported.
+    """
+    if isinstance(arg, tuple):
+        return arg
+    else:
+        return (None,)
+
+
+def _skip_if_str_or_tuple(window):
+    """Handle `window` being a str or a tuple or an array-like.
+    """
+    if isinstance(window, str) or isinstance(window, tuple) or callable(window):
+        return None
+    else:
+        return window
+
+
+def _skip_if_poly1d(arg):
+    return None if isinstance(arg, np.poly1d) else arg
+
+###################
+
+def abcd_normalize_signature(A=None, B=None, C=None, D=None):
+    return array_namespace(A, B, C, D)
+
+
+def argrelextrema_signature(data, *args, **kwds):
+    return array_namespace(data)
+
+argrelmax_signature = argrelextrema_signature
+argrelmin_signature = argrelextrema_signature
+
+
+def band_stop_obj_signature(wp, ind, passb, stopb, gpass, gstop, type):
+    return array_namespace(passb, stopb)
+
+
+def bessel_signature(N, Wn, *args, **kwds):
+    return array_namespace(Wn)
+
+butter_signature = bessel_signature
+
+
+def cheby2_signature(N, rs, Wn, *args, **kwds):
+    return array_namespace(Wn)
+
+
+def cheby1_signature(N, rp, Wn, *args, **kwds):
+    return array_namespace(Wn)
+
+
+def ellip_signature(N, rp, rs, Wn, *args, **kwds):
+    return array_namespace(Wn)
+
+
+########################## XXX: no arrays in, arrays out
+def besselap_signature(N, norm='phase'):
+    return np
+
+def buttap_signature(N):
+    return np
+
+def cheb1ap_signature(N, rp):
+    return np
+
+
+def cheb2ap_signature(N, rs):
+    return np
+
+def ellipap_signature(N, rp, rs):
+    return np
+
+def correlation_lags_signature(in1_len, in2_len, mode='full'):
+    return np
+
+
+def czt_points_signature(m, w=None, a=(1+0j)):
+    return np
+
+
+def gammatone_signature(freq, ftype, order=None, numtaps=None, fs=None):
+    return np
+
+
+def iircomb_signature(w0, Q, ftype='notch', fs=2.0, *, pass_zero=False):
+    return np
+
+
+def iirnotch_signature(w0, Q, fs=2.0):
+    return np
+
+
+def iirpeak_signature(w0, Q, fs=2.0):
+    return np
+
+
+def savgol_coeffs_signature(
+    window_length, polyorder, deriv=0, delta=1.0, pos=None, use='conv'
+):
+    return np
+
+
+def unit_impulse_signature(shape, idx=None, dtype=float):
+    return np
+############################
+
+
+####################### XXX: no arrays, maybe arrays out
+def buttord_signature(wp, ws, gpass, gstop, analog=False, fs=None):
+    return np
+
+def cheb1ord_signature(wp, ws, gpass, gstop, analog=False, fs=None):
+    return np
+
+def cheb2ord_signature(wp, ws, gpass, gstop, analog=False, fs=None):
+    return np
+
+def ellipord_signature(wp, ws, gpass, gstop, analog=False, fs=None):
+    return np
+###########################################
+
+
+########### NB: scalars in, scalars out
+def kaiser_atten_signature(numtaps, width):
+    return np
+
+def kaiser_beta_signature(a):
+    return np
+
+def kaiserord_signature(ripple, width):
+    return np
+
+def get_window_signature(window, Nx, fftbins=True):
+    return np
+#################################
+
+
+def bode_signature(system, w=None, n=100):
+    return array_namespace(*_skip_if_lti(system), w)
+
+dbode_signature = bode_signature
+
+
+def freqresp_signature(system, w=None, n=10000):
+    return array_namespace(*_skip_if_lti(system), w)
+
+dfreqresp_signature = freqresp_signature
+
+
+def impulse_signature(system, X0=None, T=None, N=None):
+    return array_namespace(*_skip_if_lti(system), X0, T)
+
+
+def dimpulse_signature(system, x0=None, t=None, n=None):
+    return array_namespace(*_skip_if_lti(system), x0, t)
+
+
+def lsim_signature(system, U, T, X0=None, interp=True):
+    return array_namespace(*_skip_if_lti(system), U, T, X0)
+
+
+def dlsim_signature(system, u, t=None, x0=None):
+    return array_namespace(*_skip_if_lti(system), u, t, x0)
+
+
+def step_signature(system, X0=None, T=None, N=None):
+    return array_namespace(*_skip_if_lti(system), X0, T)
+
+def dstep_signature(system, x0=None, t=None, n=None):
+    return array_namespace(*_skip_if_lti(system), x0, t)
+
+
+def cont2discrete_signature(system, dt, method='zoh', alpha=None):
+    return array_namespace(*_skip_if_lti(system))
+
+
+def bilinear_signature(b, a, fs=1.0):
+    return array_namespace(b, a)
+
+
+def bilinear_zpk_signature(z, p, k, fs):
+    return array_namespace(z, p)
+
+
+def chirp_signature(t,*args, **kwds):
+    return array_namespace(t)
+
+
+############## XXX: array-likes in, str out
+def choose_conv_method_signature(in1, in2, *args, **kwds):
+    return array_namespace(in1, in2)
+############################################
+
+
+def convolve_signature(in1, in2, *args, **kwds):
+    return array_namespace(in1, in2)
+
+fftconvolve_signature = convolve_signature
+oaconvolve_signature = convolve_signature
+correlate_signature = convolve_signature
+correlate_signature = convolve_signature
+convolve2d_signature = convolve_signature
+correlate2d_signature = convolve_signature
+
+
+def coherence_signature(x, y, fs=1.0, window='hann', *args, **kwds):
+    return array_namespace(x, y, _skip_if_str_or_tuple(window))
+
+
+def csd_signature(x, y, fs=1.0, window='hann', *args, **kwds):
+    return array_namespace(x, y, _skip_if_str_or_tuple(window))
+
+
+def periodogram_signature(x, fs=1.0, window='boxcar'):
+    return array_namespace(x, _skip_if_str_or_tuple(window))
+
+
+def welch_signature(x, fs=1.0, window='hann', *args, **kwds):
+    return array_namespace(x, _skip_if_str_or_tuple(window))
+
+
+def spectrogram_signature(x, fs=1.0, window=('tukey', 0.25), *args, **kwds):
+    return array_namespace(x, _skip_if_str_or_tuple(window))
+
+
+def stft_signature(x, fs=1.0, window='hann', *args, **kwds):
+    return array_namespace(x, _skip_if_str_or_tuple(window))
+
+
+def istft_signature(Zxx, fs=1.0, window='hann', *args, **kwds):
+    return array_namespace(Zxx, _skip_if_str_or_tuple(window))
+
+
+def resample_signature(x, num, t=None, axis=0, window=None, domain='time'):
+    return array_namespace(x, t, _skip_if_str_or_tuple(window))
+
+
+def resample_poly_signature(x, up, down, axis=0, window=('kaiser', 5.0), *args, **kwds):
+    return array_namespace(x, _skip_if_str_or_tuple(window))
+
+
+def check_COLA_signature(window, nperseg, noverlap, tol=1e-10):
+    return array_namespace(_skip_if_str_or_tuple(window))
+
+
+def check_NOLA_signature(window, nperseg, noverlap, tol=1e-10):
+    return array_namespace(_skip_if_str_or_tuple(window))
+
+
+def czt_signature(x, *args, **kwds):
+    return array_namespace(x)
+
+decimate_signature = czt_signature
+gauss_spline_signature = czt_signature
+
+
+def deconvolve_signature(signal, divisor):
+    return array_namespace(signal, divisor)
+
+
+def detrend_signature(data, axis=1, type='linear', bp=0, *args, **kwds):
+    return array_namespace(data, _skip_if_int(bp))
+
+
+def filtfilt_signature(b, a, x, *args, **kwds):
+    return array_namespace(b, a, x)
+
+
+def lfilter_signature(b, a, x, axis=-1, zi=None):
+    return array_namespace(b, a, x, zi)
+
+
+def find_peaks_signature(
+    x, height=None, threshold=None, distance=None, prominence=None, width=None,
+    wlen=None, rel_height=0.5, plateau_size=None
+):
+    return array_namespace(x, height, threshold, prominence, width, plateau_size)
+
+
+def find_peaks_cwt_signature(
+    vector, widths, wavelet=None, max_distances=None, *args, **kwds
+):
+    return array_namespace(vector, widths, max_distances)
+
+
+def findfreqs_signature(num, den, N, kind='ba'):
+    return array_namespace(num, den)
+
+
+def firls_signature(numtaps, bands, desired, *, weight=None, fs=None):
+    return array_namespace(bands, desired, weight)
+
+
+def firwin_signature(numtaps, cutoff, *args, **kwds):
+    return array_namespace(cutoff)
+
+
+def firwin2_signature(numtaps, freq, gain, *args, **kwds):
+    return array_namespace(freq, gain)
+
+
+def freqs_zpk_signature(z, p, k, worN, *args, **kwds):
+    return array_namespace(z, p, _skip_if_int(worN))
+
+freqz_zpk_signature = freqs_zpk_signature
+
+
+def freqs_signature(b, a, worN=200, *args, **kwds):
+    return array_namespace(b, a, _skip_if_int(worN))
+
+freqz_signature = freqs_signature
+
+
+def freqz_sos_signature(sos, worN=512, *args, **kwds):
+    return array_namespace(sos, _skip_if_int(worN))
+
+sosfreqz_signature = freqz_sos_signature
+
+
+def gausspulse_signature(t, *args, **kwds):
+    arr_t = None if isinstance(t, str) else t
+    return array_namespace(arr_t)
+
+
+def group_delay_signature(system, w=512, whole=False, fs=6.283185307179586):
+    return array_namespace(_skip_if_str_or_tuple(system), _skip_if_int(w))
+
+
+def hilbert_signature(x, N=None, axis=-1):
+    return array_namespace(x)
+
+hilbert2_signature = hilbert_signature
+
+
+def iirdesign_signature(wp, ws, *args, **kwds):
+    return array_namespace(wp, ws)
+
+
+def iirfilter_signature(N, Wn, *args, **kwds):
+    return array_namespace(Wn)
+
+
+def invres_signature(r, p, k, tol=0.001, rtype='avg'):
+    return array_namespace(r, p, k)
+
+invresz_signature = invres_signature
+
+
+############################### XXX: excluded, blacklisted on CuPy (mismatched API)
+def lfilter_zi_signature(b, a):
+    return array_namespace(b, a)
+
+def sosfilt_zi_signature(sos):
+    return array_namespace(sos)
+
+# needs to be blacklisted on CuPy (is not implemented)
+def remez_signature(numtaps, bands, desired, *, weight=None, **kwds):
+    return array_namespace(bands, desired, weight)
+#############################################
+
+def lfiltic_signature(b, a, y, x=None):
+    return array_namespace(b, a, y, x)
+
+
+def lombscargle_signature(
+    x, y, freqs, precenter=False, normalize=False, *,
+    weights=None, floating_mean=False
+):
+    return array_namespace(x, y, freqs, weights)
+
+
+def lp2bp_signature(b, a, *args, **kwds):
+    return array_namespace(b, a)
+
+lp2bs_signature = lp2bp_signature
+lp2hp_signature = lp2bp_signature
+lp2lp_signature = lp2bp_signature
+
+tf2zpk_signature = lp2bp_signature
+tf2sos_signature = lp2bp_signature
+
+normalize_signature = lp2bp_signature
+residue_signature = lp2bp_signature 
+residuez_signature = residue_signature
+
+
+def lp2bp_zpk_signature(z, p, k, *args, **kwds):
+    return array_namespace(z, p)
+
+lp2bs_zpk_signature = lp2bp_zpk_signature
+lp2hp_zpk_signature = lp2bs_zpk_signature
+lp2lp_zpk_signature = lp2bs_zpk_signature
+
+
+def zpk2sos_signature(z, p, k, *args, **kwds):
+    return array_namespace(z, p)
+
+zpk2ss_signature = zpk2sos_signature
+zpk2tf_signature = zpk2sos_signature
+
+
+def max_len_seq_signature(nbits, state=None, length=None, taps=None):
+    return array_namespace(state, taps)
+
+
+def medfilt_signature(volume, kernel_size=None):
+    return array_namespace(volume)
+
+
+def medfilt2d_signature(input, kernel_size=3):
+    return array_namespace(input)
+
+
+def minimum_phase_signature(h, *args, **kwds):
+    return array_namespace(h)
+
+
+def order_filter_signature(a, domain, rank):
+    return array_namespace(a, domain)
+
+
+def peak_prominences_signature(x, peaks, *args, **kwds):
+    return array_namespace(x, peaks)
+
+
+peak_widths_signature = peak_prominences_signature
+
+
+def place_poles_signature(A, B, poles, method='YT', rtol=0.001, maxiter=30):
+    return array_namespace(A, B, poles)
+
+
+def savgol_filter_signature(x, *args, **kwds):
+    return array_namespace(x)
+
+
+def sawtooth_signature(t, width=1):
+    return array_namespace(t)
+
+
+def sepfir2d_signature(input, hrow, hcol):
+    return array_namespace(input, hrow, hcol)
+
+
+def sos2tf_signature(sos):
+    return array_namespace(sos)
+
+
+sos2zpk_signature = sos2tf_signature
+
+
+def sosfilt_signature(sos, x, axis=-1, zi=None):
+    return array_namespace(sos, x, zi)
+
+
+def sosfiltfilt_signature(sos, x, *args, **kwds):
+    return array_namespace(sos, x)
+
+
+def spline_filter_signature(Iin, lmbda=5.0):
+    return array_namespace(Iin)
+
+
+def square_signature(t, duty=0.5):
+    return array_namespace(t)
+
+
+def ss2tf_signature(A, B, C, D, input=0):
+    return array_namespace(A, B, C, D)
+
+ss2zpk_signature = ss2tf_signature
+
+
+def sweep_poly_signature(t, poly, phi=0):
+    return array_namespace(t, _skip_if_poly1d(poly))
+
+
+def symiirorder1_signature(signal, c0, z1, precision=-1.0):
+    return array_namespace(signal)
+
+
+def symiirorder2_signature(input, r, omega, precision=-1.0):
+    return array_namespace(input)
+
+
+def cspline1d_signature(signal, *args, **kwds):
+    return array_namespace(signal)
+
+qspline1d_signature = cspline1d_signature
+cspline2d_signature = cspline1d_signature
+qspline2d_signature = qspline1d_signature
+
+
+def cspline1d_eval_signature(cj, newx, *args, **kwds):
+    return array_namespace(cj, newx)
+
+qspline1d_eval_signature = cspline1d_eval_signature
+
+
+def tf2ss_signature(num, den):
+    return array_namespace(num, den)
+
+
+def unique_roots_signature(p, tol=0.001, rtype='min'):
+    return array_namespace(p)
+
+
+def upfirdn_signature(h, x, up=1, down=1, axis=-1, mode='constant', cval=0):
+    return array_namespace(h, x)
+
+
+def vectorstrength_signature(events, period):
+    return array_namespace(events, period)
+
+
+def wiener_signature(im, mysize=None, noise=None):
+    return array_namespace(im)
+
+
+def zoom_fft_signature(x, fn, m=None, *, fs=2, endpoint=False, axis=-1):
+    return array_namespace(x, fn)
+
diff --git a/scipy/signal/_filter_design.py b/scipy/signal/_filter_design.py
index 0f177247c602..459d7087c786 100644
--- a/scipy/signal/_filter_design.py
+++ b/scipy/signal/_filter_design.py
@@ -1183,26 +1183,6 @@ def zpk2tf(z, p, k):
         b = k * poly(z)
     a = atleast_1d(poly(p))
 
-    # Use real output if possible. Copied from np.poly, since
-    # we can't depend on a specific version of numpy.
-    if issubclass(b.dtype.type, np.complexfloating):
-        # if complex roots are all complex conjugates, the roots are real.
-        roots = np.asarray(z, complex)
-        pos_roots = np.compress(roots.imag > 0, roots)
-        neg_roots = np.conjugate(np.compress(roots.imag < 0, roots))
-        if len(pos_roots) == len(neg_roots):
-            if np.all(np.sort_complex(neg_roots) == np.sort_complex(pos_roots)):
-                b = b.real.copy()
-
-    if issubclass(a.dtype.type, np.complexfloating):
-        # if complex roots are all complex conjugates, the roots are real.
-        roots = np.asarray(p, complex)
-        pos_roots = np.compress(roots.imag > 0, roots)
-        neg_roots = np.conjugate(np.compress(roots.imag < 0, roots))
-        if len(pos_roots) == len(neg_roots):
-            if np.all(np.sort_complex(neg_roots) == np.sort_complex(pos_roots)):
-                a = a.real.copy()
-
     return b, a
 
 
diff --git a/scipy/signal/_short_time_fft.py b/scipy/signal/_short_time_fft.py
index bd52e67839bd..a0b49eb789f1 100644
--- a/scipy/signal/_short_time_fft.py
+++ b/scipy/signal/_short_time_fft.py
@@ -26,7 +26,7 @@
 import numpy as np
 
 import scipy.fft as fft_lib
-from scipy.signal import detrend
+from scipy.signal._signaltools import detrend
 from scipy.signal.windows import get_window
 
 __all__ = ['ShortTimeFFT']
diff --git a/scipy/signal/_signal_api.py b/scipy/signal/_signal_api.py
new file mode 100644
index 000000000000..8a099dd4ee72
--- /dev/null
+++ b/scipy/signal/_signal_api.py
@@ -0,0 +1,30 @@
+"""This is the 'bare' scipy.signal API.
+
+This --- private! --- module only collects implementations of public  API
+for _support_alternative_backends.
+The latter --- also private! --- module adds delegation to CuPy etc and
+re-exports decorated names to __init__.py
+"""
+
+from . import _sigtools, windows         # noqa: F401
+from ._waveforms import *        # noqa: F403
+from ._max_len_seq import max_len_seq       # noqa: F401
+from ._upfirdn import upfirdn         # noqa: F401
+
+from ._spline import sepfir2d          # noqa: F401
+
+from ._spline_filters import *         # noqa: F403
+from ._filter_design import *         # noqa: F403
+from ._fir_filter_design import *         # noqa: F403
+from ._ltisys import *         # noqa: F403
+from ._lti_conversion import *         # noqa: F403
+from ._signaltools import *         # noqa: F403
+from ._savitzky_golay import savgol_coeffs, savgol_filter  # noqa: F401
+from ._spectral_py import *         # noqa: F403
+from ._short_time_fft import *         # noqa: F403
+from ._peak_finding import *         # noqa: F403
+from ._czt import *         # noqa: F403
+from .windows import get_window  # keep this one in signal namespace  # noqa: F401
+
+
+__all__ = [s for s in dir() if not s.startswith('_')]
diff --git a/scipy/signal/_signaltools.py b/scipy/signal/_signaltools.py
index 3f42ba3092be..dc6d3d3bde11 100644
--- a/scipy/signal/_signaltools.py
+++ b/scipy/signal/_signaltools.py
@@ -716,7 +716,7 @@ def fftconvolve(in1, in2, mode="full", axes=None):
     elif in1.ndim != in2.ndim:
         raise ValueError("in1 and in2 should have the same dimensionality")
     elif xp_size(in1) == 0 or xp_size(in2) == 0:  # empty arrays
-        return xp.array([])
+        return xp.asarray([])
 
     in1, in2, axes = _init_freq_conv_axes(in1, in2, mode, axes,
                                           sorted_axes=False)
@@ -933,7 +933,7 @@ def oaconvolve(in1, in2, mode="full", axes=None):
     elif in1.ndim != in2.ndim:
         raise ValueError("in1 and in2 should have the same dimensionality")
     elif in1.size == 0 or in2.size == 0:  # empty arrays
-        return np.array([])
+        return xp.asarray([])
     elif in1.shape == in2.shape:  # Equivalent to fftconvolve
         return fftconvolve(in1, in2, mode=mode, axes=axes)
 
diff --git a/scipy/signal/_support_alternative_backends.py b/scipy/signal/_support_alternative_backends.py
index 8ea6cd2d78b0..2cc8f6a8c950 100644
--- a/scipy/signal/_support_alternative_backends.py
+++ b/scipy/signal/_support_alternative_backends.py
@@ -1,13 +1,13 @@
-import sys
 import functools
 from scipy._lib._array_api import (
-    array_namespace, is_cupy, is_jax, scipy_namespace_for, SCIPY_ARRAY_API
+    is_cupy, is_jax, scipy_namespace_for, SCIPY_ARRAY_API
 )
-from ._signaltools import (convolve, fftconvolve, convolve2d, oaconvolve,
-                           correlate, correlate2d, order_filter, medfilt, medfilt2d,
-                           wiener, detrend, hilbert, hilbert2, lfilter, deconvolve,
-                           sosfilt, sosfiltfilt, sosfilt_zi, lfilter_zi,
-                           filtfilt,)
+
+from ._signal_api import *   # noqa: F403
+from . import _signal_api
+from . import _delegators
+__all__ = _signal_api.__all__
+
 
 MODULE_NAME = 'signal'
 
@@ -53,111 +53,15 @@ def wrapper(*args, **kwds):
 
 
 
-# X_signature signature must match the signature of X
-
-def convolve_signature(in1, in2, *args, **kwds):
-    xp = array_namespace(in1, in2)
-    return xp
-
-fftconvolve_signature = convolve_signature
-oaconvolve_signature = convolve_signature
-correlate_signature = convolve_signature
-correlate_signature = convolve_signature
-convolve2d_signature = convolve_signature
-correlate2d_signature = convolve_signature
-
-
-def medfilt_signature(volume, kernel_size=None):
-    xp = array_namespace(volume)
-    return xp
-
-
-def medfilt2d_signature(input, kernel_size=3):
-    xp = array_namespace(input)
-    return xp
-
-
-def order_filter_signature(a, domain, rank):
-    xp = array_namespace(a, domain)
-    return xp
-
-
-def wiener_signature(im, mysize=None, noise=None):
-    xp = array_namespace(im)
-    return xp
-
-
-def detrend_signature(data, axis=-1, type='linear', bp=0, overwrite_data=False):
-    xp = array_namespace(data, None if isinstance(bp, int) else bp)
-    return xp
-
-
-def hilbert_signature(x, *args, **kwds):
-    xp = array_namespace(x)
-    return xp
-
-hilbert2_signature = hilbert_signature
-
-
-def lfilter_signature(b, a, x, axis=-1, zi=None):
-    return array_namespace(b, a, x, zi)
-
-
-def lfilter_zi_signature(b, a):
-    return array_namespace(b, a)
-
-
-def filtfilt_signature(b, a, x, *args, **kwds):
-    return array_namespace(b, a, x)
-
-
-def sosfilt_signature(sos, x, axis=-1, zi=None):
-    return array_namespace(sos, x, zi)
-
-
-def sosfilt_zi_signature(sos):
-    return array_namespace(sos)
-
-
-def sosfiltfilt_signature(sos, x, axis=-1, padtype='odd', padlen=None):
-    return array_namespace(sos, x)
-
-
-def deconvolve_signature(signal, divisor):
-    return array_namespace(signal, divisor)
-
-
-# functions we patch for dispatch
-_FUNC_MAP = {
-    convolve: convolve_signature,
-    fftconvolve: fftconvolve_signature,
-    oaconvolve: oaconvolve_signature,
-    correlate: correlate_signature,
-    convolve2d: convolve2d_signature,
-    correlate2d: correlate2d_signature,
-    medfilt: medfilt_signature,
-    medfilt2d: medfilt2d_signature,
-    order_filter: order_filter_signature,
-    wiener: wiener_signature,
-    detrend: detrend_signature,
-    hilbert: hilbert_signature,
-    hilbert2: hilbert2_signature,
-    lfilter: lfilter_signature,
-    lfilter_zi: lfilter_zi_signature, 
-    deconvolve: deconvolve_signature,
-    sosfilt: sosfilt_signature,
-    sosfiltfilt: sosfiltfilt_signature,
-    sosfilt_zi : sosfilt_zi_signature,
-    filtfilt: filtfilt_signature,
-}
-
-
 # ### decorate ###
-for func in _FUNC_MAP:
-    f = (delegate_xp(_FUNC_MAP[func], MODULE_NAME)(func)
-         if SCIPY_ARRAY_API
-         else func)
-    sys.modules[__name__].__dict__[func.__name__] = f
+for obj_name in _signal_api.__all__:
+    bare_obj = getattr(_signal_api, obj_name)
+    delegator = getattr(_delegators, obj_name + "_signature", None)
 
+    if SCIPY_ARRAY_API and delegator is not None:
+        f = delegate_xp(delegator, MODULE_NAME)(bare_obj)
+    else:
+        f = bare_obj
 
-__all__ = [f.__name__ for f in _FUNC_MAP]
+    # add the decorated function to the namespace, to be imported in __init__.py
+    vars()[obj_name] = f
diff --git a/scipy/signal/_wavelets.py b/scipy/signal/_wavelets.py
index 2b9f8fa32672..9da68aed88ed 100644
--- a/scipy/signal/_wavelets.py
+++ b/scipy/signal/_wavelets.py
@@ -1,5 +1,5 @@
 import numpy as np
-from scipy.signal import convolve
+from scipy.signal._signaltools import convolve
 
 
 def _ricker(points, a):
diff --git a/scipy/signal/meson.build b/scipy/signal/meson.build
index 2699f975d88e..655abb9aa683 100644
--- a/scipy/signal/meson.build
+++ b/scipy/signal/meson.build
@@ -62,6 +62,9 @@ py3.extension_module('_spline',
 
 py3.install_sources([
     '__init__.py',
+    '_support_alternative_backends.py',
+    '_signal_api.py',
+    '_delegators.py',
     '_arraytools.py',
     '_spline_filters.py',
     '_czt.py',
@@ -89,7 +92,6 @@ py3.install_sources([
     'spline.py',
     'waveforms.py',
     'wavelets.py',
-    '_support_alternative_backends.py',
   ],
   subdir: 'scipy/signal'
 )
diff --git a/scipy/signal/tests/test_filter_design.py b/scipy/signal/tests/test_filter_design.py
index 62613b5bb64e..863aa2bf2e50 100644
--- a/scipy/signal/tests/test_filter_design.py
+++ b/scipy/signal/tests/test_filter_design.py
@@ -215,6 +215,35 @@ def test_identity(self):
         xp_assert_equal(a, a_r)
         assert isinstance(a, np.ndarray)
 
+    def test_conj_pair(self):
+        # conjugate pairs give real-coeff num & den
+        z = np.array([1j, -1j, 2j, -2j])
+        # shouldn't need elements of pairs to be adjacent
+        p = np.array([1+1j, 3-100j, 3+100j, 1-1j])
+        k = 23
+
+        # np.poly should do the right thing, but be explicit about
+        # taking real part
+        b = k * np.poly(z).real
+        a = np.poly(p).real
+
+        bp, ap = zpk2tf(z, p, k)
+
+        xp_assert_close(b, bp)
+        xp_assert_close(a, ap)
+
+        assert np.isrealobj(bp)
+        assert np.isrealobj(ap)
+
+    def test_complexk(self):
+        # regression: z, p real, k complex k gave real b, a
+        b, a = np.array([1j, 1j]), np.array([1.0, 2])
+        z, p, k = tf2zpk(b, a)
+        xp_assert_close(k, 1j)
+        bp, ap = zpk2tf(z, p, k)
+        xp_assert_close(b, bp)
+        xp_assert_close(a, ap)
+
 
 class TestSos2Zpk:
 
diff --git a/scipy/signal/tests/test_signaltools.py b/scipy/signal/tests/test_signaltools.py
index 080b8a291bee..336ac203c78d 100644
--- a/scipy/signal/tests/test_signaltools.py
+++ b/scipy/signal/tests/test_signaltools.py
@@ -80,11 +80,19 @@ def test_complex(self, xp):
         z = convolve(x, y)
         xp_assert_equal(z, xp.asarray([2j, 2 + 6j, 5 + 8j, 5 + 5j]))
 
+    @xfail_xp_backends("jax.numpy", reason="wrong output dtype")
     def test_zero_rank(self, xp):
+        a = xp.asarray(1289)
+        b = xp.asarray(4567)
+        c = convolve(a, b)
+        xp_assert_equal(c, a * b)
+
+    @skip_xp_backends(np_only=True, reason="pure python")
+    def test_zero_rank_python_scalars(self, xp):
         a = 1289
         b = 4567
         c = convolve(a, b)
-        xp_assert_equal(c, a * b)
+        assert c == a * b
 
     def test_broadcastable(self, xp):
         a = xp.reshape(xp.arange(27), (3, 3, 3))
@@ -97,9 +105,10 @@ def test_broadcastable(self, xp):
             y = convolve(a, xp.reshape(b, b_shape), method='fft')
             xp_assert_close(x, y, atol=1e-14)
 
+    @xfail_xp_backends("jax.numpy", reason="wrong output dtype")
     def test_single_element(self, xp):
-        a = np.array([4967])
-        b = np.array([3920])
+        a = xp.asarray([4967])
+        b = xp.asarray([3920])
         c = convolve(a, b)
         xp_assert_equal(c, a * b)
 
@@ -289,7 +298,7 @@ def test_dtype_deprecation(self, xp):
             convolve(a, b)
 
 
-
+@pytest.mark.filterwarnings("ignore::FutureWarning:dask")
 @skip_xp_backends(cpu_only=True, exceptions=['cupy'])
 class TestConvolve2d:
 
@@ -506,7 +515,7 @@ def test_large_array(self, xp):
         assert fails[0].size == 0
 
 
-
+@pytest.mark.filterwarnings("ignore::FutureWarning:dask")
 @skip_xp_backends(cpu_only=True, exceptions=['cupy'])
 class TestFFTConvolve:
 
@@ -814,11 +823,15 @@ def test_valid_mode_ignore_nonaxes(self, xp):
         out = fftconvolve(a, b, 'valid', axes=1)
         xp_assert_close(out, expected, atol=1.5e-6)
 
-    def test_empty(self, xp):
+    @xfail_xp_backends("cupy", reason="dtypes do not match")
+    @xfail_xp_backends("jax.numpy", reason="assorted error messages")
+    @pytest.mark.parametrize("a,b", [([], []), ([5, 6], []), ([], [7])])
+    def test_empty(self, a, b, xp):
         # Regression test for #1745: crashes with 0-length input.
-        assert fftconvolve([], []).size == 0
-        assert fftconvolve([5, 6], []).size == 0
-        assert fftconvolve([], [7]).size == 0
+        xp_assert_equal(
+            fftconvolve(xp.asarray(a), xp.asarray(b)),
+            xp.asarray([]),
+        )
 
     @skip_xp_backends("jax.numpy", reason="jnp.pad: pad_width with nd=0")
     def test_zero_rank(self, xp):
@@ -960,10 +973,14 @@ def gen_oa_shapes_eq(sizes):
 
 @skip_xp_backends(cpu_only=True, exceptions=['cupy'])
 @skip_xp_backends("jax.numpy", reason="fails all around")
+@skip_xp_backends("dask.array",
+    reason="Gets converted to numpy at some point for some reason. "
+           "Probably also suffers from boolean indexing issues"
+)
 class TestOAConvolve:
     @pytest.mark.slow()
     @pytest.mark.parametrize('shape_a_0, shape_b_0',
-                             gen_oa_shapes_eq(list(range(100)) +
+                             gen_oa_shapes_eq(list(range(1, 100, 1)) +
                                               list(range(100, 1000, 23)))
                              )
     def test_real_manylens(self, shape_a_0, shape_b_0, xp):
@@ -1092,12 +1109,14 @@ def test_2d_axes(self, axes, shape_a_0, shape_b_0,
 
         assert_array_almost_equal(out, expected)
 
-    @skip_xp_backends(np_only=True)
-    def test_empty(self, xp):
+    @xfail_xp_backends("torch", reason="ValueError: Target length must be positive")
+    @pytest.mark.parametrize("a,b", [([], []), ([5, 6], []), ([], [7])])
+    def test_empty(self, a, b, xp):
         # Regression test for #1745: crashes with 0-length input.
-        assert oaconvolve([], []).size == 0
-        assert oaconvolve([5, 6], []).size == 0
-        assert oaconvolve([], [7]).size == 0
+        xp_assert_equal(
+            oaconvolve(xp.asarray(a), xp.asarray(b)),
+            xp.asarray([]),
+        )
 
     def test_zero_rank(self, xp):
         a = xp.asarray(4967)
@@ -1106,8 +1125,8 @@ def test_zero_rank(self, xp):
         xp_assert_equal(out, a * b)
 
     def test_single_element(self, xp):
-        a = np.asarray([4967])
-        b = np.asarray([3920])
+        a = xp.asarray([4967])
+        b = xp.asarray([3920])
         out = oaconvolve(a, b)
         xp_assert_equal(out, a * b)
 
@@ -1615,8 +1634,9 @@ def test_complex(self, xp):
 class TestOrderFilt:
 
     def test_basic(self, xp):
-        xp_assert_equal(signal.order_filter([1, 2, 3], [1, 0, 1], 1),
-                           [2, 3, 2])
+        actual = signal.order_filter(xp.asarray([1, 2, 3]), xp.asarray([1, 0, 1]), 1)
+        expect = xp.asarray([2, 3, 2])
+        xp_assert_equal(actual, expect)
 
 
 @skip_xp_backends(cpu_only=True, exceptions=['cupy'])
@@ -2440,7 +2460,9 @@ def decimal(self, dt, xp):
             dt = np.cdouble
 
         # emulate np.finfo(dt).precision for complex64 and complex128
-        prec = {64: 15, 32: 6}[xp.finfo(dt).bits]
+        # note: unwrapped dask has no finfo
+        xp_compat = array_namespace(xp.asarray(1))
+        prec = {64: 15, 32: 6}[xp_compat.finfo(dt).bits]
         return int(2 * prec / 3)
 
     def _setup_rank1(self, dt, mode, xp):
@@ -2559,13 +2581,13 @@ def test_consistency_correlate_funcs(self, xp):
             a_xp, b_xp = xp.asarray(a), xp.asarray(b)
             np_corr_result = np.correlate(a, b, mode=mode)
             assert_almost_equal(signal.correlate(a_xp, b_xp, mode=mode),
-                                xp.asarray(np_corr_result), check_namespace=False)
+                                xp.asarray(np_corr_result))
 
             # See gh-5897
             if mode == 'valid':
                 np_corr_result = np.correlate(b, a, mode=mode)
                 assert_almost_equal(signal.correlate(b_xp, a_xp, mode=mode),
-                                    xp.asarray(np_corr_result), check_namespace=False)
+                                    xp.asarray(np_corr_result))
 
     @skip_xp_backends(np_only=True)  # XXX
     def test_consistency_correlate_funcs_2(self, xp):
@@ -2766,6 +2788,7 @@ def test_gust_scalars(self, xp):
         xp_assert_close(y, expected)
 
 
+@skip_xp_backends("dask.array", reason="sosfiltfilt directly sets shape attributes on arrays which dask doesn't like")
 @skip_xp_backends(cpu_only=True, exceptions=['cupy'])
 class TestSOSFiltFilt(TestFiltFilt):
     filtfilt_kind = 'sos'
@@ -3998,10 +4021,12 @@ def test_nonnumeric_dtypes(func, xp):
 #      (https://github.com/cupy/cupy/pull/8677)
 #  3. an issue with CuPy's __array__ not numpy-2.0 compatible
 @skip_xp_backends(cpu_only=True)
+@skip_xp_backends("dask.array", reason="sosfilt doesn't convert dask array to numpy before cython")
 @pytest.mark.parametrize('dt', ['float32', 'float64', 'complex64', 'complex128'])
 class TestSOSFilt:
 
     # The test_rank* tests are pulled from _TestLinearFilter
+
     @skip_xp_backends('jax.numpy', reason='buffer array is read-only')
     def test_rank1(self, dt, xp):
         dt = getattr(xp, dt)
@@ -4210,6 +4235,7 @@ def test_bad_zi_shape(self, dt, xp):
         with pytest.raises(ValueError, match='Invalid zi shape'):
             sosfilt(sos, x, zi=zi, axis=1)
 
+
     @skip_xp_backends('jax.numpy', reason='item assignment')
     def test_sosfilt_zi(self, dt, xp):
         dt = getattr(xp, dt)
@@ -4317,6 +4343,7 @@ def test_bp(self, xp):
         with assert_raises(ValueError):
             detrend(data, type="linear", bp=3)
 
+    @pytest.mark.filterwarnings("ignore::FutureWarning:dask")
     @pytest.mark.parametrize('bp', [np.array([0, 2]), [0, 2]])
     def test_detrend_array_bp(self, bp, xp):
         # regression test for https://github.com/scipy/scipy/issues/18675
diff --git a/scipy/sparse/csgraph/_shortest_path.pyx b/scipy/sparse/csgraph/_shortest_path.pyx
index c12e160e7e1f..b67aa6ad2cf4 100644
--- a/scipy/sparse/csgraph/_shortest_path.pyx
+++ b/scipy/sparse/csgraph/_shortest_path.pyx
@@ -3,7 +3,7 @@ Routines for performing shortest-path graph searches
 
 The main interface is in the function :func:`shortest_path`.  This
 calls cython routines that compute the shortest path using
-the Floyd-Warshall algorithm, Dijkstra's algorithm with Fibonacci Heaps,
+the Floyd-Warshall algorithm, Dijkstra's algorithm with priority queue,
 the Bellman-Ford algorithm, or Johnson's Algorithm.
 
 Yen's k-Shortest Path Algorithm is available for
@@ -24,9 +24,11 @@ from scipy.sparse._sputils import (convert_pydata_sparse_to_scipy,
 
 cimport cython
 
-from libc.stdlib cimport malloc, free
 from libc.math cimport INFINITY
 
+from libcpp.queue cimport priority_queue
+from libcpp.pair cimport pair
+
 np.import_array()
 
 include 'parameters.pxi'
@@ -69,9 +71,11 @@ def shortest_path(csgraph, method='auto',
                      Computational cost is approximately ``O[N^3]``.
                      The input csgraph will be converted to a dense representation.
 
-           'D'    -- Dijkstra's algorithm with Fibonacci heaps.
-                     Computational cost is approximately ``O[N(N*k + N*log(N))]``,
-                     where ``k`` is the average number of connected edges per node.
+           'D'    -- Dijkstra's algorithm with priority queue.
+                     Computational cost is approximately ``O[I * (E + N) * log(N)]``,
+                     where ``E`` is the number of edges in the graph,
+                     and ``I = len(indices)`` if ``indices`` is passed. Otherwise,
+                     ``I = N``.
                      The input csgraph will be converted to a csr representation.
 
            'BF'   -- Bellman-Ford algorithm.
@@ -363,7 +367,7 @@ cdef void _floyd_warshall(
     # dist_matrix should be a [N,N] matrix, such that dist_matrix[i, j]
     # is the distance from point i to point j.  Zero-distances imply that
     # the points are not connected.
-    cdef int N = dist_matrix.shape[0]
+    cdef unsigned int N = dist_matrix.shape[0]
     assert dist_matrix.shape[1] == N
 
     cdef unsigned int i, j, k
@@ -431,7 +435,7 @@ def dijkstra(csgraph, directed=True, indices=None,
     dijkstra(csgraph, directed=True, indices=None, return_predecessors=False,
              unweighted=False, limit=np.inf, min_only=False)
 
-    Dijkstra algorithm using Fibonacci Heaps
+    Dijkstra algorithm using priority queue
 
     .. versionadded:: 0.11.0
 
@@ -598,12 +602,14 @@ def dijkstra(csgraph, directed=True, indices=None,
         else:
             predecessor_matrix = np.empty((len(indices), N), dtype=ITYPE)
             predecessor_matrix.fill(NULL_IDX)
+            source_matrix = np.empty((len(indices), 0), dtype=ITYPE) # unused
     else:
         if min_only:
             predecessor_matrix = np.empty(0, dtype=ITYPE)
-            source_matrix = np.empty(0, dtype=ITYPE)
+            source_matrix = np.empty(0, dtype=ITYPE) # unused
         else:
-            predecessor_matrix = np.empty((0, N), dtype=ITYPE)
+            predecessor_matrix = np.empty((len(indices), 0), dtype=ITYPE)
+            source_matrix = np.empty((len(indices), 0), dtype=ITYPE) # unused
 
     if unweighted:
         csr_data = np.ones(csgraph.data.shape)
@@ -612,15 +618,23 @@ def dijkstra(csgraph, directed=True, indices=None,
     csr_indices, csr_indptr = safely_cast_index_arrays(csgraph, ITYPE, msg="csgraph")
 
     if directed:
+        # for null transposed CSR
+        dummy_double_array = np.empty(0, dtype=DTYPE)
+        dummy_int_array = np.empty(0, dtype=ITYPE)
         if min_only:
-            _dijkstra_directed_multi(indices,
-                                     csr_data, csr_indices, csr_indptr,
-                                     dist_matrix, predecessor_matrix,
-                                     source_matrix, limitf)
+            _dijkstra(indices,
+                      csr_data, csr_indices, csr_indptr,
+                      dummy_double_array, dummy_int_array, dummy_int_array,
+                      dist_matrix, predecessor_matrix, source_matrix,
+                      limitf)
         else:
-            _dijkstra_directed(indices,
-                               csr_data, csr_indices, csr_indptr,
-                               dist_matrix, predecessor_matrix, limitf)
+            _dijkstra_multi_separate(
+                      indices,
+                      csr_data, csr_indices, csr_indptr,
+                      dummy_double_array, dummy_int_array, dummy_int_array,
+                      dist_matrix, predecessor_matrix, source_matrix,
+                      limitf)
+
     else:
         csrT = csgraph.T.tocsr()
         csrT_indices, csrT_indptr = safely_cast_index_arrays(csrT, ITYPE, msg="csgraph")
@@ -629,16 +643,18 @@ def dijkstra(csgraph, directed=True, indices=None,
         else:
             csrT_data = csrT.data
         if min_only:
-            _dijkstra_undirected_multi(indices,
-                                       csr_data, csr_indices, csr_indptr,
-                                       csrT_data, csrT_indices, csrT_indptr,
-                                       dist_matrix, predecessor_matrix,
-                                       source_matrix, limitf)
+            _dijkstra(indices,
+                    csr_data, csr_indices, csr_indptr,
+                    csrT_data, csrT_indices, csrT_indptr,
+                    dist_matrix, predecessor_matrix, source_matrix,
+                    limitf)
         else:
-            _dijkstra_undirected(indices,
+            _dijkstra_multi_separate(
+                                 indices,
                                  csr_data, csr_indices, csr_indptr,
                                  csrT_data, csrT_indices, csrT_indptr,
-                                 dist_matrix, predecessor_matrix, limitf)
+                                 dist_matrix, predecessor_matrix, source_matrix,
+                                 limitf)
 
     if return_predecessors:
         if min_only:
@@ -651,252 +667,114 @@ def dijkstra(csgraph, directed=True, indices=None,
     else:
         return dist_matrix.reshape(return_shape)
 
-@cython.boundscheck(False)
-cdef _dijkstra_setup_heap_multi(FibonacciHeap *heap,
-                                FibonacciNode* nodes,
-                                const int[:] source_indices,
-                                int[:] sources,
-                                double[:] dist_matrix,
-                                int return_pred):
-    cdef:
-        unsigned int Nind = source_indices.shape[0]
-        unsigned int N = dist_matrix.shape[0]
-        unsigned int i, k, j_source
-        FibonacciNode *current_node
-
-    for k in range(N):
-        initialize_node(&nodes[k], k)
-
-    heap.min_node = NULL
-    for i in range(Nind):
-        j_source = source_indices[i]
-        current_node = &nodes[j_source]
-        if current_node.state == SCANNED:
-            continue
-        dist_matrix[j_source] = 0
-        if return_pred:
-            sources[j_source] = j_source
-        current_node.state = SCANNED
-        current_node.source = j_source
-        insert_node(heap, &nodes[j_source])
-
-@cython.boundscheck(False)
-cdef _dijkstra_scan_heap_multi(FibonacciHeap *heap,
-                               FibonacciNode *v,
-                               FibonacciNode* nodes,
-                               const double[:] csr_weights,
-                               const int[:] csr_indices,
-                               const int[:] csr_indptr,
-                               int[:] pred,
-                               int[:] sources,
-                               int return_pred,
-                               DTYPE_t limit):
-    cdef:
-        unsigned int j_current
-        ITYPE_t j
-        DTYPE_t next_val
-        FibonacciNode *current_node
 
-    for j in range(csr_indptr[v.index], csr_indptr[v.index + 1]):
-        j_current = csr_indices[j]
-        current_node = &nodes[j_current]
-        if current_node.state != SCANNED:
-            next_val = v.val + csr_weights[j]
-            if next_val <= limit:
-                if current_node.state == NOT_IN_HEAP:
-                    current_node.state = IN_HEAP
-                    current_node.val = next_val
-                    current_node.source = v.source
-                    insert_node(heap, current_node)
-                    if return_pred:
-                        pred[j_current] = v.index
-                        sources[j_current] = v.source
-                elif current_node.val > next_val:
-                    current_node.source = v.source
-                    decrease_val(heap, current_node,
-                                 next_val)
-                    if return_pred:
-                        pred[j_current] = v.index
-                        sources[j_current] = v.source
+ctypedef unsigned int uint_t
+ctypedef pair[DTYPE_t, uint_t] dist_index_pair_t
+ctypedef priority_queue[dist_index_pair_t] dijkstra_queue_t
 
 @cython.boundscheck(False)
-cdef _dijkstra_scan_heap(FibonacciHeap *heap,
-                         FibonacciNode *v,
-                         FibonacciNode* nodes,
+cdef void _dijkstra_scan_heap(dijkstra_queue_t &heap,
+                         dist_index_pair_t v,
                          const double[:] csr_weights,
                          const int[:] csr_indices,
                          const int[:] csr_indptr,
-                         int[:, :] pred,
+                         double[:] dist_matrix,
+                         int[:] pred,
                          int return_pred,
-                         DTYPE_t limit,
-                         int i):
+                         int[:] sources,
+                         int return_source,
+                         DTYPE_t limit) noexcept nogil:
     cdef:
-        unsigned int j_current
         ITYPE_t j
+        unsigned int j_current
         DTYPE_t next_val
-        FibonacciNode *current_node
 
-    for j in range(csr_indptr[v.index], csr_indptr[v.index + 1]):
+    # v is a dist_index_pair_t poped from the queue
+    # v.first: the distance of the vertex
+    # v.second: index of the vertex
+    for j in range(csr_indptr[v.second], csr_indptr[v.second + 1]):
         j_current = csr_indices[j]
-        current_node = &nodes[j_current]
-        if current_node.state != SCANNED:
-            next_val = v.val + csr_weights[j]
-            if next_val <= limit:
-                if current_node.state == NOT_IN_HEAP:
-                    current_node.state = IN_HEAP
-                    current_node.val = next_val
-                    insert_node(heap, current_node)
-                    if return_pred:
-                        pred[i, j_current] = v.index
-                elif current_node.val > next_val:
-                    decrease_val(heap, current_node,
-                                 next_val)
-                    if return_pred:
-                        pred[i, j_current] = v.index
-
-@cython.boundscheck(False)
-cdef int _dijkstra_directed(
-            const int[:] source_indices,
-            const double[:] csr_weights,
-            const int[:] csr_indices,
-            const int[:] csr_indptr,
-            double[:, :] dist_matrix,
-            int[:, :] pred,
-            DTYPE_t limit) except -1:
-    cdef:
-        unsigned int Nind = dist_matrix.shape[0]
-        unsigned int N = dist_matrix.shape[1]
-        unsigned int i, k, j_source
-        int return_pred = (pred.size > 0)
-        FibonacciHeap heap
-        FibonacciNode *v
-        FibonacciNode* nodes = <FibonacciNode*> malloc(N *
-                                                       sizeof(FibonacciNode))
-    if nodes == NULL:
-        raise MemoryError("Failed to allocate memory in _dijkstra_directed")
-
-    for i in range(Nind):
-        j_source = source_indices[i]
+        next_val = v.first + csr_weights[j]
+        if next_val <= limit:
+            if dist_matrix[j_current] > next_val:
+                dist_matrix[j_current] = next_val
+                # The same vertex may be pushed multiple times to the queue, but
+                # anything with suboptimal distance is ignored when poped
+                heap.push(dist_index_pair_t(-next_val, j_current))
+                if return_pred:
+                    pred[j_current] = v.second
+                if return_source:
+                    sources[j_current] = sources[v.second]
 
-        for k in range(N):
-            initialize_node(&nodes[k], k)
-
-        dist_matrix[i, j_source] = 0
-        heap.min_node = NULL
-        insert_node(&heap, &nodes[j_source])
-
-        while heap.min_node:
-            v = remove_min(&heap)
-            v.state = SCANNED
-
-            _dijkstra_scan_heap(&heap, v, nodes,
-                                csr_weights, csr_indices, csr_indptr,
-                                pred, return_pred, limit, i)
-
-            # v has now been scanned: add the distance to the results
-            dist_matrix[i, v.index] = v.val
-
-    free(nodes)
-    return 0
 
 @cython.boundscheck(False)
-cdef int _dijkstra_directed_multi(
+cdef int _dijkstra(
             const int[:] source_indices,
             const double[:] csr_weights,
             const int[:] csr_indices,
             const int[:] csr_indptr,
+            const double[:] csrT_weights,
+            const int[:] csrT_indices,
+            const int[:] csrT_indptr,
             double[:] dist_matrix,
             int[:] pred,
             int[:] sources,
-            DTYPE_t limit) except -1:
+            DTYPE_t limit) except -1 nogil:
     cdef:
+        unsigned int Nind = source_indices.shape[0]
         unsigned int N = dist_matrix.shape[0]
-
-        int return_pred = (pred.size > 0)
-
-        FibonacciHeap heap
-        FibonacciNode *v
-        FibonacciNode* nodes = <FibonacciNode*> malloc(N *
-                                                       sizeof(FibonacciNode))
-    if nodes == NULL:
-        raise MemoryError("Failed to allocate memory in "
-                          "_dijkstra_directed_multi")
-
-    # initialize the heap with each of the starting
-    # nodes on the heap and in a scanned state with 0 values
-    # and their entry of the distance matrix = 0
-    # pred will lead back to one of the starting indices
-    _dijkstra_setup_heap_multi(&heap, nodes, source_indices,
-                               sources, dist_matrix, return_pred)
-
-    while heap.min_node:
-        v = remove_min(&heap)
-        v.state = SCANNED
-
-        _dijkstra_scan_heap_multi(&heap, v, nodes,
-                                  csr_weights, csr_indices, csr_indptr,
-                                  pred, sources, return_pred, limit)
-
-        # v has now been scanned: add the distance to the results
-        dist_matrix[v.index] = v.val
-
-    free(nodes)
-    return 0
-
-@cython.boundscheck(False)
-cdef int _dijkstra_undirected(
-            const int[:] source_indices,
-            const double[:] csr_weights,
-            const int[:] csr_indices,
-            const int[:] csr_indptr,
-            const double[:] csrT_weights,
-            const int[:] csrT_indices,
-            const int[:] csrT_indptr,
-            double[:, :] dist_matrix,
-            int[:, :] pred,
-            DTYPE_t limit) except -1:
-    cdef:
-        unsigned int Nind = dist_matrix.shape[0]
-        unsigned int N = dist_matrix.shape[1]
-        unsigned int i, k, j_source
-        int return_pred = (pred.size > 0)
-        FibonacciHeap heap
-        FibonacciNode *v
-        FibonacciNode* nodes = <FibonacciNode*> malloc(N *
-                                                       sizeof(FibonacciNode))
-    if nodes == NULL:
-        raise MemoryError("Failed to allocate memory in _dijkstra_undirected")
+        unsigned int i, j_source
+        bint return_pred = (pred.shape[0] > 0)
+        bint return_sources = (sources.shape[0] > 0)
+        bint directed = (csrT_weights.shape[0] == 0)
+
+        # pairs of {-distance, vertex index} will be pushed
+        # to treat it as a min-heap instead of max-heap
+        dijkstra_queue_t heap = dijkstra_queue_t()
+        dist_index_pair_t v
+
+    if return_pred and pred.shape[0] != N:
+        raise RuntimeError(
+            f"Invalid predecessors array shape {pred.shape}. Expected {(N,)}."
+        )
+    if return_sources and sources.shape[0] != N:
+        raise RuntimeError(
+            f"Invalid sources array shape {sources.shape}. Expected {(N,)}."
+        )
 
     for i in range(Nind):
         j_source = source_indices[i]
+        dist_matrix[j_source] = 0
+        heap.push(dist_index_pair_t(-dist_matrix[j_source], j_source))
+        if return_sources:
+            sources[j_source] = j_source
 
-        for k in range(N):
-            initialize_node(&nodes[k], k)
-
-        dist_matrix[i, j_source] = 0
-        heap.min_node = NULL
-        insert_node(&heap, &nodes[j_source])
-
-        while heap.min_node:
-            v = remove_min(&heap)
-            v.state = SCANNED
 
-            _dijkstra_scan_heap(&heap, v, nodes,
-                                csr_weights, csr_indices, csr_indptr,
-                                pred, return_pred, limit, i)
+    while heap.size():
+        v = heap.top()
+        heap.pop()
+        v.first = -v.first
+        # Do not process v if its distance has been updated
+        # after v was pushed to the queue, in which case
+        # _dijkstra_scan_heap should have already been called with
+        # the vertex v.second
+        # This assures _dijkstra_scan_heap is only called once per vertex
+        # and the total complexity is O(Mlog(M)) per source
+        if dist_matrix[v.second] < v.first :
+            continue
 
-            _dijkstra_scan_heap(&heap, v, nodes,
+        _dijkstra_scan_heap(heap, v, csr_weights, csr_indices, csr_indptr,
+                            dist_matrix, pred, return_pred,
+                            sources, return_sources, limit)
+        if not directed:
+            _dijkstra_scan_heap(heap, v,
                                 csrT_weights, csrT_indices, csrT_indptr,
-                                pred, return_pred, limit, i)
-
-            # v has now been scanned: add the distance to the results
-            dist_matrix[i, v.index] = v.val
-
-    free(nodes)
+                                dist_matrix, pred, return_pred,
+                                sources, return_sources, limit)
     return 0
 
 @cython.boundscheck(False)
-cdef int _dijkstra_undirected_multi(
+cdef int _dijkstra_multi_separate(
             const int[:] source_indices,
             const double[:] csr_weights,
             const int[:] csr_indices,
@@ -904,41 +782,34 @@ cdef int _dijkstra_undirected_multi(
             const double[:] csrT_weights,
             const int[:] csrT_indices,
             const int[:] csrT_indptr,
-            double[:] dist_matrix,
-            int[:] pred,
-            int[:] sources,
+            double[:, :] dist_matrix,
+            int[:, :] pred,
+            int[:, :] sources,
             DTYPE_t limit) except -1:
     cdef:
-        unsigned int N = dist_matrix.shape[0]
-        int return_pred = (pred.size > 0)
-        FibonacciHeap heap
-        FibonacciNode *v
-        FibonacciNode* nodes = <FibonacciNode*> malloc(N *
-                                                       sizeof(FibonacciNode))
-    if nodes == NULL:
-        raise MemoryError("Failed to allocate memory in "
-                          "_dijkstra_undirected_multi")
-
-    _dijkstra_setup_heap_multi(&heap, nodes, source_indices,
-                               sources, dist_matrix, return_pred)
-
-    while heap.min_node:
-        v = remove_min(&heap)
-        v.state = SCANNED
-
-        _dijkstra_scan_heap_multi(&heap, v, nodes,
-                                  csr_weights, csr_indices, csr_indptr,
-                                  pred, sources, return_pred, limit)
-
-        _dijkstra_scan_heap_multi(&heap, v, nodes,
-                                  csrT_weights, csrT_indices, csrT_indptr,
-                                  pred, sources, return_pred, limit)
+        unsigned int Nind = source_indices.shape[0]
+        unsigned int i
+        int source_list[1]
 
-        #v has now been scanned: add the distance to the results
-        dist_matrix[v.index] = v.val
+    if dist_matrix.shape[0] != Nind:
+        raise RuntimeError(
+            f"Not enough rows in distances matrix. Got {dist_matrix.shape[0]}, expected {Nind}."
+        )
+    if pred.shape[0] != Nind:
+        raise RuntimeError(
+            f"Not enough rows in predecessors matrix. Got {pred.shape[0]}, expected {Nind}."
+        )
+    if sources.shape[0] != Nind:
+        raise RuntimeError(
+            f"Not enough rows in sources matrix. Got {sources.shape[0]}, expected {Nind}."
+        )
 
-    free(nodes)
-    return 0
+    for i in range(Nind):
+        source_list[0] = source_indices[i]
+        _dijkstra(source_list,
+                  csr_weights, csr_indices, csr_indptr,
+                  csrT_weights, csrT_indices, csrT_indptr,
+                  dist_matrix[i], pred[i], sources[i], limit)
 
 
 def bellman_ford(csgraph, directed=True, indices=None,
@@ -1101,7 +972,8 @@ cdef int _bellman_ford_directed(
     cdef:
         unsigned int Nind = dist_matrix.shape[0]
         unsigned int N = dist_matrix.shape[1]
-        unsigned int i, j, k, j_source, count
+        unsigned int i, j, j_source, count
+        ITYPE_t k
         DTYPE_t d1, d2, w12
         int return_pred = (pred.size > 0)
 
@@ -1142,7 +1014,8 @@ cdef int _bellman_ford_undirected(
     cdef:
         unsigned int Nind = dist_matrix.shape[0]
         unsigned int N = dist_matrix.shape[1]
-        unsigned int i, j, k, j_source, ind_k, count
+        unsigned int i, j, j_source, ind_k, count
+        ITYPE_t k
         DTYPE_t d1, d2, w12
         int return_pred = (pred.size > 0)
 
@@ -1310,7 +1183,7 @@ def johnson(csgraph, directed=True, indices=None,
         predecessor_matrix = np.empty((len(indices), N), dtype=ITYPE)
         predecessor_matrix.fill(NULL_IDX)
     else:
-        predecessor_matrix = np.empty((0, N), dtype=ITYPE)
+        predecessor_matrix = np.empty((len(indices), 0), dtype=ITYPE)
 
     #------------------------------
     # initialize distance array
@@ -1334,17 +1207,27 @@ def johnson(csgraph, directed=True, indices=None,
     # add the bellman-ford weights to the data
     _johnson_add_weights(csr_data, csr_indices, csr_indptr, dist_array)
 
+    dummy_source_matrix = np.empty((len(indices), 0), dtype=ITYPE)
     if directed:
-        _dijkstra_directed(indices,
-                           csr_data, csr_indices, csr_indptr,
-                           dist_matrix, predecessor_matrix, np.inf)
+        # for null transposed CSR
+        dummy_double_array = np.empty(0, dtype=DTYPE)
+        dummy_int_array = np.empty(0, dtype=ITYPE)
+        _dijkstra_multi_separate(
+            indices,
+            csr_data, csr_indices, csr_indptr,
+            dummy_double_array, dummy_int_array, dummy_int_array,
+            dist_matrix, predecessor_matrix, dummy_source_matrix, np.inf)
     else:
-        csrT = csr_array((csr_data, csr_indices, csr_indptr), csgraph.shape).T.tocsr()
-        _johnson_add_weights(csrT.data, csrT.indices, csrT.indptr, dist_array)
-        _dijkstra_undirected(indices,
-                             csr_data, csr_indices, csr_indptr,
-                             csrT.data, csrT.indices, csrT.indptr,
-                             dist_matrix, predecessor_matrix, np.inf)
+        csgraphT = csr_array((csr_data, csr_indices, csr_indptr),
+                               csgraph.shape).T.tocsr()
+        _johnson_add_weights(csgraphT.data, csgraphT.indices,
+                             csgraphT.indptr, dist_array)
+        _dijkstra_multi_separate(
+            indices,
+            csr_data,csr_indices, csr_indptr,
+            csgraphT.data, csgraphT.indices, csgraphT.indptr,
+            dist_matrix, predecessor_matrix, dummy_source_matrix,
+            np.inf)
 
     # ------------------------------
     # correct the distance matrix for the bellman-ford weights
@@ -1364,7 +1247,9 @@ cdef void _johnson_add_weights(
             const int[:] csr_indptr,
             const double[:] dist_array) noexcept:
     # let w(u, v) = w(u, v) + h(u) - h(v)
-    cdef unsigned int j, k, N = dist_array.shape[0]
+    cdef:
+        unsigned int j, N = dist_array.shape[0]
+        ITYPE_t k
 
     for j in range(N):
         for k in range(csr_indptr[j], csr_indptr[j + 1]):
@@ -1380,7 +1265,8 @@ cdef int _johnson_directed(
     # Note: The contents of dist_array must be initialized to zero on entry
     cdef:
         unsigned int N = dist_array.shape[0]
-        unsigned int j, k, count
+        unsigned int j, count
+        ITYPE_t k
         DTYPE_t d1, d2, w12
 
     # relax all edges (N+1) - 1 times
@@ -1413,7 +1299,8 @@ cdef int _johnson_undirected(
     # Note: The contents of dist_array must be initialized to zero on entry
     cdef:
         unsigned int N = dist_array.shape[0]
-        unsigned int j, k, ind_k, count
+        unsigned int j, ind_k, count
+        ITYPE_t k
         DTYPE_t d1, d2, w12
 
     # relax all edges (N+1) - 1 times
@@ -1441,253 +1328,6 @@ cdef int _johnson_undirected(
     return -1
 
 
-######################################################################
-# FibonacciNode structure
-#  This structure and the operations on it are the nodes of the
-#  Fibonacci heap.
-#
-cdef enum FibonacciState:
-    SCANNED
-    NOT_IN_HEAP
-    IN_HEAP
-
-
-cdef struct FibonacciNode:
-    unsigned int index
-    unsigned int rank
-    unsigned int source
-    FibonacciState state
-    DTYPE_t val
-    FibonacciNode* parent
-    FibonacciNode* left_sibling
-    FibonacciNode* right_sibling
-    FibonacciNode* children
-
-
-cdef void initialize_node(FibonacciNode* node,
-                          unsigned int index,
-                          DTYPE_t val=0) noexcept:
-    # Assumptions: - node is a valid pointer
-    #              - node is not currently part of a heap
-    node.index = index
-    node.source = -9999
-    node.val = val
-    node.rank = 0
-    node.state = NOT_IN_HEAP
-
-    node.parent = NULL
-    node.left_sibling = NULL
-    node.right_sibling = NULL
-    node.children = NULL
-
-
-cdef FibonacciNode* leftmost_sibling(FibonacciNode* node) noexcept:
-    # Assumptions: - node is a valid pointer
-    cdef FibonacciNode* temp = node
-    while(temp.left_sibling):
-        temp = temp.left_sibling
-    return temp
-
-
-cdef void add_child(FibonacciNode* node, FibonacciNode* new_child) noexcept:
-    # Assumptions: - node is a valid pointer
-    #              - new_child is a valid pointer
-    #              - new_child is not the sibling or child of another node
-    new_child.parent = node
-
-    if node.children:
-        add_sibling(node.children, new_child)
-    else:
-
-        node.children = new_child
-        new_child.right_sibling = NULL
-        new_child.left_sibling = NULL
-        node.rank = 1
-
-
-cdef void add_sibling(FibonacciNode* node, FibonacciNode* new_sibling) noexcept:
-    # Assumptions: - node is a valid pointer
-    #              - new_sibling is a valid pointer
-    #              - new_sibling is not the child or sibling of another node
-
-    # Insert new_sibling between node and node.right_sibling
-    if node.right_sibling:
-        node.right_sibling.left_sibling = new_sibling
-    new_sibling.right_sibling = node.right_sibling
-    new_sibling.left_sibling = node
-    node.right_sibling = new_sibling
-
-    new_sibling.parent = node.parent
-    if new_sibling.parent:
-        new_sibling.parent.rank += 1
-
-
-cdef void remove(FibonacciNode* node) noexcept:
-    # Assumptions: - node is a valid pointer
-    if node.parent:
-        node.parent.rank -= 1
-        if node.parent.children == node:  # node is the leftmost sibling.
-            node.parent.children = node.right_sibling
-
-    if node.left_sibling:
-        node.left_sibling.right_sibling = node.right_sibling
-    if node.right_sibling:
-        node.right_sibling.left_sibling = node.left_sibling
-
-    node.left_sibling = NULL
-    node.right_sibling = NULL
-    node.parent = NULL
-
-
-######################################################################
-# FibonacciHeap structure
-#  This structure and operations on it use the FibonacciNode
-#  routines to implement a Fibonacci heap
-
-ctypedef FibonacciNode* pFibonacciNode
-
-
-cdef struct FibonacciHeap:
-    # In this representation, min_node is always at the leftmost end
-    # of the linked-list, hence min_node.left_sibling is always NULL.
-    FibonacciNode* min_node
-    pFibonacciNode[100] roots_by_rank  # maximum number of nodes is ~2^100.
-
-
-cdef void insert_node(FibonacciHeap* heap,
-                      FibonacciNode* node) noexcept:
-    # Assumptions: - heap is a valid pointer
-    #              - node is a valid pointer
-    #              - node is not the child or sibling of another node
-    if heap.min_node:
-        if node.val < heap.min_node.val:
-            # Replace heap.min_node with node, which is always
-            # at the leftmost end of the roots' linked-list.
-            node.left_sibling = NULL
-            node.right_sibling = heap.min_node
-            heap.min_node.left_sibling = node
-            heap.min_node = node
-        else:
-            add_sibling(heap.min_node, node)
-    else:
-        heap.min_node = node
-
-
-cdef void decrease_val(FibonacciHeap* heap,
-                       FibonacciNode* node,
-                       DTYPE_t newval) noexcept:
-    # Assumptions: - heap is a valid pointer
-    #              - newval <= node.val
-    #              - node is a valid pointer
-    #              - node is not the child or sibling of another node
-    #              - node is in the heap
-    node.val = newval
-    if node.parent and (node.parent.val >= newval):
-        remove(node)
-        insert_node(heap, node)
-    elif heap.min_node.val > node.val:
-        # Replace heap.min_node with node, which is always
-        # at the leftmost end of the roots' linked-list.
-        remove(node)
-        node.right_sibling = heap.min_node
-        heap.min_node.left_sibling = node
-        heap.min_node = node
-
-
-cdef void link(FibonacciHeap* heap, FibonacciNode* node) noexcept:
-    # Assumptions: - heap is a valid pointer
-    #              - node is a valid pointer
-    #              - node is already within heap
-
-    cdef FibonacciNode *linknode
-
-    if heap.roots_by_rank[node.rank] == NULL:
-        heap.roots_by_rank[node.rank] = node
-    else:
-        linknode = heap.roots_by_rank[node.rank]
-        heap.roots_by_rank[node.rank] = NULL
-
-        if node.val < linknode.val or node == heap.min_node:
-            remove(linknode)
-            add_child(node, linknode)
-            link(heap, node)
-        else:
-            remove(node)
-            add_child(linknode, node)
-            link(heap, linknode)
-
-
-cdef FibonacciNode* remove_min(FibonacciHeap* heap) noexcept:
-    # Assumptions: - heap is a valid pointer
-    #              - heap.min_node is a valid pointer
-    cdef:
-        FibonacciNode *temp
-        FibonacciNode *temp_right
-        FibonacciNode *out
-        unsigned int i
-
-    # make all min_node children into root nodes
-    temp = heap.min_node.children
-
-    while temp:
-        temp_right = temp.right_sibling
-        remove(temp)
-        add_sibling(heap.min_node, temp)
-        temp = temp_right
-
-    # remove min_root and choose another root as a preliminary min_root
-    out = heap.min_node
-    temp = heap.min_node.right_sibling
-    remove(heap.min_node)
-    heap.min_node = temp
-
-    if temp == NULL:
-        # There is a unique root in the tree, hence a unique node
-        # which is the minimum that we return here.
-        return out
-
-    # re-link the heap
-    for i in range(100):
-        heap.roots_by_rank[i] = NULL
-
-    while temp:
-        if temp.val < heap.min_node.val:
-            heap.min_node = temp
-        temp_right = temp.right_sibling
-        link(heap, temp)
-        temp = temp_right
-
-    # move heap.min_node to the leftmost end of the linked-list of roots
-    temp = leftmost_sibling(heap.min_node)
-    if heap.min_node != temp:
-        remove(heap.min_node)
-        heap.min_node.right_sibling = temp
-        temp.left_sibling = heap.min_node
-
-    return out
-
-
-######################################################################
-# Debugging: Functions for printing the Fibonacci heap
-#
-#cdef void print_node(FibonacciNode* node, int level=0) noexcept:
-#    print('%s(%i,%i) %i' % (level*' ', node.index, node.val, node.rank))
-#    if node.children:
-#        print_node(node.children, level+1)
-#    if node.right_sibling:
-#        print_node(node.right_sibling, level)
-#
-#
-#cdef void print_heap(FibonacciHeap* heap) noexcept:
-#    print("---------------------------------")
-#    if heap.min_node:
-#        print("min node: (%i, %i)" % (heap.min_node.index, heap.min_node.val))
-#        print_node(heap.min_node)
-#    else:
-#        print("[empty heap]")
-
-######################################################################
-
 # Author: Tomer Sery  -- <tomersery28@gmail.com>
 # License: BSD 3-clause ("New BSD License"), (C) 2024
 
@@ -1880,33 +1520,25 @@ cdef void _yen(
 
         # Dijkstra's operands and results arrays
         int[:] indice_node_arr = np.array([source], dtype=ITYPE)
-        int[:, :] predecessor_matrix = np.full((1, N), NULL_IDX, dtype=ITYPE)
-        double[:, :] dist_matrix = np.full((1, N), np.inf, dtype=DTYPE)
-    dist_matrix[0, source] = 0
+        int[:] predecessor_matrix = np.full((N), NULL_IDX, dtype=ITYPE)
+        double[:] dist_matrix = np.full((N), np.inf, dtype=DTYPE)
+        int[:] dummy_source_matrix = np.empty((0), dtype=ITYPE) # unused
+    dist_matrix[source] = 0
 
     # ---------------------------------------------------
     # Compute and store the shortest path
-    if directed:
-        _dijkstra_directed(
-            indice_node_arr,
-            original_weights, csr_indices, csr_indptr,
-            dist_matrix, predecessor_matrix, INFINITY,
-        )
-    else:
-        _dijkstra_undirected(
-            indice_node_arr,
-            original_weights, csr_indices, csr_indptr,
-            originalT_weights, csrT_indices, csrT_indptr,
-            dist_matrix, predecessor_matrix, INFINITY,
-        )
+    _dijkstra(
+        indice_node_arr,
+        original_weights, csr_indices, csr_indptr,
+        originalT_weights, csrT_indices, csrT_indptr,
+        dist_matrix, predecessor_matrix, dummy_source_matrix,
+        INFINITY,
+    )
 
-    shortest_distances[0] = dist_matrix[0, sink]
+    shortest_distances[0] = dist_matrix[sink]
     if shortest_distances[0] == INFINITY:
         # No paths between source and sink
         return
-    if directed:
-        # Avoid copying a size 0 memory view
-        originalT_weights = original_weights
 
     cdef:
         # initialize candidate arrays
@@ -1916,16 +1548,22 @@ cdef void _yen(
         int[:, :] candidate_predecessors = np.full((K, N), NULL_IDX, dtype=ITYPE)
         # Store the original graph weights for restoring the graph
         double[:] csr_weights = original_weights.copy()
-        double[:] csrT_weights = originalT_weights.copy()
+        double[:] csrT_weights
 
         int k, i, spur_node, node, short_path_idx, tmp_i
         double root_path_distance, total_distance, tmp_d
 
+    # Avoid copying a size 0 memory view
+    if directed:
+        csrT_weights = np.empty(0, dtype=DTYPE)
+    else:
+        csrT_weights = originalT_weights.copy()
+
     # Copy shortest path to shortest_paths_predecessors
     node = sink
     while node != NULL_IDX:
-        shortest_paths_predecessors[0, node] = predecessor_matrix[0, node]
-        node = predecessor_matrix[0, node]
+        shortest_paths_predecessors[0, node] = predecessor_matrix[node]
+        node = predecessor_matrix[node]
 
 
     # ---------------------------------------------------
@@ -2012,35 +1650,29 @@ cdef void _yen(
             # Search for the shortest path from spur_node to sink
 
             # Reset the distance and predecessor matrix
-            predecessor_matrix[0, :] = NULL_IDX
-            dist_matrix[0, :] = INFINITY
-            dist_matrix[0, source] = 0
+            predecessor_matrix[:] = NULL_IDX
+            dist_matrix[:] = INFINITY
+            dist_matrix[source] = 0
             # Search only for paths starting for spur_node
             indice_node_arr[0] = spur_node
-            if directed:
-                _dijkstra_directed(
-                    indice_node_arr,
-                    csr_weights, csr_indices, csr_indptr,
-                    dist_matrix, predecessor_matrix, INFINITY,
-                )
-            else:
-                _dijkstra_undirected(
-                    indice_node_arr,
-                    csr_weights, csr_indices, csr_indptr,
-                    csrT_weights, csrT_indices, csrT_indptr,
-                    dist_matrix, predecessor_matrix, INFINITY,
-                )
+            _dijkstra(
+                indice_node_arr,
+                csr_weights, csr_indices, csr_indptr,
+                csrT_weights, csrT_indices, csrT_indptr,
+                dist_matrix, predecessor_matrix, dummy_source_matrix,
+                INFINITY,
+            )
 
             # Compute the total distance of the found path
-            total_distance = dist_matrix[0, sink] + root_path_distance
+            total_distance = dist_matrix[sink] + root_path_distance
 
             # ---------------------------------------------------
             # Add the found path to arrays of candidates
             if (
                 total_distance != INFINITY
                 and _yen_is_path_in_candidates(candidate_predecessors,
-                                               shortest_paths_predecessors[k-1],
-                                               predecessor_matrix[0],
+                                               shortest_paths_predecessors[k-1], 
+                                               predecessor_matrix,
                                                spur_node, sink) == 0
             ):
                 # Find the index to insert the new path
@@ -2072,9 +1704,9 @@ cdef void _yen(
                     node = sink
                     while node != spur_node:
                         candidate_predecessors[short_path_idx, node] = (
-                            predecessor_matrix[0, node]
+                            predecessor_matrix[node]
                         )
-                        node = predecessor_matrix[0, node]
+                        node = predecessor_matrix[node]
 
            # ---------------------------------------------------
             # Restore graph weights
diff --git a/scipy/sparse/csgraph/meson.build b/scipy/sparse/csgraph/meson.build
index 21d691b96de7..fbf707dd3402 100644
--- a/scipy/sparse/csgraph/meson.build
+++ b/scipy/sparse/csgraph/meson.build
@@ -1,27 +1,47 @@
-pyx_files = [
+pyx_files_for_c = [
   ['_flow', '_flow.pyx'],
   ['_matching', '_matching.pyx'],
   ['_min_spanning_tree', '_min_spanning_tree.pyx'],
   ['_reordering', '_reordering.pyx'],
-  ['_shortest_path', '_shortest_path.pyx'],
   ['_tools', '_tools.pyx'],
   ['_traversal', '_traversal.pyx']
 ]
+pyx_files_for_cpp = [
+  ['_shortest_path', '_shortest_path.pyx'],
+]
 
-cython_gen_csgraph = generator(cython,
+cython_gen_csgraph_for_c = generator(cython,
   arguments : cython_args,
   output : '@BASENAME@.c',
   depends : [_cython_tree, fs.copyfile('parameters.pxi')],
 )
 
-foreach pyx_file: pyx_files
+foreach pyx_file: pyx_files_for_c
   py3.extension_module(pyx_file[0],
-    cython_gen_csgraph.process(pyx_file[1]),
+    cython_gen_csgraph_for_c.process(pyx_file[1]),
     c_args: cython_c_args,
     dependencies: np_dep,
     link_args: version_link_args,
     install: true,
-    subdir: 'scipy/sparse/csgraph'
+    subdir: 'scipy/sparse/csgraph',
+  )
+endforeach
+
+cython_gen_csgraph_for_cpp = generator(cython,
+  arguments : cython_cplus_args,
+  output : '@BASENAME@.cpp',
+  depends : [_cython_tree],
+)
+
+foreach pyx_file: pyx_files_for_cpp
+  py3.extension_module(pyx_file[0],
+    cython_gen_csgraph_for_cpp.process(pyx_file[1]),
+    cpp_args: cython_cpp_args,
+    include_directories: inc_np,
+    dependencies: np_dep,
+    link_args: version_link_args,
+    install: true,
+    subdir: 'scipy/sparse/csgraph',
   )
 endforeach
 
diff --git a/scipy/sparse/csgraph/tests/test_shortest_path.py b/scipy/sparse/csgraph/tests/test_shortest_path.py
index ba50f760e750..45d811486846 100644
--- a/scipy/sparse/csgraph/tests/test_shortest_path.py
+++ b/scipy/sparse/csgraph/tests/test_shortest_path.py
@@ -16,6 +16,7 @@
                        [1, 0, 0, 0, 0],
                        [2, 0, 0, 2, 0]], dtype=float)
 
+# Undirected version of directed_G
 undirected_G = np.array([[0, 3, 3, 1, 2],
                          [3, 0, 0, 2, 4],
                          [3, 0, 0, 0, 0],
@@ -24,6 +25,7 @@
 
 unweighted_G = (directed_G > 0).astype(float)
 
+# Correct shortest path lengths for directed_G and undirected_G
 directed_SP = [[0, 3, 3, 5, 7],
                [3, 0, 6, 2, 4],
                [np.inf, np.inf, 0, np.inf, np.inf],
@@ -33,6 +35,35 @@
 directed_2SP_0_to_3 = [[-9999, 0, -9999, 1, -9999],
                        [-9999, 0, -9999, 4, 1]]
 
+undirected_SP = np.array([[0, 3, 3, 1, 2],
+                          [3, 0, 6, 2, 4],
+                          [3, 6, 0, 4, 5],
+                          [1, 2, 4, 0, 2],
+                          [2, 4, 5, 2, 0]], dtype=float)
+
+undirected_SP_limit_2 = np.array([[0, np.inf, np.inf, 1, 2],
+                                  [np.inf, 0, np.inf, 2, np.inf],
+                                  [np.inf, np.inf, 0, np.inf, np.inf],
+                                  [1, 2, np.inf, 0, 2],
+                                  [2, np.inf, np.inf, 2, 0]], dtype=float)
+
+undirected_SP_limit_0 = np.ones((5, 5), dtype=float) - np.eye(5)
+undirected_SP_limit_0[undirected_SP_limit_0 > 0] = np.inf
+
+# Correct predecessors for directed_G and undirected_G
+directed_pred = np.array([[-9999, 0, 0, 1, 1],
+                          [3, -9999, 0, 1, 1],
+                          [-9999, -9999, -9999, -9999, -9999],
+                          [3, 0, 0, -9999, 1],
+                          [4, 0, 0, 4, -9999]], dtype=float)
+
+undirected_pred = np.array([[-9999, 0, 0, 0, 0],
+                            [1, -9999, 0, 1, 1],
+                            [2, 0, -9999, 0, 0],
+                            [3, 3, 0, -9999, 3],
+                            [4, 4, 0, 4, -9999]], dtype=float)
+
+# Other graphs
 directed_sparse_zero_G = scipy.sparse.csr_array(
     (
         [0, 1, 2, 3, 1],
@@ -61,33 +92,6 @@
                         [np.inf, np.inf, np.inf, 0, 1],
                         [np.inf, np.inf, np.inf, 1, 0]]
 
-directed_pred = np.array([[-9999, 0, 0, 1, 1],
-                          [3, -9999, 0, 1, 1],
-                          [-9999, -9999, -9999, -9999, -9999],
-                          [3, 0, 0, -9999, 1],
-                          [4, 0, 0, 4, -9999]], dtype=float)
-
-undirected_SP = np.array([[0, 3, 3, 1, 2],
-                          [3, 0, 6, 2, 4],
-                          [3, 6, 0, 4, 5],
-                          [1, 2, 4, 0, 2],
-                          [2, 4, 5, 2, 0]], dtype=float)
-
-undirected_SP_limit_2 = np.array([[0, np.inf, np.inf, 1, 2],
-                                  [np.inf, 0, np.inf, 2, np.inf],
-                                  [np.inf, np.inf, 0, np.inf, np.inf],
-                                  [1, 2, np.inf, 0, 2],
-                                  [2, np.inf, np.inf, 2, 0]], dtype=float)
-
-undirected_SP_limit_0 = np.ones((5, 5), dtype=float) - np.eye(5)
-undirected_SP_limit_0[undirected_SP_limit_0 > 0] = np.inf
-
-undirected_pred = np.array([[-9999, 0, 0, 0, 0],
-                            [1, -9999, 0, 1, 1],
-                            [2, 0, -9999, 0, 0],
-                            [3, 3, 0, -9999, 3],
-                            [4, 4, 0, 4, -9999]], dtype=float)
-
 directed_negative_weighted_G = np.array([[0, 0, 0],
                                          [-1, 0, 0],
                                          [0, -1, 0]], dtype=float)
@@ -217,6 +221,28 @@ def test_dijkstra_min_only_random(n):
             p = pred[p]
 
 
+@pytest.mark.parametrize('n', (10, 100))
+@pytest.mark.parametrize("method", ['FW', 'J', 'BF'])
+@pytest.mark.parametrize('directed', (True, False))
+def test_star_graph(n, method, directed):
+    # Build the star graph
+    star_arr = np.zeros((n, n), dtype=float)
+    star_center_idx = 0
+    star_arr[star_center_idx, :] = star_arr[:, star_center_idx] = range(n)
+    G = scipy.sparse.csr_matrix(star_arr, shape=(n, n))
+    # Build the distances matrix
+    SP_solution = np.zeros((n, n), dtype=float)
+    SP_solution[:] = star_arr[star_center_idx]
+    for idx in range(1, n):
+        SP_solution[idx] += star_arr[idx, star_center_idx]
+    np.fill_diagonal(SP_solution, 0)
+
+    SP = shortest_path(G, method=method, directed=directed)
+    assert_allclose(
+        SP_solution, SP
+    )
+
+
 def test_dijkstra_random():
     # reproduces the hang observed in gh-17782
     n = 10
@@ -425,9 +451,11 @@ def test_yen_undirected():
         source=0,
         sink=3,
         K=4,
+        directed=False,
     )
     assert_allclose(distances, [1., 4., 5., 8.])
 
+
 def test_yen_unweighted():
     # Ask for more paths than there are, verify only the available paths are returned
     distances, predecessors = yen(
@@ -441,6 +469,7 @@ def test_yen_unweighted():
     assert_allclose(distances, [2., 3.])
     assert_allclose(predecessors, directed_2SP_0_to_3)
 
+
 def test_yen_no_paths():
     distances = yen(
         directed_G,
@@ -450,6 +479,7 @@ def test_yen_no_paths():
     )
     assert distances.size == 0
 
+
 def test_yen_negative_weights():
     distances = yen(
         directed_negative_weighted_G,
diff --git a/scipy/special/Faddeeva.hh b/scipy/special/Faddeeva.hh
deleted file mode 100644
index aefc76d56ac2..000000000000
--- a/scipy/special/Faddeeva.hh
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2012 Massachusetts Institute of Technology
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
- */
-
-/* Available at: http://ab-initio.mit.edu/Faddeeva_w
-
-   Header file for Faddeeva_w.cc; see that file for more information. */
-
-#ifndef FADDEEVA_HH
-#define FADDEEVA_HH 1
-
-#include <complex>
-
-namespace Faddeeva {
-
-// compute w(z) = exp(-z^2) erfc(-iz) [ Faddeeva / scaled complex error func ]
-extern std::complex<double> w(std::complex<double> z,double relerr=0);
-extern double w_im(double x); // special-case code for Im[w(x)] of real x
-
-// Various functions that we can compute with the help of w(z)
-
-// compute erfcx(z) = exp(z^2) erfz(z)
-extern std::complex<double> erfcx(std::complex<double> z, double relerr=0);
-extern double erfcx(double x); // special case for real x
-
-// compute erf(z), the error function of complex arguments
-extern std::complex<double> erf(std::complex<double> z, double relerr=0);
-extern double erf(double x); // special case for real x
-
-// compute erfi(z) = -i erf(iz), the imaginary error function
-extern std::complex<double> erfi(std::complex<double> z, double relerr=0);
-extern double erfi(double x); // special case for real x
-
-// compute erfc(z) = 1 - erf(z), the complementary error function
-extern std::complex<double> erfc(std::complex<double> z, double relerr=0);
-extern double erfc(double x); // special case for real x
-
-// compute Dawson(z) = sqrt(pi)/2  *  exp(-z^2) * erfi(z)
-extern std::complex<double> Dawson(std::complex<double> z, double relerr=0);
-extern double Dawson(double x); // special case for real x
-
-}; // namespace Faddeeva
-
-#endif // FADDEEVA_HH
diff --git a/scipy/special/__init__.py b/scipy/special/__init__.py
index 6f5e84eb6e91..797826e255db 100644
--- a/scipy/special/__init__.py
+++ b/scipy/special/__init__.py
@@ -820,7 +820,7 @@ def _load_libsf_error_state():
 from ._support_alternative_backends import (
     log_ndtr, ndtr, ndtri, erf, erfc, i0, i0e, i1, i1e, gammaln,
     gammainc, gammaincc, logit, expit, entr, rel_entr, xlogy,
-    chdtr, chdtrc, betainc, betaincc, stdtr)
+    chdtr, chdtrc, betainc, betaincc, stdtr, stdtrit)
 
 from . import _basic
 from ._basic import *
diff --git a/scipy/special/_add_newdocs.py b/scipy/special/_add_newdocs.py
index 134604c90128..93eca8eca60c 100644
--- a/scipy/special/_add_newdocs.py
+++ b/scipy/special/_add_newdocs.py
@@ -82,111 +82,6 @@ def add_newdoc(name, doc):
     Internal function, use `ellip_norm` instead.
     """)
 
-add_newdoc("voigt_profile",
-    r"""
-    voigt_profile(x, sigma, gamma, out=None)
-
-    Voigt profile.
-
-    The Voigt profile is a convolution of a 1-D Normal distribution with
-    standard deviation ``sigma`` and a 1-D Cauchy distribution with half-width at
-    half-maximum ``gamma``.
-
-    If ``sigma = 0``, PDF of Cauchy distribution is returned.
-    Conversely, if ``gamma = 0``, PDF of Normal distribution is returned.
-    If ``sigma = gamma = 0``, the return value is ``Inf`` for ``x = 0``,
-    and ``0`` for all other ``x``.
-
-    Parameters
-    ----------
-    x : array_like
-        Real argument
-    sigma : array_like
-        The standard deviation of the Normal distribution part
-    gamma : array_like
-        The half-width at half-maximum of the Cauchy distribution part
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    scalar or ndarray
-        The Voigt profile at the given arguments
-
-    See Also
-    --------
-    wofz : Faddeeva function
-
-    Notes
-    -----
-    It can be expressed in terms of Faddeeva function
-
-    .. math:: V(x; \sigma, \gamma) = \frac{Re[w(z)]}{\sigma\sqrt{2\pi}},
-    .. math:: z = \frac{x + i\gamma}{\sqrt{2}\sigma}
-
-    where :math:`w(z)` is the Faddeeva function.
-
-    References
-    ----------
-    .. [1] https://en.wikipedia.org/wiki/Voigt_profile
-
-    Examples
-    --------
-    Calculate the function at point 2 for ``sigma=1`` and ``gamma=1``.
-
-    >>> from scipy.special import voigt_profile
-    >>> import numpy as np
-    >>> import matplotlib.pyplot as plt
-    >>> voigt_profile(2, 1., 1.)
-    0.09071519942627544
-
-    Calculate the function at several points by providing a NumPy array
-    for `x`.
-
-    >>> values = np.array([-2., 0., 5])
-    >>> voigt_profile(values, 1., 1.)
-    array([0.0907152 , 0.20870928, 0.01388492])
-
-    Plot the function for different parameter sets.
-
-    >>> fig, ax = plt.subplots(figsize=(8, 8))
-    >>> x = np.linspace(-10, 10, 500)
-    >>> parameters_list = [(1.5, 0., "solid"), (1.3, 0.5, "dashed"),
-    ...                    (0., 1.8, "dotted"), (1., 1., "dashdot")]
-    >>> for params in parameters_list:
-    ...     sigma, gamma, linestyle = params
-    ...     voigt = voigt_profile(x, sigma, gamma)
-    ...     ax.plot(x, voigt, label=rf"$\sigma={sigma},\, \gamma={gamma}$",
-    ...             ls=linestyle)
-    >>> ax.legend()
-    >>> plt.show()
-
-    Verify visually that the Voigt profile indeed arises as the convolution
-    of a normal and a Cauchy distribution.
-
-    >>> from scipy.signal import convolve
-    >>> x, dx = np.linspace(-10, 10, 500, retstep=True)
-    >>> def gaussian(x, sigma):
-    ...     return np.exp(-0.5 * x**2/sigma**2)/(sigma * np.sqrt(2*np.pi))
-    >>> def cauchy(x, gamma):
-    ...     return gamma/(np.pi * (np.square(x)+gamma**2))
-    >>> sigma = 2
-    >>> gamma = 1
-    >>> gauss_profile = gaussian(x, sigma)
-    >>> cauchy_profile = cauchy(x, gamma)
-    >>> convolved = dx * convolve(cauchy_profile, gauss_profile, mode="same")
-    >>> voigt = voigt_profile(x, sigma, gamma)
-    >>> fig, ax = plt.subplots(figsize=(8, 8))
-    >>> ax.plot(x, gauss_profile, label="Gauss: $G$", c='b')
-    >>> ax.plot(x, cauchy_profile, label="Cauchy: $C$", c='y', ls="dashed")
-    >>> xx = 0.5*(x[1:] + x[:-1])  # midpoints
-    >>> ax.plot(xx, convolved[1:], label="Convolution: $G * C$", ls='dashdot',
-    ...         c='k')
-    >>> ax.plot(x, voigt, label="Voigt", ls='dotted', c='r')
-    >>> ax.legend()
-    >>> plt.show()
-    """)
-
 add_newdoc("wrightomega",
     r"""
     wrightomega(z, out=None)
@@ -1654,50 +1549,6 @@ def add_newdoc(name, doc):
 
     """)
 
-add_newdoc("dawsn",
-    """
-    dawsn(x, out=None)
-
-    Dawson's integral.
-
-    Computes::
-
-        exp(-x**2) * integral(exp(t**2), t=0..x).
-
-    Parameters
-    ----------
-    x : array_like
-        Function parameter.
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    y : scalar or ndarray
-        Value of the integral.
-
-    See Also
-    --------
-    wofz, erf, erfc, erfcx, erfi
-
-    References
-    ----------
-    .. [1] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-15, 15, num=1000)
-    >>> plt.plot(x, special.dawsn(x))
-    >>> plt.xlabel('$x$')
-    >>> plt.ylabel('$dawsn(x)$')
-    >>> plt.show()
-
-    """)
-
 add_newdoc(
     "elliprc",
     r"""
@@ -2293,189 +2144,6 @@ def add_newdoc(name, doc):
 
     """)
 
-add_newdoc("erf",
-    """
-    erf(z, out=None)
-
-    Returns the error function of complex argument.
-
-    It is defined as ``2/sqrt(pi)*integral(exp(-t**2), t=0..z)``.
-
-    Parameters
-    ----------
-    x : ndarray
-        Input array.
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    res : scalar or ndarray
-        The values of the error function at the given points `x`.
-
-    See Also
-    --------
-    erfc, erfinv, erfcinv, wofz, erfcx, erfi
-
-    Notes
-    -----
-    The cumulative of the unit normal distribution is given by
-    ``Phi(z) = 1/2[1 + erf(z/sqrt(2))]``.
-
-    References
-    ----------
-    .. [1] https://en.wikipedia.org/wiki/Error_function
-    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
-        Handbook of Mathematical Functions with Formulas,
-        Graphs, and Mathematical Tables. New York: Dover,
-        1972. http://www.math.sfu.ca/~cbm/aands/page_297.htm
-    .. [3] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-3, 3)
-    >>> plt.plot(x, special.erf(x))
-    >>> plt.xlabel('$x$')
-    >>> plt.ylabel('$erf(x)$')
-    >>> plt.show()
-
-    """)
-
-add_newdoc("erfc",
-    """
-    erfc(x, out=None)
-
-    Complementary error function, ``1 - erf(x)``.
-
-    Parameters
-    ----------
-    x : array_like
-        Real or complex valued argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        Values of the complementary error function
-
-    See Also
-    --------
-    erf, erfi, erfcx, dawsn, wofz
-
-    References
-    ----------
-    .. [1] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-3, 3)
-    >>> plt.plot(x, special.erfc(x))
-    >>> plt.xlabel('$x$')
-    >>> plt.ylabel('$erfc(x)$')
-    >>> plt.show()
-
-    """)
-
-add_newdoc("erfi",
-    """
-    erfi(z, out=None)
-
-    Imaginary error function, ``-i erf(i z)``.
-
-    Parameters
-    ----------
-    z : array_like
-        Real or complex valued argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        Values of the imaginary error function
-
-    See Also
-    --------
-    erf, erfc, erfcx, dawsn, wofz
-
-    Notes
-    -----
-
-    .. versionadded:: 0.12.0
-
-    References
-    ----------
-    .. [1] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-3, 3)
-    >>> plt.plot(x, special.erfi(x))
-    >>> plt.xlabel('$x$')
-    >>> plt.ylabel('$erfi(x)$')
-    >>> plt.show()
-
-    """)
-
-add_newdoc("erfcx",
-    """
-    erfcx(x, out=None)
-
-    Scaled complementary error function, ``exp(x**2) * erfc(x)``.
-
-    Parameters
-    ----------
-    x : array_like
-        Real or complex valued argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        Values of the scaled complementary error function
-
-
-    See Also
-    --------
-    erf, erfc, erfi, dawsn, wofz
-
-    Notes
-    -----
-
-    .. versionadded:: 0.12.0
-
-    References
-    ----------
-    .. [1] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-3, 3)
-    >>> plt.plot(x, special.erfcx(x))
-    >>> plt.xlabel('$x$')
-    >>> plt.ylabel('$erfcx(x)$')
-    >>> plt.show()
-
-    """)
-
 add_newdoc(
     "erfinv",
     """
@@ -3399,120 +3067,6 @@ def add_newdoc(name, doc):
 
     """)
 
-
-add_newdoc("exp10",
-    """
-    exp10(x, out=None)
-
-    Compute ``10**x`` element-wise.
-
-    Parameters
-    ----------
-    x : array_like
-        `x` must contain real numbers.
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    scalar or ndarray
-        ``10**x``, computed element-wise.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy.special import exp10
-
-    >>> exp10(3)
-    1000.0
-    >>> x = np.array([[-1, -0.5, 0], [0.5, 1, 1.5]])
-    >>> exp10(x)
-    array([[  0.1       ,   0.31622777,   1.        ],
-           [  3.16227766,  10.        ,  31.6227766 ]])
-
-    """)
-
-add_newdoc("exp2",
-    """
-    exp2(x, out=None)
-
-    Compute ``2**x`` element-wise.
-
-    Parameters
-    ----------
-    x : array_like
-        `x` must contain real numbers.
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    scalar or ndarray
-        ``2**x``, computed element-wise.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy.special import exp2
-
-    >>> exp2(3)
-    8.0
-    >>> x = np.array([[-1, -0.5, 0], [0.5, 1, 1.5]])
-    >>> exp2(x)
-    array([[ 0.5       ,  0.70710678,  1.        ],
-           [ 1.41421356,  2.        ,  2.82842712]])
-    """)
-
-add_newdoc("expm1",
-    """
-    expm1(x, out=None)
-
-    Compute ``exp(x) - 1``.
-
-    When `x` is near zero, ``exp(x)`` is near 1, so the numerical calculation
-    of ``exp(x) - 1`` can suffer from catastrophic loss of precision.
-    ``expm1(x)`` is implemented to avoid the loss of precision that occurs when
-    `x` is near zero.
-
-    Parameters
-    ----------
-    x : array_like
-        `x` must contain real numbers.
-    out : ndarray, optional
-        Optional output array for the function values
-
-    Returns
-    -------
-    scalar or ndarray
-        ``exp(x) - 1`` computed element-wise.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy.special import expm1
-
-    >>> expm1(1.0)
-    1.7182818284590451
-    >>> expm1([-0.2, -0.1, 0, 0.1, 0.2])
-    array([-0.18126925, -0.09516258,  0.        ,  0.10517092,  0.22140276])
-
-    The exact value of ``exp(7.5e-13) - 1`` is::
-
-        7.5000000000028125000000007031250000001318...*10**-13.
-
-    Here is what ``expm1(7.5e-13)`` gives:
-
-    >>> expm1(7.5e-13)
-    7.5000000000028135e-13
-
-    Compare that to ``exp(7.5e-13) - 1``, where the subtraction results in
-    a "catastrophic" loss of precision:
-
-    >>> np.exp(7.5e-13) - 1
-    7.5006667543675576e-13
-
-    """)
-
 add_newdoc("expn",
     r"""
     expn(n, x, out=None)
@@ -6094,49 +5648,6 @@ def add_newdoc(name, doc):
     Internal function, do not use.
     """)
 
-add_newdoc("log1p",
-    """
-    log1p(x, out=None)
-
-    Calculates log(1 + x) for use when `x` is near zero.
-
-    Parameters
-    ----------
-    x : array_like
-        Real or complex valued input.
-    out : ndarray, optional
-        Optional output array for the function results.
-
-    Returns
-    -------
-    scalar or ndarray
-        Values of ``log(1 + x)``.
-
-    See Also
-    --------
-    expm1, cosm1
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> import scipy.special as sc
-
-    It is more accurate than using ``log(1 + x)`` directly for ``x``
-    near 0. Note that in the below example ``1 + 1e-17 == 1`` to
-    double precision.
-
-    >>> sc.log1p(1e-17)
-    1e-17
-    >>> np.log(1 + 1e-17)
-    0.0
-
-    """)
-
-add_newdoc("_log1pmx",
-    """
-    Internal function, do not use.
-    """)
-
 add_newdoc("lpmv",
     r"""
     lpmv(m, v, x, out=None)
@@ -7275,65 +6786,6 @@ def add_newdoc(name, doc):
 
     """)
 
-add_newdoc("ndtr",
-    r"""
-    ndtr(x, out=None)
-
-    Cumulative distribution of the standard normal distribution.
-
-    Returns the area under the standard Gaussian probability
-    density function, integrated from minus infinity to `x`
-
-    .. math::
-
-       \frac{1}{\sqrt{2\pi}} \int_{-\infty}^x \exp(-t^2/2) dt
-
-    Parameters
-    ----------
-    x : array_like, real or complex
-        Argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        The value of the normal CDF evaluated at `x`
-
-    See Also
-    --------
-    log_ndtr : Logarithm of ndtr
-    ndtri : Inverse of ndtr, standard normal percentile function
-    erf : Error function
-    erfc : 1 - erf
-    scipy.stats.norm : Normal distribution
-
-    Examples
-    --------
-    Evaluate `ndtr` at one point.
-
-    >>> import numpy as np
-    >>> from scipy.special import ndtr
-    >>> ndtr(0.5)
-    0.6914624612740131
-
-    Evaluate the function at several points by providing a NumPy array
-    or list for `x`.
-
-    >>> ndtr([0, 0.5, 2])
-    array([0.5       , 0.69146246, 0.97724987])
-
-    Plot the function.
-
-    >>> import matplotlib.pyplot as plt
-    >>> x = np.linspace(-5, 5, 100)
-    >>> fig, ax = plt.subplots()
-    >>> ax.plot(x, ndtr(x))
-    >>> ax.set_title(r"Standard normal cumulative distribution function $\Phi$")
-    >>> plt.show()
-    """)
-
-
 add_newdoc("nrdtrimn",
     """
     nrdtrimn(p, std, x, out=None)
@@ -7441,59 +6893,6 @@ def add_newdoc(name, doc):
 
     """)
 
-add_newdoc("log_ndtr",
-    """
-    log_ndtr(x, out=None)
-
-    Logarithm of Gaussian cumulative distribution function.
-
-    Returns the log of the area under the standard Gaussian probability
-    density function, integrated from minus infinity to `x`::
-
-        log(1/sqrt(2*pi) * integral(exp(-t**2 / 2), t=-inf..x))
-
-    Parameters
-    ----------
-    x : array_like, real or complex
-        Argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        The value of the log of the normal CDF evaluated at `x`
-
-    See Also
-    --------
-    erf
-    erfc
-    scipy.stats.norm
-    ndtr
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy.special import log_ndtr, ndtr
-
-    The benefit of ``log_ndtr(x)`` over the naive implementation
-    ``np.log(ndtr(x))`` is most evident with moderate to large positive
-    values of ``x``:
-
-    >>> x = np.array([6, 7, 9, 12, 15, 25])
-    >>> log_ndtr(x)
-    array([-9.86587646e-010, -1.27981254e-012, -1.12858841e-019,
-           -1.77648211e-033, -3.67096620e-051, -3.05669671e-138])
-
-    The results of the naive calculation for the moderate ``x`` values
-    have only 5 or 6 correct significant digits. For values of ``x``
-    greater than approximately 8.3, the naive expression returns 0:
-
-    >>> np.log(ndtr(x))
-    array([-9.86587701e-10, -1.27986510e-12,  0.00000000e+00,
-            0.00000000e+00,  0.00000000e+00,  0.00000000e+00])
-    """)
-
 add_newdoc("ndtri",
     """
     ndtri(y, out=None)
@@ -8972,176 +8371,6 @@ def add_newdoc(name, doc):
     significantly faster than ``tukeylambda.cdf``.
     """)
 
-add_newdoc("wofz",
-    """
-    wofz(z, out=None)
-
-    Faddeeva function
-
-    Returns the value of the Faddeeva function for complex argument::
-
-        exp(-z**2) * erfc(-i*z)
-
-    Parameters
-    ----------
-    z : array_like
-        complex argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    scalar or ndarray
-        Value of the Faddeeva function
-
-    See Also
-    --------
-    dawsn, erf, erfc, erfcx, erfi
-
-    References
-    ----------
-    .. [1] Steven G. Johnson, Faddeeva W function implementation.
-       http://ab-initio.mit.edu/Faddeeva
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> from scipy import special
-    >>> import matplotlib.pyplot as plt
-
-    >>> x = np.linspace(-3, 3)
-    >>> z = special.wofz(x)
-
-    >>> plt.plot(x, z.real, label='wofz(x).real')
-    >>> plt.plot(x, z.imag, label='wofz(x).imag')
-    >>> plt.xlabel('$x$')
-    >>> plt.legend(framealpha=1, shadow=True)
-    >>> plt.grid(alpha=0.25)
-    >>> plt.show()
-
-    """)
-
-add_newdoc("xlogy",
-    """
-    xlogy(x, y, out=None)
-
-    Compute ``x*log(y)`` so that the result is 0 if ``x = 0``.
-
-    Parameters
-    ----------
-    x : array_like
-        Multiplier
-    y : array_like
-        Argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    z : scalar or ndarray
-        Computed x*log(y)
-
-    Notes
-    -----
-    The log function used in the computation is the natural log.
-
-    .. versionadded:: 0.13.0
-
-    Examples
-    --------
-    We can use this function to calculate the binary logistic loss also
-    known as the binary cross entropy. This loss function is used for
-    binary classification problems and is defined as:
-
-    .. math::
-        L = 1/n * \\sum_{i=0}^n -(y_i*log(y\\_pred_i) + (1-y_i)*log(1-y\\_pred_i))
-
-    We can define the parameters `x` and `y` as y and y_pred respectively.
-    y is the array of the actual labels which over here can be either 0 or 1.
-    y_pred is the array of the predicted probabilities with respect to
-    the positive class (1).
-
-    >>> import numpy as np
-    >>> from scipy.special import xlogy
-    >>> y = np.array([0, 1, 0, 1, 1, 0])
-    >>> y_pred = np.array([0.3, 0.8, 0.4, 0.7, 0.9, 0.2])
-    >>> n = len(y)
-    >>> loss = -(xlogy(y, y_pred) + xlogy(1 - y, 1 - y_pred)).sum()
-    >>> loss /= n
-    >>> loss
-    0.29597052165495025
-
-    A lower loss is usually better as it indicates that the predictions are
-    similar to the actual labels. In this example since our predicted
-    probabilities are close to the actual labels, we get an overall loss
-    that is reasonably low and appropriate.
-
-    """)
-
-add_newdoc("xlog1py",
-    """
-    xlog1py(x, y, out=None)
-
-    Compute ``x*log1p(y)`` so that the result is 0 if ``x = 0``.
-
-    Parameters
-    ----------
-    x : array_like
-        Multiplier
-    y : array_like
-        Argument
-    out : ndarray, optional
-        Optional output array for the function results
-
-    Returns
-    -------
-    z : scalar or ndarray
-        Computed x*log1p(y)
-
-    Notes
-    -----
-
-    .. versionadded:: 0.13.0
-
-    Examples
-    --------
-    This example shows how the function can be used to calculate the log of
-    the probability mass function for a geometric discrete random variable.
-    The probability mass function of the geometric distribution is defined
-    as follows:
-
-    .. math:: f(k) = (1-p)^{k-1} p
-
-    where :math:`p` is the probability of a single success
-    and :math:`1-p` is the probability of a single failure
-    and :math:`k` is the number of trials to get the first success.
-
-    >>> import numpy as np
-    >>> from scipy.special import xlog1py
-    >>> p = 0.5
-    >>> k = 100
-    >>> _pmf = np.power(1 - p, k - 1) * p
-    >>> _pmf
-    7.888609052210118e-31
-
-    If we take k as a relatively large number the value of the probability
-    mass function can become very low. In such cases taking the log of the
-    pmf would be more suitable as the log function can change the values
-    to a scale that is more appropriate to work with.
-
-    >>> _log_pmf = xlog1py(k - 1, -p) + np.log(p)
-    >>> _log_pmf
-    -69.31471805599453
-
-    We can confirm that we get a value close to the original pmf value by
-    taking the exponential of the log pmf.
-
-    >>> _orig_pmf = np.exp(_log_pmf)
-    >>> np.isclose(_pmf, _orig_pmf)
-    True
-
-    """)
-
 add_newdoc("yn",
     r"""
     yn(n, x, out=None)
diff --git a/scipy/special/_basic.py b/scipy/special/_basic.py
index ea8945333a50..292569eeaca1 100644
--- a/scipy/special/_basic.py
+++ b/scipy/special/_basic.py
@@ -2985,14 +2985,20 @@ def _factorialx_approx_core(n, k, extend):
     # scalar case separately, unified handling would be inefficient for arrays;
     # don't use isscalar due to numpy/numpy#23574; 0-dim arrays treated below
     if not isinstance(n, np.ndarray):
-        return (
-            np.power(k, (n - n_mod_k) / k)
-            * gamma(n / k + 1) / gamma(n_mod_k / k + 1)
-            * max(n_mod_k, 1)
-        )
+        with warnings.catch_warnings():
+            # large n cause overflow warnings, but infinity is fine
+            warnings.simplefilter("ignore", RuntimeWarning)
+            return (
+                np.power(k, (n - n_mod_k) / k)
+                * gamma(n / k + 1) / gamma(n_mod_k / k + 1)
+                * max(n_mod_k, 1)
+            )
 
     # factor that's independent of the residue class (see factorialk docstring)
-    result = np.power(k, n / k) * gamma(n / k + 1)
+    with warnings.catch_warnings():
+        # large n cause overflow warnings, but infinity is fine
+        warnings.simplefilter("ignore", RuntimeWarning)
+        result = np.power(k, n / k) * gamma(n / k + 1)
     # factor dependent on residue r (for `r=0` it's 1, so we skip `r=0`
     # below and thus also avoid evaluating `max(r, 1)`)
     def corr(k, r): return np.power(k, -r / k) / gamma(r / k + 1) * r
@@ -3099,8 +3105,8 @@ def _factorialx_wrapper(fname, n, k, exact, extend):
         elif n in {0, 1}:
             return 1 if exact else np.float64(1)
         elif exact and _is_subdtype(type(n), "i"):
-            # calculate with integers
-            return _range_prod(1, n, k=k)
+            # calculate with integers; cast away other int types (like unsigned)
+            return _range_prod(1, int(n), k=k)
         elif exact:
             # only relevant for factorial
             raise ValueError(msg_exact_not_possible.format(dtype=type(n)))
diff --git a/scipy/special/_cunity.pxd b/scipy/special/_cunity.pxd
deleted file mode 100644
index 06eecbd30cec..000000000000
--- a/scipy/special/_cunity.pxd
+++ /dev/null
@@ -1,118 +0,0 @@
-cimport numpy as np
-from libc.math cimport fabs, sin, cos, exp, atan2
-
-from ._complexstuff cimport (
-    zisfinite, zabs, zpack, npy_cdouble_from_double_complex,
-    double_complex_from_npy_cdouble)
-
-cdef extern from "_complexstuff.h":
-    np.npy_cdouble npy_clog(np.npy_cdouble x) nogil
-    np.npy_cdouble npy_cexp(np.npy_cdouble x) nogil
-
-
-cdef extern from "dd_real_wrappers.h":
-    ctypedef struct double2:
-        double hi
-        double lo
-
-    double2 dd_create_d(double x) nogil
-    double2 dd_add(const double2* a, const double2* b) nogil
-    double2 dd_mul(const double2* a, const double2* b) nogil
-    double dd_to_double(const double2* a) nogil
-
-cdef extern from "xsf_wrappers.h" nogil:
-    double xsf_cosm1(double x)
-    double cephes_expm1_wrap(double x)
-    double cephes_log1p_wrap(double x)
-
-# log(z + 1) = log(x + 1 + 1j*y)
-#             = log(sqrt((x+1)**2 + y**2)) + 1j*atan2(y, x+1)
-#
-# Using atan2(y, x+1) for the imaginary part is always okay.  The real part
-# needs to be calculated more carefully.  For |z| large, the naive formula
-# log(z + 1) can be used.  When |z| is small, rewrite as
-#
-# log(sqrt((x+1)**2 + y**2)) = 0.5*log(x**2 + 2*x +1 + y**2)
-#       = 0.5 * log1p(x**2 + y**2 + 2*x)
-#       = 0.5 * log1p(hypot(x,y) * (hypot(x, y) + 2*x/hypot(x,y)))
-#
-# This expression suffers from cancellation when x < 0 and
-# y = +/-sqrt(2*fabs(x)). To get around this cancellation problem, we use
-# double-double precision when necessary.
-cdef inline double complex clog1p(double complex z) noexcept nogil:
-    cdef double zr, zi, x, y, az, azi
-    cdef np.npy_cdouble ret
-
-    if not zisfinite(z):
-        z = z + 1
-        ret = npy_clog(npy_cdouble_from_double_complex(z))
-        return double_complex_from_npy_cdouble(ret)
-
-    zr = z.real
-    zi = z.imag
-
-    if zi == 0.0 and zr >= -1.0:
-        return zpack(cephes_log1p_wrap(zr), 0.0)
-
-    az = zabs(z)
-    if az < 0.707:
-        azi = fabs(zi)
-        if zr < 0 and fabs(-zr - azi*azi/2)/(-zr) < 0.5:
-            return clog1p_ddouble(zr, zi)
-        else:
-            x = 0.5 * cephes_log1p_wrap(az*(az + 2*zr/az))
-            y = atan2(zi, zr + 1.0)
-            return zpack(x, y)
-
-    z = z + 1.0
-    ret = npy_clog(npy_cdouble_from_double_complex(z))
-    return double_complex_from_npy_cdouble(ret)
-
-cdef inline double complex clog1p_ddouble(double zr, double zi) noexcept nogil:
-    cdef double x, y
-    cdef double2 r, i, two, rsqr, isqr, rtwo, absm1
-
-    r = dd_create_d(zr)
-    i = dd_create_d(zi)
-    two = dd_create_d(2.0)
-
-    rsqr = dd_mul(&r,& r)
-    isqr = dd_mul(&i, &i)
-    rtwo = dd_mul(&two, &r)
-    absm1 = dd_add(&rsqr, &isqr)
-    absm1 = dd_add(&absm1, &rtwo)
-
-    x = 0.5 * cephes_log1p_wrap(dd_to_double(&absm1))
-    y = atan2(zi, zr+1.0)
-    return zpack(x, y)
-
-# cexpm1(z) = cexp(z) - 1
-#
-# The imaginary part of this is easily computed via exp(z.real)*sin(z.imag)
-# The real part is difficult to compute when there is cancellation e.g. when
-# z.real = -log(cos(z.imag)).  There isn't a way around this problem  that
-# doesn't involve computing exp(z.real) and/or cos(z.imag) to higher
-# precision.
-cdef inline double complex cexpm1(double complex z) noexcept nogil:
-    cdef double zr, zi, ezr, x, y
-    cdef np.npy_cdouble ret
-
-    if not zisfinite(z):
-        ret = npy_cexp(npy_cdouble_from_double_complex(z))
-        return double_complex_from_npy_cdouble(ret) - 1.0
-
-    zr = z.real
-    zi = z.imag
-
-    if zr <= -40:
-        x = -1.0
-    else:
-        ezr = cephes_expm1_wrap(zr)
-        x = ezr*cos(zi) + xsf_cosm1(zi)
-    # don't compute exp(zr) too, unless necessary
-    if zr > -1.0:
-        y = (ezr + 1.0)*sin(zi)
-    else:
-        y = exp(zr)*sin(zi)
-
-    return zpack(x, y)
diff --git a/scipy/special/_faddeeva.cxx b/scipy/special/_faddeeva.cxx
deleted file mode 100644
index 7fddc1125025..000000000000
--- a/scipy/special/_faddeeva.cxx
+++ /dev/null
@@ -1,190 +0,0 @@
-#include "_faddeeva.h"
-
-#include <complex>
-#include <cmath>
-
-using namespace std;
-
-extern "C" {
-
-npy_cdouble faddeeva_w(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    std::complex<double> w = Faddeeva::w(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-npy_cdouble faddeeva_erf(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    complex<double> w = Faddeeva::erf(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-double faddeeva_erfc(double x)
-{
-    return Faddeeva::erfc(x);
-}
-
-npy_cdouble faddeeva_erfc_complex(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    complex<double> w = Faddeeva::erfc(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-double faddeeva_erfcx(double x)
-{
-    return Faddeeva::erfcx(x);
-}
-
-npy_cdouble faddeeva_erfcx_complex(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    complex<double> w = Faddeeva::erfcx(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-double faddeeva_erfi(double x)
-{
-    return Faddeeva::erfi(x);
-}
-
-npy_cdouble faddeeva_erfi_complex(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    complex<double> w = Faddeeva::erfi(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-double faddeeva_dawsn(double x)
-{
-    return Faddeeva::Dawson(x);
-}
-
-npy_cdouble faddeeva_dawsn_complex(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    complex<double> w = Faddeeva::Dawson(z);
-    return npy_cpack(real(w), imag(w));
-}
-
-/*
- * A wrapper for a normal CDF for complex argument
- */
-
-npy_cdouble faddeeva_ndtr(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    z *= M_SQRT1_2;
-    complex<double> w = 0.5 * Faddeeva::erfc(-z);
-    return npy_cpack(real(w), imag(w));
-}
-
-/*
- * Log of the CDF of the normal distribution for double x.
- *
- * Let F(x) be the CDF of the standard normal distribution.
- * This implementation of log(F(x)) is based on the identities
- *
- *   F(x) = erfc(-x/√2)/2
- *        = 1 - erfc(x/√2)/2
- *
- * We use the first formula for x < -1, with erfc(z) replaced
- * by erfcx(z)*exp(-z**2) to ensure high precision for large
- * negative values when we take the logarithm:
- *
- *   log F(x) = log(erfc(-x/√2)/2)
- *            = log(erfcx(-x/√2)/2)*exp(-x**2/2))
- *            = log(erfcx(-x/√2)/2) - x**2/2
- *
- * For x >= -1, we use the second formula for F(x):
- *
- *   log F(x) = log(1 - erfc(x/√2)/2)
- *            = log1p(-erfc(x/√2)/2)
- */
-double faddeeva_log_ndtr(double x)
-{
-    double t = x*M_SQRT1_2;
-    if (x < -1.0) {
-        return log(faddeeva_erfcx(-t)/2) - t*t;
-    }
-    else {
-        return log1p(-faddeeva_erfc(t)/2);
-    }
-}
-
-/*
- * Log of the normal CDF for complex arguments.
- *
- * This is equivalent to log(ndtr(z)), but is more robust to overflow at $z\to\infty$.
- * This implementation uses the Faddeva computation, $\erfc(z) = \exp(-z^2) w(iz)$,
- * taking special care to select the principal branch of the log function
- *           log( exp(-z^2) w(i z) )
- */
-npy_cdouble faddeeva_log_ndtr_complex(npy_cdouble zp)
-{
-    complex<double> z(npy_creal(zp), npy_cimag(zp));
-    if (npy_creal(zp) > 6) {
-        // Underflow. Close to the real axis, expand the log in log(1 - ndtr(-z)).
-        complex<double> w = -0.5 * Faddeeva::erfc(z*M_SQRT1_2);
-        if (abs(w) < 1e-8) {
-            return npy_cpack(real(w), imag(w));
-        }
-    }
-
-    z *= -M_SQRT1_2;
-    double x = real(z), y = imag(z);
-
-    /* Compute the principal branch of $log(exp(-z^2))$, using the fact that
-     * $log(e^t) = log|e^t| + i Arg(e^t)$, and that if $t = r + is$, then
-     * $e^t = e^r (\cos(s) + i \sin(s))$.
-     */
-    double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
-    double mIm_z2 = -2*x*y; // Im(-z^2)
-
-    double im = fmod(mIm_z2, 2.0*M_PI);
-    if (im > M_PI) {im -= 2.0*M_PI;}
-
-    complex<double> val1 = complex<double>(mRe_z2, im);
-
-    complex<double> val2 = log(Faddeeva::w(complex<double>(-y, x)));
-    complex<double> result = val1 + val2 - NPY_LOGE2;
-
-    /* Again, select the principal branch: log(z) = log|z| + i arg(z), thus
-     * the imaginary part of the result should belong to [-pi, pi].
-     */
-    im = imag(result);
-    if (im >= M_PI){ im -= 2*M_PI; }
-    if (im < -M_PI){ im += 2*M_PI; }
-
-    return npy_cpack(real(result), im);
-}
-
-double faddeeva_voigt_profile(double x, double sigma, double gamma)
-{
-    const double INV_SQRT_2 = 0.707106781186547524401;
-    const double SQRT_2PI = 2.5066282746310002416123552393401042;
-
-    if(sigma == 0){
-        if (gamma == 0){
-            if (std::isnan(x))
-                return x;
-            if (x == 0)
-                return INFINITY;
-            return 0;
-        }
-        return gamma / M_PI / (x*x + gamma*gamma);
-    }
-    if (gamma == 0){
-        return 1 / SQRT_2PI / sigma * exp(-(x/sigma)*(x/sigma) / 2);
-    }
-
-    double zreal = x / sigma * INV_SQRT_2;
-    double zimag = gamma / sigma * INV_SQRT_2;
-    std::complex<double> z(zreal, zimag);
-    std::complex<double> w = Faddeeva::w(z);
-    return real(w) / sigma / SQRT_2PI;
-}
-
-}  // extern "C"
diff --git a/scipy/special/_faddeeva.h b/scipy/special/_faddeeva.h
deleted file mode 100644
index 01fcb7ce1a35..000000000000
--- a/scipy/special/_faddeeva.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef FADDEEVA_H_
-#define FADDEEVA_H_
-
-#ifdef __cplusplus
-#define EXTERN_C_START extern "C" {
-#define EXTERN_C_END }
-#else
-#define EXTERN_C_START
-#define EXTERN_C_END
-#endif
-
-#include <Python.h>
-#include <complex>
-
-#include "Faddeeva.hh"
-
-EXTERN_C_START
-
-#include <numpy/npy_math.h>
-
-npy_cdouble faddeeva_w(npy_cdouble zp);
-npy_cdouble faddeeva_erf(npy_cdouble zp);
-
-double faddeeva_erfc(double x);
-npy_cdouble faddeeva_erfc_complex(npy_cdouble zp);
-
-double faddeeva_erfcx(double x);
-npy_cdouble faddeeva_erfcx_complex(npy_cdouble zp);
-
-double faddeeva_erfi(double zp);
-npy_cdouble faddeeva_erfi_complex(npy_cdouble zp);
-
-double faddeeva_dawsn(double zp);
-npy_cdouble faddeeva_dawsn_complex(npy_cdouble zp);
-
-npy_cdouble faddeeva_ndtr(npy_cdouble zp);
-
-double faddeeva_log_ndtr(double x);
-npy_cdouble faddeeva_log_ndtr_complex(npy_cdouble zp);
-
-double faddeeva_voigt_profile(double x, double sigma, double gamma);
-
-EXTERN_C_END
-
-#endif
diff --git a/scipy/special/_generate_pyx.py b/scipy/special/_generate_pyx.py
index b61521e32105..42e4acfcf2e3 100644
--- a/scipy/special/_generate_pyx.py
+++ b/scipy/special/_generate_pyx.py
@@ -96,7 +96,9 @@
     'k0', 'k0e', 'k1', 'k1e', 'y0', 'y1', 'j0', 'j1', 'struve', 'modstruve',
     'beta', 'betaln', 'besselpoly', 'gammaln', 'gammasgn', 'cbrt', 'radian', 'cosm1',
     'gammainc', 'gammaincinv', 'gammaincc', 'gammainccinv', 'fresnel', 'ellipe',
-    'ellipeinc', 'ellipk', 'ellipkinc', 'ellipkm1', 'ellipj', '_riemann_zeta'
+    'ellipeinc', 'ellipk', 'ellipkinc', 'ellipkm1', 'ellipj', '_riemann_zeta', 'erf',
+    'erfc', 'erfcx', 'erfi', 'voigt_profile', 'wofz', 'dawsn', 'ndtr', 'log_ndtr',
+    'exp2', 'exp10', 'expm1', 'log1p', 'xlogy', 'xlog1py', '_log1pmx'
 ]
 
 # -----------------------------------------------------------------------------
diff --git a/scipy/special/_hyp0f1.pxd b/scipy/special/_hyp0f1.pxd
index bb4eff873520..d68710c8bc83 100644
--- a/scipy/special/_hyp0f1.pxd
+++ b/scipy/special/_hyp0f1.pxd
@@ -1,7 +1,6 @@
 from libc.math cimport pow, sqrt, floor, log, log1p, exp, M_PI, NAN, fabs, isinf
 cimport numpy as np
 
-from ._xlogy cimport xlogy
 from ._complexstuff cimport (
     zsqrt, zpow, zabs, npy_cdouble_from_double_complex,
     double_complex_from_npy_cdouble)
@@ -21,6 +20,7 @@ cdef extern from "xsf_wrappers.h":
     np.npy_cdouble special_ccyl_bessel_i(double v, np.npy_cdouble z) nogil
     np.npy_cdouble special_ccyl_bessel_j(double v, np.npy_cdouble z) nogil
     double xsf_sinpi(double x) nogil
+    double xsf_xlogy(double x, double y) nogil
 
 cdef extern from "numpy/npy_math.h":
     double npy_creal(np.npy_cdouble z) nogil
@@ -43,7 +43,7 @@ cdef inline double _hyp0f1_real(double v, double z) noexcept nogil:
 
     if z > 0:
         arg = sqrt(z)
-        arg_exp = xlogy(1.0-v, arg) + xsf_gammaln(v)
+        arg_exp = xsf_xlogy(1.0-v, arg) + xsf_gammaln(v)
         bess_val = xsf_iv(v-1, 2.0*arg)
 
         if (arg_exp > log(DBL_MAX) or bess_val == 0 or   # overflow
@@ -91,11 +91,11 @@ cdef inline double _hyp0f1_asy(double v, double z) noexcept nogil:
     u3 = (30375.0 - 369603.0*p2 + 765765.0*p4 - 425425.0*p6) * pp * p2 / 414720.0
     u_corr_i = 1.0 + u1/v1 + u2/(v1*v1) + u3/(v1*v1*v1)
 
-    result = exp(arg_exp_i - xlogy(v1, arg)) * gs * u_corr_i
+    result = exp(arg_exp_i - xsf_xlogy(v1, arg)) * gs * u_corr_i
     if v - 1 < 0:
         # DLMF 10.27.2: I_{-v} = I_{v} + (2/pi) sin(pi*v) K_v
         u_corr_k = 1.0 - u1/v1 + u2/(v1*v1) - u3/(v1*v1*v1)
-        result += exp(arg_exp_k + xlogy(v1, arg)) * gs * 2.0 * xsf_sinpi(v1) * u_corr_k
+        result += exp(arg_exp_k + xsf_xlogy(v1, arg)) * gs * 2.0 * xsf_sinpi(v1) * u_corr_k
 
     return result
 
diff --git a/scipy/special/_logsumexp.py b/scipy/special/_logsumexp.py
index 5f18e41ee73e..ef81907ce8d5 100644
--- a/scipy/special/_logsumexp.py
+++ b/scipy/special/_logsumexp.py
@@ -147,7 +147,9 @@ def _wrap_radians(x, xp=None):
     out = -((-x + math.pi) % (2 * math.pi) - math.pi)
     # preserve relative precision
     no_wrap = xp.abs(x) < xp.pi
-    out[no_wrap] = x[no_wrap]
+    # TODO: i think this is correct but double check
+    # out[no_wrap] = x[no_wrap]
+    out = xp.where(no_wrap, x, out)
     return out
 
 
@@ -202,7 +204,10 @@ def _logsumexp(a, b, axis, return_sign, xp):
     a_max, i_max = _elements_and_indices_with_max_real(a, axis=axis, xp=xp)
 
     # for precision, these terms are separated out of the main sum.
-    a[i_max] = -xp.inf
+    # TODO: we shouldn't be mutating in-place here unless we make a copy
+    # dask arrays do not copy before this somehow
+    #a[i_max] = -xp.inf
+    a = xp.where(i_max, -xp.asarray(xp.inf, dtype=a.dtype), a)
     i_max_dt = xp.astype(i_max, a.dtype)
     # This is an inefficient way of getting `m` because it is the sum of a sparse
     # array; however, this is the simplest way I can think of to get the right shape.
diff --git a/scipy/special/_special_ufuncs.cpp b/scipy/special/_special_ufuncs.cpp
index fa5d20d4a935..85d200ee4984 100644
--- a/scipy/special/_special_ufuncs.cpp
+++ b/scipy/special/_special_ufuncs.cpp
@@ -12,6 +12,8 @@
 #include "xsf/binom.h"
 #include "xsf/digamma.h"
 #include "xsf/ellip.h"
+#include "xsf/erf.h"
+#include "xsf/exp.h"
 #include "xsf/expint.h"
 #include "xsf/fresnel.h"
 #include "xsf/gamma.h"
@@ -20,6 +22,7 @@
 #include "xsf/kelvin.h"
 #include "xsf/lambertw.h"
 #include "xsf/legendre.h"
+#include "xsf/log.h"
 #include "xsf/log_exp.h"
 #include "xsf/mathieu.h"
 #include "xsf/par_cyl.h"
@@ -27,6 +30,7 @@
 #include "xsf/sph_bessel.h"
 #include "xsf/sph_harm.h"
 #include "xsf/sphd_wave.h"
+#include "xsf/stats.h"
 #include "xsf/struve.h"
 #include "xsf/trig.h"
 #include "xsf/wright_bessel.h"
@@ -43,6 +47,7 @@
 
 extern const char *_cospi_doc;
 extern const char *_sinpi_doc;
+extern const char *_log1pmx_doc;
 extern const char *airy_doc;
 extern const char *airye_doc;
 extern const char *bei_doc;
@@ -57,13 +62,21 @@ extern const char *cbrt_doc;
 extern const char *cosdg_doc;
 extern const char *cosm1_doc;
 extern const char *cotdg_doc;
+extern const char *dawsn_doc;
 extern const char *ellipe_doc;
 extern const char *ellipeinc_doc;
 extern const char *ellipj_doc;
 extern const char *ellipk_doc;
 extern const char *ellipkm1_doc;
 extern const char *ellipkinc_doc;
+extern const char *erf_doc;
+extern const char *erfc_doc;
+extern const char *erfcx_doc;
+extern const char *erfi_doc;
 extern const char *exp1_doc;
+extern const char *expm1_doc;
+extern const char *exp2_doc;
+extern const char *exp10_doc;
 extern const char *expi_doc;
 extern const char *expit_doc;
 extern const char *exprel_doc;
@@ -112,9 +125,11 @@ extern const char *k1e_doc;
 extern const char *kv_doc;
 extern const char *kve_doc;
 extern const char *lambertw_doc;
+extern const char *log1p_doc;
 extern const char *logit_doc;
 extern const char *loggamma_doc;
 extern const char *log_expit_doc;
+extern const char *log_ndtr_doc;
 extern const char *log_wright_bessel_doc;
 extern const char *mathieu_a_doc;
 extern const char *mathieu_b_doc;
@@ -126,6 +141,7 @@ extern const char *mathieu_modsem2_doc;
 extern const char *mathieu_sem_doc;
 extern const char *modfresnelm_doc;
 extern const char *modfresnelp_doc;
+extern const char *ndtr_doc;
 extern const char *obl_ang1_doc;
 extern const char *obl_ang1_cv_doc;
 extern const char *obl_cv_doc;
@@ -162,7 +178,11 @@ extern const char *sph_harm_doc;
 extern const char *struve_h_doc;
 extern const char *struve_l_doc;
 extern const char *tandg_doc;
+extern const char *voigt_profile_doc;
+extern const char *wofz_doc;
 extern const char *wright_bessel_doc;
+extern const char *xlogy_doc;
+extern const char *xlog1py_doc;
 extern const char *y0_doc;
 extern const char *y1_doc;
 extern const char *yv_doc;
@@ -210,9 +230,10 @@ PyMODINIT_FUNC PyInit__special_ufuncs() {
                           "_sinpi", _sinpi_doc);
     PyModule_AddObjectRef(_special_ufuncs, "_sinpi", _sinpi);
 
-    PyObject *_zeta = xsf::numpy::ufunc(
-	{static_cast<xsf::numpy::ff_f>(xsf::zeta), static_cast<xsf::numpy::Ff_F>(xsf::zeta),
-	 static_cast<xsf::numpy::dd_d>(xsf::zeta), static_cast<xsf::numpy::Dd_D>(xsf::zeta)}, "_zeta", _zeta_doc);
+    PyObject *_zeta =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::ff_f>(xsf::zeta), static_cast<xsf::numpy::Ff_F>(xsf::zeta),
+                           static_cast<xsf::numpy::dd_d>(xsf::zeta), static_cast<xsf::numpy::Dd_D>(xsf::zeta)},
+                          "_zeta", _zeta_doc);
     PyModule_AddObjectRef(_special_ufuncs, "_zeta", _zeta);
 
     PyObject *airy =
@@ -326,6 +347,70 @@ PyMODINIT_FUNC PyInit__special_ufuncs() {
         {static_cast<xsf::numpy::d_d>(xsf::exprel), static_cast<xsf::numpy::f_f>(xsf::exprel)}, "exprel", exprel_doc);
     PyModule_AddObjectRef(_special_ufuncs, "exprel", exprel);
 
+    PyObject *expm1 =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::expm1), static_cast<xsf::numpy::f_f>(xsf::expm1),
+                           static_cast<xsf::numpy::D_D>(xsf::expm1), static_cast<xsf::numpy::F_F>(xsf::expm1)},
+                          "expm1", expm1_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "expm1", expm1);
+
+    PyObject *exp2 = xsf::numpy::ufunc(
+        {static_cast<xsf::numpy::d_d>(xsf::exp2), static_cast<xsf::numpy::f_f>(xsf::exp2)}, "exp2", exp2_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "exp2", exp2);
+
+    PyObject *exp10 = xsf::numpy::ufunc(
+        {static_cast<xsf::numpy::d_d>(xsf::exp10), static_cast<xsf::numpy::f_f>(xsf::exp10)}, "exp10", exp10_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "exp10", exp10);
+
+    PyObject *erf = xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::erf), static_cast<xsf::numpy::f_f>(xsf::erf),
+                                       static_cast<xsf::numpy::D_D>(xsf::erf), static_cast<xsf::numpy::F_F>(xsf::erf)},
+                                      "erf", erf_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "erf", erf);
+
+    PyObject *erfc =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::erfc), static_cast<xsf::numpy::f_f>(xsf::erfc),
+                           static_cast<xsf::numpy::D_D>(xsf::erfc), static_cast<xsf::numpy::F_F>(xsf::erfc)},
+                          "erfc", erfc_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "erfc", erfc);
+
+    PyObject *erfcx =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::erfcx), static_cast<xsf::numpy::f_f>(xsf::erfcx),
+                           static_cast<xsf::numpy::D_D>(xsf::erfcx), static_cast<xsf::numpy::F_F>(xsf::erfcx)},
+                          "erfcx", erfcx_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "erfcx", erfcx);
+
+    PyObject *erfi =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::erfi), static_cast<xsf::numpy::f_f>(xsf::erfi),
+                           static_cast<xsf::numpy::D_D>(xsf::erfi), static_cast<xsf::numpy::F_F>(xsf::erfi)},
+                          "erfi", erfi_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "erfi", erfi);
+
+    PyObject *voigt_profile = xsf::numpy::ufunc(
+        {static_cast<xsf::numpy::ddd_d>(xsf::voigt_profile), static_cast<xsf::numpy::fff_f>(xsf::voigt_profile)},
+        "voigt_profile", voigt_profile_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "voigt_profile", voigt_profile);
+
+    PyObject *wofz = xsf::numpy::ufunc(
+        {static_cast<xsf::numpy::D_D>(xsf::wofz), static_cast<xsf::numpy::F_F>(xsf::wofz)}, "wofz", wofz_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "wofz", wofz);
+
+    PyObject *dawsn =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::dawsn), static_cast<xsf::numpy::f_f>(xsf::dawsn),
+                           static_cast<xsf::numpy::D_D>(xsf::dawsn), static_cast<xsf::numpy::F_F>(xsf::dawsn)},
+                          "dawsn", dawsn_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "dawsn", dawsn);
+
+    PyObject *ndtr =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::ndtr), static_cast<xsf::numpy::f_f>(xsf::ndtr),
+                           static_cast<xsf::numpy::D_D>(xsf::ndtr), static_cast<xsf::numpy::F_F>(xsf::ndtr)},
+                          "ndtr", ndtr_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "ndtr", ndtr);
+
+    PyObject *log_ndtr =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::log_ndtr), static_cast<xsf::numpy::f_f>(xsf::log_ndtr),
+                           static_cast<xsf::numpy::D_D>(xsf::log_ndtr), static_cast<xsf::numpy::F_F>(xsf::log_ndtr)},
+                          "log_ndtr", log_ndtr_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "log_ndtr", log_ndtr);
+
     PyObject *fresnel =
         xsf::numpy::ufunc({static_cast<xsf::numpy::d_dd>(xsf::fresnel), static_cast<xsf::numpy::f_ff>(xsf::fresnel),
                            static_cast<xsf::numpy::D_DD>(xsf::fresnel), static_cast<xsf::numpy::F_FF>(xsf::fresnel)},
@@ -466,7 +551,8 @@ PyMODINIT_FUNC PyInit__special_ufuncs() {
     PyModule_AddObjectRef(_special_ufuncs, "_iv_ratio", iv_ratio);
 
     PyObject *iv_ratio_c = xsf::numpy::ufunc(
-        {static_cast<xsf::numpy::dd_d>(xsf::iv_ratio_c), static_cast<xsf::numpy::ff_f>(xsf::iv_ratio_c)}, "_iv_ratio_c", iv_ratio_c_doc);
+        {static_cast<xsf::numpy::dd_d>(xsf::iv_ratio_c), static_cast<xsf::numpy::ff_f>(xsf::iv_ratio_c)}, "_iv_ratio_c",
+        iv_ratio_c_doc);
     PyModule_AddObjectRef(_special_ufuncs, "_iv_ratio_c", iv_ratio_c);
 
     PyObject *ive = xsf::numpy::ufunc(
@@ -550,6 +636,29 @@ PyMODINIT_FUNC PyInit__special_ufuncs() {
         "kve", kve_doc);
     PyModule_AddObjectRef(_special_ufuncs, "kve", kve);
 
+    PyObject *log1p =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::log1p), static_cast<xsf::numpy::f_f>(xsf::log1p),
+                           static_cast<xsf::numpy::D_D>(xsf::log1p), static_cast<xsf::numpy::F_F>(xsf::log1p)},
+                          "log1p", log1p_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "log1p", log1p);
+
+    PyObject *_log1pmx =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::log1pmx), static_cast<xsf::numpy::f_f>(xsf::log1pmx)},
+                          "_log1pmx", _log1pmx_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "_log1pmx", _log1pmx);
+
+    PyObject *xlogy =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::dd_d>(xsf::xlogy), static_cast<xsf::numpy::ff_f>(xsf::xlogy),
+                           static_cast<xsf::numpy::DD_D>(xsf::xlogy), static_cast<xsf::numpy::FF_F>(xsf::xlogy)},
+                          "xlogy", xlogy_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "xlogy", xlogy);
+
+    PyObject *xlog1py =
+        xsf::numpy::ufunc({static_cast<xsf::numpy::dd_d>(xsf::xlog1py), static_cast<xsf::numpy::ff_f>(xsf::xlog1py),
+                           static_cast<xsf::numpy::DD_D>(xsf::xlog1py), static_cast<xsf::numpy::FF_F>(xsf::xlog1py)},
+                          "xlog1py", xlog1py_doc);
+    PyModule_AddObjectRef(_special_ufuncs, "xlog1py", xlog1py);
+
     PyObject *log_expit =
         xsf::numpy::ufunc({static_cast<xsf::numpy::d_d>(xsf::log_expit), static_cast<xsf::numpy::f_f>(xsf::log_expit),
                            static_cast<xsf::numpy::g_g>(xsf::log_expit)},
@@ -843,9 +952,8 @@ PyMODINIT_FUNC PyInit__special_ufuncs() {
     PyModule_AddObjectRef(_special_ufuncs, "rgamma", rgamma);
 
     PyObject *_riemann_zeta = xsf::numpy::ufunc(
-	 {static_cast<xsf::numpy::d_d>(xsf::riemann_zeta), static_cast<xsf::numpy::D_D>(xsf::riemann_zeta),
-	 static_cast<xsf::numpy::f_f>(xsf::riemann_zeta), static_cast<xsf::numpy::F_F>(xsf::riemann_zeta)
-	},
+        {static_cast<xsf::numpy::d_d>(xsf::riemann_zeta), static_cast<xsf::numpy::D_D>(xsf::riemann_zeta),
+         static_cast<xsf::numpy::f_f>(xsf::riemann_zeta), static_cast<xsf::numpy::F_F>(xsf::riemann_zeta)},
         "_riemann_zeta", _riemann_zeta_doc);
     PyModule_AddObjectRef(_special_ufuncs, "_riemann_zeta", _riemann_zeta);
 
diff --git a/scipy/special/_special_ufuncs_docs.cpp b/scipy/special/_special_ufuncs_docs.cpp
index 67b11e294445..24b9f28fea28 100644
--- a/scipy/special/_special_ufuncs_docs.cpp
+++ b/scipy/special/_special_ufuncs_docs.cpp
@@ -1199,6 +1199,271 @@ const char *ellipkinc_doc = R"(
            2020-09-15. See Sec. 19.25(i) https://dlmf.nist.gov/19.25#i
     )";
 
+const char *xlogy_doc = R"(
+    xlogy(x, y, out=None)
+
+    Compute ``x*log(y)`` so that the result is 0 if ``x = 0``.
+
+    Parameters
+    ----------
+    x : array_like
+        Multiplier
+    y : array_like
+        Argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    z : scalar or ndarray
+        Computed x*log(y)
+
+    Notes
+    -----
+    The log function used in the computation is the natural log.
+
+    .. versionadded:: 0.13.0
+
+    Examples
+    --------
+    We can use this function to calculate the binary logistic loss also
+    known as the binary cross entropy. This loss function is used for
+    binary classification problems and is defined as:
+
+    .. math::
+        L = 1/n * \\sum_{i=0}^n -(y_i*log(y\\_pred_i) + (1-y_i)*log(1-y\\_pred_i))
+
+    We can define the parameters `x` and `y` as y and y_pred respectively.
+    y is the array of the actual labels which over here can be either 0 or 1.
+    y_pred is the array of the predicted probabilities with respect to
+    the positive class (1).
+
+    >>> import numpy as np
+    >>> from scipy.special import xlogy
+    >>> y = np.array([0, 1, 0, 1, 1, 0])
+    >>> y_pred = np.array([0.3, 0.8, 0.4, 0.7, 0.9, 0.2])
+    >>> n = len(y)
+    >>> loss = -(xlogy(y, y_pred) + xlogy(1 - y, 1 - y_pred)).sum()
+    >>> loss /= n
+    >>> loss
+    0.29597052165495025
+
+    A lower loss is usually better as it indicates that the predictions are
+    similar to the actual labels. In this example since our predicted
+    probabilities are close to the actual labels, we get an overall loss
+    that is reasonably low and appropriate.
+    )";
+
+const char *xlog1py_doc = R"(
+    xlog1py(x, y, out=None)
+
+    Compute ``x*log1p(y)`` so that the result is 0 if ``x = 0``.
+
+    Parameters
+    ----------
+    x : array_like
+        Multiplier
+    y : array_like
+        Argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    z : scalar or ndarray
+        Computed x*log1p(y)
+
+    Notes
+    -----
+
+    .. versionadded:: 0.13.0
+
+    Examples
+    --------
+    This example shows how the function can be used to calculate the log of
+    the probability mass function for a geometric discrete random variable.
+    The probability mass function of the geometric distribution is defined
+    as follows:
+
+    .. math:: f(k) = (1-p)^{k-1} p
+
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure
+    and :math:`k` is the number of trials to get the first success.
+
+    >>> import numpy as np
+    >>> from scipy.special import xlog1py
+    >>> p = 0.5
+    >>> k = 100
+    >>> _pmf = np.power(1 - p, k - 1) * p
+    >>> _pmf
+    7.888609052210118e-31
+
+    If we take k as a relatively large number the value of the probability
+    mass function can become very low. In such cases taking the log of the
+    pmf would be more suitable as the log function can change the values
+    to a scale that is more appropriate to work with.
+
+    >>> _log_pmf = xlog1py(k - 1, -p) + np.log(p)
+    >>> _log_pmf
+    -69.31471805599453
+
+    We can confirm that we get a value close to the original pmf value by
+    taking the exponential of the log pmf.
+
+    >>> _orig_pmf = np.exp(_log_pmf)
+    >>> np.isclose(_pmf, _orig_pmf)
+    True
+    )";
+
+const char *_log1pmx_doc = R"(
+    Internal function, do not use.
+    )";
+
+const char *log1p_doc = R"(
+    log1p(x, out=None)
+
+    Calculates log(1 + x) for use when `x` is near zero.
+
+    Parameters
+    ----------
+    x : array_like
+        Real or complex valued input.
+    out : ndarray, optional
+        Optional output array for the function results.
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of ``log(1 + x)``.
+
+    See Also
+    --------
+    expm1, cosm1
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import scipy.special as sc
+
+    It is more accurate than using ``log(1 + x)`` directly for ``x``
+    near 0. Note that in the below example ``1 + 1e-17 == 1`` to
+    double precision.
+
+    >>> sc.log1p(1e-17)
+    1e-17
+    >>> np.log(1 + 1e-17)
+    0.0
+    )";
+
+const char *expm1_doc = R"(
+    expm1(x, out=None)
+
+    Compute ``exp(x) - 1``.
+
+    When `x` is near zero, ``exp(x)`` is near 1, so the numerical calculation
+    of ``exp(x) - 1`` can suffer from catastrophic loss of precision.
+    ``expm1(x)`` is implemented to avoid the loss of precision that occurs when
+    `x` is near zero.
+
+    Parameters
+    ----------
+    x : array_like
+        `x` must contain real numbers.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        ``exp(x) - 1`` computed element-wise.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import expm1
+
+    >>> expm1(1.0)
+    1.7182818284590451
+    >>> expm1([-0.2, -0.1, 0, 0.1, 0.2])
+    array([-0.18126925, -0.09516258,  0.        ,  0.10517092,  0.22140276])
+
+    The exact value of ``exp(7.5e-13) - 1`` is::
+
+        7.5000000000028125000000007031250000001318...*10**-13.
+
+    Here is what ``expm1(7.5e-13)`` gives:
+
+    >>> expm1(7.5e-13)
+    7.5000000000028135e-13
+
+    Compare that to ``exp(7.5e-13) - 1``, where the subtraction results in
+    a "catastrophic" loss of precision:
+
+    >>> np.exp(7.5e-13) - 1
+    7.5006667543675576e-13
+    )";
+
+const char *exp2_doc = R"(
+    exp2(x, out=None)
+
+    Compute ``2**x`` element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        `x` must contain real numbers.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        ``2**x``, computed element-wise.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import exp2
+
+    >>> exp2(3)
+    8.0
+    >>> x = np.array([[-1, -0.5, 0], [0.5, 1, 1.5]])
+    >>> exp2(x)
+    array([[ 0.5       ,  0.70710678,  1.        ],
+           [ 1.41421356,  2.        ,  2.82842712]])
+    )";
+
+const char *exp10_doc = R"(
+    exp10(x, out=None)
+
+    Compute ``10**x`` element-wise.
+
+    Parameters
+    ----------
+    x : array_like
+        `x` must contain real numbers.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        ``10**x``, computed element-wise.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import exp10
+
+    >>> exp10(3)
+    1000.0
+    >>> x = np.array([[-1, -0.5, 0], [0.5, 1, 1.5]])
+    >>> exp10(x)
+    array([[  0.1       ,   0.31622777,   1.        ],
+           [  3.16227766,  10.        ,  31.6227766 ]])
+    )";
+
 const char *exp1_doc = R"(
     exp1(z, out=None)
 
@@ -1366,6 +1631,181 @@ const char *expi_doc = R"(
 
     )";
 
+const char *erf_doc = R"(
+    erf(z, out=None)
+
+    Returns the error function of complex argument.
+
+    It is defined as ``2/sqrt(pi)*integral(exp(-t**2), t=0..z)``.
+
+    Parameters
+    ----------
+    x : ndarray
+        Input array.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    res : scalar or ndarray
+        The values of the error function at the given points `x`.
+
+    See Also
+    --------
+    erfc, erfinv, erfcinv, wofz, erfcx, erfi
+
+    Notes
+    -----
+    The cumulative of the unit normal distribution is given by
+    ``Phi(z) = 1/2[1 + erf(z/sqrt(2))]``.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Error_function
+    .. [2] Milton Abramowitz and Irene A. Stegun, eds.
+        Handbook of Mathematical Functions with Formulas,
+        Graphs, and Mathematical Tables. New York: Dover,
+        1972. http://www.math.sfu.ca/~cbm/aands/page_297.htm
+    .. [3] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-3, 3)
+    >>> plt.plot(x, special.erf(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$erf(x)$')
+    >>> plt.show()
+    )";
+
+const char *erfc_doc = R"(
+    erfc(x, out=None)
+
+    Complementary error function, ``1 - erf(x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Real or complex valued argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the complementary error function
+
+    See Also
+    --------
+    erf, erfi, erfcx, dawsn, wofz
+
+    References
+    ----------
+    .. [1] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-3, 3)
+    >>> plt.plot(x, special.erfc(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$erfc(x)$')
+    >>> plt.show()
+    )";
+
+const char *erfi_doc = R"(
+    erfi(z, out=None)
+
+    Imaginary error function, ``-i erf(i z)``.
+
+    Parameters
+    ----------
+    z : array_like
+        Real or complex valued argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the imaginary error function
+
+    See Also
+    --------
+    erf, erfc, erfcx, dawsn, wofz
+
+    Notes
+    -----
+
+    .. versionadded:: 0.12.0
+
+    References
+    ----------
+    .. [1] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-3, 3)
+    >>> plt.plot(x, special.erfi(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$erfi(x)$')
+    >>> plt.show()
+    )";
+
+const char *erfcx_doc = R"(
+    erfcx(x, out=None)
+
+    Scaled complementary error function, ``exp(x**2) * erfc(x)``.
+
+    Parameters
+    ----------
+    x : array_like
+        Real or complex valued argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Values of the scaled complementary error function
+
+
+    See Also
+    --------
+    erf, erfc, erfi, dawsn, wofz
+
+    Notes
+    -----
+
+    .. versionadded:: 0.12.0
+
+    References
+    ----------
+    .. [1] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-3, 3)
+    >>> plt.plot(x, special.erfcx(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$erfcx(x)$')
+    >>> plt.show()
+    )";
+
 const char *expit_doc = R"(
     expit(x, out=None)
 
@@ -1476,6 +1916,48 @@ const char *exprel_doc = R"(
     0.99999999392252903
     )";
 
+const char *dawsn_doc = R"(
+    dawsn(x, out=None)
+
+    Dawson's integral.
+
+    Computes::
+
+        exp(-x**2) * integral(exp(t**2), t=0..x).
+
+    Parameters
+    ----------
+    x : array_like
+        Function parameter.
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    y : scalar or ndarray
+        Value of the integral.
+
+    See Also
+    --------
+    wofz, erf, erfc, erfcx, erfi
+
+    References
+    ----------
+    .. [1] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-15, 15, num=1000)
+    >>> plt.plot(x, special.dawsn(x))
+    >>> plt.xlabel('$x$')
+    >>> plt.ylabel('$dawsn(x)$')
+    >>> plt.show()
+    )";
+
 const char *fresnel_doc = R"(
     fresnel(z, out=None)
 
@@ -4667,6 +5149,58 @@ const char *log_expit_doc = R"(
     lose all precision and return 0.
     )";
 
+const char *log_ndtr_doc = R"(
+    log_ndtr(x, out=None)
+
+    Logarithm of Gaussian cumulative distribution function.
+
+    Returns the log of the area under the standard Gaussian probability
+    density function, integrated from minus infinity to `x`::
+
+        log(1/sqrt(2*pi) * integral(exp(-t**2 / 2), t=-inf..x))
+
+    Parameters
+    ----------
+    x : array_like, real or complex
+        Argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value of the log of the normal CDF evaluated at `x`
+
+    See Also
+    --------
+    erf
+    erfc
+    scipy.stats.norm
+    ndtr
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.special import log_ndtr, ndtr
+
+    The benefit of ``log_ndtr(x)`` over the naive implementation
+    ``np.log(ndtr(x))`` is most evident with moderate to large positive
+    values of ``x``:
+
+    >>> x = np.array([6, 7, 9, 12, 15, 25])
+    >>> log_ndtr(x)
+    array([-9.86587646e-010, -1.27981254e-012, -1.12858841e-019,
+           -1.77648211e-033, -3.67096620e-051, -3.05669671e-138])
+
+    The results of the naive calculation for the moderate ``x`` values
+    have only 5 or 6 correct significant digits. For values of ``x``
+    greater than approximately 8.3, the naive expression returns 0:
+
+    >>> np.log(ndtr(x))
+    array([-9.86587701e-10, -1.27986510e-12,  0.00000000e+00,
+            0.00000000e+00,  0.00000000e+00,  0.00000000e+00])
+    )";
+
 const char *log_wright_bessel_doc = R"(
     log_wright_bessel(a, b, x, out=None)
 
@@ -5111,6 +5645,63 @@ const char *obl_ang1_doc = R"(
 
     )";
 
+const char *ndtr_doc = R"(
+    ndtr(x, out=None)
+
+    Cumulative distribution of the standard normal distribution.
+
+    Returns the area under the standard Gaussian probability
+    density function, integrated from minus infinity to `x`
+
+    .. math::
+
+       \frac{1}{\sqrt{2\pi}} \int_{-\infty}^x \exp(-t^2/2) dt
+
+    Parameters
+    ----------
+    x : array_like, real or complex
+        Argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        The value of the normal CDF evaluated at `x`
+
+    See Also
+    --------
+    log_ndtr : Logarithm of ndtr
+    ndtri : Inverse of ndtr, standard normal percentile function
+    erf : Error function
+    erfc : 1 - erf
+    scipy.stats.norm : Normal distribution
+
+    Examples
+    --------
+    Evaluate `ndtr` at one point.
+
+    >>> import numpy as np
+    >>> from scipy.special import ndtr
+    >>> ndtr(0.5)
+    0.6914624612740131
+
+    Evaluate the function at several points by providing a NumPy array
+    or list for `x`.
+
+    >>> ndtr([0, 0.5, 2])
+    array([0.5       , 0.69146246, 0.97724987])
+
+    Plot the function.
+
+    >>> import matplotlib.pyplot as plt
+    >>> x = np.linspace(-5, 5, 100)
+    >>> fig, ax = plt.subplots()
+    >>> ax.plot(x, ndtr(x))
+    >>> ax.set_title(r"Standard normal cumulative distribution function $\Phi$")
+    >>> plt.show()
+    )";
+
 const char *obl_ang1_cv_doc = R"(
     obl_ang1_cv(m, n, c, cv, x, out=None)
 
@@ -6169,6 +6760,157 @@ const char *struve_l_doc = R"(
     >>> plt.show()
     )";
 
+const char *voigt_profile_doc = R"(
+    voigt_profile(x, sigma, gamma, out=None)
+
+    Voigt profile.
+
+    The Voigt profile is a convolution of a 1-D Normal distribution with
+    standard deviation ``sigma`` and a 1-D Cauchy distribution with half-width at
+    half-maximum ``gamma``.
+
+    If ``sigma = 0``, PDF of Cauchy distribution is returned.
+    Conversely, if ``gamma = 0``, PDF of Normal distribution is returned.
+    If ``sigma = gamma = 0``, the return value is ``Inf`` for ``x = 0``,
+    and ``0`` for all other ``x``.
+
+    Parameters
+    ----------
+    x : array_like
+        Real argument
+    sigma : array_like
+        The standard deviation of the Normal distribution part
+    gamma : array_like
+        The half-width at half-maximum of the Cauchy distribution part
+    out : ndarray, optional
+        Optional output array for the function values
+
+    Returns
+    -------
+    scalar or ndarray
+        The Voigt profile at the given arguments
+
+    See Also
+    --------
+    wofz : Faddeeva function
+
+    Notes
+    -----
+    It can be expressed in terms of Faddeeva function
+
+    .. math:: V(x; \sigma, \gamma) = \frac{Re[w(z)]}{\sigma\sqrt{2\pi}},
+    .. math:: z = \frac{x + i\gamma}{\sqrt{2}\sigma}
+
+    where :math:`w(z)` is the Faddeeva function.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Voigt_profile
+
+    Examples
+    --------
+    Calculate the function at point 2 for ``sigma=1`` and ``gamma=1``.
+
+    >>> from scipy.special import voigt_profile
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> voigt_profile(2, 1., 1.)
+    0.09071519942627544
+
+    Calculate the function at several points by providing a NumPy array
+    for `x`.
+
+    >>> values = np.array([-2., 0., 5])
+    >>> voigt_profile(values, 1., 1.)
+    array([0.0907152 , 0.20870928, 0.01388492])
+
+    Plot the function for different parameter sets.
+
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> x = np.linspace(-10, 10, 500)
+    >>> parameters_list = [(1.5, 0., "solid"), (1.3, 0.5, "dashed"),
+    ...                    (0., 1.8, "dotted"), (1., 1., "dashdot")]
+    >>> for params in parameters_list:
+    ...     sigma, gamma, linestyle = params
+    ...     voigt = voigt_profile(x, sigma, gamma)
+    ...     ax.plot(x, voigt, label=rf"$\sigma={sigma},\, \gamma={gamma}$",
+    ...             ls=linestyle)
+    >>> ax.legend()
+    >>> plt.show()
+
+    Verify visually that the Voigt profile indeed arises as the convolution
+    of a normal and a Cauchy distribution.
+
+    >>> from scipy.signal import convolve
+    >>> x, dx = np.linspace(-10, 10, 500, retstep=True)
+    >>> def gaussian(x, sigma):
+    ...     return np.exp(-0.5 * x**2/sigma**2)/(sigma * np.sqrt(2*np.pi))
+    >>> def cauchy(x, gamma):
+    ...     return gamma/(np.pi * (np.square(x)+gamma**2))
+    >>> sigma = 2
+    >>> gamma = 1
+    >>> gauss_profile = gaussian(x, sigma)
+    >>> cauchy_profile = cauchy(x, gamma)
+    >>> convolved = dx * convolve(cauchy_profile, gauss_profile, mode="same")
+    >>> voigt = voigt_profile(x, sigma, gamma)
+    >>> fig, ax = plt.subplots(figsize=(8, 8))
+    >>> ax.plot(x, gauss_profile, label="Gauss: $G$", c='b')
+    >>> ax.plot(x, cauchy_profile, label="Cauchy: $C$", c='y', ls="dashed")
+    >>> xx = 0.5*(x[1:] + x[:-1])  # midpoints
+    >>> ax.plot(xx, convolved[1:], label="Convolution: $G * C$", ls='dashdot',
+    ...         c='k')
+    >>> ax.plot(x, voigt, label="Voigt", ls='dotted', c='r')
+    >>> ax.legend()
+    >>> plt.show()
+    )";
+
+const char *wofz_doc = R"(
+    wofz(z, out=None)
+
+    Faddeeva function
+
+    Returns the value of the Faddeeva function for complex argument::
+
+        exp(-z**2) * erfc(-i*z)
+
+    Parameters
+    ----------
+    z : array_like
+        complex argument
+    out : ndarray, optional
+        Optional output array for the function results
+
+    Returns
+    -------
+    scalar or ndarray
+        Value of the Faddeeva function
+
+    See Also
+    --------
+    dawsn, erf, erfc, erfcx, erfi
+
+    References
+    ----------
+    .. [1] Steven G. Johnson, Faddeeva W function implementation.
+       http://ab-initio.mit.edu/Faddeeva
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import special
+    >>> import matplotlib.pyplot as plt
+
+    >>> x = np.linspace(-3, 3)
+    >>> z = special.wofz(x)
+
+    >>> plt.plot(x, z.real, label='wofz(x).real')
+    >>> plt.plot(x, z.imag, label='wofz(x).imag')
+    >>> plt.xlabel('$x$')
+    >>> plt.legend(framealpha=1, shadow=True)
+    >>> plt.grid(alpha=0.25)
+    >>> plt.show()
+    )";
+
 const char *wright_bessel_doc = R"(
     wright_bessel(a, b, x, out=None)
 
diff --git a/scipy/special/_support_alternative_backends.py b/scipy/special/_support_alternative_backends.py
index 71d8ec549f2f..fb92e7e95707 100644
--- a/scipy/special/_support_alternative_backends.py
+++ b/scipy/special/_support_alternative_backends.py
@@ -11,7 +11,7 @@
 from ._ufuncs import (
     log_ndtr, ndtr, ndtri, erf, erfc, i0, i0e, i1, i1e, gammaln,  # noqa: F401
     gammainc, gammaincc, logit, expit, entr, rel_entr, xlogy,  # noqa: F401
-    chdtr, chdtrc, betainc, betaincc, stdtr  # noqa: F401
+    chdtr, chdtrc, betainc, betaincc, stdtr, stdtrit  # noqa: F401
 )
 
 array_api_compat_prefix = "scipy._lib.array_api_compat"
@@ -144,12 +144,35 @@ def __stdtr(df, t):
     return __stdtr
 
 
+def _stdtrit(xp, spx):
+    betainc = getattr(spx.special, 'betainc', None) if spx else None  # noqa: F811
+    if betainc is None and hasattr(xp, 'special'):
+        betainc = getattr(xp.special, 'betainc', None)
+
+    # If betainc is not defined, the root-finding would be done with `xp`
+    # despite `stdtr` being evaluated with SciPy/NumPy `stdtr`. Save the
+    # conversions: in this case, just evaluate `stdtrit` with SciPy/NumPy.
+    if betainc is None:
+        return None
+
+    from scipy.optimize.elementwise import bracket_root, find_root
+
+    def __stdtrit(df, p):
+        def fun(t, df, p):  return stdtr(df, t) - p
+        res_bracket = bracket_root(fun, xp.zeros_like(p), args=(df, p))
+        res_root = find_root(fun, res_bracket.bracket, args=(df, p))
+        return res_root.x
+
+    return __stdtrit
+
+
 _generic_implementations = {'rel_entr': _rel_entr,
                             'xlogy': _xlogy,
                             'chdtr': _chdtr,
                             'chdtrc': _chdtrc,
                             'betaincc': _betaincc,
                             'stdtr': _stdtr,
+                            'stdtrit': _stdtrit,
                             }
 
 
@@ -190,6 +213,7 @@ def wrapped(*args, **kwargs):
     'betainc': 3,
     'betaincc': 3,
     'stdtr': 2,
+    'stdtrit': 2,
 }
 
 for f_name, n_array_args in array_special_func_map.items():
diff --git a/scipy/special/_xlogy.pxd b/scipy/special/_xlogy.pxd
deleted file mode 100644
index b9ddf300cbc1..000000000000
--- a/scipy/special/_xlogy.pxd
+++ /dev/null
@@ -1,19 +0,0 @@
-from libc.math cimport log1p
-
-from ._complexstuff cimport zlog, zisnan, number_t
-from ._cunity cimport clog1p
-
-cdef inline number_t xlogy(number_t x, number_t y) noexcept nogil:
-    if x == 0 and not zisnan(y):
-        return 0
-    else:
-        return x * zlog(y)
-
-cdef inline number_t xlog1py(number_t x, number_t y) noexcept nogil:
-    if x == 0 and not zisnan(y):
-        return 0
-    else:
-        if number_t is double:
-            return x * log1p(y)
-        else:
-            return x * clog1p(y)
diff --git a/scipy/special/cython_special.pyx b/scipy/special/cython_special.pyx
index 782fd98c09f8..8d0f5ecbc270 100644
--- a/scipy/special/cython_special.pyx
+++ b/scipy/special/cython_special.pyx
@@ -1287,15 +1287,30 @@ cdef extern from r"xsf_wrappers.h":
     double cephes_igamci(double a, double p) nogil
     double cephes_igam_fac(double a, double x) nogil
     double cephes_lanczos_sum_expg_scaled(double x) nogil
-    double cephes_erf(double x) nogil
-    double cephes_erfc(double x) nogil
+    npy_cdouble xsf_cwofz(npy_cdouble x) nogil
+    double xsf_erf(double x) nogil
+    npy_cdouble xsf_cerf(npy_cdouble x) nogil
+    double xsf_erfc(double x) nogil
+    npy_cdouble xsf_cerfc(npy_cdouble x) nogil
+    double xsf_erfcx(double x) nogil
+    npy_cdouble xsf_cerfcx(npy_cdouble x) nogil
+    double xsf_erfi(double x) nogil
+    npy_cdouble xsf_cerfi(npy_cdouble x) nogil
+    double xsf_dawsn(double x) nogil
+    npy_cdouble xsf_cdawsn(npy_cdouble x) nogil
+    double xsf_voigt_profile(double x, double sigma, double gamma) nogil
     double cephes_poch(double x, double m) nogil
     double cephes_rgamma(double x) nogil
     double xsf_zetac(double x) nogil
-    double cephes_log1p(double x) nogil
-    double cephes_log1pmx(double x) nogil
+    double xsf_log1p(double x) nogil
+    npy_cdouble xsf_clog1p(npy_cdouble x) nogil
+    double xsf_xlogy(double x, double y) nogil
+    npy_cdouble xsf_cxlogy(npy_cdouble x, npy_cdouble y) nogil
+    double xsf_xlog1py(double x, double y) nogil
+    npy_cdouble xsf_cxlog1py(npy_cdouble x, npy_cdouble y) nogil
     double cephes_lgam1p(double x) nogil
-    double cephes_expm1(double x) nogil
+    double xsf_expm1(double x) nogil
+    npy_cdouble xsf_cexpm1(npy_cdouble z) nogil
     double xsf_cosm1(double x) nogil
     double cephes_expn(int n, double x) nogil
     double xsf_ellipe(double x) nogil
@@ -1308,8 +1323,8 @@ cdef extern from r"xsf_wrappers.h":
     double xsf_cotdg(double x) nogil
     double xsf_radian(double d, double m, double s) nogil
     double cephes_erfcinv(double y) nogil
-    double cephes_exp10(double x) nogil
-    double cephes_exp2(double x) nogil
+    double xsf_exp10(double x) nogil
+    double xsf_exp2(double x) nogil
     npy_int xsf_csici(npy_cdouble, npy_cdouble *, npy_cdouble *) nogil
     npy_int xsf_cshichi(npy_cdouble, npy_cdouble *, npy_cdouble *) nogil
     npy_int xsf_sici(npy_double, npy_double *, npy_double *) nogil
@@ -1343,6 +1358,9 @@ cdef extern from r"xsf_wrappers.h":
     double xsf_nbdtrc(int k, int n, double p) nogil
     double xsf_nbdtri(int k, int n, double p) nogil
     double xsf_ndtr(double x) nogil
+    npy_cdouble xsf_cndtr(npy_cdouble x) nogil
+    double xsf_log_ndtr(double x) nogil
+    npy_cdouble xsf_clog_ndtr(npy_cdouble x) nogil
     double xsf_ndtri(double x) nogil
     double xsf_owens_t(double h, double a) nogil
     double xsf_pdtr(double k, double m) nogil
@@ -1593,10 +1611,6 @@ from .orthogonal_eval cimport eval_sh_legendre_l as _func_eval_sh_legendre_l
 ctypedef double _proto_eval_sh_legendre_l_t(Py_ssize_t, double) noexcept nogil
 cdef _proto_eval_sh_legendre_l_t *_proto_eval_sh_legendre_l_t_var = &_func_eval_sh_legendre_l
 
-from ._cunity cimport cexpm1 as _func_cexpm1
-ctypedef double complex _proto_cexpm1_t(double complex) noexcept nogil
-cdef _proto_cexpm1_t *_proto_cexpm1_t_var = &_func_cexpm1
-
 from ._legacy cimport expn_unsafe as _func_expn_unsafe
 ctypedef double _proto_expn_unsafe_t(double, double) noexcept nogil
 cdef _proto_expn_unsafe_t *_proto_expn_unsafe_t_var = &_func_expn_unsafe
@@ -1674,10 +1688,6 @@ from ._legacy cimport kn_unsafe as _func_kn_unsafe
 ctypedef double _proto_kn_unsafe_t(double, double) noexcept nogil
 cdef _proto_kn_unsafe_t *_proto_kn_unsafe_t_var = &_func_kn_unsafe
 
-from ._cunity cimport clog1p as _func_clog1p
-ctypedef double complex _proto_clog1p_t(double complex) noexcept nogil
-cdef _proto_clog1p_t *_proto_clog1p_t_var = &_func_clog1p
-
 cdef extern from r"_ufuncs_defs.h":
     cdef npy_double _func_pmv_wrap "pmv_wrap"(npy_double, npy_double, npy_double)nogil
 
@@ -1773,22 +1783,6 @@ from ._cdflib_wrappers cimport stdtrit as _func_stdtrit
 ctypedef double _proto_stdtrit_t(double, double) noexcept nogil
 cdef _proto_stdtrit_t *_proto_stdtrit_t_var = &_func_stdtrit
 
-from ._xlogy cimport xlog1py as _func_xlog1py
-ctypedef double _proto_xlog1py_double__t(double, double) noexcept nogil
-cdef _proto_xlog1py_double__t *_proto_xlog1py_double__t_var = &_func_xlog1py[double]
-
-from ._xlogy cimport xlog1py as _func_xlog1py
-ctypedef double complex _proto_xlog1py_double_complex__t(double complex, double complex) noexcept nogil
-cdef _proto_xlog1py_double_complex__t *_proto_xlog1py_double_complex__t_var = &_func_xlog1py[double_complex]
-
-from ._xlogy cimport xlogy as _func_xlogy
-ctypedef double _proto_xlogy_double__t(double, double) noexcept nogil
-cdef _proto_xlogy_double__t *_proto_xlogy_double__t_var = &_func_xlogy[double]
-
-from ._xlogy cimport xlogy as _func_xlogy
-ctypedef double complex _proto_xlogy_double_complex__t(double complex, double complex) noexcept nogil
-cdef _proto_xlogy_double_complex__t *_proto_xlogy_double_complex__t_var = &_func_xlogy[double_complex]
-
 from ._legacy cimport yn_unsafe as _func_yn_unsafe
 ctypedef double _proto_yn_unsafe_t(double, double) noexcept nogil
 cdef _proto_yn_unsafe_t *_proto_yn_unsafe_t_var = &_func_yn_unsafe
@@ -1799,7 +1793,7 @@ cdef _proto_ndtri_exp_t *_proto_ndtri_exp_t_var = &_func_ndtri_exp
 
 cpdef double voigt_profile(double x0, double x1, double x2) noexcept nogil:
     """See the documentation for scipy.special.voigt_profile"""
-    return (<double(*)(double, double, double) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_voigt_profile)(x0, x1, x2)
+    return xsf_voigt_profile(x0, x1, x2)
 
 cpdef double agm(double x0, double x1) noexcept nogil:
     """See the documentation for scipy.special.agm"""
@@ -2060,9 +2054,9 @@ cpdef double cotdg(double x0) noexcept nogil:
 cpdef Dd_number_t dawsn(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.dawsn"""
     if Dd_number_t is double:
-        return (<double(*)(double) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_dawsn)(x0)
+        return xsf_dawsn(x0)
     elif Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_dawsn_complex)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cdawsn(_complexstuff.npy_cdouble_from_double_complex(x0)))
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2168,9 +2162,9 @@ cpdef double entr(double x0) noexcept nogil:
 cpdef Dd_number_t erf(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.erf"""
     if Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erf)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cerf(_complexstuff.npy_cdouble_from_double_complex(x0)))
     elif Dd_number_t is double:
-        return cephes_erf(x0)
+        return xsf_erf(x0)
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2180,9 +2174,9 @@ cpdef Dd_number_t erf(Dd_number_t x0) noexcept nogil:
 cpdef Dd_number_t erfc(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.erfc"""
     if Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erfc_complex)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cerfc(_complexstuff.npy_cdouble_from_double_complex(x0)))
     elif Dd_number_t is double:
-        return cephes_erfc(x0)
+        return xsf_erfc(x0)
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2192,9 +2186,9 @@ cpdef Dd_number_t erfc(Dd_number_t x0) noexcept nogil:
 cpdef Dd_number_t erfcx(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.erfcx"""
     if Dd_number_t is double:
-        return (<double(*)(double) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erfcx)(x0)
+        return xsf_erfcx(x0)
     elif Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erfcx_complex)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cerfcx(_complexstuff.npy_cdouble_from_double_complex(x0)))
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2204,9 +2198,9 @@ cpdef Dd_number_t erfcx(Dd_number_t x0) noexcept nogil:
 cpdef Dd_number_t erfi(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.erfi"""
     if Dd_number_t is double:
-        return (<double(*)(double) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erfi)(x0)
+        return xsf_erfi(x0)
     elif Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_erfi_complex)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cerfi(_complexstuff.npy_cdouble_from_double_complex(x0)))
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2459,11 +2453,11 @@ cpdef Dd_number_t exp1(Dd_number_t x0) noexcept nogil:
 
 cpdef double exp10(double x0) noexcept nogil:
     """See the documentation for scipy.special.exp10"""
-    return cephes_exp10(x0)
+    return xsf_exp10(x0)
 
 cpdef double exp2(double x0) noexcept nogil:
     """See the documentation for scipy.special.exp2"""
-    return cephes_exp2(x0)
+    return xsf_exp2(x0)
 
 cpdef Dd_number_t expi(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.expi"""
@@ -2496,9 +2490,9 @@ cpdef dfg_number_t expit(dfg_number_t x0) noexcept nogil:
 cpdef Dd_number_t expm1(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.expm1"""
     if Dd_number_t is double_complex:
-        return _func_cexpm1(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cexpm1(_complexstuff.npy_cdouble_from_double_complex(x0)))
     elif Dd_number_t is double:
-        return cephes_expm1(x0)
+        return xsf_expm1(x0)
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2922,9 +2916,9 @@ cpdef Dd_number_t kve(double x0, Dd_number_t x1) noexcept nogil:
 cpdef Dd_number_t log1p(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.log1p"""
     if Dd_number_t is double_complex:
-        return _func_clog1p(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_clog1p(_complexstuff.npy_cdouble_from_double_complex(x0)))
     elif Dd_number_t is double:
-        return cephes_log1p(x0)
+        return xsf_log1p(x0)
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -2950,9 +2944,9 @@ cpdef dfg_number_t log_expit(dfg_number_t x0) noexcept nogil:
 cpdef Dd_number_t log_ndtr(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.log_ndtr"""
     if Dd_number_t is double:
-        return (<double(*)(double) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_log_ndtr)(x0)
+        return xsf_log_ndtr(x0)
     elif Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_log_ndtr_complex)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_clog_ndtr(_complexstuff.npy_cdouble_from_double_complex(x0)))
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -3192,7 +3186,7 @@ cpdef double nctdtrit(double x0, double x1, double x2) noexcept nogil:
 cpdef Dd_number_t ndtr(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.ndtr"""
     if Dd_number_t is double_complex:
-        return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_ndtr)(x0)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cndtr(_complexstuff.npy_cdouble_from_double_complex(x0)))
     elif Dd_number_t is double:
         return xsf_ndtr(x0)
     else:
@@ -3577,7 +3571,7 @@ cpdef double tklmbda(double x0, double x1) noexcept nogil:
 
 cpdef double complex wofz(double complex x0) noexcept nogil:
     """See the documentation for scipy.special.wofz"""
-    return (<double complex(*)(double complex) noexcept nogil>scipy.special._ufuncs_cxx._export_faddeeva_w)(x0)
+    return _complexstuff.double_complex_from_npy_cdouble(xsf_cwofz(_complexstuff.npy_cdouble_from_double_complex(x0)))
 
 cpdef Dd_number_t wrightomega(Dd_number_t x0) noexcept nogil:
     """See the documentation for scipy.special.wrightomega"""
@@ -3594,9 +3588,10 @@ cpdef Dd_number_t wrightomega(Dd_number_t x0) noexcept nogil:
 cpdef Dd_number_t xlog1py(Dd_number_t x0, Dd_number_t x1) noexcept nogil:
     """See the documentation for scipy.special.xlog1py"""
     if Dd_number_t is double:
-        return _func_xlog1py[double](x0, x1)
+        return xsf_xlog1py(x0, x1)
     elif Dd_number_t is double_complex:
-        return _func_xlog1py[double_complex](x0, x1)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cxlog1py(_complexstuff.npy_cdouble_from_double_complex(x0),
+            _complexstuff.npy_cdouble_from_double_complex(x1)))
     else:
         if Dd_number_t is double_complex:
             return NAN
@@ -3606,9 +3601,10 @@ cpdef Dd_number_t xlog1py(Dd_number_t x0, Dd_number_t x1) noexcept nogil:
 cpdef Dd_number_t xlogy(Dd_number_t x0, Dd_number_t x1) noexcept nogil:
     """See the documentation for scipy.special.xlogy"""
     if Dd_number_t is double:
-        return _func_xlogy[double](x0, x1)
+        return xsf_xlogy(x0, x1)
     elif Dd_number_t is double_complex:
-        return _func_xlogy[double_complex](x0, x1)
+        return _complexstuff.double_complex_from_npy_cdouble(xsf_cxlogy(_complexstuff.npy_cdouble_from_double_complex(x0),
+            _complexstuff.npy_cdouble_from_double_complex(x1)))
     else:
         if Dd_number_t is double_complex:
             return NAN
diff --git a/scipy/special/functions.json b/scipy/special/functions.json
index 879ca0d87715..4478aea9a86d 100644
--- a/scipy/special/functions.json
+++ b/scipy/special/functions.json
@@ -32,11 +32,6 @@
             "cephes_lgam1p": "d->d"
         }
     },
-    "_log1pmx": {
-        "xsf_wrappers.h": {
-            "cephes_log1pmx": "d->d"
-        }
-    },
     "_sf_error_test_function": {
         "sf_error.pxd": {
             "_sf_error_test_function": "i->i"
@@ -57,11 +52,6 @@
             "cephes__struve_power_series": "ddp*d->d"
         }
     },
-    "voigt_profile" : {
-        "_faddeeva.h++" : {
-            "faddeeva_voigt_profile": "ddd->d"
-        }
-    },
     "agm": {
         "_agm.pxd": {
             "agm": "dd->d"
@@ -179,12 +169,6 @@
         "_cdflib_wrappers.pxd": {
             "chndtrix": "ddd->d"        }
     },
-    "dawsn": {
-        "_faddeeva.h++": {
-            "faddeeva_dawsn": "d->d",
-            "faddeeva_dawsn_complex": "D->D"
-        }
-    },
     "_factorial": {
 	"_factorial.pxd": {
 	    "_factorial": "d->d"
@@ -225,34 +209,6 @@
             "entr": "d->d"
         }
     },
-    "erf": {
-        "_faddeeva.h++": {
-            "faddeeva_erf": "D->D"
-        },
-        "xsf_wrappers.h": {
-            "cephes_erf": "d->d"
-        }
-    },
-    "erfc": {
-        "_faddeeva.h++": {
-            "faddeeva_erfc_complex": "D->D"
-        },
-        "xsf_wrappers.h": {
-            "cephes_erfc": "d->d"
-        }
-    },
-    "erfcx": {
-        "_faddeeva.h++": {
-            "faddeeva_erfcx": "d->d",
-            "faddeeva_erfcx_complex": "D->D"
-        }
-    },
-    "erfi": {
-        "_faddeeva.h++": {
-            "faddeeva_erfi": "d->d",
-            "faddeeva_erfi_complex": "D->D"
-        }
-    },
     "erfinv": {
         "boost_special_functions.h++": {
             "erfinv_float": "f->f",
@@ -365,24 +321,6 @@
             "eval_sh_legendre_l": "pd->d"
         }
     },
-    "exp10": {
-        "xsf_wrappers.h": {
-            "cephes_exp10": "d->d"
-        }
-    },
-    "exp2": {
-        "xsf_wrappers.h": {
-            "cephes_exp2": "d->d"
-        }
-    },
-    "expm1": {
-        "_cunity.pxd": {
-            "cexpm1": "D->D"
-        },
-        "xsf_wrappers.h": {
-            "cephes_expm1": "d->d"
-        }
-    },
     "expn": {
         "_legacy.pxd": {
             "expn_unsafe": "dd->d"
@@ -537,20 +475,6 @@
             "landau_isf_double": "ddd->d"
         }
     },
-    "log1p": {
-        "_cunity.pxd": {
-            "clog1p": "D->D"
-        },
-        "xsf_wrappers.h": {
-            "cephes_log1p": "d->d"
-        }
-    },
-    "log_ndtr": {
-        "_faddeeva.h++": {
-            "faddeeva_log_ndtr": "d->d",
-            "faddeeva_log_ndtr_complex": "D->D"
-        }
-    },
     "lpmv": {
         "xsf_wrappers.h": {
             "pmv_wrap": "ddd->d"
@@ -634,14 +558,6 @@
             "nctdtrit": "ddd->d"
         }
     },
-    "ndtr": {
-        "_faddeeva.h++": {
-            "faddeeva_ndtr": "D->D"
-        },
-        "xsf_wrappers.h": {
-            "xsf_ndtr": "d->d"
-        }
-    },
     "ndtri": {
         "xsf_wrappers.h": {
             "xsf_ndtri": "d->d"
@@ -793,29 +709,12 @@
             "xsf_tukeylambdacdf": "dd->d"
         }
     },
-    "wofz": {
-        "_faddeeva.h++": {
-            "faddeeva_w": "D->D"
-        }
-    },
     "wrightomega": {
         "_wright.h++": {
             "wrightomega": "D->D",
             "wrightomega_real": "d->d"
         }
     },
-    "xlog1py": {
-        "_xlogy.pxd": {
-            "xlog1py[double]": "dd->d",
-            "xlog1py[double_complex]": "DD->D"
-        }
-    },
-    "xlogy": {
-        "_xlogy.pxd": {
-            "xlogy[double]": "dd->d",
-            "xlogy[double_complex]": "DD->D"
-        }
-    },
     "yn": {
         "_legacy.pxd": {
             "yn_unsafe": "dd->d"
diff --git a/scipy/special/meson.build b/scipy/special/meson.build
index 3c42247b4a72..7ba7857031d7 100644
--- a/scipy/special/meson.build
+++ b/scipy/special/meson.build
@@ -7,7 +7,6 @@ _ufuncs_pxi_pxd_sources = [
   fs.copyfile('_cdflib_wrappers.pxd'),
   fs.copyfile('_complexstuff.pxd'),
   fs.copyfile('_convex_analysis.pxd'),
-  fs.copyfile('_cunity.pxd'),
   fs.copyfile('_ellip_harm.pxd'),
   fs.copyfile('_ellip_harm_2.pxd'),
   fs.copyfile('_ellipk.pxd'),
@@ -18,7 +17,6 @@ _ufuncs_pxi_pxd_sources = [
   fs.copyfile('_ndtri_exp.pxd'),
   fs.copyfile('_sici.pxd'),
   fs.copyfile('_spence.pxd'),
-  fs.copyfile('_xlogy.pxd'),
   fs.copyfile('orthogonal_eval.pxd'),
   fs.copyfile('sf_error.pxd'),
   fs.copyfile('_ufuncs_extra_code.pxi'),
@@ -54,10 +52,8 @@ ufuncs_sources = [
 ]
 
 ufuncs_cxx_sources = [
-  '_faddeeva.cxx',
   '_wright.cxx',
   'ellint_carlson_wrap.cxx',
-  'Faddeeva.cc',
   'sf_error.cc',
   'wright.cc'
 ]
diff --git a/scipy/special/tests/test_basic.py b/scipy/special/tests/test_basic.py
index 727e68c04d95..e6b0f27591a7 100644
--- a/scipy/special/tests/test_basic.py
+++ b/scipy/special/tests/test_basic.py
@@ -2351,6 +2351,7 @@ def _nest_me(x, k=1):
         assert_func(special.factorialk(n, 3, exact=exact),
                     np.array(exp_nucleus[3], ndmin=level))
 
+    @pytest.mark.fail_slow(5)
     @pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64])
     @pytest.mark.parametrize("exact,extend",
                              [(True, "zero"), (False, "zero"), (False, "complex")])
@@ -2364,10 +2365,38 @@ def _check(n):
             assert_func(special.factorial2(n, **kw), special.factorial2(n_ref, **kw))
             assert_func(special.factorialk(n, k=3, **kw),
                         special.factorialk(n_ref, k=3, **kw))
+        def _check_inf(n):
+            # produce inf of same type/dimension
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning)
+                shaped_inf = n / 0
+            assert_func(special.factorial(n, **kw), shaped_inf)
+            assert_func(special.factorial2(n, **kw), shaped_inf)
+            assert_func(special.factorialk(n, k=3, **kw), shaped_inf)
+
         _check(dtype(0))
         _check(dtype(1))
         _check(np.array(0, dtype=dtype))
         _check(np.array([0, 1], dtype=dtype))
+        # test that maximal uint values work as well
+        N = dtype(np.iinfo(dtype).max)
+        # TODO: cannot use N itself yet; factorial uses `gamma(N+1)` resp. `(hi+lo)//2`
+        if dtype == np.uint64:
+            if exact:
+                # avoid attempting huge calculation
+                pass
+            elif np.lib.NumpyVersion(np.__version__) >= "2.0.0":
+                # N does not fit into int64 --> cannot use _check
+                _check_inf(dtype(N-1))
+                _check_inf(np.array(N-1, dtype=dtype))
+                _check_inf(np.array([N-1], dtype=dtype))
+        elif dtype in [np.uint8, np.uint16] or not exact:
+            # factorial(65535, exact=True) has 287189 digits and is calculated almost
+            # instantaneously on modern hardware; however, dtypes bigger than uint16
+            # would blow up runtime and memory consumption for exact=True
+            _check(N-1)
+            _check(np.array(N-1, dtype=dtype))
+            _check(np.array([N-2, N-1], dtype=dtype))
 
     # note that n=170 is the last integer such that factorial(n) fits float64
     @pytest.mark.parametrize('n', range(30, 180, 10))
diff --git a/scipy/special/tests/test_logsumexp.py b/scipy/special/tests/test_logsumexp.py
index 2827aa000175..7fdbc9d13b80 100644
--- a/scipy/special/tests/test_logsumexp.py
+++ b/scipy/special/tests/test_logsumexp.py
@@ -18,9 +18,6 @@
 
 
 @array_api_compatible
-@pytest.mark.usefixtures("skip_xp_backends")
-@pytest.mark.skip_xp_backends('jax.numpy',
-                              reason="JAX arrays do not support item assignment")
 def test_wrap_radians(xp):
     x = xp.asarray([-math.pi-1, -math.pi, -1, -1e-300,
                     0, 1e-300, 1, math.pi, math.pi+1])
@@ -35,6 +32,10 @@ def test_wrap_radians(xp):
 @pytest.mark.skip_xp_backends('jax.numpy',
                               reason="JAX arrays do not support item assignment")
 class TestLogSumExp:
+    # numpy warning filters don't work for dask
+    # (also we should not expect the numpy warning filter to work for any Array API
+    # library)
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in log")
     def test_logsumexp(self, xp):
         # Test with zero-size array
         a = xp.asarray([])
@@ -69,6 +70,7 @@ def test_logsumexp(self, xp):
         nan = xp.asarray([xp.nan])
         xp_assert_equal(logsumexp(inf), inf[0])
         xp_assert_equal(logsumexp(-inf), -inf[0])
+
         xp_assert_equal(logsumexp(nan), nan[0])
         xp_assert_equal(logsumexp(xp.asarray([-xp.inf, -xp.inf])), -inf[0])
 
@@ -115,6 +117,7 @@ def test_logsumexp_sign(self, xp):
         xp_assert_close(r, xp.asarray(1.))
         xp_assert_equal(s, xp.asarray(-1.))
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
     def test_logsumexp_sign_zero(self, xp):
         a = xp.asarray([1, 1])
         b = xp.asarray([1, -1])
@@ -219,6 +222,7 @@ def test_gh18295(self, xp):
         ref = xp.logaddexp(a[0], a[1])
         xp_assert_close(res, ref)
 
+    @pytest.mark.filterwarnings("ignore::FutureWarning:dask")
     @pytest.mark.parametrize('dtype', ['complex64', 'complex128'])
     def test_gh21610(self, xp, dtype):
         # gh-21610 noted that `logsumexp` could return imaginary components
diff --git a/scipy/special/tests/test_support_alternative_backends.py b/scipy/special/tests/test_support_alternative_backends.py
index 48cfe5bfb64c..47ef974f17b5 100644
--- a/scipy/special/tests/test_support_alternative_backends.py
+++ b/scipy/special/tests/test_support_alternative_backends.py
@@ -5,7 +5,7 @@
 from scipy.conftest import array_api_compatible
 from scipy import special
 from scipy._lib._array_api_no_0d import xp_assert_close
-from scipy._lib._array_api import is_jax, is_torch, SCIPY_DEVICE
+from scipy._lib._array_api import is_jax, is_torch, SCIPY_DEVICE, is_dask
 from scipy._lib.array_api_compat import numpy as np
 
 try:
@@ -17,13 +17,13 @@
 
 @pytest.mark.skipif(not HAVE_ARRAY_API_STRICT,
                     reason="`array_api_strict` not installed")
-def test_dispatch_to_unrecognize_library():
+def test_dispatch_to_unrecognized_library():
     xp = array_api_strict
     f = get_array_special_func('ndtr', xp=xp, n_array_args=1)
     x = [1, 2, 3]
     res = f(xp.asarray(x))
     ref = xp.asarray(special.ndtr(np.asarray(x)))
-    xp_assert_close(res, ref, xp=xp)
+    xp_assert_close(res, ref)
 
 
 @pytest.mark.parametrize('dtype', ['float32', 'float64', 'int64'])
@@ -34,16 +34,20 @@ def test_rel_entr_generic(dtype):
     f = get_array_special_func('rel_entr', xp=xp, n_array_args=2)
     dtype_np = getattr(np, dtype)
     dtype_xp = getattr(xp, dtype)
-    x, y = [-1, 0, 0, 1], [1, 0, 2, 3]
+    x = [-1, 0, 0, 1]
+    y = [1, 0, 2, 3]
 
-    x_xp, y_xp = xp.asarray(x, dtype=dtype_xp), xp.asarray(y, dtype=dtype_xp)
+    x_xp = xp.asarray(x, dtype=dtype_xp)
+    y_xp = xp.asarray(y, dtype=dtype_xp)
     res = f(x_xp, y_xp)
 
-    x_np, y_np = np.asarray(x, dtype=dtype_np), np.asarray(y, dtype=dtype_np)
+    x_np = np.asarray(x, dtype=dtype_np)
+    y_np = np.asarray(y, dtype=dtype_np)
     ref = special.rel_entr(x_np[-1], y_np[-1])
     ref = np.asarray([np.inf, 0, 0, ref], dtype=ref.dtype)
+    ref = xp.asarray(ref)
 
-    xp_assert_close(res, xp.asarray(ref), xp=xp)
+    xp_assert_close(res, ref)
 
 
 @pytest.mark.fail_slow(5)
@@ -52,6 +56,8 @@ def test_rel_entr_generic(dtype):
 # @pytest.mark.usefixtures("skip_xp_backends")
 # `reversed` is for developer convenience: test new function first = less waiting
 @pytest.mark.parametrize('f_name_n_args', reversed(array_special_func_map.items()))
+# numpy warning filter doesn't work for dask
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
 @pytest.mark.parametrize('dtype', ['float32', 'float64'])
 @pytest.mark.parametrize('shapes', [[(0,)]*4, [tuple()]*4, [(10,)]*4,
                                     [(10,), (11, 1), (12, 1, 1), (13, 1, 1, 1)]])
@@ -60,10 +66,16 @@ def test_support_alternative_backends(xp, f_name_n_args, dtype, shapes):
 
     if (SCIPY_DEVICE != 'cpu'
         and is_torch(xp)
-        and f_name in {'stdtr', 'betaincc', 'betainc'}
+        and f_name in {'stdtr', 'stdtrit', 'betaincc', 'betainc'}
     ):
         pytest.skip(f"`{f_name}` does not have an array-agnostic implementation "
                     f"and cannot delegate to PyTorch.")
+        
+    if is_dask(xp) and f_name == 'rel_entr':
+        pytest.skip("boolean index assignment")
+
+    if is_jax(xp) and f_name in {'stdtrit'}:
+        pytest.skip(f"`{f_name}` generic implementation require array mutation.")
 
     shapes = shapes[:n_args]
     f = getattr(special, f_name)
diff --git a/scipy/special/xsf/erf.h b/scipy/special/xsf/erf.h
new file mode 100644
index 000000000000..0ec390dde3d6
--- /dev/null
+++ b/scipy/special/xsf/erf.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include "faddeeva.h"
+#include "cephes/ndtr.h"
+#include "config.h"
+
+namespace xsf {
+
+inline double erf(double x) { return cephes::erf(x); }
+
+inline float erf(float x) { return erf(static_cast<double>(x)); }
+
+inline std::complex<double> erf(std::complex<double> z) { return Faddeeva::erf(z); }
+
+inline std::complex<float> erf(std::complex<float> x) {
+    return static_cast<std::complex<float>>(erf(static_cast<std::complex<double>>(x)));
+}
+
+inline double erfc(double x) { return cephes::erfc(x); }
+
+inline float erfc(float x) { return erfc(static_cast<double>(x)); }
+
+inline std::complex<double> erfc(std::complex<double> z) { return Faddeeva::erfc(z); }
+
+inline std::complex<float> erfc(std::complex<float> x) {
+    return static_cast<std::complex<float>>(erfc(static_cast<std::complex<double>>(x)));
+}
+
+inline double erfcx(double x) { return Faddeeva::erfcx(x); }
+
+inline float erfcx(float x) { return erfcx(static_cast<double>(x)); }
+
+inline std::complex<double> erfcx(std::complex<double> z) { return Faddeeva::erfcx(z); }
+
+inline std::complex<float> erfcx(std::complex<float> x) {
+    return static_cast<std::complex<float>>(erfcx(static_cast<std::complex<double>>(x)));
+}
+
+inline double erfi(double x) { return Faddeeva::erfi(x); }
+
+inline float erfi(float x) { return erfi(static_cast<double>(x)); }
+
+inline std::complex<double> erfi(std::complex<double> z) { return Faddeeva::erfi(z); }
+
+inline std::complex<float> erfi(std::complex<float> z) {
+    return static_cast<std::complex<float>>(erfi(static_cast<std::complex<double>>(z)));
+}
+
+inline double voigt_profile(double x, double sigma, double gamma) {
+    const double INV_SQRT_2 = 0.707106781186547524401;
+    const double SQRT_2PI = 2.5066282746310002416123552393401042;
+
+    if (sigma == 0) {
+        if (gamma == 0) {
+            if (std::isnan(x))
+                return x;
+            if (x == 0)
+                return INFINITY;
+            return 0;
+        }
+        return gamma / M_PI / (x * x + gamma * gamma);
+    }
+    if (gamma == 0) {
+        return 1 / SQRT_2PI / sigma * exp(-(x / sigma) * (x / sigma) / 2);
+    }
+
+    double zreal = x / sigma * INV_SQRT_2;
+    double zimag = gamma / sigma * INV_SQRT_2;
+    std::complex<double> z(zreal, zimag);
+    std::complex<double> w = Faddeeva::w(z);
+    return w.real() / sigma / SQRT_2PI;
+}
+
+inline float voigt_profile(float x, float sigma, float gamma) {
+    return voigt_profile(static_cast<double>(x), static_cast<double>(sigma), static_cast<double>(gamma));
+}
+
+inline std::complex<double> wofz(std::complex<double> z) { return Faddeeva::w(z); }
+
+inline std::complex<float> wofz(std::complex<float> x) {
+    return static_cast<std::complex<float>>(wofz(static_cast<std::complex<double>>(x)));
+}
+
+inline double dawsn(double x) { return Faddeeva::Dawson(x); }
+
+inline float dawsn(float x) { return dawsn(static_cast<double>(x)); }
+
+inline std::complex<double> dawsn(std::complex<double> z) { return Faddeeva::Dawson(z); }
+
+inline std::complex<float> dawsn(std::complex<float> x) {
+    return static_cast<std::complex<float>>(dawsn(static_cast<std::complex<double>>(x)));
+}
+
+} // namespace xsf
diff --git a/scipy/special/xsf/exp.h b/scipy/special/xsf/exp.h
new file mode 100644
index 000000000000..0d8551109084
--- /dev/null
+++ b/scipy/special/xsf/exp.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "xsf/cephes/exp10.h"
+#include "xsf/cephes/exp2.h"
+
+namespace xsf {
+
+inline double expm1(double x) { return cephes::expm1(x); }
+
+inline float expm1(float x) { return expm1(static_cast<double>(x)); }
+
+// cexpm1(z) = cexp(z) - 1
+//
+// The imaginary part of this is easily computed via exp(z.real)*sin(z.imag)
+// The real part is difficult to compute when there is cancellation e.g. when
+// z.real = -log(cos(z.imag)).  There isn't a way around this problem  that
+// doesn't involve computing exp(z.real) and/or cos(z.imag) to higher
+// precision.
+inline std::complex<double> expm1(std::complex<double> z) {
+    if (!std::isfinite(std::real(z)) || !std::isfinite(std::imag(z))) {
+        return std::exp(z) - 1.0;
+    }
+
+    double x;
+    double ezr = 0;
+    if (std::real(z) <= -40) {
+        x = -1.0;
+    } else {
+        ezr = expm1(std::real(z));
+        x = ezr * std::cos(std::imag(z)) + cosm1(std::imag(z));
+    }
+
+    // don't compute exp(zr) too, unless necessary
+    double y;
+    if (std::real(z) > -1.0) {
+        y = (ezr + 1.0) * sin(std::imag(z));
+    } else {
+        y = exp(std::real(z)) * sin(std::imag(z));
+    }
+
+    return std::complex<double>{x, y};
+}
+
+inline std::complex<float> expm1(std::complex<float> z) {
+    return static_cast<std::complex<float>>(expm1(static_cast<std::complex<double>>(z)));
+}
+
+double exp2(double x) { return cephes::exp2(x); }
+
+float exp2(float x) { return exp2(static_cast<double>(x)); }
+
+double exp10(double x) { return cephes::exp10(x); }
+
+float exp10(float x) { return exp10(static_cast<double>(x)); }
+
+} // namespace xsf
diff --git a/scipy/special/Faddeeva.cc b/scipy/special/xsf/faddeeva.h
similarity index 74%
rename from scipy/special/Faddeeva.cc
rename to scipy/special/xsf/faddeeva.h
index 8148fe60bf65..78b51dc84de0 100644
--- a/scipy/special/Faddeeva.cc
+++ b/scipy/special/xsf/faddeeva.h
@@ -20,9 +20,6 @@
  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
  */
 
-#include <Python.h>
-#include "Faddeeva.hh"
-
 /* Available at: http://ab-initio.mit.edu/Faddeeva
 
    Computes various error functions (erf, erfc, erfi, erfcx), 
@@ -118,29 +115,47 @@
 		       file Faddeeva.hh.
 */
 
+#pragma once
+
 #include <cfloat>
-#include <cmath>
+#include <complex>
 
-#define complex std::complex
-#define isinf std::isinf
-#define isnan std::isnan
+namespace Faddeeva {
 
-/////////////////////////////////////////////////////////////////////////
+// compute w(z) = exp(-z^2) erfc(-iz) [ Faddeeva / scaled complex error func ]
+std::complex<double> w(std::complex<double> z,double relerr=0);
+double w_im(double x); // special-case code for Im[w(x)] of real x
 
-#define Inf INFINITY
-#define NaN NAN
+// Various functions that we can compute with the help of w(z)
 
-/////////////////////////////////////////////////////////////////////////
-// Auxiliary routines to compute other special functions based on w(z)
+// compute erfcx(z) = exp(z^2) erfz(z)
+std::complex<double> erfcx(std::complex<double> z, double relerr=0);
+double erfcx(double x); // special case for real x
+
+// compute erf(z), the error function of complex arguments
+std::complex<double> erf(std::complex<double> z, double relerr=0);
+double erf(double x); // special case for real x
+
+// compute erfi(z) = -i erf(iz), the imaginary error function
+std::complex<double> erfi(std::complex<double> z, double relerr=0);
+double erfi(double x); // special case for real x
+
+// compute erfc(z) = 1 - erf(z), the complementary error function
+std::complex<double> erfc(std::complex<double> z, double relerr=0);
+double erfc(double x); // special case for real x
+
+// compute Dawson(z) = sqrt(pi)/2  *  exp(-z^2) * erfi(z)
+std::complex<double> Dawson(std::complex<double> z, double relerr=0);
+double Dawson(double x); // special case for real x
 
 // compute erfcx(z) = exp(z^2) erfz(z)
-complex<double> Faddeeva::erfcx(complex<double> z, double relerr)
+std::complex<double> erfcx(std::complex<double> z, double relerr)
 {
-  return Faddeeva::w(complex<double>(-imag(z), real(z)));
+  return w(std::complex<double>(-imag(z), real(z)));
 }
 
 // compute the error function erf(x)
-double Faddeeva::erf(double x)
+double erf(double x)
 {
   double mx2 = -x*x;
   if (mx2 < -750) // underflow
@@ -148,11 +163,11 @@ double Faddeeva::erf(double x)
 
   if (x >= 0) {
     if (x < 5e-3) goto taylor;
-    return 1.0 - exp(mx2) * Faddeeva::erfcx(x);
+    return 1.0 - exp(mx2) * erfcx(x);
   }
   else { // x < 0
     if (x > -5e-3) goto taylor;
-    return exp(mx2) * Faddeeva::erfcx(-x) - 1.0;
+    return exp(mx2) * erfcx(-x) - 1.0;
   }
 
   // Use Taylor series for small |x|, to avoid cancellation inaccuracy
@@ -163,18 +178,19 @@ double Faddeeva::erf(double x)
 		       + mx2 * 0.11283791670955125739));
 }
 
+
 // compute the error function erf(z)
-complex<double> Faddeeva::erf(complex<double> z, double relerr)
+std::complex<double> erf(std::complex<double> z, double relerr)
 {
   double x = real(z), y = imag(z);
 
   if (x == 0) // handle separately for speed & handling of y = Inf or NaN
-    return complex<double>(x, // preserve sign of 0
+    return std::complex<double>(x, // preserve sign of 0
 			   /* handle y -> Inf limit manually, since
 			      exp(y^2) -> Inf but Im[w(y)] -> 0, so
 			      IEEE will give us a NaN when it should be Inf */
-			   y*y > 720 ? (y > 0 ? Inf : -Inf)
-			   : exp(y*y) * Faddeeva::w_im(y));
+			   y*y > 720 ? (y > 0 ? std::numeric_limits<double>::infinity() : -std::numeric_limits<double>::infinity())
+			   : exp(y*y) * w_im(y));
 
   double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
   double mIm_z2 = -2*x*y; // Im(-z^2)
@@ -194,8 +210,8 @@ complex<double> Faddeeva::erf(complex<double> z, double relerr)
     /* don't use complex exp function, since that will produce spurious NaN
        values when multiplying w in an overflow situation. */
     return 1.0 - exp(mRe_z2) *
-      (complex<double>(cos(mIm_z2), sin(mIm_z2))
-       * Faddeeva::w(complex<double>(-y,x)));
+      (std::complex<double>(cos(mIm_z2), sin(mIm_z2))
+       * w(std::complex<double>(-y,x)));
   }
   else { // x < 0
     if (x > -5e-3) { // duplicate from above to avoid fabs(x) call
@@ -204,20 +220,20 @@ complex<double> Faddeeva::erf(complex<double> z, double relerr)
       else if (fabs(mIm_z2) < 5e-3)
 	goto taylor_erfi;
     }
-    else if (isnan(x))
-      return complex<double>(NaN, y == 0 ? 0 : NaN);
+    else if (std::isnan(x))
+      return std::complex<double>(std::numeric_limits<double>::quiet_NaN(), y == 0 ? 0 : std::numeric_limits<double>::quiet_NaN());
     /* don't use complex exp function, since that will produce spurious NaN
        values when multiplying w in an overflow situation. */
     return exp(mRe_z2) *
-      (complex<double>(cos(mIm_z2), sin(mIm_z2))
-       * Faddeeva::w(complex<double>(y,-x))) - 1.0;
+      (std::complex<double>(cos(mIm_z2), sin(mIm_z2))
+       * w(std::complex<double>(y,-x))) - 1.0;
   }
 
   // Use Taylor series for small |z|, to avoid cancellation inaccuracy
   //     erf(z) = 2/sqrt(pi) * z * (1 - z^2/3 + z^4/10 - ...)
  taylor:
   {
-    complex<double> mz2(mRe_z2, mIm_z2); // -z^2
+    std::complex<double> mz2(mRe_z2, mIm_z2); // -z^2
     return z * (1.1283791670955125739
 		+ mz2 * (0.37612638903183752464
 			 + mz2 * 0.11283791670955125739));
@@ -236,60 +252,61 @@ complex<double> Faddeeva::erf(complex<double> z, double relerr)
   {
     double x2 = x*x, y2 = y*y;
     double expy2 = exp(y2);
-    return complex<double>
+    return std::complex<double>
       (expy2 * x * (1.1283791670955125739
 		    - x2 * (0.37612638903183752464
 			    + 0.75225277806367504925*y2)
 		    + x2*x2 * (0.11283791670955125739
 			       + y2 * (0.45135166683820502956
 				       + 0.15045055561273500986*y2))),
-       expy2 * (Faddeeva::w_im(y)
+       expy2 * (w_im(y)
 		- x2*y * (1.1283791670955125739 
 			  - x2 * (0.56418958354775628695 
 				  + 0.37612638903183752464*y2))));
   }
 }
 
+
 // erfi(z) = -i erf(iz)
-complex<double> Faddeeva::erfi(complex<double> z, double relerr)
+std::complex<double> erfi(std::complex<double> z, double relerr)
 {
-  complex<double> e = Faddeeva::erf(complex<double>(-imag(z),real(z)), relerr);
-  return complex<double>(imag(e), -real(e));
+  std::complex<double> e = erf(std::complex<double>(-imag(z),real(z)), relerr);
+  return std::complex<double>(imag(e), -real(e));
 }
 
 // erfi(x) = -i erf(ix)
-double Faddeeva::erfi(double x)
+double erfi(double x)
 {
-  return x*x > 720 ? (x > 0 ? Inf : -Inf)
-    : exp(x*x) * Faddeeva::w_im(x);
+  return x*x > 720 ? (x > 0 ? std::numeric_limits<double>::infinity() : -std::numeric_limits<double>::infinity())
+    : exp(x*x) * w_im(x);
 }
 
 // erfc(x) = 1 - erf(x)
-double Faddeeva::erfc(double x)
+double erfc(double x)
 {
   if (x*x > 750) // underflow
     return (x >= 0 ? 0.0 : 2.0);
-  return x >= 0 ? exp(-x*x) * Faddeeva::erfcx(x) 
-    : 2. - exp(-x*x) * Faddeeva::erfcx(-x);
+  return x >= 0 ? exp(-x*x) * erfcx(x) 
+    : 2. - exp(-x*x) * erfcx(-x);
 }
 
 // erfc(z) = 1 - erf(z)
-complex<double> Faddeeva::erfc(complex<double> z, double relerr)
+std::complex<double> erfc(std::complex<double> z, double relerr)
 {
   double x = real(z), y = imag(z);
 
   if (x == 0.)
-    return complex<double>(1,
+    return std::complex<double>(1,
 			   /* handle y -> Inf limit manually, since
 			      exp(y^2) -> Inf but Im[w(y)] -> 0, so
 			      IEEE will give us a NaN when it should be Inf */
-			   y*y > 720 ? (y > 0 ? -Inf : Inf)
-			   : -exp(y*y) * Faddeeva::w_im(y));
+			   y*y > 720 ? (y > 0 ? -std::numeric_limits<double>::infinity() : std::numeric_limits<double>::infinity())
+			   : -exp(y*y) * w_im(y));
   if (y == 0.) {
     if (x*x > 750) // underflow
       return (x >= 0 ? 0.0 : 2.0);
-    return x >= 0 ? exp(-x*x) * Faddeeva::erfcx(x) 
-      : 2. - exp(-x*x) * Faddeeva::erfcx(-x);
+    return x >= 0 ? exp(-x*x) * erfcx(x) 
+      : 2. - exp(-x*x) * erfcx(-x);
   }
 
   double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
@@ -298,47 +315,47 @@ complex<double> Faddeeva::erfc(complex<double> z, double relerr)
     return (x >= 0 ? 0.0 : 2.0);
 
   if (x >= 0)
-    return exp(complex<double>(mRe_z2, mIm_z2))
-      * Faddeeva::w(complex<double>(-y,x), relerr);
+    return exp(std::complex<double>(mRe_z2, mIm_z2))
+      * w(std::complex<double>(-y,x), relerr);
   else
-    return 2.0 - exp(complex<double>(mRe_z2, mIm_z2))
-      * Faddeeva::w(complex<double>(y,-x), relerr);
+    return 2.0 - exp(std::complex<double>(mRe_z2, mIm_z2))
+      * w(std::complex<double>(y,-x), relerr);
 }
 
 // compute Dawson(x) = sqrt(pi)/2  *  exp(-x^2) * erfi(x)
-double Faddeeva::Dawson(double x)
+double Dawson(double x)
 {
   const double spi2 = 0.8862269254527580136490837416705725913990; // sqrt(pi)/2
-  return spi2 * Faddeeva::w_im(x);
+  return spi2 * w_im(x);
 }
 
 // compute Dawson(z) = sqrt(pi)/2  *  exp(-z^2) * erfi(z)
-complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
+std::complex<double> Dawson(std::complex<double> z, double relerr)
 {
   const double spi2 = 0.8862269254527580136490837416705725913990; // sqrt(pi)/2
   double x = real(z), y = imag(z);
 
   // handle axes separately for speed & proper handling of x or y = Inf or NaN
   if (y == 0)
-    return complex<double>(spi2 * Faddeeva::w_im(x),
+    return std::complex<double>(spi2 * w_im(x),
 			   -y); // preserve sign of 0
   if (x == 0) {
     double y2 = y*y;
     if (y2 < 2.5e-5) { // Taylor expansion
-      return complex<double>(x, // preserve sign of 0
+      return std::complex<double>(x, // preserve sign of 0
 	 y * (1.
 	      + y2 * (0.6666666666666666666666666666666666666667
 		      + y2 * 0.2666666666666666666666666666666666666667)));
     }
-    return complex<double>(x, // preserve sign of 0
+    return std::complex<double>(x, // preserve sign of 0
 			   spi2 * (y >= 0 
-				   ? exp(y2) - Faddeeva::erfcx(y)
-				   : Faddeeva::erfcx(-y) - exp(y2)));
+				   ? exp(y2) - erfcx(y)
+				   : erfcx(-y) - exp(y2)));
   }
 
   double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
   double mIm_z2 = -2*x*y; // Im(-z^2)
-  complex<double> mz2(mRe_z2, mIm_z2); // -z^2
+  std::complex<double> mz2(mRe_z2, mIm_z2); // -z^2
 
   /* Handle positive and negative x via different formulas,
      using the mirror symmetries of w, to avoid overflow/underflow
@@ -350,8 +367,8 @@ complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
       else if (fabs(mIm_z2) < 5e-3)
 	goto taylor_realaxis;
     }
-    complex<double> res = exp(mz2) - Faddeeva::w(z);
-    return spi2 * complex<double>(-imag(res), real(res));
+    std::complex<double> res = exp(mz2) - w(z);
+    return spi2 * std::complex<double>(-imag(res), real(res));
   }
   else { // y < 0
     if (y > -5e-3) { // duplicate from above to avoid fabs(x) call
@@ -360,10 +377,10 @@ complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
       else if (fabs(mIm_z2) < 5e-3)
 	goto taylor_realaxis;
     }
-    else if (isnan(y))
-      return complex<double>(x == 0 ? 0 : NaN, NaN);
-    complex<double> res = Faddeeva::w(-z) - exp(mz2);
-    return spi2 * complex<double>(-imag(res), real(res));
+    else if (std::isnan(y))
+      return std::complex<double>(x == 0 ? 0 : std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN());
+    std::complex<double> res = w(-z) - exp(mz2);
+    return spi2 * std::complex<double>(-imag(res), real(res));
   }
 
   // Use Taylor series for small |z|, to avoid cancellation inaccuracy
@@ -413,7 +430,7 @@ complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
       double y2 = y*y;
       if (x2 > 25e14) {// |x| > 5e7
 	double xy2 = (x*y)*(x*y);
-	return complex<double>((0.5 + y2 * (0.5 + 0.25*y2
+	return std::complex<double>((0.5 + y2 * (0.5 + 0.25*y2
 					    - 0.16666666666666666667*xy2)) / x,
 			       y * (-1 + y2 * (-0.66666666666666666667
 					       + 0.13333333333333333333*xy2
@@ -421,15 +438,15 @@ complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
 			       / (2*x2 - 1));
       }
       return (1. / (-15 + x2*(90 + x2*(-60 + 8*x2)))) *
-	complex<double>(x * (33 + x2 * (-28 + 4*x2)
+	std::complex<double>(x * (33 + x2 * (-28 + 4*x2)
 			     + y2 * (18 - 4*x2 + 4*y2)),
 			y * (-15 + x2 * (24 - 4*x2)
 			     + y2 * (4*x2 - 10 - 4*y2)));
     }
     else {
-      double D = spi2 * Faddeeva::w_im(x);
+      double D = spi2 * w_im(x);
       double x2 = x*x, y2 = y*y;
-      return complex<double>
+      return std::complex<double>
 	(D + y2 * (D + x - 2*D*x2)
 	 + y2*y2 * (D * (0.5 - x2 * (2 - 0.66666666666666666667*x2))
 		    + x * (0.83333333333333333333
@@ -444,21 +461,21 @@ complex<double> Faddeeva::Dawson(complex<double> z, double relerr)
   }
 }
 
-/////////////////////////////////////////////////////////////////////////
-
 // return sinc(x) = sin(x)/x, given both x and sin(x) 
 // [since we only use this in cases where sin(x) has already been computed]
-static inline double sinc(double x, double sinx) { 
+inline double sinc(double x, double sinx) { 
   return fabs(x) < 1e-4 ? 1 - (0.1666666666666666666667)*x*x : sinx / x; 
 }
 
 // sinh(x) via Taylor series, accurate to machine precision for |x| < 1e-2
-static inline double sinh_taylor(double x) {
+inline double sinh_taylor(double x) {
   return x * (1 + (x*x) * (0.1666666666666666666667
 			   + 0.00833333333333333333333 * (x*x)));
 }
 
-static inline double sqr(double x) { return x*x; }
+inline double sqr(double x) { return x*x; }
+
+/////////////////////////////////////////////////////////////////////////
 
 // precomputed table of expa2n2[n-1] = exp(-a2*n*n)
 // for double-precision a2 = 0.26865... in Faddeeva::w, below.
@@ -517,16 +534,18 @@ static const double expa2n2[] = {
   0.0 // underflow (also prevents reads past array end, below)
 };
 
+
+
 /////////////////////////////////////////////////////////////////////////
 
-complex<double> Faddeeva::w(complex<double> z, double relerr)
+std::complex<double> w(std::complex<double> z, double relerr)
 {
   if (real(z) == 0.0)
-    return complex<double>(Faddeeva::erfcx(imag(z)), 
+    return std::complex<double>(erfcx(imag(z)), 
 			   real(z)); // give correct sign of 0 in imag(w)
   else if (imag(z) == 0)
-    return complex<double>(exp(-sqr(real(z))),
-			   Faddeeva::w_im(real(z)));
+    return std::complex<double>(exp(-sqr(real(z))),
+			   w_im(real(z)));
 
   double a, a2, c;
   if (relerr <= DBL_EPSILON) {
@@ -545,7 +564,7 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
   const double x = fabs(real(z));
   const double y = imag(z), ya = fabs(y);
 
-  complex<double> ret(0.,0.); // return value
+  std::complex<double> ret(0.,0.); // return value
 
   double sum1 = 0, sum2 = 0, sum3 = 0, sum4 = 0, sum5 = 0;
 
@@ -577,21 +596,21 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
 	if (x > ya) {
 	  double yax = ya / xs; 
 	  double denom = ispi / (xs + yax*ya);
-	  ret = complex<double>(denom*yax, denom);
+	  ret = std::complex<double>(denom*yax, denom);
 	}
-	else if (isinf(ya))
-	  return ((isnan(x) || y < 0)
-		  ? complex<double>(NaN,NaN) : complex<double>(0,0));
+	else if (std::isinf(ya))
+	  return ((std::isnan(x) || y < 0)
+		  ? std::complex<double>(std::numeric_limits<double>::quiet_NaN(),std::numeric_limits<double>::quiet_NaN()) : std::complex<double>(0,0));
 	else {
 	  double xya = xs / ya;
 	  double denom = ispi / (xya*xs + ya);
-	  ret = complex<double>(denom, denom*xya);
+	  ret = std::complex<double>(denom, denom*xya);
 	}
       }
       else { // nu == 2, w(z) = i/sqrt(pi) * z / (z*z - 0.5)
 	double dr = xs*xs - ya*ya - 0.5, di = 2*xs*ya;
 	double denom = ispi / (dr*dr + di*di);
-	ret = complex<double>(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di));
+	ret = std::complex<double>(denom * (xs*di-ya*dr), denom * (xs*dr+ya*di));
       }
     }
     else { // compute nu(z) estimate and do general continued fraction
@@ -606,14 +625,14 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
       }
       { // w(z) = i/sqrt(pi) / w:
 	double denom = ispi / (wr*wr + wi*wi);
-	ret = complex<double>(denom*wi, denom*wr);
+	ret = std::complex<double>(denom*wi, denom*wr);
       }
     }
     if (y < 0) {
       // use w(z) = 2.0*exp(-z*z) - w(-z), 
       // but be careful of overflow in exp(-z*z) 
       //                                = exp(-(xs*xs-ya*ya) -2*i*xs*ya) 
-      return 2.0*exp(complex<double>((ya-xs)*(xs+ya), 2*xs*y)) - ret;
+      return 2.0*exp(std::complex<double>((ya-xs)*(xs+ya), 2*xs*y)) - ret;
     }
     else
       return ret;
@@ -626,18 +645,18 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
     if (x > ya) {
       double yax = ya / xs; 
       double denom = ispi / (xs + yax*ya);
-      ret = complex<double>(denom*yax, denom);
+      ret = std::complex<double>(denom*yax, denom);
     }
     else {
       double xya = xs / ya;
       double denom = ispi / (xya*xs + ya);
-      ret = complex<double>(denom, denom*xya);
+      ret = std::complex<double>(denom, denom*xya);
     }
     if (y < 0) {
       // use w(z) = 2.0*exp(-z*z) - w(-z), 
       // but be careful of overflow in exp(-z*z) 
       //                                = exp(-(xs*xs-ya*ya) -2*i*xs*ya) 
-      return 2.0*exp(complex<double>((ya-xs)*(xs+ya), 2*xs*y)) - ret;
+      return 2.0*exp(std::complex<double>((ya-xs)*(xs+ya), 2*xs*y)) - ret;
     }
     else
       return ret;
@@ -660,8 +679,8 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
     double prod2ax = 1, prodm2ax = 1;
     double expx2;
 
-    if (isnan(y))
-      return complex<double>(y,y);
+    if (std::isnan(y))
+      return std::complex<double>(y,y);
     
     /* Somewhat ugly copy-and-paste duplication here, but I see significant
        speedups from using the special-case code with the precomputed
@@ -747,7 +766,7 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
     }
     const double expx2erfcxy = // avoid spurious overflow for large negative y
       y > -6 // for y < -6, erfcx(y) = 2*exp(y*y) to double precision
-      ? expx2*Faddeeva::erfcx(y) : 2*exp(y*y-x*x);
+      ? expx2*erfcx(y) : 2*exp(y*y-x*x);
     if (y > 5) { // imaginary terms cancel
       const double sinxy = sin(x*y);
       ret = (expx2erfcxy - c*y*sum1) * cos(2*x*y)
@@ -759,15 +778,15 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
       const double sin2xy = sin(2*xs*y), cos2xy = cos(2*xs*y);
       const double coef1 = expx2erfcxy - c*y*sum1;
       const double coef2 = c*xs*expx2;
-      ret = complex<double>(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy),
+      ret = std::complex<double>(coef1 * cos2xy + coef2 * sinxy * sinc(xs*y, sinxy),
 			    coef2 * sinc(2*xs*y, sin2xy) - coef1 * sin2xy);
     }
   }
   else { // x large: only sum3 & sum5 contribute (see above note)    
-    if (isnan(x))
-      return complex<double>(x,x);
-    if (isnan(y))
-      return complex<double>(y,y);
+    if (std::isnan(x))
+      return std::complex<double>(x,x);
+    if (std::isnan(y))
+      return std::complex<double>(y,y);
 
 #if USE_CONTINUED_FRACTION
     ret = exp(-x*x); // |y| < 1e-10, so we only need exp(-x*x) term
@@ -809,10 +828,11 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
     }
   }
  finish:
-  return ret + complex<double>((0.5*c)*y*(sum2+sum3), 
+  return ret + std::complex<double>((0.5*c)*y*(sum2+sum3), 
 			       (0.5*c)*std::copysign(sum5-sum4, real(z)));
 }
 
+
 /////////////////////////////////////////////////////////////////////////
 
 /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
@@ -852,7 +872,7 @@ complex<double> Faddeeva::w(complex<double> z, double relerr)
    with the help of Maple and a little shell script.   This allows
    the Chebyshev polynomials to be of significantly lower degree (about 1/4)
    compared to fitting the whole [0,1] interval with a single polynomial. */
-static double erfcx_y100(double y100)
+double erfcx_y100(double y100)
 {
   switch ((int) y100) {
 case 0: {
@@ -1261,7 +1281,7 @@ return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682
   return 1.0;
 }
 
-double Faddeeva::erfcx(double x)
+double erfcx(double x)
 {
   if (x >= 0) {
     if (x > 50) { // continued-fraction expansion is faster
@@ -1297,7 +1317,7 @@ double Faddeeva::erfcx(double x)
    with the help of Maple and a little shell script.   This allows
    the Chebyshev polynomials to be of significantly lower degree (about 1/30)
    compared to fitting the whole [0,1] interval with a single polynomial. */
-static double w_im_y100(double y100, double x) {
+double w_im_y100(double y100, double x) {
   switch ((int) y100) {
     case 0: {
       double t = 2*y100 - 1;
@@ -1706,10 +1726,10 @@ static double w_im_y100(double y100, double x) {
   }
   /* Since 0 <= y100 < 101, this is only reached if x is NaN,
      in which case we should return NaN. */
-  return NaN;
+  return std::numeric_limits<double>::quiet_NaN();
 }
 
-double Faddeeva::w_im(double x)
+double w_im(double x)
 {
   if (x >= 0) {
     if (x > 45) { // continued-fraction expansion is faster
@@ -1735,596 +1755,4 @@ double Faddeeva::w_im(double x)
   }
 }
 
-/////////////////////////////////////////////////////////////////////////
-
-// Compile with -DTEST_FADDEEVA to compile a little test program
-#ifdef TEST_FADDEEVA
-
-#include <cstdio>
-
-// compute relative error |b-a|/|a|, handling case of NaN and Inf,
-static double relerr(double a, double b) {
-  if (isnan(a) || isnan(b) || isinf(a) || isinf(b)) {
-    if ((isnan(a) && !isnan(b)) || (!isnan(a) && isnan(b)) ||
-	(isinf(a) && !isinf(b)) || (!isinf(a) && isinf(b)) ||
-	(isinf(a) && isinf(b) && a*b < 0))
-      return Inf; // "infinite" error
-    return 0; // matching infinity/nan results counted as zero error
-  }
-  if (a == 0)
-    return b == 0 ? 0 : Inf;
-  else
-    return fabs((b-a) / a);
-}
-
-int main(void) {
-  double errmax_all = 0;
-  {
-    printf("############# w(z) tests #############\n");
-    const int NTST = 57;
-    complex<double> z[NTST] = {
-      complex<double>(624.2,-0.26123),
-      complex<double>(-0.4,3.),
-      complex<double>(0.6,2.),
-      complex<double>(-1.,1.),
-      complex<double>(-1.,-9.),
-      complex<double>(-1.,9.),
-      complex<double>(-0.0000000234545,1.1234),
-      complex<double>(-3.,5.1),
-      complex<double>(-53,30.1),
-      complex<double>(0.0,0.12345),
-      complex<double>(11,1),
-      complex<double>(-22,-2),
-      complex<double>(9,-28),
-      complex<double>(21,-33),
-      complex<double>(1e5,1e5),
-      complex<double>(1e14,1e14),
-      complex<double>(-3001,-1000),
-      complex<double>(1e160,-1e159),
-      complex<double>(-6.01,0.01),
-      complex<double>(-0.7,-0.7),
-      complex<double>(2.611780000000000e+01, 4.540909610972489e+03),
-      complex<double>(0.8e7,0.3e7),
-      complex<double>(-20,-19.8081),
-      complex<double>(1e-16,-1.1e-16),
-      complex<double>(2.3e-8,1.3e-8),
-      complex<double>(6.3,-1e-13),
-      complex<double>(6.3,1e-20),
-      complex<double>(1e-20,6.3),
-      complex<double>(1e-20,16.3),
-      complex<double>(9,1e-300),
-      complex<double>(6.01,0.11),
-      complex<double>(8.01,1.01e-10),
-      complex<double>(28.01,1e-300),
-      complex<double>(10.01,1e-200),
-      complex<double>(10.01,-1e-200),
-      complex<double>(10.01,0.99e-10),
-      complex<double>(10.01,-0.99e-10),
-      complex<double>(1e-20,7.01),
-      complex<double>(-1,7.01),
-      complex<double>(5.99,7.01),
-      complex<double>(1,0),
-      complex<double>(55,0),
-      complex<double>(-0.1,0),
-      complex<double>(1e-20,0),
-      complex<double>(0,5e-14),
-      complex<double>(0,51),
-      complex<double>(Inf,0),
-      complex<double>(-Inf,0),
-      complex<double>(0,Inf),
-      complex<double>(0,-Inf),
-      complex<double>(Inf,Inf),
-      complex<double>(Inf,-Inf),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(0,NaN),
-      complex<double>(NaN,Inf),
-      complex<double>(Inf,NaN)
-    };
-    complex<double> w[NTST] = { /* w(z), computed with WolframAlpha
-				   ... note that WolframAlpha is problematic
-				   some of the above inputs, so I had to
-				   use the continued-fraction expansion
-				   in WolframAlpha in some cases, or switch
-				   to Maple */
-      complex<double>(-3.78270245518980507452677445620103199303131110e-7,
-		      0.000903861276433172057331093754199933411710053155),
-      complex<double>(0.1764906227004816847297495349730234591778719532788,
-		      -0.02146550539468457616788719893991501311573031095617),
-      complex<double>(0.2410250715772692146133539023007113781272362309451,
-		      0.06087579663428089745895459735240964093522265589350),
-      complex<double>(0.30474420525691259245713884106959496013413834051768,
-		      -0.20821893820283162728743734725471561394145872072738),
-      complex<double>(7.317131068972378096865595229600561710140617977e34,
-		      8.321873499714402777186848353320412813066170427e34),
-      complex<double>(0.0615698507236323685519612934241429530190806818395,
-		      -0.00676005783716575013073036218018565206070072304635),
-      complex<double>(0.3960793007699874918961319170187598400134746631,
-		      -5.593152259116644920546186222529802777409274656e-9),
-      complex<double>(0.08217199226739447943295069917990417630675021771804,
-		      -0.04701291087643609891018366143118110965272615832184),
-      complex<double>(0.00457246000350281640952328010227885008541748668738,
-		      -0.00804900791411691821818731763401840373998654987934),
-      complex<double>(0.8746342859608052666092782112565360755791467973338452,
-		      0.),
-      complex<double>(0.00468190164965444174367477874864366058339647648741,
-		      0.0510735563901306197993676329845149741675029197050),
-      complex<double>(-0.0023193175200187620902125853834909543869428763219,
-		      -0.025460054739731556004902057663500272721780776336),
-      complex<double>(9.11463368405637174660562096516414499772662584e304,
-		      3.97101807145263333769664875189354358563218932e305),
-      complex<double>(-4.4927207857715598976165541011143706155432296e281,
-		      -2.8019591213423077494444700357168707775769028e281),
-      complex<double>(2.820947917809305132678577516325951485807107151e-6,
-		      2.820947917668257736791638444590253942253354058e-6),
-      complex<double>(2.82094791773878143474039725787438662716372268e-15,
-		      2.82094791773878143474039725773333923127678361e-15),
-      complex<double>(-0.0000563851289696244350147899376081488003110150498,
-		      -0.000169211755126812174631861529808288295454992688),
-      complex<double>(-5.586035480670854326218608431294778077663867e-162,
-		      5.586035480670854326218608431294778077663867e-161),
-      complex<double>(0.00016318325137140451888255634399123461580248456,
-		      -0.095232456573009287370728788146686162555021209999),
-      complex<double>(0.69504753678406939989115375989939096800793577783885,
-		      -1.8916411171103639136680830887017670616339912024317),
-      complex<double>(0.0001242418269653279656612334210746733213167234822,
-		      7.145975826320186888508563111992099992116786763e-7),
-      complex<double>(2.318587329648353318615800865959225429377529825e-8,
-		      6.182899545728857485721417893323317843200933380e-8),
-      complex<double>(-0.0133426877243506022053521927604277115767311800303,
-		      -0.0148087097143220769493341484176979826888871576145),
-      complex<double>(1.00000000000000012412170838050638522857747934,
-		      1.12837916709551279389615890312156495593616433e-16),
-      complex<double>(0.9999999853310704677583504063775310832036830015,
-		      2.595272024519678881897196435157270184030360773e-8),
-      complex<double>(-1.4731421795638279504242963027196663601154624e-15,
-		      0.090727659684127365236479098488823462473074709),
-      complex<double>(5.79246077884410284575834156425396800754409308e-18,
-		      0.0907276596841273652364790985059772809093822374),
-      complex<double>(0.0884658993528521953466533278764830881245144368,
-		      1.37088352495749125283269718778582613192166760e-22),
-      complex<double>(0.0345480845419190424370085249304184266813447878,
-		      2.11161102895179044968099038990446187626075258e-23),
-      complex<double>(6.63967719958073440070225527042829242391918213e-36,
-		      0.0630820900592582863713653132559743161572639353),
-      complex<double>(0.00179435233208702644891092397579091030658500743634,
-		      0.0951983814805270647939647438459699953990788064762),
-      complex<double>(9.09760377102097999924241322094863528771095448e-13,
-		      0.0709979210725138550986782242355007611074966717),
-      complex<double>(7.2049510279742166460047102593255688682910274423e-304,
-		      0.0201552956479526953866611812593266285000876784321),
-      complex<double>(3.04543604652250734193622967873276113872279682e-44,
-		    0.0566481651760675042930042117726713294607499165),
-      complex<double>(3.04543604652250734193622967873276113872279682e-44,
-		      0.0566481651760675042930042117726713294607499165),
-      complex<double>(0.5659928732065273429286988428080855057102069081e-12,
-		      0.056648165176067504292998527162143030538756683302),
-      complex<double>(-0.56599287320652734292869884280802459698927645e-12,
-		      0.0566481651760675042929985271621430305387566833029),
-      complex<double>(0.0796884251721652215687859778119964009569455462,
-		      1.11474461817561675017794941973556302717225126e-22),
-      complex<double>(0.07817195821247357458545539935996687005781943386550,
-		      -0.01093913670103576690766705513142246633056714279654),
-      complex<double>(0.04670032980990449912809326141164730850466208439937,
-		      0.03944038961933534137558064191650437353429669886545),
-      complex<double>(0.36787944117144232159552377016146086744581113103176,
-		      0.60715770584139372911503823580074492116122092866515),
-      complex<double>(0,
-		      0.010259688805536830986089913987516716056946786526145),
-      complex<double>(0.99004983374916805357390597718003655777207908125383,
-		      -0.11208866436449538036721343053869621153527769495574),
-      complex<double>(0.99999999999999999999999999999999999999990000,
-		      1.12837916709551257389615890312154517168802603e-20),
-      complex<double>(0.999999999999943581041645226871305192054749891144158,
-		      0),
-      complex<double>(0.0110604154853277201542582159216317923453996211744250,
-		      0),
-      complex<double>(0,0),
-      complex<double>(0,0),
-      complex<double>(0,0),
-      complex<double>(Inf,0),
-      complex<double>(0,0),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN)
-    };
-    double errmax = 0;
-    for (int i = 0; i < NTST; ++i) {
-      complex<double> fw = Faddeeva::w(z[i],0.);
-      double re_err = relerr(real(w[i]), real(fw));
-      double im_err = relerr(imag(w[i]), imag(fw));
-      printf("w(%g%+gi) = %g%+gi (vs. %g%+gi), re/im rel. err. = %0.2g/%0.2g)\n",
-	     real(z[i]),imag(z[i]), real(fw),imag(fw), real(w[i]),imag(w[i]),
-	     re_err, im_err);
-      if (re_err > errmax) errmax = re_err;
-      if (im_err > errmax) errmax = im_err;
-    }
-    if (errmax > 1e-13) {
-      printf("FAILURE -- relative error %g too large!\n", errmax);
-      return 1;
-    }
-    printf("SUCCESS (max relative error = %g)\n", errmax);
-    if (errmax > errmax_all) errmax_all = errmax;
-  }
-  {
-    const int NTST = 33;
-    complex<double> z[NTST] = {
-      complex<double>(1,2),
-      complex<double>(-1,2),
-      complex<double>(1,-2),
-      complex<double>(-1,-2),
-      complex<double>(9,-28),
-      complex<double>(21,-33),
-      complex<double>(1e3,1e3),
-      complex<double>(-3001,-1000),
-      complex<double>(1e160,-1e159),
-      complex<double>(5.1e-3, 1e-8),
-      complex<double>(-4.9e-3, 4.95e-3),
-      complex<double>(4.9e-3, 0.5),
-      complex<double>(4.9e-4, -0.5e1),
-      complex<double>(-4.9e-5, -0.5e2),
-      complex<double>(5.1e-3, 0.5),
-      complex<double>(5.1e-4, -0.5e1),
-      complex<double>(-5.1e-5, -0.5e2),
-      complex<double>(1e-6,2e-6),
-      complex<double>(0,2e-6),
-      complex<double>(0,2),
-      complex<double>(0,20),
-      complex<double>(0,200),
-      complex<double>(Inf,0),
-      complex<double>(-Inf,0),
-      complex<double>(0,Inf),
-      complex<double>(0,-Inf),
-      complex<double>(Inf,Inf),
-      complex<double>(Inf,-Inf),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(0,NaN),
-      complex<double>(NaN,Inf),
-      complex<double>(Inf,NaN)
-    };
-    complex<double> w[NTST] = { // erf(z[i]), evaluated with Maple
-      complex<double>(-0.5366435657785650339917955593141927494421,
-		      -5.049143703447034669543036958614140565553),
-      complex<double>(0.5366435657785650339917955593141927494421,
-		      -5.049143703447034669543036958614140565553),
-      complex<double>(-0.5366435657785650339917955593141927494421,
-		      5.049143703447034669543036958614140565553),
-      complex<double>(0.5366435657785650339917955593141927494421,
-		      5.049143703447034669543036958614140565553),
-      complex<double>(0.3359473673830576996788000505817956637777e304,
-		      -0.1999896139679880888755589794455069208455e304),
-      complex<double>(0.3584459971462946066523939204836760283645e278,
-		      0.3818954885257184373734213077678011282505e280),
-      complex<double>(0.9996020422657148639102150147542224526887,
-		      0.00002801044116908227889681753993542916894856),
-      complex<double>(-1, 0),
-      complex<double>(1, 0),
-      complex<double>(0.005754683859034800134412990541076554934877,
-		      0.1128349818335058741511924929801267822634e-7),
-      complex<double>(-0.005529149142341821193633460286828381876955,
-		      0.005585388387864706679609092447916333443570),
-      complex<double>(0.007099365669981359632319829148438283865814,
-		      0.6149347012854211635026981277569074001219),
-      complex<double>(0.3981176338702323417718189922039863062440e8,
-		      -0.8298176341665249121085423917575122140650e10),
-      complex<double>(-Inf,
-		      -Inf),
-      complex<double>(0.007389128308257135427153919483147229573895,
-		      0.6149332524601658796226417164791221815139),
-      complex<double>(0.4143671923267934479245651547534414976991e8,
-		      -0.8298168216818314211557046346850921446950e10),
-      complex<double>(-Inf,
-		      -Inf),
-      complex<double>(0.1128379167099649964175513742247082845155e-5,
-		      0.2256758334191777400570377193451519478895e-5),
-      complex<double>(0,
-		      0.2256758334194034158904576117253481476197e-5),
-      complex<double>(0,
-		      18.56480241457555259870429191324101719886),
-      complex<double>(0,
-		      0.1474797539628786202447733153131835124599e173),
-      complex<double>(0,
-		      Inf),
-      complex<double>(1,0),
-      complex<double>(-1,0),
-      complex<double>(0,Inf),
-      complex<double>(0,-Inf),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(0,NaN),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,NaN)
-    };
-#define TST(f)								\
-    printf("############# " #f "(z) tests #############\n");		\
-    double errmax = 0;							\
-    for (int i = 0; i < NTST; ++i) {					\
-      complex<double> fw = Faddeeva::f(z[i],0.);			\
-      double re_err = relerr(real(w[i]), real(fw));			\
-      double im_err = relerr(imag(w[i]), imag(fw));			\
-      printf(#f "(%g%+gi) = %g%+gi (vs. %g%+gi), re/im rel. err. = %0.2g/%0.2g)\n", \
-	     real(z[i]),imag(z[i]), real(fw),imag(fw), real(w[i]),imag(w[i]), \
-	     re_err, im_err);						\
-      if (re_err > errmax) errmax = re_err;				\
-      if (im_err > errmax) errmax = im_err;				\
-    }									\
-    if (errmax > 1e-13) {						\
-      printf("FAILURE -- relative error %g too large!\n", errmax);	\
-      return 1;								\
-    }									\
-    printf("Checking " #f "(x) special case...\n");			\
-    for (int i = 0; i < 10000; ++i) {					\
-      double x = pow(10., -300. + i * 600. / (10000 - 1));		\
-      double re_err = relerr(Faddeeva::f(x),				\
-			     real(Faddeeva::f(complex<double>(x,0.))));	\
-      if (re_err > errmax) errmax = re_err;				\
-      re_err = relerr(Faddeeva::f(-x),					\
-		      real(Faddeeva::f(complex<double>(-x,0.))));	\
-      if (re_err > errmax) errmax = re_err;				\
-    }									\
-    {									\
-      double re_err = relerr(Faddeeva::f(Inf),				\
-			     real(Faddeeva::f(complex<double>(Inf,0.)))); \
-      if (re_err > errmax) errmax = re_err;				\
-      re_err = relerr(Faddeeva::f(-Inf),				\
-		      real(Faddeeva::f(complex<double>(-Inf,0.))));	\
-      if (re_err > errmax) errmax = re_err;				\
-      re_err = relerr(Faddeeva::f(NaN),					\
-		      real(Faddeeva::f(complex<double>(NaN,0.))));	\
-      if (re_err > errmax) errmax = re_err;				\
-    }									\
-    if (errmax > 1e-13) {						\
-      printf("FAILURE -- relative error %g too large!\n", errmax);	\
-      return 1;								\
-    }									\
-    printf("SUCCESS (max relative error = %g)\n", errmax);		\
-    if (errmax > errmax_all) errmax_all = errmax
-
-    TST(erf);
-  }
-  {
-    // since erfi just calls through to erf, just one test should
-    // be sufficient to make sure I didn't screw up the signs or something
-    const int NTST = 1;
-    complex<double> z[NTST] = { complex<double>(1.234,0.5678) };
-    complex<double> w[NTST] = { // erfi(z[i]), computed with Maple
-      complex<double>(1.081032284405373149432716643834106923212,
-		      1.926775520840916645838949402886591180834)
-    };
-    TST(erfi);
-  }
-  {
-    // since erfcx just calls through to w, just one test should
-    // be sufficient to make sure I didn't screw up the signs or something
-    const int NTST = 1;
-    complex<double> z[NTST] = { complex<double>(1.234,0.5678) };
-    complex<double> w[NTST] = { // erfcx(z[i]), computed with Maple
-      complex<double>(0.3382187479799972294747793561190487832579,
-		      -0.1116077470811648467464927471872945833154)
-    };
-    TST(erfcx);
-  }
-  {
-    const int NTST = 30;
-    complex<double> z[NTST] = {
-      complex<double>(1,2),
-      complex<double>(-1,2),
-      complex<double>(1,-2),
-      complex<double>(-1,-2),
-      complex<double>(9,-28),
-      complex<double>(21,-33),
-      complex<double>(1e3,1e3),
-      complex<double>(-3001,-1000),
-      complex<double>(1e160,-1e159),
-      complex<double>(5.1e-3, 1e-8),
-      complex<double>(0,2e-6),
-      complex<double>(0,2),
-      complex<double>(0,20),
-      complex<double>(0,200),
-      complex<double>(2e-6,0),
-      complex<double>(2,0),
-      complex<double>(20,0),
-      complex<double>(200,0),
-      complex<double>(Inf,0),
-      complex<double>(-Inf,0),
-      complex<double>(0,Inf),
-      complex<double>(0,-Inf),
-      complex<double>(Inf,Inf),
-      complex<double>(Inf,-Inf),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(0,NaN),
-      complex<double>(NaN,Inf),
-      complex<double>(Inf,NaN),
-      complex<double>(88,0)
-    };
-    complex<double> w[NTST] = { // erfc(z[i]), evaluated with Maple
-      complex<double>(1.536643565778565033991795559314192749442,
-		      5.049143703447034669543036958614140565553),
-      complex<double>(0.4633564342214349660082044406858072505579,
-		      5.049143703447034669543036958614140565553),
-      complex<double>(1.536643565778565033991795559314192749442,
-		      -5.049143703447034669543036958614140565553),
-      complex<double>(0.4633564342214349660082044406858072505579,
-		      -5.049143703447034669543036958614140565553),
-      complex<double>(-0.3359473673830576996788000505817956637777e304,
-		      0.1999896139679880888755589794455069208455e304),
-      complex<double>(-0.3584459971462946066523939204836760283645e278,
-		      -0.3818954885257184373734213077678011282505e280),
-      complex<double>(0.0003979577342851360897849852457775473112748,
-		      -0.00002801044116908227889681753993542916894856),
-      complex<double>(2, 0),
-      complex<double>(0, 0),
-      complex<double>(0.9942453161409651998655870094589234450651,
-		      -0.1128349818335058741511924929801267822634e-7),
-      complex<double>(1,
-		      -0.2256758334194034158904576117253481476197e-5),
-      complex<double>(1,
-		      -18.56480241457555259870429191324101719886),
-      complex<double>(1,
-		      -0.1474797539628786202447733153131835124599e173),
-      complex<double>(1, -Inf),
-      complex<double>(0.9999977432416658119838633199332831406314,
-		      0),
-      complex<double>(0.004677734981047265837930743632747071389108,
-		      0),
-      complex<double>(0.5395865611607900928934999167905345604088e-175,
-		      0),
-      complex<double>(0, 0),
-      complex<double>(0, 0),
-      complex<double>(2, 0),
-      complex<double>(1, -Inf),
-      complex<double>(1, Inf),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, 0),
-      complex<double>(1, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(0,0)
-    };
-    TST(erfc);
-  }
-  {
-    const int NTST = 47;
-    complex<double> z[NTST] = {
-      complex<double>(2,1),
-      complex<double>(-2,1),
-      complex<double>(2,-1),
-      complex<double>(-2,-1),
-      complex<double>(-28,9),
-      complex<double>(33,-21),
-      complex<double>(1e3,1e3),
-      complex<double>(-1000,-3001),
-      complex<double>(1e-8, 5.1e-3),
-      complex<double>(4.95e-3, -4.9e-3),
-      complex<double>(0.5, 4.9e-3),
-      complex<double>(-0.5e1, 4.9e-4),
-      complex<double>(-0.5e2, -4.9e-5),
-      complex<double>(0.5e3, 4.9e-6),
-      complex<double>(0.5, 5.1e-3),
-      complex<double>(-0.5e1, 5.1e-4),
-      complex<double>(-0.5e2, -5.1e-5),
-      complex<double>(1e-6,2e-6),
-      complex<double>(2e-6,0),
-      complex<double>(2,0),
-      complex<double>(20,0),
-      complex<double>(200,0),
-      complex<double>(0,4.9e-3),
-      complex<double>(0,-5.1e-3),
-      complex<double>(0,2e-6),
-      complex<double>(0,-2),
-      complex<double>(0,20),
-      complex<double>(0,-200),
-      complex<double>(Inf,0),
-      complex<double>(-Inf,0),
-      complex<double>(0,Inf),
-      complex<double>(0,-Inf),
-      complex<double>(Inf,Inf),
-      complex<double>(Inf,-Inf),
-      complex<double>(NaN,NaN),
-      complex<double>(NaN,0),
-      complex<double>(0,NaN),
-      complex<double>(NaN,Inf),
-      complex<double>(Inf,NaN),
-      complex<double>(39, 6.4e-5),
-      complex<double>(41, 6.09e-5),
-      complex<double>(4.9e7, 5e-11),
-      complex<double>(5.1e7, 4.8e-11),
-      complex<double>(1e9, 2.4e-12),
-      complex<double>(1e11, 2.4e-14),
-      complex<double>(1e13, 2.4e-16),
-      complex<double>(1e300, 2.4e-303)
-    };
-    complex<double> w[NTST] = { // dawson(z[i]), evaluated with Maple
-      complex<double>(0.1635394094345355614904345232875688576839,
-		      -0.1531245755371229803585918112683241066853),
-      complex<double>(-0.1635394094345355614904345232875688576839,
-		      -0.1531245755371229803585918112683241066853),
-      complex<double>(0.1635394094345355614904345232875688576839,
-		      0.1531245755371229803585918112683241066853),
-      complex<double>(-0.1635394094345355614904345232875688576839,
-		      0.1531245755371229803585918112683241066853),
-      complex<double>(-0.01619082256681596362895875232699626384420,
-		      -0.005210224203359059109181555401330902819419),
-      complex<double>(0.01078377080978103125464543240346760257008,
-		      0.006866888783433775382193630944275682670599),
-      complex<double>(-0.5808616819196736225612296471081337245459,
-		      0.6688593905505562263387760667171706325749),
-      complex<double>(Inf,
-		      -Inf),
-      complex<double>(0.1000052020902036118082966385855563526705e-7,
-		      0.005100088434920073153418834680320146441685),
-      complex<double>(0.004950156837581592745389973960217444687524,
-		      -0.004899838305155226382584756154100963570500),
-      complex<double>(0.4244534840871830045021143490355372016428,
-		      0.002820278933186814021399602648373095266538),
-      complex<double>(-0.1021340733271046543881236523269967674156,
-		      -0.00001045696456072005761498961861088944159916),
-      complex<double>(-0.01000200120119206748855061636187197886859,
-		      0.9805885888237419500266621041508714123763e-8),
-      complex<double>(0.001000002000012000023960527532953151819595,
-		      -0.9800058800588007290937355024646722133204e-11),
-      complex<double>(0.4244549085628511778373438768121222815752,
-		      0.002935393851311701428647152230552122898291),
-      complex<double>(-0.1021340732357117208743299813648493928105,
-		      -0.00001088377943049851799938998805451564893540),
-      complex<double>(-0.01000200120119126652710792390331206563616,
-		      0.1020612612857282306892368985525393707486e-7),
-      complex<double>(0.1000000000007333333333344266666666664457e-5,
-		      0.2000000000001333333333323199999999978819e-5),
-      complex<double>(0.1999999999994666666666675199999999990248e-5,
-		      0),
-      complex<double>(0.3013403889237919660346644392864226952119,
-		      0),
-      complex<double>(0.02503136792640367194699495234782353186858,
-		      0),
-      complex<double>(0.002500031251171948248596912483183760683918,
-		      0),
-      complex<double>(0,0.004900078433419939164774792850907128053308),
-      complex<double>(0,-0.005100088434920074173454208832365950009419),
-      complex<double>(0,0.2000000000005333333333341866666666676419e-5),
-      complex<double>(0,-48.16001211429122974789822893525016528191),
-      complex<double>(0,0.4627407029504443513654142715903005954668e174),
-      complex<double>(0,-Inf),
-      complex<double>(0,0),
-      complex<double>(-0,0),
-      complex<double>(0, Inf),
-      complex<double>(0, -Inf),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, 0),
-      complex<double>(0, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(NaN, NaN),
-      complex<double>(0.01282473148489433743567240624939698290584,
-		      -0.2105957276516618621447832572909153498104e-7),
-      complex<double>(0.01219875253423634378984109995893708152885,
-		      -0.1813040560401824664088425926165834355953e-7),
-      complex<double>(0.1020408163265306334945473399689037886997e-7,
-		      -0.1041232819658476285651490827866174985330e-25),
-      complex<double>(0.9803921568627452865036825956835185367356e-8,
-		      -0.9227220299884665067601095648451913375754e-26),
-      complex<double>(0.5000000000000000002500000000000000003750e-9,
-		      -0.1200000000000000001800000188712838420241e-29),
-      complex<double>(5.00000000000000000000025000000000000000000003e-12,
-		      -1.20000000000000000000018000000000000000000004e-36),
-      complex<double>(5.00000000000000000000000002500000000000000000e-14,
-		      -1.20000000000000000000000001800000000000000000e-42),
-      complex<double>(5e-301, 0)
-    };
-    TST(Dawson);
-  }
-  printf("#####################################\n");
-  printf("SUCCESS (max relative error = %g)\n", errmax_all);
-}
-
-#endif
+}
\ No newline at end of file
diff --git a/scipy/special/xsf/log.h b/scipy/special/xsf/log.h
new file mode 100644
index 000000000000..23681cb2ccb0
--- /dev/null
+++ b/scipy/special/xsf/log.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include "cephes/dd_real.h"
+#include "trig.h"
+
+namespace xsf {
+
+inline double log1p(double x) { return cephes::log1p(x); }
+
+inline float log1p(float x) { return log1p(static_cast<double>(x)); }
+
+inline std::complex<double> clog1p_ddouble(double zr, double zi) {
+    double x, y;
+
+    cephes::detail::double_double r(zr);
+    cephes::detail::double_double i(zi);
+    cephes::detail::double_double two(2.0);
+
+    cephes::detail::double_double rsqr = r * r;
+    cephes::detail::double_double isqr = i * i;
+    cephes::detail::double_double rtwo = two * r;
+    cephes::detail::double_double absm1 = rsqr + isqr;
+    absm1 = absm1 + rtwo;
+
+    x = 0.5 * log1p(static_cast<double>(absm1));
+    y = atan2(zi, zr + 1.0);
+    return std::complex<double>{x, y};
+}
+
+// log(z + 1) = log(x + 1 + 1j*y)
+//             = log(sqrt((x+1)**2 + y**2)) + 1j*atan2(y, x+1)
+//
+// Using atan2(y, x+1) for the imaginary part is always okay.  The real part
+// needs to be calculated more carefully.  For |z| large, the naive formula
+// log(z + 1) can be used.  When |z| is small, rewrite as
+//
+// log(sqrt((x+1)**2 + y**2)) = 0.5*log(x**2 + 2*x +1 + y**2)
+//       = 0.5 * log1p(x**2 + y**2 + 2*x)
+//       = 0.5 * log1p(hypot(x,y) * (hypot(x, y) + 2*x/hypot(x,y)))
+//
+// This expression suffers from cancellation when x < 0 and
+// y = +/-sqrt(2*fabs(x)). To get around this cancellation problem, we use
+// double-double precision when necessary.
+inline std::complex<double> log1p(std::complex<double> z) {
+    double x, y, az, azi;
+
+    if (!std::isfinite(std::real(z)) || !std::isfinite(std::imag(z))) {
+        z = z + 1.0;
+        return std::log(z);
+    }
+
+    double zr = z.real();
+    double zi = z.imag();
+
+    if (zi == 0.0 && zr >= -1.0) {
+        return log1p(zr);
+    }
+
+    az = std::abs(z);
+    if (az < 0.707) {
+        azi = std::fabs(zi);
+        if (zr < 0 && std::abs(-zr - azi * azi / 2) / (-zr) < 0.5) {
+            return clog1p_ddouble(zr, zi);
+        } else {
+            x = 0.5 * log1p(az * (az + 2 * zr / az));
+            y = atan2(zi, zr + 1.0);
+            return std::complex<double>(x, y);
+        }
+    }
+
+    z = z + 1.0;
+    return std::log(z);
+}
+
+inline std::complex<float> log1p(std::complex<float> z) {
+    return static_cast<std::complex<float>>(log1p(static_cast<std::complex<double>>(z)));
+}
+
+inline double log1pmx(double x) { return cephes::log1pmx(x); }
+
+inline float log1pmx(float x) { return log1pmx(static_cast<double>(x)); }
+
+template <typename T>
+T xlogy(T x, T y) {
+    if (x == 0 && !std::isnan(y)) {
+        return 0;
+    }
+
+    return x * std::log(y);
+}
+
+template <typename T>
+std::complex<T> xlogy(std::complex<T> x, std::complex<T> y) {
+    if (x == T(0) && !std::isnan(std::real(y)) && !std::isnan(std::imag(y))) {
+        return 0;
+    }
+
+    return x * std::log(y);
+}
+
+template <typename T>
+T xlog1py(T x, T y) {
+    if (x == 0 && !std::isnan(y)) {
+        return 0;
+    }
+
+    return x * log1p(y);
+}
+
+template <typename T>
+std::complex<T> xlog1py(std::complex<T> x, std::complex<T> y) {
+    if (x == T(0) && !std::isnan(std::real(y)) && !std::isnan(std::imag(y))) {
+        return 0;
+    }
+
+    return x * log1p(y);
+}
+
+} // namespace xsf
diff --git a/scipy/special/xsf/stats.h b/scipy/special/xsf/stats.h
index 6dde641cb014..267bf3c6d00d 100644
--- a/scipy/special/xsf/stats.h
+++ b/scipy/special/xsf/stats.h
@@ -13,6 +13,7 @@
 #include "xsf/cephes/owens_t.h"
 #include "xsf/cephes/pdtr.h"
 #include "xsf/cephes/tukey.h"
+#include "xsf/erf.h"
 
 namespace xsf {
 
@@ -50,6 +51,102 @@ inline double kolmogp(double x) { return cephes::kolmogp(x); }
 
 inline double ndtr(double x) { return cephes::ndtr(x); }
 
+inline float ndtr(float x) { return ndtr(static_cast<double>(x)); }
+
+inline std::complex<double> ndtr(std::complex<double> z) { return 0.5 * erfc(-z * M_SQRT1_2); }
+
+inline std::complex<float> ndtr(std::complex<float> z) {
+    return static_cast<std::complex<float>>(ndtr(static_cast<std::complex<double>>(z)));
+}
+
+/*
+ * Log of the CDF of the normal distribution for double x.
+ *
+ * Let F(x) be the CDF of the standard normal distribution.
+ * This implementation of log(F(x)) is based on the identities
+ *
+ *   F(x) = erfc(-x/√2)/2
+ *        = 1 - erfc(x/√2)/2
+ *
+ * We use the first formula for x < -1, with erfc(z) replaced
+ * by erfcx(z)*exp(-z**2) to ensure high precision for large
+ * negative values when we take the logarithm:
+ *
+ *   log F(x) = log(erfc(-x/√2)/2)
+ *            = log(erfcx(-x/√2)/2)*exp(-x**2/2))
+ *            = log(erfcx(-x/√2)/2) - x**2/2
+ *
+ * For x >= -1, we use the second formula for F(x):
+ *
+ *   log F(x) = log(1 - erfc(x/√2)/2)
+ *            = log1p(-erfc(x/√2)/2)
+ */
+inline double log_ndtr(double x) {
+    double t = x * M_SQRT1_2;
+    if (x < -1.0) {
+        return log(erfcx(-t) / 2) - t * t;
+    } else {
+        return log1p(-erfc(t) / 2);
+    }
+}
+
+inline float log_ndtr(float x) { return log_ndtr(static_cast<double>(x)); }
+
+/*
+ * Log of the normal CDF for complex arguments.
+ *
+ * This is equivalent to log(ndtr(z)), but is more robust to overflow at $z\to\infty$.
+ * This implementation uses $\erfc(z) = \exp(-z^2) w(iz)$ taking special care to select
+ * the principal branch of the log function log( exp(-z^2) w(i z) )
+ */
+inline std::complex<double> log_ndtr(std::complex<double> z) {
+    if (z.real() > 6) {
+        // Underflow. Close to the real axis, expand the log in log(1 - ndtr(-z)).
+        std::complex<double> w = -0.5 * erfc(z * M_SQRT1_2);
+        if (std::abs(w) < 1e-8) {
+            return w;
+        }
+    }
+
+    z *= -M_SQRT1_2;
+    double x = std::real(z);
+    double y = std::imag(z);
+
+    /* Compute the principal branch of $log(exp(-z^2))$, using the fact that
+     * $log(e^t) = log|e^t| + i Arg(e^t)$, and that if $t = r + is$, then
+     * $e^t = e^r (\cos(s) + i \sin(s))$.
+     */
+    double mRe_z2 = (y - x) * (x + y); // Re(-z^2), being careful of overflow
+    double mIm_z2 = -2 * x * y;        // Im(-z^2)
+
+    double im = fmod(mIm_z2, 2.0 * M_PI);
+    if (im > M_PI) {
+        im -= 2.0 * M_PI;
+    }
+
+    std::complex<double> val1 = std::complex<double>(mRe_z2, im);
+
+    std::complex<double> val2 = log(xsf::wofz(complex<double>(-y, x)));
+    std::complex<double> result = val1 + val2 - NPY_LOGE2;
+
+    /* Again, select the principal branch: log(z) = log|z| + i arg(z), thus
+     * the imaginary part of the result should belong to [-pi, pi].
+     */
+    im = imag(result);
+    if (im >= M_PI) {
+        im -= 2 * M_PI;
+    }
+    if (im < -M_PI) {
+        im += 2 * M_PI;
+    }
+
+    return {result.real(), im};
+}
+
+inline std::complex<float> log_ndtr(std::complex<float> z) {
+    return static_cast<std::complex<float>>(log_ndtr(static_cast<std::complex<double>>(z)));
+}
+
 inline double nbdtr(int k, int n, double p) { return cephes::nbdtr(k, n, p); }
 
 inline double nbdtrc(int k, int n, double p) { return cephes::nbdtrc(k, n, p); }
diff --git a/scipy/special/xsf_wrappers.cpp b/scipy/special/xsf_wrappers.cpp
index dd126ee95489..9550f10cf6ba 100644
--- a/scipy/special/xsf_wrappers.cpp
+++ b/scipy/special/xsf_wrappers.cpp
@@ -8,12 +8,15 @@
 #include "xsf/cdflib.h"
 #include "xsf/digamma.h"
 #include "xsf/ellip.h"
+#include "xsf/erf.h"
+#include "xsf/exp.h"
 #include "xsf/expint.h"
 #include "xsf/fresnel.h"
 #include "xsf/gamma.h"
 #include "xsf/hyp2f1.h"
 #include "xsf/kelvin.h"
 #include "xsf/lambertw.h"
+#include "xsf/log.h"
 #include "xsf/log_exp.h"
 #include "xsf/loggamma.h"
 #include "xsf/mathieu.h"
@@ -32,8 +35,6 @@
 
 #include "xsf/cephes/cbrt.h"
 #include "xsf/cephes/erfinv.h"
-#include "xsf/cephes/exp10.h"
-#include "xsf/cephes/exp2.h"
 #include "xsf/cephes/expn.h"
 #include "xsf/cephes/fresnl.h"
 #include "xsf/cephes/hyperg.h"
@@ -314,15 +315,12 @@ int xsf_sici(double x, double *si, double *ci) { return xsf::sici(x, si, ci); }
 int xsf_shichi(double x, double *si, double *ci) { return xsf::shichi(x, si, ci); }
 
 int xsf_csici(npy_cdouble x, npy_cdouble *si, npy_cdouble *ci) {
-    return xsf::sici(to_complex(x),
-		     reinterpret_cast<complex<double> *>(si),
-		     reinterpret_cast<complex<double> *>(ci));
+    return xsf::sici(to_complex(x), reinterpret_cast<complex<double> *>(si), reinterpret_cast<complex<double> *>(ci));
 }
 
 int xsf_cshichi(npy_cdouble x, npy_cdouble *shi, npy_cdouble *chi) {
-    return xsf::shichi(to_complex(x),
-		       reinterpret_cast<complex<double> *>(shi),
-		       reinterpret_cast<complex<double> *>(chi));
+    return xsf::shichi(to_complex(x), reinterpret_cast<complex<double> *>(shi),
+                       reinterpret_cast<complex<double> *>(chi));
 }
 
 double cephes__struve_asymp_large_z(double v, double z, Py_ssize_t is_h, double *err) {
@@ -382,28 +380,42 @@ double cephes_igam_fac(double a, double x) { return xsf::cephes::detail::igam_fa
 
 double cephes_lanczos_sum_expg_scaled(double x) { return xsf::cephes::lanczos_sum_expg_scaled(x); }
 
-double cephes_erf(double x) { return xsf::cephes::erf(x); }
-
-double cephes_erfc(double x) { return xsf::cephes::erfc(x); }
-
 double cephes_poch(double x, double m) { return xsf::cephes::poch(x, m); }
 
 double cephes_rgamma(double x) { return xsf::cephes::rgamma(x); }
 
 double xsf_zetac(double x) { return xsf::zetac(x); }
 
-double cephes_log1p(double x) { return xsf::cephes::log1p(x); }
-
-double cephes_log1pmx(double x) { return xsf::cephes::log1pmx(x); }
-
 double cephes_lgam1p(double x) { return xsf::cephes::lgam1p(x); }
 
-double cephes_expm1(double x) { return xsf::cephes::expm1(x); }
-
 double cephes_expn(int n, double x) { return xsf::cephes::expn(n, x); }
 
 double xsf_ellipe(double x) { return xsf::ellipe(x); }
 
+double xsf_erf(double x) { return xsf::erf(x); }
+
+npy_cdouble xsf_cerf(npy_cdouble z) { return to_ccomplex(xsf::erf(to_complex(z))); }
+
+double xsf_erfc(double x) { return xsf::erfc(x); }
+
+npy_cdouble xsf_cerfc(npy_cdouble z) { return to_ccomplex(xsf::erfc(to_complex(z))); }
+
+double xsf_erfcx(double x) { return xsf::erfcx(x); }
+
+npy_cdouble xsf_cerfcx(npy_cdouble z) { return to_ccomplex(xsf::erfcx(to_complex(z))); }
+
+double xsf_dawsn(double x) { return xsf::dawsn(x); }
+
+npy_cdouble xsf_cdawsn(npy_cdouble z) { return to_ccomplex(xsf::dawsn(to_complex(z))); }
+
+double xsf_erfi(double x) { return xsf::erfi(x); }
+
+npy_cdouble xsf_cerfi(npy_cdouble z) { return to_ccomplex(xsf::erfi(to_complex(z))); }
+
+npy_cdouble xsf_cwofz(npy_cdouble z) { return to_ccomplex(xsf::wofz(to_complex(z))); }
+
+double xsf_voigt_profile(double x, double sigma, double gamma) { return xsf::voigt_profile(x, sigma, gamma); }
+
 double cephes_ellpk(double x) { return xsf::ellipkm1(x); }
 
 double cephes_ellie(double phi, double m) { return xsf::ellipeinc(phi, m); }
@@ -414,10 +426,6 @@ double cephes_poch_wrap(double x, double m) { return xsf::cephes::poch(x, m); }
 
 double cephes_erfcinv(double y) { return xsf::cephes::erfcinv(y); }
 
-double cephes_exp10(double x) { return xsf::cephes::exp10(x); }
-
-double cephes_exp2(double x) { return xsf::cephes::exp2(x); }
-
 double cephes_round(double x) { return xsf::cephes::round(x); }
 
 double cephes_spence(double x) { return xsf::cephes::spence(x); }
@@ -426,6 +434,32 @@ double xsf_struve_h(double v, double z) { return xsf::struve_h(v, z); }
 
 double xsf_struve_l(double v, double z) { return xsf::struve_l(v, z); }
 
+// Exp
+
+double xsf_expm1(double x) { return xsf::expm1(x); }
+
+npy_cdouble xsf_cexpm1(npy_cdouble z) { return to_ccomplex(xsf::expm1(to_complex(z))); }
+
+double xsf_exp2(double x) { return xsf::exp2(x); }
+
+double xsf_exp10(double x) { return xsf::exp10(x); }
+
+// Log
+
+double xsf_log1p(double x) { return xsf::log1p(x); }
+
+npy_cdouble xsf_clog1p(npy_cdouble z) { return to_ccomplex(xsf::log1p(to_complex(z))); }
+
+double xsf_xlogy(double x, double y) { return xsf::xlogy(x, y); }
+
+npy_cdouble xsf_cxlogy(npy_cdouble x, npy_cdouble y) { return to_ccomplex(xsf::xlogy(to_complex(x), to_complex(y))); }
+
+double xsf_xlog1py(double x, double y) { return xsf::xlog1py(x, y); }
+
+npy_cdouble xsf_cxlog1py(npy_cdouble x, npy_cdouble y) {
+    return to_ccomplex(xsf::xlog1py(to_complex(x), to_complex(y)));
+}
+
 // Cylindrical Bessel
 
 double xsf_i0(double x) { return xsf::cyl_bessel_i0(x); }
@@ -572,7 +606,7 @@ double xsf_gdtr(double a, double b, double x) { return xsf::gdtr(a, b, x); }
 
 double xsf_gdtrc(double a, double b, double x) { return xsf::gdtrc(a, b, x); }
 
-double xsf_gdtrib(double a, double p, double x) {return xsf::gdtrib(a, p, x); }
+double xsf_gdtrib(double a, double p, double x) { return xsf::gdtrib(a, p, x); }
 
 double xsf_kolmogorov(double x) { return xsf::kolmogorov(x); }
 
@@ -592,6 +626,12 @@ double xsf_nbdtri(int k, int n, double p) { return xsf::nbdtri(k, n, p); }
 
 double xsf_ndtr(double x) { return xsf::ndtr(x); }
 
+npy_cdouble xsf_cndtr(npy_cdouble x) { return to_ccomplex(xsf::ndtr(to_complex(x))); }
+
+double xsf_log_ndtr(double x) { return xsf::log_ndtr(x); }
+
+npy_cdouble xsf_clog_ndtr(npy_cdouble x) { return to_ccomplex(xsf::log_ndtr(to_complex(x))); }
+
 double xsf_ndtri(double x) { return xsf::ndtri(x); }
 
 double xsf_owens_t(double h, double a) { return xsf::owens_t(h, a); }
diff --git a/scipy/special/xsf_wrappers.h b/scipy/special/xsf_wrappers.h
index 4cd40e9ae410..619fdba2cf61 100644
--- a/scipy/special/xsf_wrappers.h
+++ b/scipy/special/xsf_wrappers.h
@@ -179,28 +179,26 @@ double cephes_igam_fac(double a, double x);
 
 double cephes_lanczos_sum_expg_scaled(double x);
 
-double cephes_erf(double x);
-
-double cephes_erfc(double x);
-
 double cephes_poch(double x, double m);
 
 double cephes_rgamma(double x);
 
 double xsf_zetac(double x);
 
-double cephes_log1p(double x);
-
-double cephes_log1pmx(double x);
-
 double cephes_lgam1p(double x);
 
-double cephes_expm1(double x);
-
 double cephes_expn(int n, double x);
 
 double xsf_ellipe(double x);
 
+double xsf_dawsn(double x);
+
+npy_cdouble xsf_cdawsn(npy_cdouble z);
+
+double xsf_voigt_profile(double x, double sigma, double gamma);
+
+npy_cdouble xsf_cwofz(npy_cdouble z);
+
 double cephes_ellpk(double x);
 
 double cephes_ellie(double phi, double m);
@@ -209,10 +207,6 @@ double xsf_ellipkinc(double phi, double m);
 
 double cephes_erfcinv(double y);
 
-double cephes_exp10(double x);
-
-double cephes_exp2(double x);
-
 double cephes_round(double x);
 
 double cephes_spence(double x);
@@ -221,6 +215,39 @@ double xsf_struve_h(double v, double z);
 
 double xsf_struve_l(double v, double z);
 
+// Exp
+
+double xsf_expm1(double x);
+npy_cdouble xsf_cexpm1(npy_cdouble z);
+
+double xsf_exp2(double x);
+
+double xsf_exp10(double x);
+
+// Erf
+
+double xsf_erf(double x);
+npy_cdouble xsf_cerf(npy_cdouble z);
+
+double xsf_erfi(double x);
+npy_cdouble xsf_cerfi(npy_cdouble z);
+
+double xsf_erfc(double x);
+npy_cdouble xsf_cerfc(npy_cdouble z);
+
+double xsf_erfcx(double x);
+npy_cdouble xsf_cerfcx(npy_cdouble z);
+
+// Log
+
+double xsf_log1p(double x);
+npy_cdouble xsf_clog1p(npy_cdouble x);
+
+double xsf_xlogy(double x, double y);
+npy_cdouble xsf_cxlogy(npy_cdouble x, npy_cdouble y);
+
+double xsf_xlog1py(double x, double y);
+npy_cdouble xsf_cxlog1py(npy_cdouble x, npy_cdouble y);
 
 // Cylindrical Bessel
 
@@ -325,6 +352,9 @@ double xsf_nbdtr(int k, int n, double p);
 double xsf_nbdtrc(int k, int n, double p);
 double xsf_nbdtri(int k, int n, double p);
 double xsf_ndtr(double x);
+npy_cdouble xsf_cndtr(npy_cdouble x);
+double xsf_log_ndtr(double x);
+npy_cdouble xsf_clog_ndtr(npy_cdouble x);
 double xsf_ndtri(double x);
 double xsf_owens_t(double h, double a);
 double xsf_pdtr(double k, double m);
diff --git a/scipy/stats/_continued_fraction.py b/scipy/stats/_continued_fraction.py
index d71ce1009594..4b08c389f1c8 100644
--- a/scipy/stats/_continued_fraction.py
+++ b/scipy/stats/_continued_fraction.py
@@ -300,7 +300,7 @@ def func(n, *args):
 
     xp = array_namespace(fs_a[0], fs_b[0], *args)
 
-    shape = xp.broadcast_shapes(shape_a, shape_b)
+    shape = np.broadcast_shapes(shape_a, shape_b)  # OK to use NumPy on tuples
     dtype = xp.result_type(dtype_a, dtype_b)
     an = xp.astype(xp_ravel(xp.broadcast_to(xp.reshape(fs_a[0], shape_a), shape)), dtype)  # noqa: E501
     bn = xp.astype(xp_ravel(xp.broadcast_to(xp.reshape(fs_b[0], shape_b), shape)), dtype)  # noqa: E501
diff --git a/scipy/stats/_covariance.py b/scipy/stats/_covariance.py
index 2dde85d0bac9..2ff905889bb9 100644
--- a/scipy/stats/_covariance.py
+++ b/scipy/stats/_covariance.py
@@ -488,7 +488,9 @@ def _covariance(self):
                 if self._cov_matrix is None else self._cov_matrix)
 
     def _colorize(self, x):
-        return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
+        m = x.T.shape[0]
+        res = linalg.solve_triangular(self._chol_P.T, x.T.reshape(m, -1), lower=False)
+        return res.reshape(x.T.shape).T
 
 
 def _dot_diag(x, d):
@@ -549,8 +551,9 @@ def _covariance(self):
         return self._factor @ self._factor.T
 
     def _whiten(self, x):
-        res = linalg.solve_triangular(self._factor, x.T, lower=True).T
-        return res
+        m = x.T.shape[0]
+        res = linalg.solve_triangular(self._factor, x.T.reshape(m, -1), lower=True)
+        return res.reshape(x.T.shape).T
 
     def _colorize(self, x):
         return x @ self._factor.T
diff --git a/scipy/stats/_stats_mstats_common.py b/scipy/stats/_stats_mstats_common.py
index 6900eba1fa61..9a621016e157 100644
--- a/scipy/stats/_stats_mstats_common.py
+++ b/scipy/stats/_stats_mstats_common.py
@@ -2,6 +2,7 @@
 import numpy as np
 from . import distributions
 from .._lib._bunch import _make_tuple_bunch
+from ._axis_nan_policy import _axis_nan_policy_factory
 from ._stats_pythran import siegelslopes as siegelslopes_pythran
 
 __all__ = ['_find_repeats', 'theilslopes', 'siegelslopes']
@@ -14,6 +15,13 @@
                                        ['slope', 'intercept'])
 
 
+def _n_samples_optional_x(kwargs):
+    return 2 if kwargs.get('x', None) is not None else 1
+
+
+@_axis_nan_policy_factory(TheilslopesResult, default_axis=None, n_outputs=4,
+                          n_samples=_n_samples_optional_x,
+                          result_to_tuple=tuple, paired=True, too_small=1)
 def theilslopes(y, x=None, alpha=0.95, method='separate'):
     r"""
     Computes the Theil-Sen estimator for a set of points (x, y).
@@ -131,7 +139,9 @@ def theilslopes(y, x=None, alpha=0.95, method='separate'):
     else:
         x = np.array(x, dtype=float, copy=True).ravel()
         if len(x) != len(y):
-            raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
+            raise ValueError("Array shapes are incompatible for broadcasting.")
+    if len(x) < 2:
+        raise ValueError("`x` and `y` must have length at least 2.")
 
     # Compute sorted slopes only when deltax > 0
     deltax = x[:, np.newaxis] - x
@@ -192,6 +202,9 @@ def _find_repeats(arr):
     return unique[atleast2], freq[atleast2]
 
 
+@_axis_nan_policy_factory(SiegelslopesResult, default_axis=None, n_outputs=2,
+                          n_samples=_n_samples_optional_x,
+                          result_to_tuple=tuple, paired=True, too_small=1)
 def siegelslopes(y, x=None, method="hierarchical"):
     r"""
     Computes the Siegel estimator for a set of points (x, y).
@@ -296,8 +309,12 @@ def siegelslopes(y, x=None, method="hierarchical"):
     else:
         x = np.asarray(x, dtype=float).ravel()
         if len(x) != len(y):
-            raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
+            raise ValueError("Array shapes are incompatible for broadcasting.")
+    if len(x) < 2:
+        raise ValueError("`x` and `y` must have length at least 2.")
+
     dtype = np.result_type(x, y, np.float32)  # use at least float32
     y, x = y.astype(dtype), x.astype(dtype)
     medslope, medinter = siegelslopes_pythran(y, x, method)
+    medslope, medinter = np.asarray(medslope)[()], np.asarray(medinter)[()]
     return SiegelslopesResult(slope=medslope, intercept=medinter)
diff --git a/scipy/stats/_stats_py.py b/scipy/stats/_stats_py.py
index 71ae19acabc2..24cb2e2f1f63 100644
--- a/scipy/stats/_stats_py.py
+++ b/scipy/stats/_stats_py.py
@@ -155,6 +155,18 @@ def _convert_common_float(*arrays, xp=None):
 
 SignificanceResult = _make_tuple_bunch('SignificanceResult',
                                        ['statistic', 'pvalue'], [])
+# Let's call a SignificanceResult with legacy :correlation" attribute a
+# "CorrelationResult". Don't add to `extra_field_names`- shouldn't be in repr.
+
+
+def _pack_CorrelationResult(statistic, pvalue, correlation):
+    res = SignificanceResult(statistic, pvalue)
+    res.correlation = correlation
+    return res
+
+
+def _unpack_CorrelationResult(res):
+    return res.statistic, res.pvalue, res.correlation
 
 
 # note that `weights` are paired with `x`
@@ -1308,10 +1320,8 @@ def skew(a, axis=0, bias=True, nan_policy='propagate'):
     if not bias:
         can_correct = ~zero & (n > 2)
         if xp.any(can_correct):
-            m2 = m2[can_correct]
-            m3 = m3[can_correct]
             nval = ((n - 1.0) * n)**0.5 / (n - 2.0) * m3 / m2**1.5
-            vals[can_correct] = nval
+            vals = xp.where(can_correct, nval, vals)
 
     return vals[()] if vals.ndim == 0 else vals
 
@@ -1418,10 +1428,8 @@ def kurtosis(a, axis=0, fisher=True, bias=True, nan_policy='propagate'):
     if not bias:
         can_correct = ~zero & (n > 3)
         if xp.any(can_correct):
-            m2 = m2[can_correct]
-            m4 = m4[can_correct]
             nval = 1.0/(n-2)/(n-3) * ((n**2-1.0)*m4/m2**2.0 - 3*(n-1)**2.0)
-            vals[can_correct] = nval + 3.0
+            vals = xp.where(can_correct, nval + 3.0, vals)
 
     vals = vals - 3 if fisher else vals
     return vals[()] if vals.ndim == 0 else vals
@@ -5313,6 +5321,9 @@ def spearmanr(a, b=None, axis=0, nan_policy='propagate',
         return res
 
 
+@_axis_nan_policy_factory(_pack_CorrelationResult, n_samples=2,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3)
 def pointbiserialr(x, y):
     r"""Calculate a point biserial correlation coefficient and its p-value.
 
@@ -5408,6 +5419,9 @@ def pointbiserialr(x, y):
     return res
 
 
+@_axis_nan_policy_factory(_pack_CorrelationResult, default_axis=None, n_samples=2,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3)
 def kendalltau(x, y, *, nan_policy='propagate',
                method='auto', variant='b', alternative='two-sided'):
     r"""Calculate Kendall's tau, a correlation measure for ordinal data.
@@ -5525,37 +5539,14 @@ def kendalltau(x, y, *, nan_policy='propagate',
     y = np.asarray(y).ravel()
 
     if x.size != y.size:
-        raise ValueError("All inputs to `kendalltau` must be of the same "
-                         f"size, found x-size {x.size} and y-size {y.size}")
+        raise ValueError("Array shapes are incompatible for broadcasting.")
     elif not x.size or not y.size:
         # Return NaN if arrays are empty
-        res = SignificanceResult(np.nan, np.nan)
-        res.correlation = np.nan
-        return res
-
-    # check both x and y
-    cnx, npx = _contains_nan(x, nan_policy)
-    cny, npy = _contains_nan(y, nan_policy)
-    contains_nan = cnx or cny
-    if npx == 'omit' or npy == 'omit':
-        nan_policy = 'omit'
-
-    if contains_nan and nan_policy == 'propagate':
-        res = SignificanceResult(np.nan, np.nan)
-        res.correlation = np.nan
+        NaN = _get_nan(x, y)
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
         return res
 
-    elif contains_nan and nan_policy == 'omit':
-        x = ma.masked_invalid(x)
-        y = ma.masked_invalid(y)
-        if variant == 'b':
-            return mstats_basic.kendalltau(x, y, method=method, use_ties=True,
-                                           alternative=alternative)
-        else:
-            message = ("nan_policy='omit' is currently compatible only with "
-                       "variant='b'.")
-            raise ValueError(message)
-
     def count_rank_tie(ranks):
         cnt = np.bincount(ranks).astype('int64', copy=False)
         cnt = cnt[cnt > 1]
@@ -5586,8 +5577,9 @@ def count_rank_tie(ranks):
     tot = (size * (size - 1)) // 2
 
     if xtie == tot or ytie == tot:
-        res = SignificanceResult(np.nan, np.nan)
-        res.correlation = np.nan
+        NaN = _get_nan(x, y)
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
         return res
 
     # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
@@ -5636,6 +5628,15 @@ def count_rank_tie(ranks):
     return res
 
 
+def _weightedtau_n_samples(kwargs):
+    rank = kwargs.get('rank', False)
+    return 2 if (isinstance(rank, bool) or rank is None) else 3
+
+
+@_axis_nan_policy_factory(_pack_CorrelationResult, default_axis=None,
+                          n_samples=_weightedtau_n_samples,
+                          result_to_tuple=_unpack_CorrelationResult, paired=True,
+                          too_small=1, n_outputs=3, override={'nan_propagation': False})
 def weightedtau(x, y, rank=True, weigher=None, additive=True):
     r"""Compute a weighted version of Kendall's :math:`\tau`.
 
@@ -5771,15 +5772,14 @@ def weightedtau(x, y, rank=True, weigher=None, additive=True):
     """
     x = np.asarray(x).ravel()
     y = np.asarray(y).ravel()
+    NaN = _get_nan(x, y)
 
     if x.size != y.size:
-        raise ValueError("All inputs to `weightedtau` must be "
-                         "of the same size, "
-                         f"found x-size {x.size} and y-size {y.size}")
+        raise ValueError("Array shapes are incompatible for broadcasting.")
     if not x.size:
         # Return NaN if arrays are empty
-        res = SignificanceResult(np.nan, np.nan)
-        res.correlation = np.nan
+        res = SignificanceResult(NaN, NaN)
+        res.correlation = NaN
         return res
 
     # If there are NaNs we apply _toint64()
@@ -5800,11 +5800,11 @@ def weightedtau(x, y, rank=True, weigher=None, additive=True):
             y = _toint64(y)
 
     if rank is True:
-        tau = (
+        tau = np.asarray(
             _weightedrankedtau(x, y, None, weigher, additive) +
             _weightedrankedtau(y, x, None, weigher, additive)
-        ) / 2
-        res = SignificanceResult(tau, np.nan)
+        )[()] / 2
+        res = SignificanceResult(tau, NaN)
         res.correlation = tau
         return res
 
@@ -5812,14 +5812,15 @@ def weightedtau(x, y, rank=True, weigher=None, additive=True):
         rank = np.arange(x.size, dtype=np.intp)
     elif rank is not None:
         rank = np.asarray(rank).ravel()
+        rank = _toint64(rank).astype(np.intp)
         if rank.size != x.size:
             raise ValueError(
                 "All inputs to `weightedtau` must be of the same size, "
                 f"found x-size {x.size} and rank-size {rank.size}"
             )
 
-    tau = _weightedrankedtau(x, y, rank, weigher, additive)
-    res = SignificanceResult(tau, np.nan)
+    tau = np.asarray(_weightedrankedtau(x, y, rank, weigher, additive))[()]
+    res = SignificanceResult(tau, NaN)
     res.correlation = tau
     return res
 
@@ -6117,27 +6118,29 @@ def _t_confidence_interval(df, t, confidence_level, alternative, dtype=None, xp=
     dtype = t.dtype if dtype is None else dtype
     xp = array_namespace(t) if xp is None else xp
 
-    # stdtrit not dispatched yet; use NumPy
-    df, t = np.asarray(df), np.asarray(t)
-
     if confidence_level < 0 or confidence_level > 1:
         message = "`confidence_level` must be a number between 0 and 1."
         raise ValueError(message)
 
+    confidence_level = xp.asarray(confidence_level, dtype=dtype)
+    inf = xp.asarray(xp.inf, dtype=dtype)
+
     if alternative < 0:  # 'less'
         p = confidence_level
-        low, high = np.broadcast_arrays(-np.inf, special.stdtrit(df, p))
+        low, high = xp.broadcast_arrays(-inf, special.stdtrit(df, p))
     elif alternative > 0:  # 'greater'
         p = 1 - confidence_level
-        low, high = np.broadcast_arrays(special.stdtrit(df, p), np.inf)
+        low, high = xp.broadcast_arrays(special.stdtrit(df, p), inf)
     elif alternative == 0:  # 'two-sided'
         tail_probability = (1 - confidence_level)/2
-        p = tail_probability, 1-tail_probability
+        p = xp.asarray([tail_probability, 1-tail_probability])
         # axis of p must be the zeroth and orthogonal to all the rest
-        p = np.reshape(p, [2] + [1]*np.asarray(df).ndim)
-        low, high = special.stdtrit(df, p)
+        p = xp.reshape(p, tuple([2] + [1]*xp.asarray(df).ndim))
+        ci = special.stdtrit(df, p)
+        low, high = ci[0, ...], ci[1, ...]
     else:  # alternative is NaN when input is empty (see _axis_nan_policy)
-        p, nans = np.broadcast_arrays(t, np.nan)
+        nan = xp.asarray(xp.nan)
+        p, nans = xp.broadcast_arrays(t, nan)
         low, high = nans, nans
 
     low = xp.asarray(low, dtype=dtype)
@@ -6154,10 +6157,9 @@ def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative, xp=None):
     with np.errstate(divide='ignore', invalid='ignore'):
         t = xp.divide(d, denom)
 
-    t_np = np.asarray(t)
-    df_np = np.asarray(df)
-    prob = _get_pvalue(t_np, distributions.t(df_np), alternative, xp=np)
-    prob = xp.asarray(prob, dtype=t.dtype)
+    dist = _SimpleStudentT(xp.asarray(df, dtype=t.dtype))
+    prob = _get_pvalue(t, dist, alternative, xp=xp)
+    prob = prob[()] if prob.ndim == 0 else prob
 
     t = t[()] if t.ndim == 0 else t
     prob = prob[()] if prob.ndim == 0 else prob
@@ -10497,6 +10499,18 @@ def lmoment(sample, order=None, *, axis=0, sorted=False, standardize=True):
                                      extra_field_names=['intercept_stderr'])
 
 
+def _pack_LinregressResult(slope, intercept, rvalue, pvalue, stderr, intercept_stderr):
+    return LinregressResult(slope, intercept, rvalue, pvalue, stderr,
+                            intercept_stderr=intercept_stderr)
+
+
+def _unpack_LinregressResult(res):
+    return tuple(res) + (res.intercept_stderr,)
+
+
+@_axis_nan_policy_factory(_pack_LinregressResult, n_samples=2,
+                          result_to_tuple=_unpack_LinregressResult, paired=True,
+                          too_small=1, n_outputs=6)
 def linregress(x, y, alternative='two-sided'):
     """
     Calculate a linear least-squares regression for two sets of measurements.
@@ -10629,7 +10643,7 @@ def linregress(x, y, alternative='two-sided'):
     #   r = ssxym / sqrt( ssxm * ssym )
     if ssxm == 0.0 or ssym == 0.0:
         # If the denominator was going to be 0
-        r = 0.0
+        r = np.asarray(np.nan if ssxym == 0 else 0.0)[()]
     else:
         r = ssxym / np.sqrt(ssxm * ssym)
         # Test for numerical error propagation (make sure -1 < r < 1)
diff --git a/scipy/stats/tests/test_axis_nan_policy.py b/scipy/stats/tests/test_axis_nan_policy.py
index 162e62b81392..ecaabf6b9248 100644
--- a/scipy/stats/tests/test_axis_nan_policy.py
+++ b/scipy/stats/tests/test_axis_nan_policy.py
@@ -61,6 +61,13 @@ def combine_pvalues_weighted(*args, **kwargs):
                                  method='stouffer', **kwargs)
 
 
+def weightedtau_weighted(x, y, rank, **kwargs):
+    axis = kwargs.get('axis', 0)
+    nan_policy = kwargs.get('nan_policy', 'propagate')
+    rank = stats.rankdata(rank, axis=axis, nan_policy=nan_policy)
+    return stats.weightedtau(x, y, rank, **kwargs)
+
+
 axis_nan_policy_cases = [
     # function, args, kwds, number of samples, number of outputs,
     # ... paired, unpacker function
@@ -143,6 +150,20 @@ def combine_pvalues_weighted(*args, **kwargs):
     (xp_var, tuple(), dict(), 1, 1, False, lambda x: (x,)),
     (stats.chatterjeexi, tuple(), dict(), 2, 2, True,
      lambda res: (res.statistic, res.pvalue)),
+    (stats.pointbiserialr, tuple(), dict(), 2, 3, True,
+     lambda res: (res.statistic, res.pvalue, res.correlation)),
+    (stats.kendalltau, tuple(), dict(), 2, 3, True,
+     lambda res: (res.statistic, res.pvalue, res.correlation)),
+    (stats.weightedtau, tuple(), dict(), 2, 3, True,
+     lambda res: (res.statistic, res.pvalue, res.correlation)),
+    (weightedtau_weighted, tuple(), dict(), 3, 3, True,
+     lambda res: (res.statistic, res.pvalue, res.correlation)),
+    (stats.linregress, tuple(), dict(), 2, 6, True,
+     lambda res: tuple(res) + (res.intercept_stderr,)),
+    (stats.theilslopes, tuple(), dict(), 2, 4, True, tuple),
+    (stats.theilslopes, tuple(), dict(), 1, 4, True, tuple),
+    (stats.siegelslopes, tuple(), dict(), 2, 2, True, tuple),
+    (stats.siegelslopes, tuple(), dict(), 1, 2, True, tuple),
 ]
 
 # If the message is one of those expected, put nans in
@@ -175,6 +196,9 @@ def combine_pvalues_weighted(*args, **kwargs):
                       "One or more sample arguments is too small",
                       "invalid value encountered",
                       "divide by zero encountered",
+                      "`x` and `y` must have length at least 2.",
+                      "Inputs must not be empty.",
+                      "All `x` coordinates are identical.",
 }
 
 # If the message is one of these, results of the function may be inaccurate,
@@ -183,7 +207,7 @@ def combine_pvalues_weighted(*args, **kwargs):
                        "Sample size too small for normal approximation."}
 
 # For some functions, nan_policy='propagate' should not just return NaNs
-override_propagate_funcs = {stats.mode}
+override_propagate_funcs = {stats.mode, weightedtau_weighted, stats.weightedtau}
 
 # For some functions, empty arrays produce non-NaN results
 empty_special_case_funcs = {stats.entropy}
@@ -594,6 +618,8 @@ def test_keepdims(hypotest, args, kwds, n_samples, n_outputs, paired, unpacker,
                            stats.differential_entropy}
     if sample_shape == (2, 3, 3, 4) and hypotest in small_sample_raises:
         pytest.skip("Sample too small; test raises error.")
+    if hypotest in {weightedtau_weighted}:
+        pytest.skip("`rankdata` used in testing doesn't support axis tuple.")
     # test if keepdims parameter works correctly
     if not unpacker:
         def unpacker(res):
diff --git a/scipy/stats/tests/test_continued_fraction.py b/scipy/stats/tests/test_continued_fraction.py
index 12fd5272ebdb..9da883f337a0 100644
--- a/scipy/stats/tests/test_continued_fraction.py
+++ b/scipy/stats/tests/test_continued_fraction.py
@@ -13,6 +13,10 @@
 @pytest.mark.usefixtures("skip_xp_backends")
 @pytest.mark.skip_xp_backends('array_api_strict', reason='No fancy indexing assignment')
 @pytest.mark.skip_xp_backends('jax.numpy', reason="Don't support mutation")
+# dask doesn't like lines like this
+# n = int(xp.real(xp_ravel(n))[0])
+# (at some point in here the shape becomes nan)
+@pytest.mark.skip_xp_backends('dask.array', reason="dask has issues with the shapes")
 class TestContinuedFraction:
     rng = np.random.default_rng(5895448232066142650)
     p = rng.uniform(1, 10, size=10)
diff --git a/scipy/stats/tests/test_entropy.py b/scipy/stats/tests/test_entropy.py
index 4002b2c52703..1e7ef86abe9f 100644
--- a/scipy/stats/tests/test_entropy.py
+++ b/scipy/stats/tests/test_entropy.py
@@ -11,8 +11,11 @@
 from scipy._lib._array_api_no_0d import (xp_assert_close, xp_assert_equal,
                                          xp_assert_less)
 
+@pytest.mark.skip_xp_backends("dask.array", reason="boolean index assignment")
+@pytest.mark.usefixtures("skip_xp_backends")
+@array_api_compatible
 class TestEntropy:
-    @array_api_compatible
+
     def test_entropy_positive(self, xp):
         # See ticket #497
         pk = xp.asarray([0.5, 0.2, 0.3])
@@ -22,7 +25,6 @@ def test_entropy_positive(self, xp):
         xp_assert_equal(eself, xp.asarray(0.))
         xp_assert_less(-edouble, xp.asarray(0.))
 
-    @array_api_compatible
     def test_entropy_base(self, xp):
         pk = xp.ones(16)
         S = stats.entropy(pk, base=2.)
@@ -34,21 +36,18 @@ def test_entropy_base(self, xp):
         S2 = stats.entropy(pk, qk, base=2.)
         xp_assert_less(xp.abs(S/S2 - math.log(2.)), xp.asarray(1.e-5))
 
-    @array_api_compatible
     def test_entropy_zero(self, xp):
         # Test for PR-479
         x = xp.asarray([0., 1., 2.])
         xp_assert_close(stats.entropy(x),
                         xp.asarray(0.63651416829481278))
 
-    @array_api_compatible
     def test_entropy_2d(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
         xp_assert_close(stats.entropy(pk, qk),
                         xp.asarray([0.1933259, 0.18609809]))
 
-    @array_api_compatible
     def test_entropy_2d_zero(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.0, 0.1], [0.3, 0.6], [0.5, 0.3]])
@@ -59,20 +58,17 @@ def test_entropy_2d_zero(self, xp):
         xp_assert_close(stats.entropy(pk, qk),
                         xp.asarray([0.17403988, 0.18609809]))
 
-    @array_api_compatible
     def test_entropy_base_2d_nondefault_axis(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         xp_assert_close(stats.entropy(pk, axis=1),
                         xp.asarray([0.63651417, 0.63651417, 0.66156324]))
 
-    @array_api_compatible
     def test_entropy_2d_nondefault_axis(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
         xp_assert_close(stats.entropy(pk, qk, axis=1),
                         xp.asarray([0.23104906, 0.23104906, 0.12770641]))
 
-    @array_api_compatible
     def test_entropy_raises_value_error(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.1, 0.2], [0.6, 0.3]])
@@ -80,33 +76,28 @@ def test_entropy_raises_value_error(self, xp):
         with pytest.raises(ValueError, match=message):
             stats.entropy(pk, qk)
 
-    @array_api_compatible
     def test_base_entropy_with_axis_0_is_equal_to_default(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         xp_assert_close(stats.entropy(pk, axis=0),
                         stats.entropy(pk))
 
-    @array_api_compatible
     def test_entropy_with_axis_0_is_equal_to_default(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
         xp_assert_close(stats.entropy(pk, qk, axis=0),
                         stats.entropy(pk, qk))
 
-    @array_api_compatible
     def test_base_entropy_transposed(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         xp_assert_close(stats.entropy(pk.T),
                         stats.entropy(pk, axis=1))
 
-    @array_api_compatible
     def test_entropy_transposed(self, xp):
         pk = xp.asarray([[0.1, 0.2], [0.6, 0.3], [0.3, 0.5]])
         qk = xp.asarray([[0.2, 0.1], [0.3, 0.6], [0.5, 0.3]])
         xp_assert_close(stats.entropy(pk.T, qk.T),
                         stats.entropy(pk, qk, axis=1))
 
-    @array_api_compatible
     def test_entropy_broadcasting(self, xp):
         rng = np.random.default_rng(74187315492831452)
         x = xp.asarray(rng.random(3))
@@ -115,7 +106,6 @@ def test_entropy_broadcasting(self, xp):
         xp_assert_equal(res[0], stats.entropy(x, y[0, ...]))
         xp_assert_equal(res[1], stats.entropy(x, y[1, ...]))
 
-    @array_api_compatible
     def test_entropy_shape_mismatch(self, xp):
         x = xp.ones((10, 1, 12))
         y = xp.ones((11, 2))
@@ -123,7 +113,6 @@ def test_entropy_shape_mismatch(self, xp):
         with pytest.raises(ValueError, match=message):
             stats.entropy(x, y)
 
-    @array_api_compatible
     def test_input_validation(self, xp):
         x = xp.ones(10)
         message = "`base` must be a positive number."
@@ -131,6 +120,7 @@ def test_input_validation(self, xp):
             stats.entropy(x, base=-2)
 
 
+@pytest.mark.skip_xp_backends("dask.array", reason="No sorting in Dask")
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
 class TestDifferentialEntropy:
diff --git a/scipy/stats/tests/test_morestats.py b/scipy/stats/tests/test_morestats.py
index fb47a010fee0..bfc19854181c 100644
--- a/scipy/stats/tests/test_morestats.py
+++ b/scipy/stats/tests/test_morestats.py
@@ -764,6 +764,7 @@ def test_result_attributes(self, xp):
         "jax.numpy", cpu_only=True,
         reason='`var` incorrect when `correction > n` (google/jax#21330)')
     @pytest.mark.usefixtures("skip_xp_backends")
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide")
     def test_empty_arg(self, xp):
         args = (g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, [])
         args = [xp.asarray(arg) for arg in args]
@@ -1457,10 +1458,12 @@ def test_empty(self):
                       (np.nan, np.nan, 0.0)))
 
     def test_array_of_size_one(self):
-        with np.errstate(invalid='ignore'):
+        message = "One or more sample arguments is too small..."
+        with (np.errstate(invalid='ignore'),
+              pytest.warns(SmallSampleWarning, match=message)):
             assert_equal(stats.probplot([1], fit=True),
                          ((np.array([0.]), np.array([1])),
-                          (np.nan, np.nan, 0.0)))
+                          (np.nan, np.nan, np.nan)))
 
 
 class TestWilcoxon:
@@ -1815,6 +1818,7 @@ def test_moments_normal_distribution(self, xp):
         m3 = stats.moment(data, order=3)
         xp_assert_close(xp.asarray((m1, m2, m3)), expected[:-1], atol=0.02, rtol=1e-2)
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in scalar divide")
     def test_empty_input(self, xp):
         if is_numpy(xp):
             with pytest.warns(SmallSampleWarning, match=too_small_1d_not_omit):
@@ -1858,6 +1862,7 @@ def test_against_R(self, case, xp):
 
 @array_api_compatible
 class TestKstatVar:
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in scalar divide")
     def test_empty_input(self, xp):
         x = xp.asarray([])
         if is_numpy(xp):
diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
index 826f1d6513e4..194cef1ed25a 100644
--- a/scipy/stats/tests/test_stats.py
+++ b/scipy/stats/tests/test_stats.py
@@ -83,6 +83,7 @@ class TestTrimmedStats:
     # TODO: write these tests to handle missing values properly
     dprec = np.finfo(np.float64).precision
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide") # for dask
     def test_tmean(self, xp):
         x = xp.asarray(X.tolist())  # use default dtype of xp
 
@@ -129,6 +130,7 @@ def test_tmean(self, xp):
         y_true = [4.5, 10, 17, 21, xp.nan, xp.nan, xp.nan, xp.nan, xp.nan]
         xp_assert_close(y, xp.asarray(y_true))
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide") # for dask
     @skip_xp_backends('cupy',
                       reason="cupy/cupy#8391")
     def test_tvar(self, xp):
@@ -188,7 +190,7 @@ def test_tmin(self, xp):
 
         # check that if a full slice is masked, the output returns a
         # nan instead of a garbage value.
-        x = xp.arange(16).reshape(4, 4)
+        x = xp.reshape(xp.arange(16), (4, 4))
         res = stats.tmin(x, lowerlimit=4, axis=1)
         xp_assert_equal(res, xp.asarray([np.nan, 4, 8, 12]))
 
@@ -529,6 +531,7 @@ def test_length_two_neg1(self, xp):
         xp_assert_equal(low, -one)
         xp_assert_equal(high, one)
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide")
     @skip_xp_backends('jax.numpy', reason="JAX doesn't allow item assignment.")
     def test_length_two_constant_input(self, xp):
         # Zero variance input
@@ -722,6 +725,7 @@ def test_nd_input_validation(self, xp):
             with pytest.raises(ValueError, match=message):
                 stats.pearsonr(x, x, method=stats.PermutationMethod())
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide")
     @skip_xp_backends('jax.numpy',
                       reason='JAX arrays do not support item assignment')
     def test_nd_special_cases(self, xp):
@@ -1665,7 +1669,8 @@ def test_kendalltau():
                      (np.nan, np.nan))
 
     # empty arrays provided as input
-    assert_equal(stats.kendalltau([], []), (np.nan, np.nan))
+    with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+        assert_equal(stats.kendalltau([], []), (np.nan, np.nan))
 
     # check with larger arrays
     np.random.seed(7546)
@@ -1702,10 +1707,8 @@ def test_kendalltau():
     assert_raises(ValueError, stats.kendalltau, x, y)
 
     # test all ties
-    tau, p_value = stats.kendalltau([], [])
-    assert_equal(np.nan, tau)
-    assert_equal(np.nan, p_value)
-    tau, p_value = stats.kendalltau([0], [0])
+    with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+        tau, p_value = stats.kendalltau([0], [0])
     assert_equal(np.nan, tau)
     assert_equal(np.nan, p_value)
 
@@ -1714,12 +1717,12 @@ def test_kendalltau():
     x = np.ma.masked_greater(x, 1995)
     y = np.arange(2000, dtype=float)
     y = np.concatenate((y[1000:], y[:1000]))
-    assert_(np.isfinite(stats.kendalltau(x,y)[1]))
+    assert_(np.isfinite(stats.mstats.kendalltau(x,y)[1]))
 
 
 def test_kendalltau_vs_mstats_basic():
     np.random.seed(42)
-    for s in range(2,10):
+    for s in range(3, 10):
         a = []
         # Generate rankings with ties
         for i in range(s):
@@ -1840,7 +1843,8 @@ def exact_test(self, x, y, alternative, rev, stat_expected, p_expected):
     def test_against_R_n1(self, alternative, p_expected, rev):
         x, y = [1], [2]
         stat_expected = np.nan
-        self.exact_test(x, y, alternative, rev, stat_expected, p_expected)
+        with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+            self.exact_test(x, y, alternative, rev, stat_expected, p_expected)
 
     case_R_n2 = (list(zip(alternatives, p_n2, [False]*3))
                  + list(zip(alternatives, reversed(p_n2), [True]*3)))
@@ -2027,15 +2031,17 @@ def test_weightedtau():
                                      np.asarray(y, dtype=np.float64))
     assert_approx_equal(tau, -0.56694968153682723)
     # All ties
-    tau, p_value = stats.weightedtau([], [])
+    with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+        tau, p_value = stats.weightedtau([], [])
     assert_equal(np.nan, tau)
     assert_equal(np.nan, p_value)
-    tau, p_value = stats.weightedtau([0], [0])
+    with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+        tau, p_value = stats.weightedtau([0], [0])
     assert_equal(np.nan, tau)
     assert_equal(np.nan, p_value)
     # Size mismatches
     assert_raises(ValueError, stats.weightedtau, [0, 1], [0, 1, 2])
-    assert_raises(ValueError, stats.weightedtau, [0, 1], [0, 1], [0])
+    assert_raises(ValueError, stats.weightedtau, [0, 1], [0, 1], [0, 1, 2])
     # NaNs
     x = [12, 2, 1, 12, 2]
     y = [1, 4, 7, 1, np.nan]
@@ -2069,10 +2075,12 @@ def test_segfault_issue_9710():
     # https://github.com/scipy/scipy/issues/9710
     # This test was created to check segfault
     # In issue SEGFAULT only repros in optimized builds after calling the function twice
-    stats.weightedtau([1], [1.0])
-    stats.weightedtau([1], [1.0])
-    # The code below also caused SEGFAULT
-    stats.weightedtau([np.nan], [52])
+    message = "One or more sample arguments is too small"
+    with pytest.warns(SmallSampleWarning, match=message):
+        stats.weightedtau([1], [1.0])
+        stats.weightedtau([1], [1.0])
+        # The code below also caused SEGFAULT
+        stats.weightedtau([np.nan], [52])
 
 
 def test_kendall_tau_large():
@@ -2185,7 +2193,9 @@ def test_regressZEROX(self):
         # total sum of squares of exactly 0.
         result = stats.linregress(X, ZERO)
         assert_almost_equal(result.intercept, 0.0)
-        assert_almost_equal(result.rvalue, 0.0)
+        with pytest.warns(stats.ConstantInputWarning, match="An input array..."):
+            ref_rvalue = stats.pearsonr(X, ZERO).statistic
+        assert_almost_equal(result.rvalue, ref_rvalue)
 
     def test_regress_simple(self):
         # Regress a line with sinusoidal noise.
@@ -2360,7 +2370,9 @@ def test_compare_to_polyfit(self):
         assert_almost_equal(result.intercept, poly[1])
 
     def test_empty_input(self):
-        assert_raises(ValueError, stats.linregress, [], [])
+        with pytest.warns(SmallSampleWarning, match="One or more sample..."):
+            res = stats.linregress([], [])
+            assert np.all(np.isnan(res))
 
     def test_nan_input(self):
         x = np.arange(10.)
@@ -2804,6 +2816,7 @@ class TestSEM:
     testcase = [1., 2., 3., 4.]
     scalar_testcase = 4.
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide")
     def test_sem_scalar(self, xp):
         # This is not in R, so used:
         #     sqrt(var(testcase)*3/4)/sqrt(3)
@@ -2865,9 +2878,9 @@ def test_zmap(self, x, y, xp):
 
     def test_zmap_axis(self, xp):
         # Test use of 'axis' keyword in zmap.
-        x = np.array([[0.0, 0.0, 1.0, 1.0],
-                      [1.0, 1.0, 1.0, 2.0],
-                      [2.0, 0.0, 2.0, 0.0]])
+        x = xp.asarray([[0.0, 0.0, 1.0, 1.0],
+                        [1.0, 1.0, 1.0, 2.0],
+                        [2.0, 0.0, 2.0, 0.0]])
 
         t1 = 1.0/(2.0/3)**0.5
         t2 = 3.**0.5/3
@@ -2882,6 +2895,8 @@ def test_zmap_axis(self, xp):
         z1_expected = [[-1.0, -1.0, 1.0, 1.0],
                        [-t2, -t2, -t2, 3.**0.5],
                        [1.0, -1.0, 1.0, -1.0]]
+        z0_expected = xp.asarray(z0_expected)
+        z1_expected = xp.asarray(z1_expected)
 
         xp_assert_close(z0, z0_expected)
         xp_assert_close(z1, z1_expected)
@@ -2905,20 +2920,28 @@ def test_zmap_nan_policy_omit(self, ddof, xp):
         scores = xp.asarray([-3, -1, 2, np.nan])
         compare = xp.asarray([-8, -3, 2, 7, 12, np.nan])
         z = stats.zmap(scores, compare, ddof=ddof, nan_policy='omit')
-        ref = stats.zmap(scores, compare[~xp.isnan(compare)], ddof=ddof)
+        # exclude nans from compare, don't use isnan + mask since that messes up
+        # dask
+        ref = stats.zmap(scores, compare[:5], ddof=ddof)
         xp_assert_close(z, ref)
 
     @pytest.mark.parametrize('ddof', [0, 2])
     def test_zmap_nan_policy_omit_with_axis(self, ddof, xp):
         scores = xp.reshape(xp.arange(-5.0, 9.0), (2, -1))
-        compare = xp.reshape(xp.linspace(-8, 6, 24), (2, -1))
-        compare[0, 4] = xp.nan
-        compare[0, 6] = xp.nan
-        compare[1, 1] = xp.nan
+        compare = np.reshape(np.linspace(-8, 6, 24), (2, -1))
+        compare[0, 4] = np.nan
+        compare[0, 6] = np.nan
+        compare[1, 1] = np.nan
+        # convert from numpy since some libraries like dask
+        # can't handle the data-dependent shapes from the isnan masking
+        compare_0_notna = xp.asarray(compare[0, :][~np.isnan(compare[0, :])])
+        compare_1_notna = xp.asarray(compare[1, :][~np.isnan(compare[1, :])])
+        compare = xp.asarray(compare)
+
         z = stats.zmap(scores, compare, nan_policy='omit', axis=1, ddof=ddof)
-        res0 = stats.zmap(scores[0, :], compare[0, :][~xp.isnan(compare[0, :])],
+        res0 = stats.zmap(scores[0, :], compare_0_notna,
                           ddof=ddof)
-        res1 = stats.zmap(scores[1, :], compare[1, :][~xp.isnan(compare[1, :])],
+        res1 = stats.zmap(scores[1, :], compare_1_notna,
                           ddof=ddof)
         expected = xp.stack((res0, res1))
         xp_assert_close(z, expected)
@@ -3010,6 +3033,8 @@ def test_zscore_constant_input_1d(self, xp):
             z = stats.zscore(x)
         xp_assert_equal(z, xp.full(x.shape, xp.nan))
 
+    @pytest.mark.filterwarnings("ignore::FutureWarning") # for dask
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide") # for dask
     def test_zscore_constant_input_2d(self, xp):
         x = xp.asarray([[10.0, 10.0, 10.0, 10.0],
                         [10.0, 11.0, 12.0, 13.0]])
@@ -3031,6 +3056,7 @@ def test_zscore_constant_input_2d(self, xp):
             z = stats.zscore(y, axis=None)
         xp_assert_equal(z, xp.full(y.shape, xp.asarray(xp.nan)))
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide") # for dask
     def test_zscore_constant_input_2d_nan_policy_omit(self, xp):
         x = xp.asarray([[10.0, 10.0, 10.0, 10.0],
                         [10.0, 11.0, 12.0, xp.nan],
@@ -3050,6 +3076,7 @@ def test_zscore_constant_input_2d_nan_policy_omit(self, xp):
                                         [-s, 0, s, xp.nan],
                                         [-s2/2, s2, xp.nan, -s2/2]]))
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in divide") # for dask
     def test_zscore_2d_all_nan_row(self, xp):
         # A row is all nan, and we use axis=1.
         x = xp.asarray([[np.nan, np.nan, np.nan, np.nan],
@@ -3058,6 +3085,7 @@ def test_zscore_2d_all_nan_row(self, xp):
         xp_assert_close(z, xp.asarray([[np.nan, np.nan, np.nan, np.nan],
                                        [-1.0, -1.0, 1.0, 1.0]]))
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
     @skip_xp_backends('cupy', reason="cupy/cupy#8391")
     def test_zscore_2d_all_nan(self, xp):
         # The entire 2d array is nan, and we use axis=None.
@@ -3114,6 +3142,7 @@ def test_zscore_masked_element_0_gh19039(self, xp):
             res = stats.zscore(y, axis=None)
         assert_equal(res[1:], np.nan)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
     def test_degenerate_input(self, xp):
         scores = xp.arange(3)
         compare = xp.ones(3)
@@ -3457,6 +3486,8 @@ def _assert_equal(self, actual, expect, *, shape=None, dtype=None):
     @array_api_compatible
     @pytest.mark.parametrize('size', [10, (10, 2)])
     @pytest.mark.parametrize('m, c', product((0, 1, 2, 3), (None, 0, 1)))
+    # ignore warning for dask
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in divide")
     def test_moment_center_scalar_moment(self, size, m, c, xp):
         rng = np.random.default_rng(6581432544381372042)
         x = xp.asarray(rng.random(size=size))
@@ -3468,6 +3499,8 @@ def test_moment_center_scalar_moment(self, size, m, c, xp):
     @array_api_compatible
     @pytest.mark.parametrize('size', [10, (10, 2)])
     @pytest.mark.parametrize('c', (None, 0, 1))
+    # ignore warning for dask
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in divide")
     def test_moment_center_array_moment(self, size, c, xp):
         rng = np.random.default_rng(1706828300224046506)
         x = xp.asarray(rng.random(size=size))
@@ -3604,6 +3637,8 @@ def test_moment_accuracy(self):
     @pytest.mark.parametrize('order', [0, 1, 2, 3])
     @pytest.mark.parametrize('axis', [-1, 0, 1])
     @pytest.mark.parametrize('center', [None, 0])
+    # ignore warning for dask
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in divide")
     def test_moment_array_api(self, xp, order, axis, center):
         rng = np.random.default_rng(34823589259425)
         x = rng.random(size=(5, 6, 7))
@@ -3634,10 +3669,9 @@ def test_empty_1d(self, stat_fun, xp):
                 res = stat_fun(x)
         xp_assert_equal(res, xp.asarray(xp.nan))
 
-    @skip_xp_backends('jax.numpy',
-                      reason="JAX arrays do not support item assignment")
-    @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
+    # ignore warning for dask
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in scalar divide")
     def test_skewness(self, xp):
         # Scalar test case
         y = stats.skew(xp.asarray(self.scalar_testcase))
@@ -3709,8 +3743,8 @@ def test_precision_loss_gh15554(self, xp):
             a[:, 0] = 1.01
             stats.skew(a)
 
-    @skip_xp_backends('jax.numpy',
-                      reason="JAX arrays do not support item assignment")
+    @skip_xp_backends('jax.numpy', reason="JAX arrays do not support item assignment")
+    @skip_xp_backends('dask.array', reason='boolean index assignment')
     @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
     @pytest.mark.parametrize('axis', [-1, 0, 2, None])
@@ -3746,6 +3780,7 @@ class TestKurtosis(SkewKurtosisTest):
     @skip_xp_backends('jax.numpy',
                       reason='JAX arrays do not support item assignment')
     @pytest.mark.usefixtures("skip_xp_backends")
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in scalar divide")
     @array_api_compatible
     def test_kurtosis(self, xp):
         # Scalar test case
@@ -3809,8 +3844,8 @@ def test_kurtosis_constant_value(self, xp):
             assert xp.isnan(stats.kurtosis(a / float(2**50), fisher=False))
             assert xp.isnan(stats.kurtosis(a, fisher=False, bias=False))
 
-    @skip_xp_backends('jax.numpy',
-                      reason='JAX arrays do not support item assignment')
+    @skip_xp_backends('jax.numpy', reason='JAX arrays do not support item assignment')
+    @skip_xp_backends('dask.array', reason='boolean index assignment')
     @pytest.mark.usefixtures("skip_xp_backends")
     @array_api_compatible
     @pytest.mark.parametrize('axis', [-1, 0, 2, None])
@@ -3878,9 +3913,6 @@ def ttest_data_axis_strategy(draw):
     return data, axis
 
 
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 @array_api_compatible
 class TestStudentTest:
     # Preserving original test cases.
@@ -3900,6 +3932,7 @@ class TestStudentTest:
     P1_1_l = P1_1 / 2
     P1_1_g = 1 - (P1_1 / 2)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
     def test_onesample(self, xp):
         with suppress_warnings() as sup, \
                 np.errstate(invalid="ignore", divide="ignore"):
@@ -3952,6 +3985,7 @@ def test_onesample_nan_policy(self, xp):
             assert_raises(ValueError, stats.ttest_1samp, x, 5.0,
                           nan_policy='foobar')
 
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in divide")
     def test_1samp_alternative(self, xp):
         message = "`alternative` must be 'less', 'greater', or 'two-sided'."
         with pytest.raises(ValueError, match=message):
@@ -3965,6 +3999,8 @@ def test_1samp_alternative(self, xp):
         xp_assert_close(p, xp.asarray(self.P1_1_g))
         xp_assert_close(t, xp.asarray(self.T1_1))
 
+    @pytest.mark.skip_xp_backends('jax.numpy', reason='Generic impl mutates array.')
+    @pytest.mark.usefixtures("skip_xp_backends")
     @pytest.mark.parametrize("alternative", ['two-sided', 'less', 'greater'])
     def test_1samp_ci_1d(self, xp, alternative):
         # test confidence interval method against reference values
@@ -3997,6 +4033,8 @@ def test_1samp_ci_iv(self, xp):
         with pytest.raises(ValueError, match=message):
             res.confidence_interval(confidence_level=10)
 
+    @pytest.mark.skip_xp_backends(np_only=True, reason='Too slow.')
+    @pytest.mark.usefixtures("skip_xp_backends")
     @pytest.mark.xslow
     @hypothesis.given(alpha=hypothesis.strategies.floats(1e-15, 1-1e-15),
                       data_axis=ttest_data_axis_strategy())
@@ -5162,9 +5200,6 @@ def _stats(x, axis=0):
 
 
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 def test_ttest_ind(xp):
     # regression test
     tr = xp.asarray(1.0912746897927283)
@@ -5881,9 +5916,6 @@ def test_trim_bounds_error(self, trim):
 
 
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 class Test_ttest_CI:
     # indices in order [alternative={two-sided, less, greater},
     #                   equal_var={False, True}, trim={0, 0.2}]
@@ -5930,6 +5962,8 @@ class Test_ttest_CI:
     @pytest.mark.parametrize('alternative', ['two-sided', 'less', 'greater'])
     @pytest.mark.parametrize('equal_var', [False, True])
     @pytest.mark.parametrize('trim', [0, 0.2])
+    @pytest.mark.skip_xp_backends('jax.numpy', reason='Generic impl mutates array.')
+    @pytest.mark.usefixtures("skip_xp_backends")
     def test_confidence_interval(self, alternative, equal_var, trim, xp):
         if equal_var and trim:
             pytest.xfail('Discrepancy in `main`; needs further investigation.')
@@ -5978,9 +6012,6 @@ def test__broadcast_concatenate():
 
 
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 def test_ttest_ind_with_uneq_var(xp):
     # check vs. R `t.test`, e.g.
     # options(digits=20)
@@ -6062,10 +6093,8 @@ def test_ttest_ind_with_uneq_var(xp):
     xp_assert_close(res.pvalue, pr_2D)
 
 
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")  # for dask, multiple warnings
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 def test_ttest_ind_zero_division(xp):
     # test zero division problem
     x = xp.zeros(3)
@@ -6109,6 +6138,7 @@ def test_ttest_ind_nan_2nd_arg():
                     atol=1e-15)
 
 
+@pytest.mark.filterwarnings("ignore::FutureWarning") # for dask
 @array_api_compatible
 def test_ttest_ind_empty_1d_returns_nan(xp):
     # Two empty inputs should return a TtestResult containing nan
@@ -6125,6 +6155,7 @@ def test_ttest_ind_empty_1d_returns_nan(xp):
 
 
 @skip_xp_backends('cupy', reason='cupy/cupy#8391')
+@pytest.mark.filterwarnings("ignore::FutureWarning") # for dask
 @pytest.mark.usefixtures("skip_xp_backends")
 @array_api_compatible
 @pytest.mark.parametrize('b, expected_shape',
@@ -6187,10 +6218,8 @@ def test_gh5686(xp):
     stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
 
 
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")  # for dask, multiple warnings
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 def test_ttest_ind_from_stats_inputs_zero(xp):
     # Regression test for gh-6409.
     zero = xp.asarray(0.)
@@ -6202,9 +6231,9 @@ def test_ttest_ind_from_stats_inputs_zero(xp):
 
 
 @array_api_compatible
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
+@pytest.mark.skip_xp_backends(cpu_only=True, reason='Test uses ks_1samp')
 @pytest.mark.usefixtures("skip_xp_backends")
+@pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
 def test_ttest_uniform_pvalues(xp):
     # test that p-values are uniformly distributed under the null hypothesis
     rng = np.random.default_rng(246834602926842)
@@ -6228,9 +6257,8 @@ def test_ttest_uniform_pvalues(xp):
     x, y = xp.asarray([2, 3, 5]), xp.asarray([1.5])
 
     res = stats.ttest_ind(x, y, equal_var=True)
-    rtol = 1e-6 if is_torch(xp) else 1e-10
-    xp_assert_close(res.statistic, xp.asarray(1.0394023007754), rtol=rtol)
-    xp_assert_close(res.pvalue, xp.asarray(0.407779907736), rtol=rtol)
+    xp_assert_close(res.statistic, xp.asarray(1.0394023007754))
+    xp_assert_close(res.pvalue, xp.asarray(0.407779907736))
 
 
 def _convert_pvalue_alternative(t, p, alt, xp):
@@ -6243,9 +6271,7 @@ def _convert_pvalue_alternative(t, p, alt, xp):
 
 
 @pytest.mark.slow
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
+@pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
 @array_api_compatible
 def test_ttest_1samp_new(xp):
     n1, n2, n3 = (10, 15, 20)
@@ -6333,10 +6359,9 @@ def test_ttest_1samp_new_omit(xp):
     xp_assert_close(t, tr)
 
 
-@pytest.mark.skip_xp_backends(cpu_only=True,
-                              reason='Uses NumPy for pvalue, CI')
-@pytest.mark.usefixtures("skip_xp_backends")
 @array_api_compatible
+@pytest.mark.skip_xp_backends('jax.numpy', reason='Generic impl mutates array.')
+@pytest.mark.usefixtures("skip_xp_backends")
 def test_ttest_1samp_popmean_array(xp):
     # when popmean.shape[axis] != 1, raise an error
     # if the user wants to test multiple null hypotheses simultaneously,
@@ -6368,6 +6393,7 @@ def test_ttest_1samp_popmean_array(xp):
 
 
 class TestDescribe:
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask
     @array_api_compatible
     def test_describe_scalar(self, xp):
         with suppress_warnings() as sup, \
@@ -6398,8 +6424,8 @@ def test_describe_numbers(self, xp):
         assert n == nc
         xp_assert_equal(mm[0], mmc[0])
         xp_assert_equal(mm[1], mmc[1])
-        xp_assert_close(m, mc, rtol=4 * xp.finfo(m.dtype).eps)
-        xp_assert_close(v, vc, rtol=4 * xp.finfo(m.dtype).eps)
+        xp_assert_close(m, mc, rtol=4 * xp_test.finfo(m.dtype).eps)
+        xp_assert_close(v, vc, rtol=4 * xp_test.finfo(m.dtype).eps)
         xp_assert_close(sk, skc)
         xp_assert_close(kurt, kurtc)
 
@@ -6407,8 +6433,8 @@ def test_describe_numbers(self, xp):
         assert n == nc
         xp_assert_equal(mm[0], mmc[0])
         xp_assert_equal(mm[1], mmc[1])
-        xp_assert_close(m, mc, rtol=4 * xp.finfo(m.dtype).eps)
-        xp_assert_close(v, vc, rtol=4 * xp.finfo(m.dtype).eps)
+        xp_assert_close(m, mc, rtol=4 * xp_test.finfo(m.dtype).eps)
+        xp_assert_close(v, vc, rtol=4 * xp_test.finfo(m.dtype).eps)
         xp_assert_close(sk, skc)
         xp_assert_close(kurt, kurtc)
 
@@ -6968,6 +6994,7 @@ def check_equal_pmean(*args, **kwargs):
 
 @array_api_compatible
 class TestHMean:
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings happen
     def test_0(self, xp):
         a = [1, 0, 2]
         desired = 0
@@ -6983,11 +7010,13 @@ def test_1d(self, xp):
         desired = 4. / (1. / 1 + 1. / 2 + 1. / 3 + 1. / 4)
         check_equal_hmean(a, desired, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings happen
     def test_1d_with_zero(self, xp):
         a = np.array([1, 0])
         desired = 0.0
         check_equal_hmean(a, desired, xp=xp, rtol=0.0)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings happen
     @skip_xp_backends('array_api_strict',
                       reason=("`array_api_strict.where` `fillvalue` doesn't "
                                "accept Python scalars. See data-apis/array-api#807."))
@@ -7012,6 +7041,7 @@ def test_2d_axis0(self, xp):
         desired = np.array([22.88135593, 39.13043478, 52.90076336, 65.45454545])
         check_equal_hmean(a, desired, axis=0, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings happen
     def test_2d_axis0_with_zero(self, xp):
         a = [[10, 0, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]]
         desired = np.array([22.88135593, 0.0, 52.90076336, 65.45454545])
@@ -7023,6 +7053,7 @@ def test_2d_axis1(self, xp):
         desired = np.array([19.2, 63.03939962, 103.80078637])
         check_equal_hmean(a, desired, axis=1, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings happen
     def test_2d_axis1_with_zero(self, xp):
         a = [[10, 0, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]]
         desired = np.array([0.0, 63.03939962, 103.80078637])
@@ -7083,6 +7114,7 @@ def test_weights_masked_1d_array(self, xp):
 
 @array_api_compatible
 class TestGMean:
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in log") # for dask
     def test_0(self, xp):
         a = [1, 0, 2]
         desired = 0
@@ -7135,6 +7167,7 @@ def test_large_values(self, xp):
         desired = 1e200
         check_equal_gmean(a, desired, rtol=1e-13, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore:divide by zero encountered in log") # for dask
     def test_1d_with_0(self, xp):
         #  Test a 1d case with zero element
         a = [10, 20, 30, 40, 50, 60, 70, 80, 90, 0]
@@ -7142,6 +7175,7 @@ def test_1d_with_0(self, xp):
         with np.errstate(all='ignore'):
             check_equal_gmean(a, desired, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in log") # for dask
     def test_1d_neg(self, xp):
         #  Test a 1d case with negative element
         a = [10, 20, 30, 40, 50, 60, 70, 80, 90, -1]
@@ -7215,6 +7249,7 @@ def test_1d(self, xp):
         desired = np.sqrt((1**2 + 2**2 + 3**2 + 4**2) / 4)
         check_equal_pmean(a, p, desired, xp=xp)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
     def test_1d_with_zero(self, xp):
         a, p = np.array([1, 0]), -1
         desired = 0.0
@@ -8231,6 +8266,8 @@ def test_weighted_stouffer(self, xp, weights, expected_statistic, expected_pvalu
 
     methods = ["fisher", "pearson", "tippett", "stouffer", "mudholkar_george"]
 
+    @skip_xp_backends('dask.array', reason='no sorting in Dask',
+                      cpu_only=True, exceptions=['cupy', 'jax.numpy'])
     @pytest.mark.parametrize("variant", ["single", "all", "random"])
     @pytest.mark.parametrize("method", methods)
     def test_monotonicity(self, variant, method, xp):
@@ -9459,6 +9496,7 @@ def test_non_broadcastable(self, xp):
         with pytest.raises(ValueError, match=message):
             _xp_mean(x, weights=w)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
     def test_special_cases(self, xp):
         # weights sum to zero
         weights = xp.asarray([-1., 0., 1.])
@@ -9472,6 +9510,7 @@ def test_special_cases(self, xp):
         res = _xp_mean(xp.asarray([1., 1., 2.]), weights=weights)
         xp_assert_close(res, xp.asarray(np.inf))
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask, multiple warnings
     def test_nan_policy(self, xp):
         x = xp.arange(10.)
         mask = (x == 3)
@@ -9525,8 +9564,10 @@ def test_empty(self, xp):
         ref = xp.asarray([])
         xp_assert_equal(res, ref)
 
+    @pytest.mark.filterwarnings("ignore:overflow encountered in reduce") # for dask
     def test_dtype(self, xp):
-        max = xp.finfo(xp.float32).max
+        xp_test = array_namespace(xp.asarray(1))
+        max = xp_test.finfo(xp.float32).max
         x_np = np.asarray([max, max], dtype=np.float32)
         x_xp = xp.asarray(x_np)
 
@@ -9553,7 +9594,9 @@ def test_integer(self, xp):
 @array_api_compatible
 @pytest.mark.usefixtures("skip_xp_backends")
 @skip_xp_backends('jax.numpy', reason='JAX arrays do not support item assignment')
+@skip_xp_backends('dask.array', reason='boolean index assignment')
 class TestXP_Var:
+
     @pytest.mark.parametrize('axis', [None, 1, -1, (-2, 2)])
     @pytest.mark.parametrize('keepdims', [False, True])
     @pytest.mark.parametrize('correction', [0, 1])
@@ -9632,7 +9675,8 @@ def test_empty(self, xp):
         xp_assert_equal(res, ref)
 
     def test_dtype(self, xp):
-        max = xp.finfo(xp.float32).max
+        xp_test = array_namespace(xp.asarray(1)) # dask.array needs finfo
+        max = xp_test.finfo(xp.float32).max
         x_np = np.asarray([max, max/2], dtype=np.float32)
         x_xp = xp.asarray(x_np)
 
diff --git a/scipy/stats/tests/test_variation.py b/scipy/stats/tests/test_variation.py
index 789f393e28c3..4ba5cc3a8fa7 100644
--- a/scipy/stats/tests/test_variation.py
+++ b/scipy/stats/tests/test_variation.py
@@ -32,9 +32,10 @@ def test_sign(self, sgn, xp):
         expected = xp.asarray(sgn*math.sqrt(2)/3)
         xp_assert_close(v, expected, rtol=1e-10)
 
-    def test_scalar(self, xp):
+    @skip_xp_backends(np_only=True, reason="test plain python scalar input")
+    def test_scalar(self):
         # A scalar is treated like a 1-d sequence with length 1.
-        xp_assert_equal(variation(4.0), 0.0)
+        assert variation(4.0) == 0.0
 
     @pytest.mark.parametrize('nan_policy, expected',
                              [('propagate', np.nan),
@@ -113,6 +114,7 @@ def test_bad_axis(self, xp):
         with pytest.raises((AxisError, IndexError)):
             variation(x, axis=10)
 
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning") # for dask
     def test_mean_zero(self, xp):
         # Check that `variation` returns inf for a sequence that is not
         # identically zero but whose mean is zero.
@@ -124,6 +126,7 @@ def test_mean_zero(self, xp):
         y2 = variation(x2, axis=1)
         xp_assert_equal(y2, xp.asarray([xp.inf, xp.inf]))
 
+    @pytest.mark.filterwarnings("ignore:invalid value encountered") # for dask
     @pytest.mark.parametrize('x', [[0.]*5, [1, 2, np.inf, 9]])
     def test_return_nan(self, x, xp):
         x = xp.asarray(x)
@@ -131,6 +134,7 @@ def test_return_nan(self, x, xp):
         y = variation(x)
         xp_assert_equal(y, xp.asarray(xp.nan, dtype=x.dtype))
 
+    @pytest.mark.filterwarnings("ignore::FutureWarning") # for dask
     @pytest.mark.parametrize('axis, expected',
                              [(0, []), (1, [np.nan]*3), (None, np.nan)])
     def test_2d_size_zero_with_axis(self, axis, expected, xp):