diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 453925c3f..9c7935911 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1 +1,2 @@
-style = "sciml"
\ No newline at end of file
+style = "sciml"
+format_markdown = true
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
index fe0aec9e5..0a2bc6bad 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -19,4 +19,3 @@ The SciMLSensitivity.jl package is licensed under the MIT "Expat" License:
 > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 > SOFTWARE.
-> 
diff --git a/README.md b/README.md
index e1a738f9b..8f05a433d 100644
--- a/README.md
+++ b/README.md
@@ -7,17 +7,16 @@
 [![Build Status](https://github.com/SciML/SciMLSensitivity.jl/workflows/CI/badge.svg)](https://github.com/SciML/SciMLSensitivity.jl/actions?query=workflow%3ACI)
 [![Build status](https://badge.buildkite.com/e0ee4d9d914eb44a43c291d78c53047eeff95e7edb7881b6f7.svg)](https://buildkite.com/julialang/scimlsensitivity-dot-jl)
 
-[![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor's%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
+[![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor%27s%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
 [![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)
 
-SciMLSensitivity.jl is a component package in the [SciML Scientific Machine Learning ecosystem](https://sciml.ai/). 
+SciMLSensitivity.jl is a component package in the [SciML Scientific Machine Learning ecosystem](https://sciml.ai/).
 It holds the sensitivity analysis utilities. Users interested in using this
 functionality should check out [DifferentialEquations.jl](https://docs.sciml.ai/DiffEqDocs/stable/).
 
-
 ## Tutorials and Documentation
 
 For information on using the package,
 [see the stable documentation](https://docs.sciml.ai/SciMLSensitivity/stable/). Use the
 [in-development documentation](https://docs.sciml.ai/SciMLSensitivity/dev/) for the version of
-the documentation, which contains the unreleased features.
\ No newline at end of file
+the documentation, which contains the unreleased features.
diff --git a/docs/src/Benchmark.md b/docs/src/Benchmark.md
index a78b675b3..c7f17e58e 100644
--- a/docs/src/Benchmark.md
+++ b/docs/src/Benchmark.md
@@ -32,8 +32,8 @@ at this time.
 
 Quick summary:
 
-- `BacksolveAdjoint` can be the fastest (but use with caution!); about 25% faster
-- Using `ZygoteVJP` is faster than other vjp choices with FastDense due to the overloads
+  - `BacksolveAdjoint` can be the fastest (but use with caution!); about 25% faster
+  - Using `ZygoteVJP` is faster than other vjp choices with FastDense due to the overloads
 
 ```julia
 using DiffEqFlux, OrdinaryDiffEq, Flux, Optim, Plots, SciMLSensitivity,
@@ -46,13 +46,13 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 
 function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
 
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
-dudt2 = FastChain((x, p) -> x.^3,
+dudt2 = FastChain((x, p) -> x .^ 3,
                   FastDense(2, 50, tanh),
                   FastDense(50, 2))
 Random.seed!(100)
@@ -66,10 +66,11 @@ function loss_neuralode(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode,p)
+@btime Zygote.gradient(loss_neuralode, p)
 # 2.709 ms (56506 allocations: 6.62 MiB)
 
-prob_neuralode_interpolating = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=InterpolatingAdjoint(autojacvec=ReverseDiffVJP(true)))
+prob_neuralode_interpolating = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                         sensealg = InterpolatingAdjoint(autojacvec = ReverseDiffVJP(true)))
 
 function loss_neuralode_interpolating(p)
     pred = Array(prob_neuralode_interpolating(u0, p))
@@ -77,10 +78,11 @@ function loss_neuralode_interpolating(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_interpolating,p)
+@btime Zygote.gradient(loss_neuralode_interpolating, p)
 # 5.501 ms (103835 allocations: 2.57 MiB)
 
-prob_neuralode_interpolating_zygote = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=InterpolatingAdjoint(autojacvec=ZygoteVJP()))
+prob_neuralode_interpolating_zygote = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                                sensealg = InterpolatingAdjoint(autojacvec = ZygoteVJP()))
 
 function loss_neuralode_interpolating_zygote(p)
     pred = Array(prob_neuralode_interpolating_zygote(u0, p))
@@ -88,10 +90,11 @@ function loss_neuralode_interpolating_zygote(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_interpolating_zygote,p)
+@btime Zygote.gradient(loss_neuralode_interpolating_zygote, p)
 # 2.899 ms (56150 allocations: 6.61 MiB)
 
-prob_neuralode_backsolve = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP(true)))
+prob_neuralode_backsolve = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                     sensealg = BacksolveAdjoint(autojacvec = ReverseDiffVJP(true)))
 
 function loss_neuralode_backsolve(p)
     pred = Array(prob_neuralode_backsolve(u0, p))
@@ -99,10 +102,11 @@ function loss_neuralode_backsolve(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_backsolve,p)
+@btime Zygote.gradient(loss_neuralode_backsolve, p)
 # 4.871 ms (85855 allocations: 2.20 MiB)
 
-prob_neuralode_quad = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=QuadratureAdjoint(autojacvec=ReverseDiffVJP(true)))
+prob_neuralode_quad = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                sensealg = QuadratureAdjoint(autojacvec = ReverseDiffVJP(true)))
 
 function loss_neuralode_quad(p)
     pred = Array(prob_neuralode_quad(u0, p))
@@ -110,10 +114,11 @@ function loss_neuralode_quad(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_quad,p)
+@btime Zygote.gradient(loss_neuralode_quad, p)
 # 11.748 ms (79549 allocations: 3.87 MiB)
 
-prob_neuralode_backsolve_tracker = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=BacksolveAdjoint(autojacvec=TrackerVJP()))
+prob_neuralode_backsolve_tracker = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                             sensealg = BacksolveAdjoint(autojacvec = TrackerVJP()))
 
 function loss_neuralode_backsolve_tracker(p)
     pred = Array(prob_neuralode_backsolve_tracker(u0, p))
@@ -121,10 +126,11 @@ function loss_neuralode_backsolve_tracker(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_backsolve_tracker,p)
+@btime Zygote.gradient(loss_neuralode_backsolve_tracker, p)
 # 27.604 ms (186143 allocations: 12.22 MiB)
 
-prob_neuralode_backsolve_zygote = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=BacksolveAdjoint(autojacvec=ZygoteVJP()))
+prob_neuralode_backsolve_zygote = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                            sensealg = BacksolveAdjoint(autojacvec = ZygoteVJP()))
 
 function loss_neuralode_backsolve_zygote(p)
     pred = Array(prob_neuralode_backsolve_zygote(u0, p))
@@ -132,10 +138,11 @@ function loss_neuralode_backsolve_zygote(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_backsolve_zygote,p)
+@btime Zygote.gradient(loss_neuralode_backsolve_zygote, p)
 # 2.091 ms (49883 allocations: 6.28 MiB)
 
-prob_neuralode_backsolve_false = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP(false)))
+prob_neuralode_backsolve_false = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                           sensealg = BacksolveAdjoint(autojacvec = ReverseDiffVJP(false)))
 
 function loss_neuralode_backsolve_false(p)
     pred = Array(prob_neuralode_backsolve_false(u0, p))
@@ -143,10 +150,11 @@ function loss_neuralode_backsolve_false(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_backsolve_false,p)
+@btime Zygote.gradient(loss_neuralode_backsolve_false, p)
 # 4.822 ms (9956 allocations: 1.03 MiB)
 
-prob_neuralode_tracker = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps, sensealg=TrackerAdjoint())
+prob_neuralode_tracker = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps,
+                                   sensealg = TrackerAdjoint())
 
 function loss_neuralode_tracker(p)
     pred = Array(prob_neuralode_tracker(u0, p))
@@ -154,6 +162,6 @@ function loss_neuralode_tracker(p)
     return loss
 end
 
-@btime Zygote.gradient(loss_neuralode_tracker,p)
+@btime Zygote.gradient(loss_neuralode_tracker, p)
 # 12.614 ms (76346 allocations: 3.12 MiB)
 ```
diff --git a/docs/src/examples/dae/physical_constraints.md b/docs/src/examples/dae/physical_constraints.md
index 48a19ce22..ecf20fe65 100644
--- a/docs/src/examples/dae/physical_constraints.md
+++ b/docs/src/examples/dae/physical_constraints.md
@@ -17,18 +17,18 @@ rng = Random.default_rng()
 function f!(du, u, p, t)
     y₁, y₂, y₃ = u
     k₁, k₂, k₃ = p
-    du[1] = -k₁*y₁ + k₃*y₂*y₃
-    du[2] =  k₁*y₁ - k₃*y₂*y₃ - k₂*y₂^2
-    du[3] =  y₁ + y₂ + y₃ - 1
+    du[1] = -k₁ * y₁ + k₃ * y₂ * y₃
+    du[2] = k₁ * y₁ - k₃ * y₂ * y₃ - k₂ * y₂^2
+    du[3] = y₁ + y₂ + y₃ - 1
     return nothing
 end
 
 u₀ = [1.0, 0, 0]
-M = [1. 0  0
-     0  1. 0
-     0  0  0]
+M = [1.0 0 0
+     0 1.0 0
+     0 0 0]
 
-tspan = (0.0,1.0)
+tspan = (0.0, 1.0)
 p = [0.04, 3e7, 1e4]
 
 stiff_func = ODEFunction(f!, mass_matrix = M)
@@ -36,12 +36,12 @@ prob_stiff = ODEProblem(stiff_func, u₀, tspan, p)
 sol_stiff = solve(prob_stiff, Rodas5(), saveat = 0.1)
 
 nn_dudt2 = Lux.Chain(Lux.Dense(3, 64, tanh),
-                 Lux.Dense(64, 2))
+                     Lux.Dense(64, 2))
 
 pinit, st = Lux.setup(rng, nn_dudt2)
 
 model_stiff_ndae = NeuralODEMM(nn_dudt2, (u, p, t) -> [u[1] + u[2] + u[3] - 1],
-                               tspan, M, Rodas5(autodiff=false), saveat = 0.1)
+                               tspan, M, Rodas5(autodiff = false), saveat = 0.1)
 model_stiff_ndae(u₀, Lux.ComponentArray(pinit), st)
 
 function predict_stiff_ndae(p)
@@ -62,12 +62,11 @@ end
 l1 = first(loss_stiff_ndae(Lux.ComponentArray(pinit)))
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss_stiff_ndae(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_stiff_ndae(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, Lux.ComponentArray(pinit))
-result_stiff = Optimization.solve(optprob, NLopt.LD_LBFGS(), maxiters=100)
+result_stiff = Optimization.solve(optprob, NLopt.LD_LBFGS(), maxiters = 100)
 ```
 
-
 ## Step-by-Step Description
 
 ### Load Packages
@@ -88,9 +87,9 @@ fitting difficult.
 function f!(du, u, p, t)
     y₁, y₂, y₃ = u
     k₁, k₂, k₃ = p
-    du[1] = -k₁*y₁ + k₃*y₂*y₃
-    du[2] =  k₁*y₁ - k₃*y₂*y₃ - k₂*y₂^2
-    du[3] =  y₁ + y₂ + y₃ - 1
+    du[1] = -k₁ * y₁ + k₃ * y₂ * y₃
+    du[2] = k₁ * y₁ - k₃ * y₂ * y₃ - k₂ * y₂^2
+    du[3] = y₁ + y₂ + y₃ - 1
     return nothing
 end
 ```
@@ -100,21 +99,20 @@ end
 ```@example dae2
 u₀ = [1.0, 0, 0]
 
-M = [1. 0  0
-     0  1. 0
-     0  0  0]
+M = [1.0 0 0
+     0 1.0 0
+     0 0 0]
 
-tspan = (0.0,1.0)
+tspan = (0.0, 1.0)
 
 p = [0.04, 3e7, 1e4]
 ```
 
-- `u₀` = Initial Conditions
-- `M` = Semi-explicit Mass Matrix (last row is the constraint equation and are therefore
-all zeros)
-- `tspan` = Time span over which to evaluate
-- `p` = parameters `k1`, `k2` and `k3` of the differential equation above
-
+  - `u₀` = Initial Conditions
+  - `M` = Semi-explicit Mass Matrix (last row is the constraint equation and are therefore
+    all zeros)
+  - `tspan` = Time span over which to evaluate
+  - `p` = parameters `k1`, `k2` and `k3` of the differential equation above
 
 ### ODE Function, Problem and Solution
 
@@ -138,12 +136,12 @@ is more suited to SciML applications (similarly for
 
 ```@example dae2
 nn_dudt2 = Lux.Chain(Lux.Dense(3, 64, tanh),
-                 Lux.Dense(64, 2))
+                     Lux.Dense(64, 2))
 
 pinit, st = Lux.setup(rng, nn_dudt2)
 
 model_stiff_ndae = NeuralODEMM(nn_dudt2, (u, p, t) -> [u[1] + u[2] + u[3] - 1],
-                               tspan, M, Rodas5(autodiff=false), saveat = 0.1)
+                               tspan, M, Rodas5(autodiff = false), saveat = 0.1)
 model_stiff_ndae(u₀, Lux.ComponentArray(pinit), st)
 ```
 
@@ -195,8 +193,8 @@ The callback function displays the loss during training.
 
 ```@example dae2
 callback = function (p, l, pred) #callback function to observe training
-  display(l)
-  return false
+    display(l)
+    return false
 end
 ```
 
@@ -207,7 +205,7 @@ Finally, training with `Optimization.solve` by passing: *loss function*, *model
 
 ```@example dae2
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss_stiff_ndae(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_stiff_ndae(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, Lux.ComponentArray(pinit))
-result_stiff = Optimization.solve(optprob, NLopt.LD_LBFGS(), maxiters=100)
+result_stiff = Optimization.solve(optprob, NLopt.LD_LBFGS(), maxiters = 100)
 ```
diff --git a/docs/src/examples/dde/delay_diffeq.md b/docs/src/examples/dde/delay_diffeq.md
index 7d1d4c345..ab45413fa 100644
--- a/docs/src/examples/dde/delay_diffeq.md
+++ b/docs/src/examples/dde/delay_diffeq.md
@@ -8,13 +8,12 @@ like:
 using DifferentialEquations, Optimization, SciMLSensitivity,
       OptimizationPolyalgorithms
 
-
 # Define the same LV equation, but including a delay parameter
 function delay_lotka_volterra!(du, u, h, p, t)
-  x, y = u
-  α, β, δ, γ = p
-  du[1] = dx = (α   - β*y) * h(p, t-0.1)[1]
-  du[2] = dy = (δ*x - γ)   * y
+    x, y = u
+    α, β, δ, γ = p
+    du[1] = dx = (α - β * y) * h(p, t - 0.1)[1]
+    du[2] = dy = (δ * x - γ) * y
 end
 
 # Initial parameters
@@ -32,26 +31,28 @@ prob_dde = DDEProblem(delay_lotka_volterra!, u0, h, (0.0, 10.0),
                       constant_lags = [0.1])
 
 function predict_dde(p)
-  return Array(solve(prob_dde, MethodOfSteps(Tsit5()),
-                              u0=u0, p=p, saveat = 0.1,
-                              sensealg = ReverseDiffAdjoint()))
+    return Array(solve(prob_dde, MethodOfSteps(Tsit5()),
+                       u0 = u0, p = p, saveat = 0.1,
+                       sensealg = ReverseDiffAdjoint()))
 end
 
-loss_dde(p) = sum(abs2, x-1 for x in predict_dde(p))
+loss_dde(p) = sum(abs2, x - 1 for x in predict_dde(p))
 
 using Plots
-callback = function (p,l...;doplot=false)
-  display(loss_dde(p))
-  doplot && display(plot(solve(remake(prob_dde,p=p),MethodOfSteps(Tsit5()),saveat=0.1),ylim=(0,6)))
-  return false
+callback = function (p, l...; doplot = false)
+    display(loss_dde(p))
+    doplot &&
+        display(plot(solve(remake(prob_dde, p = p), MethodOfSteps(Tsit5()), saveat = 0.1),
+                     ylim = (0, 6)))
+    return false
 end
 
-callback(p,loss_dde(p)...)
+callback(p, loss_dde(p)...)
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_dde(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_dde(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p)
-result_dde = Optimization.solve(optprob, PolyOpt(), maxiters = 300, callback=callback)
+result_dde = Optimization.solve(optprob, PolyOpt(), maxiters = 300, callback = callback)
 ```
 
 Notice that we chose `sensealg = ReverseDiffAdjoint()` to utilize the ReverseDiff.jl
@@ -61,20 +62,22 @@ We define a callback to display the solution at the current parameters for each
 
 ```@example dde
 using Plots
-callback = function (p,l...;doplot=false)
-  display(loss_dde(p))
-  doplot && display(plot(solve(remake(prob_dde,p=p),MethodOfSteps(Tsit5()),saveat=0.1),ylim=(0,6)))
-  return false
+callback = function (p, l...; doplot = false)
+    display(loss_dde(p))
+    doplot &&
+        display(plot(solve(remake(prob_dde, p = p), MethodOfSteps(Tsit5()), saveat = 0.1),
+                     ylim = (0, 6)))
+    return false
 end
 
-callback(p,loss_dde(p)...)
+callback(p, loss_dde(p)...)
 ```
 
 We use `Optimization.solve` to optimize the parameters for our loss function:
 
 ```@example dde
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_dde(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_dde(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p)
-result_dde = Optimization.solve(optprob, PolyOpt(), callback=callback)
+result_dde = Optimization.solve(optprob, PolyOpt(), callback = callback)
 ```
diff --git a/docs/src/examples/hybrid_jump/bouncing_ball.md b/docs/src/examples/hybrid_jump/bouncing_ball.md
index 61fe1141d..900fbbab5 100644
--- a/docs/src/examples/hybrid_jump/bouncing_ball.md
+++ b/docs/src/examples/hybrid_jump/bouncing_ball.md
@@ -10,25 +10,25 @@ first start by implementing the ODE:
 ```@example bouncing_ball
 using Optimization, OptimizationPolyalgorithms, SciMLSensitivity, DifferentialEquations
 
-function f(du,u,p,t)
-  du[1] = u[2]
-  du[2] = -p[1]
+function f(du, u, p, t)
+    du[1] = u[2]
+    du[2] = -p[1]
 end
 
-function condition(u,t,integrator) # Event when event_f(u,t) == 0
-  u[1]
+function condition(u, t, integrator) # Event when event_f(u,t) == 0
+    u[1]
 end
 
 function affect!(integrator)
-  integrator.u[2] = -integrator.p[2]*integrator.u[2]
+    integrator.u[2] = -integrator.p[2] * integrator.u[2]
 end
 
-callback = ContinuousCallback(condition,affect!)
-u0 = [50.0,0.0]
-tspan = (0.0,15.0)
+callback = ContinuousCallback(condition, affect!)
+u0 = [50.0, 0.0]
+tspan = (0.0, 15.0)
 p = [9.8, 0.8]
-prob = ODEProblem(f,u0,tspan,p)
-sol = solve(prob,Tsit5(),callback=callback)
+prob = ODEProblem(f, u0, tspan, p)
+sol = solve(prob, Tsit5(), callback = callback)
 ```
 
 Here we have a friction coefficient of `0.8`. We want to refine this
@@ -38,14 +38,14 @@ the value 20:
 
 ```@example bouncing_ball
 function loss(θ)
-  sol = solve(prob,Tsit5(),p=[9.8,θ[1]],callback=callback)
-  target = 20.0
-  abs2(sol[end][1] - target)
+    sol = solve(prob, Tsit5(), p = [9.8, θ[1]], callback = callback)
+    target = 20.0
+    abs2(sol[end][1] - target)
 end
 
 loss([0.8])
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, [0.8])
 @time res = Optimization.solve(optprob, PolyOpt(), maxiters = 300)
 @show res.u # [0.866554105436901]
diff --git a/docs/src/examples/hybrid_jump/hybrid_diffeq.md b/docs/src/examples/hybrid_jump/hybrid_diffeq.md
index 52e57a200..969631a52 100644
--- a/docs/src/examples/hybrid_jump/hybrid_diffeq.md
+++ b/docs/src/examples/hybrid_jump/hybrid_diffeq.md
@@ -1,7 +1,7 @@
 # Training Neural Networks in Hybrid Differential Equations
 
 Hybrid differential equations are differential equations with implicit or
-explicit discontinuities as specified by 
+explicit discontinuities as specified by
 [callbacks](https://docs.sciml.ai/DiffEqDocs/stable/features/callback_functions/).
 In the following example, explicit dosing times are given for a pharmacometric
 model and the universal differential equation is trained to uncover the missing
@@ -9,57 +9,58 @@ dynamical equations.
 
 ```@example
 using DiffEqFlux, Flux, DifferentialEquations, Plots
-u0 = Float32[2.; 0.]
+u0 = Float32[2.0; 0.0]
 datasize = 100
-tspan = (0.0f0,10.5f0)
-dosetimes = [1.0,2.0,4.0,8.0]
+tspan = (0.0f0, 10.5f0)
+dosetimes = [1.0, 2.0, 4.0, 8.0]
 
 function affect!(integrator)
-    integrator.u = integrator.u.+1
+    integrator.u = integrator.u .+ 1
 end
-cb_ = PresetTimeCallback(dosetimes,affect!,save_positions=(false,false))
-function trueODEfunc(du,u,p,t)
+cb_ = PresetTimeCallback(dosetimes, affect!, save_positions = (false, false))
+function trueODEfunc(du, u, p, t)
     du .= -u
 end
-t = range(tspan[1],tspan[2],length=datasize)
+t = range(tspan[1], tspan[2], length = datasize)
 
-prob = ODEProblem(trueODEfunc,u0,tspan)
-ode_data = Array(solve(prob,Tsit5(),callback=cb_,saveat=t))
-dudt2 = Flux.Chain(Flux.Dense(2,50,tanh),
-             Flux.Dense(50,2))
-p,re = Flux.destructure(dudt2) # use this p as the initial condition!
+prob = ODEProblem(trueODEfunc, u0, tspan)
+ode_data = Array(solve(prob, Tsit5(), callback = cb_, saveat = t))
+dudt2 = Flux.Chain(Flux.Dense(2, 50, tanh),
+                   Flux.Dense(50, 2))
+p, re = Flux.destructure(dudt2) # use this p as the initial condition!
 
-function dudt(du,u,p,t)
+function dudt(du, u, p, t)
     du[1:2] .= -u[1:2]
     du[3:end] .= re(p)(u[1:2]) #re(p)(u[3:end])
 end
-z0 = Float32[u0;u0]
-prob = ODEProblem(dudt,z0,tspan)
+z0 = Float32[u0; u0]
+prob = ODEProblem(dudt, z0, tspan)
 
 affect!(integrator) = integrator.u[1:2] .= integrator.u[3:end]
-callback = PresetTimeCallback(dosetimes,affect!,save_positions=(false,false))
+callback = PresetTimeCallback(dosetimes, affect!, save_positions = (false, false))
 
 function predict_n_ode()
-    _prob = remake(prob,p=p)
-    Array(solve(_prob,Tsit5(),u0=z0,p=p,callback=callback,saveat=t,sensealg=ReverseDiffAdjoint()))[1:2,:]
+    _prob = remake(prob, p = p)
+    Array(solve(_prob, Tsit5(), u0 = z0, p = p, callback = callback, saveat = t,
+                sensealg = ReverseDiffAdjoint()))[1:2, :]
     #Array(solve(prob,Tsit5(),u0=z0,p=p,saveat=t))[1:2,:]
 end
 
 function loss_n_ode()
     pred = predict_n_ode()
-    loss = sum(abs2,ode_data .- pred)
+    loss = sum(abs2, ode_data .- pred)
     loss
 end
 loss_n_ode() # n_ode.p stores the initial parameters of the neural ODE
 
-cba = function (;doplot=false) #callback function to observe training
-  pred = predict_n_ode()
-  display(sum(abs2,ode_data .- pred))
-  # plot current prediction against data
-  pl = scatter(t,ode_data[1,:],label="data")
-  scatter!(pl,t,pred[1,:],label="prediction")
-  display(plot(pl))
-  return false
+cba = function (; doplot = false) #callback function to observe training
+    pred = predict_n_ode()
+    display(sum(abs2, ode_data .- pred))
+    # plot current prediction against data
+    pl = scatter(t, ode_data[1, :], label = "data")
+    scatter!(pl, t, pred[1, :], label = "prediction")
+    display(plot(pl))
+    return false
 end
 cba()
 
diff --git a/docs/src/examples/neural_ode/minibatch.md b/docs/src/examples/neural_ode/minibatch.md
index 982583d7b..876462df9 100644
--- a/docs/src/examples/neural_ode/minibatch.md
+++ b/docs/src/examples/neural_ode/minibatch.md
@@ -2,32 +2,31 @@
 
 ```@example
 using DifferentialEquations, Flux, Random, Plots
-using IterTools: ncycle 
+using IterTools: ncycle
 
 rng = Random.default_rng()
 
 function newtons_cooling(du, u, p, t)
     temp = u[1]
     k, temp_m = p
-    du[1] = dT = -k*(temp-temp_m) 
-  end
+    du[1] = dT = -k * (temp - temp_m)
+end
 
 function true_sol(du, u, p, t)
-    true_p = [log(2)/8.0, 100.0]
+    true_p = [log(2) / 8.0, 100.0]
     newtons_cooling(du, u, true_p, t)
 end
 
-
-ann = Chain(Dense(1,8,tanh), Dense(8,1,tanh))
+ann = Chain(Dense(1, 8, tanh), Dense(8, 1, tanh))
 θ, re = Flux.destructure(ann)
 
-function dudt_(u,p,t)           
-    re(p)(u)[1].* u
+function dudt_(u, p, t)
+    re(p)(u)[1] .* u
 end
 
 function predict_adjoint(time_batch)
-    _prob = remake(prob,u0=u0,p=θ)
-    Array(solve(_prob, Tsit5(), saveat = time_batch)) 
+    _prob = remake(prob, u0 = u0, p = θ)
+    Array(solve(_prob, Tsit5(), saveat = time_batch))
 end
 
 function loss_adjoint(batch, time_batch)
@@ -35,14 +34,13 @@ function loss_adjoint(batch, time_batch)
     sum(abs2, batch - pred)#, pred
 end
 
-
 u0 = Float32[200.0]
 datasize = 30
 tspan = (0.0f0, 3.0f0)
 
-t = range(tspan[1], tspan[2], length=datasize)
+t = range(tspan[1], tspan[2], length = datasize)
 true_prob = ODEProblem(true_sol, u0, tspan)
-ode_data = Array(solve(true_prob, Tsit5(), saveat=t))
+ode_data = Array(solve(true_prob, Tsit5(), saveat = t))
 
 prob = ODEProblem{false}(dudt_, u0, tspan, θ)
 
@@ -55,77 +53,81 @@ for (x, y) in train_loader
 end
 
 numEpochs = 300
-losses=[]
-cb() = begin
-    l=loss_adjoint(ode_data, t)
-    push!(losses, l)
-    @show l
-    pred=predict_adjoint(t)
-    pl = scatter(t,ode_data[1,:],label="data", color=:black, ylim=(150,200))
-    scatter!(pl,t,pred[1,:],label="prediction", color=:darkgreen)
-    display(plot(pl))
-    false
-end 
-
-opt=ADAM(0.05)
-Flux.train!(loss_adjoint, Flux.params(θ), ncycle(train_loader,numEpochs), opt, cb=Flux.throttle(cb, 10))
+losses = []
+function cb()
+    begin
+        l = loss_adjoint(ode_data, t)
+        push!(losses, l)
+        @show l
+        pred = predict_adjoint(t)
+        pl = scatter(t, ode_data[1, :], label = "data", color = :black, ylim = (150, 200))
+        scatter!(pl, t, pred[1, :], label = "prediction", color = :darkgreen)
+        display(plot(pl))
+        false
+    end
+end
+
+opt = ADAM(0.05)
+Flux.train!(loss_adjoint, Flux.params(θ), ncycle(train_loader, numEpochs), opt,
+            cb = Flux.throttle(cb, 10))
 
 #Now lets see how well it generalizes to new initial conditions 
 
-starting_temp=collect(10:30:250)
-true_prob_func(u0)=ODEProblem(true_sol, [u0], tspan)
-color_cycle=palette(:tab10)
-pl=plot()
-for (j,temp) in enumerate(starting_temp)
-    ode_test_sol = solve(ODEProblem(true_sol, [temp], (0.0f0,10.0f0)), Tsit5(), saveat=0.0:0.5:10.0)
-    ode_nn_sol = solve(ODEProblem{false}(dudt_, [temp], (0.0f0,10.0f0), θ))
-    scatter!(pl, ode_test_sol, var=(0,1), label="", color=color_cycle[j])
-    plot!(pl, ode_nn_sol, var=(0,1), label="", color=color_cycle[j], lw=2.0)
-end
-display(pl) 
+starting_temp = collect(10:30:250)
+true_prob_func(u0) = ODEProblem(true_sol, [u0], tspan)
+color_cycle = palette(:tab10)
+pl = plot()
+for (j, temp) in enumerate(starting_temp)
+    ode_test_sol = solve(ODEProblem(true_sol, [temp], (0.0f0, 10.0f0)), Tsit5(),
+                         saveat = 0.0:0.5:10.0)
+    ode_nn_sol = solve(ODEProblem{false}(dudt_, [temp], (0.0f0, 10.0f0), θ))
+    scatter!(pl, ode_test_sol, var = (0, 1), label = "", color = color_cycle[j])
+    plot!(pl, ode_nn_sol, var = (0, 1), label = "", color = color_cycle[j], lw = 2.0)
+end
+display(pl)
 title!("Neural ODE for Newton's Law of Cooling: Test Data")
 xlabel!("Time")
-ylabel!("Temp") 
+ylabel!("Temp")
 ```
 
-When training a neural network, we need to find the gradient with respect to our data set. There are three main ways to partition our data when using a training algorithm like gradient descent: stochastic, batching and mini-batching. Stochastic gradient descent trains on a single random data point each epoch. This allows for the neural network to better converge to the global minimum even on noisy data, but is computationally inefficient. Batch gradient descent trains on the whole data set each epoch and while computationally efficient is prone to converging to local minima. Mini-batching combines both of these advantages and by training on a small random "mini-batch" of the data each epoch can converge to the global minimum while remaining more computationally efficient than stochastic descent. Typically, we do this by randomly selecting subsets of the data each epoch and use this subset to train on. We can also pre-batch the data by creating an iterator holding these randomly selected batches before beginning to train. The proper size for the batch can be determined experimentally. Let us see how to do this with Julia. 
+When training a neural network, we need to find the gradient with respect to our data set. There are three main ways to partition our data when using a training algorithm like gradient descent: stochastic, batching and mini-batching. Stochastic gradient descent trains on a single random data point each epoch. This allows for the neural network to better converge to the global minimum even on noisy data, but is computationally inefficient. Batch gradient descent trains on the whole data set each epoch and while computationally efficient is prone to converging to local minima. Mini-batching combines both of these advantages and by training on a small random "mini-batch" of the data each epoch can converge to the global minimum while remaining more computationally efficient than stochastic descent. Typically, we do this by randomly selecting subsets of the data each epoch and use this subset to train on. We can also pre-batch the data by creating an iterator holding these randomly selected batches before beginning to train. The proper size for the batch can be determined experimentally. Let us see how to do this with Julia.
 
-For this example, we will use a very simple ordinary differential equation, newtons law of cooling. We can represent this in Julia like so. 
+For this example, we will use a very simple ordinary differential equation, newtons law of cooling. We can represent this in Julia like so.
 
 ```@example minibatch
 using DifferentialEquations, Flux, Random, Plots
-using IterTools: ncycle 
+using IterTools: ncycle
 
 rng = Random.default_rng()
 function newtons_cooling(du, u, p, t)
     temp = u[1]
     k, temp_m = p
-    du[1] = dT = -k*(temp-temp_m) 
-  end
+    du[1] = dT = -k * (temp - temp_m)
+end
 
 function true_sol(du, u, p, t)
-    true_p = [log(2)/8.0, 100.0]
+    true_p = [log(2) / 8.0, 100.0]
     newtons_cooling(du, u, true_p, t)
 end
 ```
 
-Now we define a neural-network using a linear approximation with 1 hidden layer of 8 neurons.  
+Now we define a neural-network using a linear approximation with 1 hidden layer of 8 neurons.
 
 ```@example minibatch
-ann = Chain(Dense(1,8,tanh), Dense(8,1,tanh))
+ann = Chain(Dense(1, 8, tanh), Dense(8, 1, tanh))
 θ, re = Flux.destructure(ann)
 
-function dudt_(u,p,t)           
-    re(p)(u)[1].* u
+function dudt_(u, p, t)
+    re(p)(u)[1] .* u
 end
 ```
 
-From here we build a loss function around it. 
+From here we build a loss function around it.
 
 ```@example minibatch
 function predict_adjoint(time_batch)
-    _prob = remake(prob, u0=u0, p=θ)
-    Array(solve(_prob, Tsit5(), saveat = time_batch)) 
+    _prob = remake(prob, u0 = u0, p = θ)
+    Array(solve(_prob, Tsit5(), saveat = time_batch))
 end
 
 function loss_adjoint(batch, time_batch)
@@ -134,16 +136,16 @@ function loss_adjoint(batch, time_batch)
 end
 ```
 
-To add support for batches of size `k` we use `Flux.Data.DataLoader`. To use this we pass in the `ode_data` and `t` as the 'x' and 'y' data to batch respectively. The parameter `batchsize` controls the size of our batches. We check our implementation by iterating over the batched data. 
+To add support for batches of size `k` we use `Flux.Data.DataLoader`. To use this we pass in the `ode_data` and `t` as the 'x' and 'y' data to batch respectively. The parameter `batchsize` controls the size of our batches. We check our implementation by iterating over the batched data.
 
 ```@example minibatch
 u0 = Float32[200.0]
 datasize = 30
 tspan = (0.0f0, 3.0f0)
 
-t = range(tspan[1], tspan[2], length=datasize)
+t = range(tspan[1], tspan[2], length = datasize)
 true_prob = ODEProblem(true_sol, u0, tspan)
-ode_data = Array(solve(true_prob, Tsit5(), saveat=t))
+ode_data = Array(solve(true_prob, Tsit5(), saveat = t))
 
 prob = ODEProblem{false}(dudt_, u0, tspan, θ)
 
@@ -156,41 +158,45 @@ for (x, y) in train_loader
 end
 ```
 
-Now we train the neural network with a user-defined call back function to display loss and the graphs with a maximum of 300 epochs. 
+Now we train the neural network with a user-defined call back function to display loss and the graphs with a maximum of 300 epochs.
 
 ```@example minibatch
 numEpochs = 300
-losses=[]
-cb() = begin
-    l=loss_adjoint(ode_data, t)
-    push!(losses, l)
-    @show l
-    pred=predict_adjoint(t)
-    pl = scatter(t,ode_data[1,:],label="data", color=:black, ylim=(150,200))
-    scatter!(pl,t,pred[1,:],label="prediction", color=:darkgreen)
-    display(plot(pl))
-    false
-end 
-
-opt=ADAM(0.05)
-Flux.train!(loss_adjoint, Flux.params(θ), ncycle(train_loader,numEpochs), opt, cb=Flux.throttle(cb, 10))
+losses = []
+function cb()
+    begin
+        l = loss_adjoint(ode_data, t)
+        push!(losses, l)
+        @show l
+        pred = predict_adjoint(t)
+        pl = scatter(t, ode_data[1, :], label = "data", color = :black, ylim = (150, 200))
+        scatter!(pl, t, pred[1, :], label = "prediction", color = :darkgreen)
+        display(plot(pl))
+        false
+    end
+end
+
+opt = ADAM(0.05)
+Flux.train!(loss_adjoint, Flux.params(θ), ncycle(train_loader, numEpochs), opt,
+            cb = Flux.throttle(cb, 10))
 ```
 
-Finally, we can see how well our trained network will generalize to new initial conditions. 
+Finally, we can see how well our trained network will generalize to new initial conditions.
 
 ```@example minibatch
-starting_temp=collect(10:30:250)
-true_prob_func(u0)=ODEProblem(true_sol, [u0], tspan)
-color_cycle=palette(:tab10)
-pl=plot()
-for (j,temp) in enumerate(starting_temp)
-    ode_test_sol = solve(ODEProblem(true_sol, [temp], (0.0f0,10.0f0)), Tsit5(), saveat=0.0:0.5:10.0)
-    ode_nn_sol = solve(ODEProblem{false}(dudt_, [temp], (0.0f0,10.0f0), θ))
-    scatter!(pl, ode_test_sol, var=(0,1), label="", color=color_cycle[j])
-    plot!(pl, ode_nn_sol, var=(0,1), label="", color=color_cycle[j], lw=2.0)
-end
-display(pl) 
+starting_temp = collect(10:30:250)
+true_prob_func(u0) = ODEProblem(true_sol, [u0], tspan)
+color_cycle = palette(:tab10)
+pl = plot()
+for (j, temp) in enumerate(starting_temp)
+    ode_test_sol = solve(ODEProblem(true_sol, [temp], (0.0f0, 10.0f0)), Tsit5(),
+                         saveat = 0.0:0.5:10.0)
+    ode_nn_sol = solve(ODEProblem{false}(dudt_, [temp], (0.0f0, 10.0f0), θ))
+    scatter!(pl, ode_test_sol, var = (0, 1), label = "", color = color_cycle[j])
+    plot!(pl, ode_nn_sol, var = (0, 1), label = "", color = color_cycle[j], lw = 2.0)
+end
+display(pl)
 title!("Neural ODE for Newton's Law of Cooling: Test Data")
 xlabel!("Time")
-ylabel!("Temp") 
+ylabel!("Temp")
 ```
diff --git a/docs/src/examples/neural_ode/neural_gde.md b/docs/src/examples/neural_ode/neural_gde.md
index fccc086eb..9ada8111b 100644
--- a/docs/src/examples/neural_ode/neural_gde.md
+++ b/docs/src/examples/neural_ode/neural_gde.md
@@ -25,15 +25,15 @@ dataset = Cora();
 # Preprocess the data and compute adjacency matrix
 classes = dataset.metadata["classes"]
 g = mldataset2gnngraph(dataset) |> device
-onehotbatch(data,labels)= device(labels).==reshape(data, 1,size(data)...)
-onecold(y) =  map(argmax,eachcol(y))
+onehotbatch(data, labels) = device(labels) .== reshape(data, 1, size(data)...)
+onecold(y) = map(argmax, eachcol(y))
 X = g.ndata.features
 y = onehotbatch(g.ndata.targets, classes) # a dense matrix is not the optimal, but we don't want to use Flux here
 
-Ã = normalized_adjacency(g, add_self_loops=true) |> device
+Ã = normalized_adjacency(g, add_self_loops = true) |> device
 
 (; train_mask, val_mask, test_mask) = g.ndata
-ytrain = y[:,train_mask]
+ytrain = y[:, train_mask]
 
 # Model and Data Configuration
 nin = size(X, 1)
@@ -42,7 +42,7 @@ nout = length(classes)
 epochs = 20
 
 # Define the graph neural network
-struct ExplicitGCNConv{F1,F2,F3,F4} <: AbstractExplicitLayer
+struct ExplicitGCNConv{F1, F2, F3, F4} <: AbstractExplicitLayer
     in_chs::Int
     out_chs::Int
     activation::F1
@@ -58,18 +58,18 @@ function Base.show(io::IO, l::ExplicitGCNConv)
 end
 
 function initialparameters(rng::AbstractRNG, d::ExplicitGCNConv)
-        return (weight=d.init_weight(rng, d.out_chs, d.in_chs),
-                bias=d.init_bias(rng, d.out_chs, 1))
+    return (weight = d.init_weight(rng, d.out_chs, d.in_chs),
+            bias = d.init_bias(rng, d.out_chs, 1))
 end
 
 initialstates(rng::AbstractRNG, d::ExplicitGCNConv) = (Ã = d.init_Ã(),)
 
-
-function ExplicitGCNConv(Ã, ch::Pair{Int,Int}, activation = identity;
-                         init_weight=glorot_normal, init_bias=zeros32)
-    init_Ã = ()->copy(Ã)
-    return ExplicitGCNConv{typeof(activation), typeof(init_Ã), typeof(init_weight), typeof(init_bias)}(first(ch), last(ch), activation,
-                                                                                                       init_Ã, init_weight, init_bias)
+function ExplicitGCNConv(Ã, ch::Pair{Int, Int}, activation = identity;
+                         init_weight = glorot_normal, init_bias = zeros32)
+    init_Ã = () -> copy(Ã)
+    return ExplicitGCNConv{typeof(activation), typeof(init_Ã), typeof(init_weight),
+                           typeof(init_bias)}(first(ch), last(ch), activation,
+                                              init_Ã, init_weight, init_bias)
 end
 
 function (l::ExplicitGCNConv)(x::AbstractMatrix, ps, st::NamedTuple)
@@ -79,9 +79,9 @@ end
 
 # Define the Neural GDE
 function diffeqsol_to_array(x::ODESolution{T, N, <:AbstractVector{<:CuArray}}) where {T, N}
-    return dropdims(gpu(x); dims=3)
+    return dropdims(gpu(x); dims = 3)
 end
-diffeqsol_to_array(x::ODESolution) = dropdims(Array(x); dims=3)
+diffeqsol_to_array(x::ODESolution) = dropdims(Array(x); dims = 3)
 
 # make NeuralODE work with Lux.Chain
 # remove this once https://github.com/SciML/DiffEqFlux.jl/issues/727 is fixed
@@ -91,7 +91,7 @@ initialstates(rng::AbstractRNG, node::NeuralODE) = initialstates(rng, node.model
 gnn = Chain(ExplicitGCNConv(Ã, nhidden => nhidden, relu),
             ExplicitGCNConv(Ã, nhidden => nhidden, relu))
 
-node = NeuralODE(gnn, (0.f0, 1.f0), Tsit5(), save_everystep = false,
+node = NeuralODE(gnn, (0.0f0, 1.0f0), Tsit5(), save_everystep = false,
                  reltol = 1e-3, abstol = 1e-3, save_start = false)
 
 model = Chain(ExplicitGCNConv(Ã, nin => nhidden, relu),
@@ -100,18 +100,18 @@ model = Chain(ExplicitGCNConv(Ã, nin => nhidden, relu),
               Dense(nhidden, nout))
 
 # Loss
-logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ); dims=1))
+logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ); dims = 1))
 
 function loss(x, y, mask, model, ps, st)
     ŷ, st = model(x, ps, st)
-    return logitcrossentropy(ŷ[:,mask], y), st
+    return logitcrossentropy(ŷ[:, mask], y), st
 end
 
 function eval_loss_accuracy(X, y, mask, model, ps, st)
     ŷ, _ = model(X, ps, st)
-    l = logitcrossentropy(ŷ[:,mask], y[:,mask])
-    acc = mean(onecold(ŷ[:,mask]) .== onecold(y[:,mask]))
-    return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
+    l = logitcrossentropy(ŷ[:, mask], y[:, mask])
+    acc = mean(onecold(ŷ[:, mask]) .== onecold(y[:, mask]))
+    return (loss = round(l, digits = 4), acc = round(acc * 100, digits = 2))
 end
 
 # Training
@@ -126,11 +126,11 @@ function train()
 
     ## Optimizer
     opt = Optimisers.ADAM(0.01f0)
-    st_opt = Optimisers.setup(opt,ps)
+    st_opt = Optimisers.setup(opt, ps)
 
     ## Training Loop
     for _ in 1:epochs
-        (l,st), back = pullback(p->loss(X, ytrain, train_mask, model, p, st), ps)
+        (l, st), back = pullback(p -> loss(X, ytrain, train_mask, model, p, st), ps)
         gs = back((one(l), nothing))[1]
         st_opt, ps = Optimisers.update(st_opt, ps, gs)
         @show eval_loss_accuracy(X, y, val_mask, model, ps, st)
@@ -175,19 +175,21 @@ Convert the data to `GNNGraph` and get the adjacency matrix from the graph `g`.
 ```julia
 classes = dataset.metadata["classes"]
 g = mldataset2gnngraph(dataset) |> device
-onehotbatch(data,labels)= device(labels).==reshape(data, 1,size(data)...)
-onecold(y) =  map(argmax,eachcol(y))
+onehotbatch(data, labels) = device(labels) .== reshape(data, 1, size(data)...)
+onecold(y) = map(argmax, eachcol(y))
 X = g.ndata.features
 y = onehotbatch(g.ndata.targets, classes) # a dense matrix is not the optimal, but we don't want to use Flux here
 
-Ã = normalized_adjacency(g, add_self_loops=true) |> device
+Ã = normalized_adjacency(g, add_self_loops = true) |> device
 ```
+
 ### Training Data
 
 GNNs operate on an entire graph, so we can't do any sort of minibatching here. We predict the entire dataset, but train the model in a semi-supervised learning fashion.
+
 ```julia
 (; train_mask, val_mask, test_mask) = g.ndata
-ytrain = y[:,train_mask]
+ytrain = y[:, train_mask]
 ```
 
 ## Model and Data Configuration
@@ -200,13 +202,13 @@ nhidden = 16
 nout = length(classes)
 epochs = 20
 ```
+
 ## Define the Graph Neural Network
 
 Here, we define a type of graph neural networks called `GCNConv`. We use the name `ExplicitGCNConv` to avoid naming conflicts with `GraphNeuralNetworks`. For more information on defining a layer with `Lux`, please consult to the [doc](http://lux.csail.mit.edu/dev/introduction/overview/#AbstractExplicitLayer-API).
 
-
 ```julia
-struct ExplicitGCNConv{F1,F2,F3} <: AbstractExplicitLayer
+struct ExplicitGCNConv{F1, F2, F3} <: AbstractExplicitLayer
     Ã::AbstractMatrix  # nomalized_adjacency matrix
     in_chs::Int
     out_chs::Int
@@ -222,14 +224,18 @@ function Base.show(io::IO, l::ExplicitGCNConv)
 end
 
 function initialparameters(rng::AbstractRNG, d::ExplicitGCNConv)
-        return (weight=d.init_weight(rng, d.out_chs, d.in_chs),
-                bias=d.init_bias(rng, d.out_chs, 1))
+    return (weight = d.init_weight(rng, d.out_chs, d.in_chs),
+            bias = d.init_bias(rng, d.out_chs, 1))
 end
 
-function ExplicitGCNConv(Ã, ch::Pair{Int,Int}, activation = identity;
-                         init_weight=glorot_normal, init_bias=zeros32)
-    return ExplicitGCNConv{typeof(activation), typeof(init_weight), typeof(init_bias)}(Ã, first(ch), last(ch), activation,
-                                                                                       init_weight, init_bias)
+function ExplicitGCNConv(Ã, ch::Pair{Int, Int}, activation = identity;
+                         init_weight = glorot_normal, init_bias = zeros32)
+    return ExplicitGCNConv{typeof(activation), typeof(init_weight), typeof(init_bias)}(Ã,
+                                                                                       first(ch),
+                                                                                       last(ch),
+                                                                                       activation,
+                                                                                       init_weight,
+                                                                                       init_bias)
 end
 
 function (l::ExplicitGCNConv)(x::AbstractMatrix, ps, st::NamedTuple)
@@ -244,14 +250,14 @@ Let us now define the final model. We will use two GNN layers for approximating
 
 ```julia
 function diffeqsol_to_array(x::ODESolution{T, N, <:AbstractVector{<:CuArray}}) where {T, N}
-    return dropdims(gpu(x); dims=3)
+    return dropdims(gpu(x); dims = 3)
 end
-diffeqsol_to_array(x::ODESolution) = dropdims(Array(x); dims=3)
+diffeqsol_to_array(x::ODESolution) = dropdims(Array(x); dims = 3)
 
 gnn = Chain(ExplicitGCNConv(Ã, nhidden => nhidden, relu),
             ExplicitGCNConv(Ã, nhidden => nhidden, relu))
 
-node = NeuralODE(gnn, (0.f0, 1.f0), Tsit5(), save_everystep = false,
+node = NeuralODE(gnn, (0.0f0, 1.0f0), Tsit5(), save_everystep = false,
                  reltol = 1e-3, abstol = 1e-3, save_start = false)
 
 model = Chain(ExplicitGCNConv(Ã, nin => nhidden, relu),
@@ -267,23 +273,25 @@ model = Chain(ExplicitGCNConv(Ã, nin => nhidden, relu),
 We shall be using the standard categorical crossentropy loss function, which is used for multiclass classification tasks.
 
 ```julia
-logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ); dims=1))
+logitcrossentropy(ŷ, y) = mean(-sum(y .* logsoftmax(ŷ); dims = 1))
 
 function loss(x, y, mask, model, ps, st)
     ŷ, st = model(x, ps, st)
-    return logitcrossentropy(ŷ[:,mask], y), st
+    return logitcrossentropy(ŷ[:, mask], y), st
 end
 
 function eval_loss_accuracy(X, y, mask, model, ps, st)
     ŷ, _ = model(X, ps, st)
-    l = logitcrossentropy(ŷ[:,mask], y[:,mask])
-    acc = mean(onecold(ŷ[:,mask]) .== onecold(y[:,mask]))
-    return (loss = round(l, digits=4), acc = round(acc*100, digits=2))
+    l = logitcrossentropy(ŷ[:, mask], y[:, mask])
+    acc = mean(onecold(ŷ[:, mask]) .== onecold(y[:, mask]))
+    return (loss = round(l, digits = 4), acc = round(acc * 100, digits = 2))
 end
 ```
 
 ### Setup Model
+
 We need to manually set up our mode with `Lux`, and convert the parameters to `ComponentArray` so that they can work well with sensitivity algorithms.
+
 ```julia
 rng = Random.default_rng()
 Random.seed!(rng, 0)
@@ -292,13 +300,14 @@ ps, st = Lux.setup(rng, model)
 ps = ComponentArray(ps) |> device
 st = st |> device
 ```
+
 ### Optimizer
 
 For this task, we will be using the `ADAM` optimizer with a learning rate of `0.01`.
 
 ```julia
 opt = Optimisers.Adam(0.01f0)
-st_opt = Optimisers.setup(opt,ps)
+st_opt = Optimisers.setup(opt, ps)
 ```
 
 ## Training Loop
@@ -307,7 +316,7 @@ Finally, we use the package `Optimisers` to learn the parameters `ps`. We run th
 
 ```julia
 for _ in 1:epochs
-    (l,st), back = pullback(p->loss(X, ytrain, train_mask, model, p, st), ps)
+    (l, st), back = pullback(p -> loss(X, ytrain, train_mask, model, p, st), ps)
     gs = back((one(l), nothing))[1]
     st_opt, ps = Optimisers.update(st_opt, ps, gs)
     @show eval_loss_accuracy(X, y, val_mask, model, ps, st)
diff --git a/docs/src/examples/neural_ode/neural_ode_flux.md b/docs/src/examples/neural_ode/neural_ode_flux.md
index 517746b7b..ff82a38dc 100644
--- a/docs/src/examples/neural_ode/neural_ode_flux.md
+++ b/docs/src/examples/neural_ode/neural_ode_flux.md
@@ -12,52 +12,52 @@ example of optimizing `u0` and `p`.
 ```@example neuralode1
 using OrdinaryDiffEq, SciMLSensitivity, Flux, Plots
 
-u0 = Float32[2.; 0.]
+u0 = Float32[2.0; 0.0]
 datasize = 30
-tspan = (0.0f0,1.5f0)
+tspan = (0.0f0, 1.5f0)
 
-function trueODEfunc(du,u,p,t)
+function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
-t = range(tspan[1],tspan[2],length=datasize)
-prob = ODEProblem(trueODEfunc,u0,tspan)
-ode_data = Array(solve(prob,Tsit5(),saveat=t))
+t = range(tspan[1], tspan[2], length = datasize)
+prob = ODEProblem(trueODEfunc, u0, tspan)
+ode_data = Array(solve(prob, Tsit5(), saveat = t))
 
-dudt2 = Flux.Chain(x -> x.^3,
-             Flux.Dense(2,50,tanh),
-             Flux.Dense(50,2))
-p,re = Flux.destructure(dudt2) # use this p as the initial condition!
-dudt(u,p,t) = re(p)(u) # need to restrcture for backprop!
-prob = ODEProblem(dudt,u0,tspan)
+dudt2 = Flux.Chain(x -> x .^ 3,
+                   Flux.Dense(2, 50, tanh),
+                   Flux.Dense(50, 2))
+p, re = Flux.destructure(dudt2) # use this p as the initial condition!
+dudt(u, p, t) = re(p)(u) # need to restrcture for backprop!
+prob = ODEProblem(dudt, u0, tspan)
 
 function predict_n_ode()
-  Array(solve(prob,Tsit5(),u0=u0,p=p,saveat=t))
+    Array(solve(prob, Tsit5(), u0 = u0, p = p, saveat = t))
 end
 
 function loss_n_ode()
     pred = predict_n_ode()
-    loss = sum(abs2,ode_data .- pred)
+    loss = sum(abs2, ode_data .- pred)
     loss
 end
 
 loss_n_ode() # n_ode.p stores the initial parameters of the neural ODE
 
-callback = function (;doplot=false) #callback function to observe training
-  pred = predict_n_ode()
-  display(sum(abs2,ode_data .- pred))
-  # plot current prediction against data
-  pl = scatter(t,ode_data[1,:],label="data")
-  scatter!(pl,t,pred[1,:],label="prediction")
-  display(plot(pl))
-  return false
+callback = function (; doplot = false) #callback function to observe training
+    pred = predict_n_ode()
+    display(sum(abs2, ode_data .- pred))
+    # plot current prediction against data
+    pl = scatter(t, ode_data[1, :], label = "data")
+    scatter!(pl, t, pred[1, :], label = "prediction")
+    display(plot(pl))
+    return false
 end
 
 # Display the ODE with the initial parameter values.
 callback()
 
 data = Iterators.repeated((), 1000)
-res1 = Flux.train!(loss_n_ode, Flux.params(u0,p), data, ADAM(0.05), cb = callback)
+res1 = Flux.train!(loss_n_ode, Flux.params(u0, p), data, ADAM(0.05), cb = callback)
 
 callback()
 ```
@@ -69,7 +69,7 @@ the `Flux.destructure` function. In this case, if `dudt` is a Flux
 chain, then:
 
 ```julia
-p,re = Flux.destructure(chain)
+p, re = Flux.destructure(chain)
 ```
 
 returns `p` which is the vector of parameters for the chain and `re` which is
@@ -83,69 +83,70 @@ Notice that Optimization.jl works on a vector input, so we have to concatenate `
 and `p` and then in the loss function split to the pieces.
 
 ```@example neuralode2
-using Flux, OrdinaryDiffEq, SciMLSensitivity, Optimization, OptimizationOptimisers, OptimizationNLopt, Plots
+using Flux, OrdinaryDiffEq, SciMLSensitivity, Optimization, OptimizationOptimisers,
+      OptimizationNLopt, Plots
 
-u0 = Float32[2.; 0.]
+u0 = Float32[2.0; 0.0]
 datasize = 30
-tspan = (0.0f0,1.5f0)
+tspan = (0.0f0, 1.5f0)
 
-function trueODEfunc(du,u,p,t)
+function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
-t = range(tspan[1],tspan[2],length=datasize)
-prob = ODEProblem(trueODEfunc,u0,tspan)
-ode_data = Array(solve(prob,Tsit5(),saveat=t))
+t = range(tspan[1], tspan[2], length = datasize)
+prob = ODEProblem(trueODEfunc, u0, tspan)
+ode_data = Array(solve(prob, Tsit5(), saveat = t))
 
-dudt2 = Flux.Chain(x -> x.^3,
-             Flux.Dense(2,50,tanh),
-             Flux.Dense(50,2))
-p,re = Flux.destructure(dudt2) # use this p as the initial condition!
-dudt(u,p,t) = re(p)(u) # need to restrcture for backprop!
-prob = ODEProblem(dudt,u0,tspan)
+dudt2 = Flux.Chain(x -> x .^ 3,
+                   Flux.Dense(2, 50, tanh),
+                   Flux.Dense(50, 2))
+p, re = Flux.destructure(dudt2) # use this p as the initial condition!
+dudt(u, p, t) = re(p)(u) # need to restrcture for backprop!
+prob = ODEProblem(dudt, u0, tspan)
 
-θ = [u0;p] # the parameter vector to optimize
+θ = [u0; p] # the parameter vector to optimize
 
 function predict_n_ode(θ)
-  Array(solve(prob,Tsit5(),u0=θ[1:2],p=θ[3:end],saveat=t))
+    Array(solve(prob, Tsit5(), u0 = θ[1:2], p = θ[3:end], saveat = t))
 end
 
 function loss_n_ode(θ)
     pred = predict_n_ode(θ)
-    loss = sum(abs2,ode_data .- pred)
-    loss,pred
+    loss = sum(abs2, ode_data .- pred)
+    loss, pred
 end
 
 loss_n_ode(θ)
 
-callback = function (θ,l,pred;doplot=false) #callback function to observe training
-  display(l)
-  # plot current prediction against data
-  pl = scatter(t,ode_data[1,:],label="data")
-  scatter!(pl,t,pred[1,:],label="prediction")
-  display(plot(pl))
-  return false
+callback = function (θ, l, pred; doplot = false) #callback function to observe training
+    display(l)
+    # plot current prediction against data
+    pl = scatter(t, ode_data[1, :], label = "data")
+    scatter!(pl, t, pred[1, :], label = "prediction")
+    display(plot(pl))
+    return false
 end
 
 # Display the ODE with the initial parameter values.
-callback(θ,loss_n_ode(θ)...)
+callback(θ, loss_n_ode(θ)...)
 
 # use Optimization.jl to solve the problem
 adtype = Optimization.AutoZygote()
 
-optf = Optimization.OptimizationFunction((p,_)->loss_n_ode(p), adtype)
+optf = Optimization.OptimizationFunction((p, _) -> loss_n_ode(p), adtype)
 optprob = Optimization.OptimizationProblem(optf, θ)
 
 result_neuralode = Optimization.solve(optprob,
-                                       OptimizationOptimisers.Adam(0.05),
-                                       callback = callback,
-                                       maxiters = 300)
+                                      OptimizationOptimisers.Adam(0.05),
+                                      callback = callback,
+                                      maxiters = 300)
 
-optprob2 = remake(optprob,u0 = result_neuralode.u)
+optprob2 = remake(optprob, u0 = result_neuralode.u)
 
 result_neuralode2 = Optimization.solve(optprob2,
-                                        NLopt.LD_LBFGS(),
-                                        callback = callback)
+                                       NLopt.LD_LBFGS(),
+                                       callback = callback)
 ```
 
 Notice that the advantage of this format is that we can use Optim's optimizers, like
diff --git a/docs/src/examples/neural_ode/simplechains.md b/docs/src/examples/neural_ode/simplechains.md
index 50e7580e6..c2dfa7da8 100644
--- a/docs/src/examples/neural_ode/simplechains.md
+++ b/docs/src/examples/neural_ode/simplechains.md
@@ -7,7 +7,8 @@
 First, we'll need data for training the NeuralODE, which can be obtained by solving the ODE `u' = f(u,p,t)` numerically using the SciML ecosystem in Julia.
 
 ```@example sc_neuralode
-using SimpleChains, StaticArrays, OrdinaryDiffEq, SciMLSensitivity, Optimization, OptimizationFlux, Plots
+using SimpleChains, StaticArrays, OrdinaryDiffEq, SciMLSensitivity, Optimization,
+      OptimizationFlux, Plots
 
 u0 = @SArray Float32[2.0, 0.0]
 datasize = 30
@@ -16,7 +17,7 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 
 function trueODE(u, p, t)
     true_A = @SMatrix Float32[-0.1 2.0; -2.0 -0.1]
-    ((u.^3)'true_A)'
+    ((u .^ 3)'true_A)'
 end
 
 prob = ODEProblem(trueODE, u0, tspan)
@@ -28,16 +29,14 @@ data = Array(solve(prob, Tsit5(), saveat = tsteps))
 Next, we set up a small neural network. It will be trained to output the derivative of the solution at each time step given the value of the solution at the previous time step, and the parameters of the network. Thus, we are treating the neural network as a function `f(u,p,t)`. The difference is that instead of relying on knowing the exact equation for the ODE, we get to solve it only with the data.
 
 ```@example sc_neuralode
-sc = SimpleChain(
-                static(2),
-                Activation(x -> x.^3),
-                TurboDense{true}(tanh, static(50)),
-                TurboDense{true}(identity, static(2))
-            )
+sc = SimpleChain(static(2),
+                 Activation(x -> x .^ 3),
+                 TurboDense{true}(tanh, static(50)),
+                 TurboDense{true}(identity, static(2)))
 
 p_nn = SimpleChains.init_params(sc)
 
-f(u,p,t) = sc(u,p)
+f(u, p, t) = sc(u, p)
 ```
 
 ## NeuralODE, Prediction and Loss
@@ -48,7 +47,8 @@ Now instead of the function `trueODE(u,p,t)` in the first code block, we pass th
 prob_nn = ODEProblem(f, u0, tspan)
 
 function predict_neuralode(p)
-    Array(solve(prob_nn, Tsit5();p=p,saveat=tsteps,sensealg=QuadratureAdjoint(autojacvec=ZygoteVJP())))
+    Array(solve(prob_nn, Tsit5(); p = p, saveat = tsteps,
+                sensealg = QuadratureAdjoint(autojacvec = ZygoteVJP())))
 end
 
 function loss_neuralode(p)
@@ -67,16 +67,17 @@ The adjoint of a neural ODE can be calculated through the various AD algorithms
 ```@example sc_neuralode
 callback = function (p, l, pred; doplot = true)
     display(l)
-    plt = scatter(tsteps, data[1,:],label="data")
-    scatter!(plt, tsteps, pred[1,:], label = "prediction")
+    plt = scatter(tsteps, data[1, :], label = "data")
+    scatter!(plt, tsteps, pred[1, :], label = "prediction")
     if doplot
         display(plot(plt))
     end
     return false
 end
 
-optf = Optimization.OptimizationFunction((x,p)->loss_neuralode(x), Optimization.AutoZygote())
+optf = Optimization.OptimizationFunction((x, p) -> loss_neuralode(x),
+                                         Optimization.AutoZygote())
 optprob = Optimization.OptimizationProblem(optf, p_nn)
 
-res = Optimization.solve(optprob, ADAM(0.05),callback=callback,maxiters=300)
-```
\ No newline at end of file
+res = Optimization.solve(optprob, ADAM(0.05), callback = callback, maxiters = 300)
+```
diff --git a/docs/src/examples/ode/exogenous_input.md b/docs/src/examples/ode/exogenous_input.md
index 2bbdc4805..9d5588497 100644
--- a/docs/src/examples/ode/exogenous_input.md
+++ b/docs/src/examples/ode/exogenous_input.md
@@ -8,9 +8,9 @@ use the form
 ```julia
 I(t) = t^2
 
-function f(du,u,p,t)
-  du[1] = I(t)
-  du[2] = u[1]
+function f(du, u, p, t)
+    du[1] = I(t)
+    du[2] = u[1]
 end
 ```
 
@@ -18,12 +18,12 @@ so that `I(t)` is an exogenous input signal into `f`. Another form that could be
 useful is a closure. For example:
 
 ```julia
-function f(du,u,p,t,I)
-  du[1] = I(t)
-  du[2] = u[1]
+function f(du, u, p, t, I)
+    du[1] = I(t)
+    du[2] = u[1]
 end
 
-_f(du,u,p,t) = f(du,u,p,t,x -> x^2)
+_f(du, u, p, t) = f(du, u, p, t, x -> x^2)
 ```
 
 which encloses an extra argument into `f` so that `_f` is now the interface-compliant
@@ -40,61 +40,62 @@ In the following example, a discrete exogenous input signal `ex` is defined and
 used as an input into the neural network of a neural ODE system.
 
 ```@example exogenous
-using DifferentialEquations, Lux, DiffEqFlux, Optimization, OptimizationPolyalgorithms, OptimizationFlux, Plots, Random
+using DifferentialEquations, Lux, DiffEqFlux, Optimization, OptimizationPolyalgorithms,
+      OptimizationFlux, Plots, Random
 
 rng = Random.default_rng()
 tspan = (0.1f0, Float32(10.0))
 tsteps = range(tspan[1], tspan[2], length = 100)
 t_vec = collect(tsteps)
-ex = vec(ones(Float32,length(tsteps), 1))
+ex = vec(ones(Float32, length(tsteps), 1))
 f(x) = (atan(8.0 * x - 4.0) + atan(4.0)) / (2.0 * atan(4.0))
 
 function hammerstein_system(u)
-    y= zeros(size(u))
+    y = zeros(size(u))
     for k in 2:length(u)
-        y[k] = 0.2 * f(u[k-1]) + 0.8 * y[k-1]
+        y[k] = 0.2 * f(u[k - 1]) + 0.8 * y[k - 1]
     end
     return y
 end
 
 y = Float32.(hammerstein_system(ex))
-plot(collect(tsteps), y, ticks=:native)
+plot(collect(tsteps), y, ticks = :native)
 
-nn_model = Lux.Chain(Lux.Dense(2,8, tanh), Lux.Dense(8, 1))
-p_model,st = Lux.setup(rng, nn_model)
+nn_model = Lux.Chain(Lux.Dense(2, 8, tanh), Lux.Dense(8, 1))
+p_model, st = Lux.setup(rng, nn_model)
 
 u0 = Float32.([0.0])
 
 function dudt(u, p, t)
     global st
     #input_val = u_vals[Int(round(t*10)+1)]
-    out,st = nn_model(vcat(u[1], ex[Int(round(10*0.1))]), p, st)
+    out, st = nn_model(vcat(u[1], ex[Int(round(10 * 0.1))]), p, st)
     return out
 end
 
-prob = ODEProblem(dudt,u0,tspan,nothing)
+prob = ODEProblem(dudt, u0, tspan, nothing)
 
 function predict_neuralode(p)
-    _prob = remake(prob,p=p)
-    Array(solve(_prob, Tsit5(), saveat=tsteps, abstol = 1e-8, reltol = 1e-6))
+    _prob = remake(prob, p = p)
+    Array(solve(_prob, Tsit5(), saveat = tsteps, abstol = 1e-8, reltol = 1e-6))
 end
 
 function loss(p)
     sol = predict_neuralode(p)
     N = length(sol)
-    return sum(abs2.(y[1:N] .- sol'))/N
+    return sum(abs2.(y[1:N] .- sol')) / N
 end
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, Lux.ComponentArray(p_model))
 
-res0 = Optimization.solve(optprob, PolyOpt(),maxiters=100)
+res0 = Optimization.solve(optprob, PolyOpt(), maxiters = 100)
 
 sol = predict_neuralode(res0.u)
-plot(tsteps,sol')
+plot(tsteps, sol')
 N = length(sol)
-scatter!(tsteps,y[1:N])
+scatter!(tsteps, y[1:N])
 ```
 
 ![](https://aws1.discourse-cdn.com/business5/uploads/julialang/original/3X/f/3/f3c2727af36ac20e114fe3c9798e567cc9d22b9e.png)
diff --git a/docs/src/examples/ode/prediction_error_method.md b/docs/src/examples/ode/prediction_error_method.md
index 4316ab9c1..93726c445 100644
--- a/docs/src/examples/ode/prediction_error_method.md
+++ b/docs/src/examples/ode/prediction_error_method.md
@@ -3,54 +3,56 @@
 When identifying linear systems from noisy data, the prediction-error method [^Ljung] is close to a gold standard when it comes to the quality of the models it produces, but is also one of the computationally more expensive methods due to its reliance on iterative, gradient-based estimation. When we are identifying nonlinear models, we typically do not have the luxury of closed-form, non-iterative solutions, while PEM is easier to adapt to the nonlinear setting.[^Larsson]
 
 Fundamentally, PEM changes the problem from minimizing a loss based on the simulation performance, to minimizing a loss based on shorter-term predictions. There are several benefits of doing so, and this example will highlight two:
-- The loss is often easier to optimize.
-- In addition to an accurate simulator, you also obtain a prediction for the system.
-- With PEM, it's possible to estimate *disturbance models*.
 
-The last point will not be illustrated in this tutorial, but we will briefly expand upon it here. Gaussian, zero-mean measurement noise is usually not very hard to handle. Disturbances that affect the state of the system may, however, cause all sorts of havoc on the estimate. Consider wind affecting an aircraft, deriving a statistical and dynamical model of the wind may be doable, but unless you measure the exact wind affecting the aircraft, making use of the model during parameter estimation is impossible. The wind is an *unmeasured load disturbance* that affects the state of the system through its own dynamics model. Using the techniques illustrated in this tutorial, it's possible to estimate the influence of the wind during the experiment that generated the data and reduce or eliminate the bias it otherwise causes in the parameter estimates. 
+  - The loss is often easier to optimize.
+  - In addition to an accurate simulator, you also obtain a prediction for the system.
+  - With PEM, it's possible to estimate *disturbance models*.
+
+The last point will not be illustrated in this tutorial, but we will briefly expand upon it here. Gaussian, zero-mean measurement noise is usually not very hard to handle. Disturbances that affect the state of the system may, however, cause all sorts of havoc on the estimate. Consider wind affecting an aircraft, deriving a statistical and dynamical model of the wind may be doable, but unless you measure the exact wind affecting the aircraft, making use of the model during parameter estimation is impossible. The wind is an *unmeasured load disturbance* that affects the state of the system through its own dynamics model. Using the techniques illustrated in this tutorial, it's possible to estimate the influence of the wind during the experiment that generated the data and reduce or eliminate the bias it otherwise causes in the parameter estimates.
 
 We will start by illustrating a common problem with simulation-error minimization. Imagine a pendulum with unknown length that is to be estimated. A small error in the pendulum length causes the frequency of oscillation to change. Over sufficiently large horizon, two sinusoidal signals with different frequencies become close to orthogonal to each other. If some form of squared-error loss is used, the loss landscape will be horribly non-convex in this case, indeed, we will illustrate exactly this below.
 
 Another case that poses a problem for simulation-error estimation is when the system is unstable or chaotic. A small error in either the initial condition or the parameters may cause the simulation error to diverge and its gradient to become meaningless.
 
-In both of these examples, we may make use of measurements we have of the evolution of the system to prevent the simulation error from diverging. For instance, if we have measured the angle of the pendulum, we can make use of this measurement to adjust the angle during the simulation to make sure it stays close to the measured angle. Instead of performing a pure simulation, we instead say that we *predict* the state a while forward in time, given all the measurements until the current time point. By minimizing this prediction rather than the pure simulation, we can often prevent the model error from diverging even though we have a poor initial guess. 
+In both of these examples, we may make use of measurements we have of the evolution of the system to prevent the simulation error from diverging. For instance, if we have measured the angle of the pendulum, we can make use of this measurement to adjust the angle during the simulation to make sure it stays close to the measured angle. Instead of performing a pure simulation, we instead say that we *predict* the state a while forward in time, given all the measurements until the current time point. By minimizing this prediction rather than the pure simulation, we can often prevent the model error from diverging even though we have a poor initial guess.
 
-We start by defining a model of the pendulum. The model takes a parameter $L$ corresponding to the length of the pendulum. 
+We start by defining a model of the pendulum. The model takes a parameter $L$ corresponding to the length of the pendulum.
 
 ```@example PEM
-using DifferentialEquations, Optimization,  OptimizationPolyalgorithms, Plots, Statistics, DataInterpolations, ForwardDiff
+using DifferentialEquations, Optimization, OptimizationPolyalgorithms, Plots, Statistics,
+      DataInterpolations, ForwardDiff
 
 tspan = (0.1, 20.0)
 tsteps = range(tspan[1], tspan[2], length = 1000)
 
 u0 = [0.0, 3.0] # Initial angle and angular velocity
 
-function simulator(du,u,p,t) # Pendulum dynamics
+function simulator(du, u, p, t) # Pendulum dynamics
     g = 9.82 # Gravitational constant
     L = p isa Number ? p : p[1] # Length of the pendulum
-    gL = g/L
-    θ  = u[1]
+    gL = g / L
+    θ = u[1]
     dθ = u[2]
     du[1] = dθ
     du[2] = -gL * sin(θ)
 end
 ```
+
 We assume that the true length of the pendulum is $L = 1$, and generate some data from this system.
 
 ```@example PEM
-prob = ODEProblem(simulator,u0,tspan,1.0) # Simulate with L = 1
-sol = solve(prob, Tsit5(), saveat=tsteps, abstol = 1e-8, reltol = 1e-8)
-y = sol[1,:] # This is the data we have available for parameter estimation
-plot(y, title="Pendulum simulation", label="angle")
+prob = ODEProblem(simulator, u0, tspan, 1.0) # Simulate with L = 1
+sol = solve(prob, Tsit5(), saveat = tsteps, abstol = 1e-8, reltol = 1e-8)
+y = sol[1, :] # This is the data we have available for parameter estimation
+plot(y, title = "Pendulum simulation", label = "angle")
 ```
 
-
 We also define functions that simulate the system and calculate the loss, given a parameter `p` corresponding to the length.
 
 ```@example PEM
 function simulate(p)
-    _prob = remake(prob,p=p)
-    solve(_prob, Tsit5(), saveat=tsteps, abstol = 1e-8, reltol = 1e-8)
+    _prob = remake(prob, p = p)
+    solve(_prob, Tsit5(), saveat = tsteps, abstol = 1e-8, reltol = 1e-8)
 end
 
 function simloss(p)
@@ -58,47 +60,48 @@ function simloss(p)
     if !SciMLBase.successful_retcode(yh.retcode)
         return Inf
     end
-    e2 = yh[1,:]
+    e2 = yh[1, :]
     e2 .= abs2.(y .- e2)
     return mean(e2)
 end
 ```
+
 We now look at the loss landscape as a function of the pendulum length:
 
 ```@example PEM
 Ls = 0.01:0.01:2
 simlosses = simloss.(Ls)
-fig_loss = plot(Ls, simlosses, title = "Loss landscape", xlabel="Pendulum length", ylabel = "MSE loss", lab="Simulation loss")
+fig_loss = plot(Ls, simlosses, title = "Loss landscape", xlabel = "Pendulum length",
+                ylabel = "MSE loss", lab = "Simulation loss")
 ```
 
-
 This figure is interesting, the loss is of course 0 for the true value $L=1$, but for values $L < 1$, the overall slope actually points in the wrong direction! Moreover, the loss is oscillatory, indicating that this is a terrible function to optimize, and that we would need a very good initial guess for a local search to converge to the true value. Note, this example is chosen to be one-dimensional in order to allow these kinds of visualizations, and one-dimensional problems are typically not hard to solve, but the reasoning extends to higher-dimensional and harder problems.
 
-We will now move on to defining a *predictor* model. Our predictor will be very simple, each time step, we will calculate the error $e$ between the simulated angle $\theta$ and the measured angle $y$. A part of this error will be used to correct the state of the pendulum. The correction we use is linear and looks like $Ke = K(y - \theta)$. We have formed what is commonly referred to as a (linear) *observer*. The [Kalman filter](https://en.wikipedia.org/wiki/Kalman_filter) is a particular kind of linear observer, where $K$ is calculated based on a statistical model of the disturbances that act on the system. We will stay with a simple, fixed-gain observer here for simplicity. 
+We will now move on to defining a *predictor* model. Our predictor will be very simple, each time step, we will calculate the error $e$ between the simulated angle $\theta$ and the measured angle $y$. A part of this error will be used to correct the state of the pendulum. The correction we use is linear and looks like $Ke = K(y - \theta)$. We have formed what is commonly referred to as a (linear) *observer*. The [Kalman filter](https://en.wikipedia.org/wiki/Kalman_filter) is a particular kind of linear observer, where $K$ is calculated based on a statistical model of the disturbances that act on the system. We will stay with a simple, fixed-gain observer here for simplicity.
 
 To feed the sampled data into the continuous-time simulation, we make use of an interpolator. We also define new functions, `predictor` that contains the pendulum dynamics with the observer correction, a `prediction` function that performs the rollout (we're not using the word simulation to not confuse with the setting above) and a loss function.
 
 ```@example PEM
-y_int = LinearInterpolation(y,tsteps)
+y_int = LinearInterpolation(y, tsteps)
 
-function predictor(du,u,p,t)
+function predictor(du, u, p, t)
     g = 9.82
     L, K, y = p # pendulum length, observer gain and measurements
-    gL = g/L
-    θ  = u[1]
+    gL = g / L
+    θ = u[1]
     dθ = u[2]
     yt = y(t)
     e = yt - θ
-    du[1] = dθ + K*e
-    du[2] = -gL * sin(θ) 
+    du[1] = dθ + K * e
+    du[2] = -gL * sin(θ)
 end
 
-predprob = ODEProblem(predictor,u0,tspan,nothing)
+predprob = ODEProblem(predictor, u0, tspan, nothing)
 
 function prediction(p)
     p_full = (p..., y_int)
-    _prob = remake(predprob,u0=eltype(p).(u0),p=p_full)
-    solve(_prob, Tsit5(), saveat=tsteps, abstol = 1e-8, reltol = 1e-8)
+    _prob = remake(predprob, u0 = eltype(p).(u0), p = p_full)
+    solve(_prob, Tsit5(), saveat = tsteps, abstol = 1e-8, reltol = 1e-8)
 end
 
 function predloss(p)
@@ -106,7 +109,7 @@ function predloss(p)
     if !SciMLBase.successful_retcode(yh.retcode)
         return Inf
     end
-    e2 = yh[1,:]
+    e2 = yh[1, :]
     e2 .= abs2.(y .- e2)
     return mean(e2)
 end
@@ -116,10 +119,9 @@ predlosses = map(Ls) do L
     predloss(p)
 end
 
-plot!(Ls, predlosses, lab="Prediction loss")
+plot!(Ls, predlosses, lab = "Prediction loss")
 ```
 
-
 Once gain, we look at the loss as a function of the parameter, and this time it looks a lot better. The loss is not convex, but the gradient points in the right direction over a much larger interval. Here, we arbitrarily set the observer gain to $K=1$, we will later let the optimizer learn this parameter.
 
 For completeness, we also perform estimation using both losses. We choose an initial guess we know will be hard for the simulation-error minimization just to drive home the point:
@@ -127,27 +129,26 @@ For completeness, we also perform estimation using both losses. We choose an ini
 ```@example PEM
 L0 = [0.7] # Initial guess of pendulum length
 adtype = Optimization.AutoForwardDiff()
-optf = Optimization.OptimizationFunction((x,p)->simloss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> simloss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, L0)
 
 ressim = Optimization.solve(optprob, PolyOpt(),
-                                    maxiters = 5000)
-ysim = simulate(ressim.u)[1,:]
+                            maxiters = 5000)
+ysim = simulate(ressim.u)[1, :]
 
-plot(tsteps, [y ysim], label=["Data" "Simulation model"])
+plot(tsteps, [y ysim], label = ["Data" "Simulation model"])
 
 p0 = [0.7, 1.0] # Initial guess of length and observer gain K
-optf2 = Optimization.OptimizationFunction((p,_)->predloss(p), adtype)
+optf2 = Optimization.OptimizationFunction((p, _) -> predloss(p), adtype)
 optprob2 = Optimization.OptimizationProblem(optf2, p0)
 
 respred = Optimization.solve(optprob2, PolyOpt(),
-                                    maxiters = 5000)
-ypred = simulate(respred.u)[1,:]
+                             maxiters = 5000)
+ypred = simulate(respred.u)[1, :]
 
-plot!(tsteps, ypred, label="Prediction model")
+plot!(tsteps, ypred, label = "Prediction model")
 ```
 
-
 The estimated parameters $(L, K)$ are
 
 ```@example PEM
@@ -155,38 +156,37 @@ respred.u
 ```
 
 Now, we might ask ourselves why we used a correct on the form $Ke$ and didn't instead set the angle in the simulation *equal* to the measurement. The reason is twofold
-1. If our prediction of the angle is 100% based on the measurements, the model parameters do not matter for the prediction, and we thus cannot hope to learn their values.
-2. The measurement is usually noisy, and we thus want to *fuse* the predictive power of the model with the information of the measurements. The Kalman filter is an optimal approach to this information fusion under special circumstances (linear model, Gaussian noise).
 
-We thus let the optimization *learn* the best value of the observer gain in order to make the best predictions. 
+ 1. If our prediction of the angle is 100% based on the measurements, the model parameters do not matter for the prediction, and we thus cannot hope to learn their values.
+ 2. The measurement is usually noisy, and we thus want to *fuse* the predictive power of the model with the information of the measurements. The Kalman filter is an optimal approach to this information fusion under special circumstances (linear model, Gaussian noise).
+
+We thus let the optimization *learn* the best value of the observer gain in order to make the best predictions.
 
 As a last step, we perform the estimation also with some measurement noise to verify that it does something reasonable:
 
 ```@example PEM
 yn = y .+ 0.1f0 .* randn.(Float32)
-y_int = LinearInterpolation(yn,tsteps) # redefine the interpolator to contain noisy measurements
+y_int = LinearInterpolation(yn, tsteps) # redefine the interpolator to contain noisy measurements
 
-optf = Optimization.OptimizationFunction((x,p)->predloss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> predloss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p0)
 
 resprednoise = Optimization.solve(optprob, PolyOpt(),
-                                    maxiters = 5000)
+                                  maxiters = 5000)
 
-yprednoise = prediction(resprednoise.u)[1,:]
-plot!(tsteps, yprednoise, label="Prediction model with noisy measurements")
+yprednoise = prediction(resprednoise.u)[1, :]
+plot!(tsteps, yprednoise, label = "Prediction model with noisy measurements")
 ```
 
-
 ```@example PEM
 resprednoise.u
 ```
 
 This example has illustrated basic use of the prediction-error method for parameter estimation. In our example, the measurement we had corresponded directly to one of the states, and coming up with an observer/predictor that worked was not too hard. For more difficult cases, we may opt to use a nonlinear observer, such as an extended Kalman filter (EKF) or design a Kalman filter based on a linearization of the system around some operating point.
 
-As a last note, there are several other methods available to improve the loss landscape and avoid local minima, such as multiple-shooting. The prediction-error method can easily be combined with most of those methods. 
+As a last note, there are several other methods available to improve the loss landscape and avoid local minima, such as multiple-shooting. The prediction-error method can easily be combined with most of those methods.
 
 References:
 
 [^Ljung]: Ljung, Lennart. "System identification---Theory for the user".
-
 [^Larsson]: Larsson, Roger, et al. "Direct prediction-error identification of unstable nonlinear systems applied to flight test data."
diff --git a/docs/src/examples/ode/second_order_adjoints.md b/docs/src/examples/ode/second_order_adjoints.md
index 5f59afb4e..2b72b4874 100644
--- a/docs/src/examples/ode/second_order_adjoints.md
+++ b/docs/src/examples/ode/second_order_adjoints.md
@@ -23,19 +23,19 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 
 function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
 
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
-dudt2 = Flux.Chain(x -> x.^3,
+dudt2 = Flux.Chain(x -> x .^ 3,
                    Flux.Dense(2, 50, tanh),
                    Flux.Dense(50, 2))
 prob_neuralode = NeuralODE(dudt2, tspan, Tsit5(), saveat = tsteps)
 
 function predict_neuralode(p)
-  Array(prob_neuralode(u0, p)[1])
+    Array(prob_neuralode(u0, p)[1])
 end
 
 function loss_neuralode(p)
@@ -48,35 +48,37 @@ end
 list_plots = []
 iter = 0
 callback = function (p, l, pred; doplot = false)
-  global list_plots, iter
+    global list_plots, iter
 
-  if iter == 0
-    list_plots = []
-  end
-  iter += 1
+    if iter == 0
+        list_plots = []
+    end
+    iter += 1
 
-  display(l)
+    display(l)
 
-  # plot current prediction against data
-  plt = scatter(tsteps, ode_data[1,:], label = "data")
-  scatter!(plt, tsteps, pred[1,:], label = "prediction")
-  push!(list_plots, plt)
-  if doplot
-    display(plot(plt))
-  end
+    # plot current prediction against data
+    plt = scatter(tsteps, ode_data[1, :], label = "data")
+    scatter!(plt, tsteps, pred[1, :], label = "prediction")
+    push!(list_plots, plt)
+    if doplot
+        display(plot(plt))
+    end
 
-  return l < 0.01
+    return l < 0.01
 end
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_neuralode(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_neuralode(x), adtype)
 
 optprob1 = Optimization.OptimizationProblem(optf, prob_neuralode.p)
-pstart = Optimization.solve(optprob1, ADAM(0.01), callback=callback, maxiters = 100).u
+pstart = Optimization.solve(optprob1, ADAM(0.01), callback = callback, maxiters = 100).u
 
 optprob2 = Optimization.OptimizationProblem(optf, pstart)
-pmin = Optimization.solve(optprob2, NewtonTrustRegion(), callback=callback, maxiters = 200)
-pmin = Optimization.solve(optprob2, Optim.KrylovTrustRegion(), callback=callback, maxiters = 200)
+pmin = Optimization.solve(optprob2, NewtonTrustRegion(), callback = callback,
+                          maxiters = 200)
+pmin = Optimization.solve(optprob2, Optim.KrylovTrustRegion(), callback = callback,
+                          maxiters = 200)
 ```
 
 Note that we do not demonstrate `Newton()` because we have not found a single
diff --git a/docs/src/examples/ode/second_order_neural.md b/docs/src/examples/ode/second_order_neural.md
index 330d70a42..3085b4e17 100644
--- a/docs/src/examples/ode/second_order_neural.md
+++ b/docs/src/examples/ode/second_order_neural.md
@@ -21,21 +21,22 @@ neural network by the mass!)
 An example of training a neural network on a second order ODE is as follows:
 
 ```@example secondorderneural
-using DifferentialEquations, Flux, Optimization, OptimizationFlux, RecursiveArrayTools, Random
+using DifferentialEquations, Flux, Optimization, OptimizationFlux, RecursiveArrayTools,
+      Random
 
-u0 = Float32[0.; 2.]
-du0 = Float32[0.; 0.]
+u0 = Float32[0.0; 2.0]
+du0 = Float32[0.0; 0.0]
 tspan = (0.0f0, 1.0f0)
-t = range(tspan[1], tspan[2], length=20)
+t = range(tspan[1], tspan[2], length = 20)
 
 model = Flux.Chain(Flux.Dense(2, 50, tanh), Flux.Dense(50, 2))
-p,re = Flux.destructure(model)
+p, re = Flux.destructure(model)
 
-ff(du,u,p,t) = re(p)(u)
+ff(du, u, p, t) = re(p)(u)
 prob = SecondOrderODEProblem{false}(ff, du0, u0, tspan, p)
 
 function predict(p)
-    Array(solve(prob, Tsit5(), p=p, saveat=t))
+    Array(solve(prob, Tsit5(), p = p, saveat = t))
 end
 
 correct_pos = Float32.(transpose(hcat(collect(0:0.05:1)[2:end], collect(2:-0.05:1)[2:end])))
@@ -50,13 +51,13 @@ opt = ADAM(0.01)
 
 l1 = loss_n_ode(p)
 
-callback = function (p,l,pred)
+callback = function (p, l, pred)
     println(l)
     l < 0.01
 end
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_n_ode(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_n_ode(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p)
 
-res = Optimization.solve(optprob, opt; callback = callback, maxiters=1000)
+res = Optimization.solve(optprob, opt; callback = callback, maxiters = 1000)
 ```
diff --git a/docs/src/examples/optimal_control/feedback_control.md b/docs/src/examples/optimal_control/feedback_control.md
index ca6371949..413261023 100644
--- a/docs/src/examples/optimal_control/feedback_control.md
+++ b/docs/src/examples/optimal_control/feedback_control.md
@@ -10,7 +10,7 @@ on the current state of the dynamical system that will control the second
 equation to stay close to 1.
 
 ```@example udeneuralcontrol
-using Flux, Optimization, OptimizationPolyalgorithms, 
+using Flux, Optimization, OptimizationPolyalgorithms,
       SciMLSensitivity, Zygote, DifferentialEquations, Plots, Random
 
 rng = Random.default_rng()
@@ -19,11 +19,11 @@ tspan = (0.0f0, 25.0f0)
 tsteps = 0.0f0:1.0:25.0f0
 
 model_univ = Flux.Chain(Flux.Dense(2, 16, tanh),
-                       Flux.Dense(16, 16, tanh),
-                       Flux.Dense(16, 1))
+                        Flux.Dense(16, 16, tanh),
+                        Flux.Dense(16, 1))
 
 # The model weights are destructured into a vector of parameters
-p_model,re = Flux.destructure(model_univ)
+p_model, re = Flux.destructure(model_univ)
 n_weights = length(p_model)
 
 # Parameters of the second equation (linear dynamics)
@@ -44,23 +44,23 @@ function dudt_univ!(du, u, p, t)
 
     # Dynamics of the control and system
     dmodel_control = re(model_weights)(u)[1]
-    dsystem_output = α*system_output + β*model_control
+    dsystem_output = α * system_output + β * model_control
 
     # Update in place
     du[1] = dmodel_control
     du[2] = dsystem_output
 end
 
-prob_univ = ODEProblem(dudt_univ!, [0f0, u0], tspan, p_all)
-sol_univ = solve(prob_univ, Tsit5(),abstol = 1e-8, reltol = 1e-6)
+prob_univ = ODEProblem(dudt_univ!, [0.0f0, u0], tspan, p_all)
+sol_univ = solve(prob_univ, Tsit5(), abstol = 1e-8, reltol = 1e-6)
 
 function predict_univ(θ)
-  return Array(solve(prob_univ, Tsit5(), u0=[0f0, θ[1]], p=θ[2:end],
-                              sensealg = InterpolatingAdjoint(autojacvec=ReverseDiffVJP(true)),
-                              saveat = tsteps))
+    return Array(solve(prob_univ, Tsit5(), u0 = [0.0f0, θ[1]], p = θ[2:end],
+                       sensealg = InterpolatingAdjoint(autojacvec = ReverseDiffVJP(true)),
+                       saveat = tsteps))
 end
 
-loss_univ(θ) = sum(abs2, predict_univ(θ)[2,:] .- 1)
+loss_univ(θ) = sum(abs2, predict_univ(θ)[2, :] .- 1)
 l = loss_univ(θ)
 ```
 
@@ -68,26 +68,26 @@ l = loss_univ(θ)
 list_plots = []
 iter = 0
 callback = function (θ, l)
-  global list_plots, iter
+    global list_plots, iter
 
-  if iter == 0
-    list_plots = []
-  end
-  iter += 1
+    if iter == 0
+        list_plots = []
+    end
+    iter += 1
 
-  println(l)
+    println(l)
 
-  plt = plot(predict_univ(θ)', ylim = (0, 6))
-  push!(list_plots, plt)
-  display(plt)
-  return false
+    plt = plot(predict_univ(θ)', ylim = (0, 6))
+    push!(list_plots, plt)
+    display(plt)
+    return false
 end
 ```
 
 ```@example udeneuralcontrol
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_univ(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_univ(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, θ)
 result_univ = Optimization.solve(optprob, PolyOpt(),
-                                     callback = callback)
+                                 callback = callback)
 ```
diff --git a/docs/src/examples/optimal_control/optimal_control.md b/docs/src/examples/optimal_control/optimal_control.md
index 1890344da..a9e10f3a1 100644
--- a/docs/src/examples/optimal_control/optimal_control.md
+++ b/docs/src/examples/optimal_control/optimal_control.md
@@ -36,60 +36,62 @@ will first reduce control cost (the last term) by 10x in order to bump the netwo
 of a local minimum. This looks like:
 
 ```@example neuraloptimalcontrol
-using Flux, DifferentialEquations, Optimization, OptimizationNLopt, OptimizationFlux, 
+using Flux, DifferentialEquations, Optimization, OptimizationNLopt, OptimizationFlux,
       SciMLSensitivity, Zygote, Plots, Statistics, Random
 
 rng = Random.default_rng()
-tspan = (0.0f0,8.0f0)
-ann = Flux.Chain(Flux.Dense(1,32,tanh), Flux.Dense(32,32,tanh), Flux.Dense(32,1))
+tspan = (0.0f0, 8.0f0)
+ann = Flux.Chain(Flux.Dense(1, 32, tanh), Flux.Dense(32, 32, tanh), Flux.Dense(32, 1))
 θ, re = Flux.destructure(ann)
-function dxdt_(dx,x,p,t)
+function dxdt_(dx, x, p, t)
     x1, x2 = x
     dx[1] = x[2]
     dx[2] = re(p)([t])[1]^3
 end
-x0 = [-4f0,0f0]
+x0 = [-4.0f0, 0.0f0]
 ts = Float32.(collect(0.0:0.01:tspan[2]))
-prob = ODEProblem(dxdt_,x0,tspan,θ)
-solve(prob,Vern9(),abstol=1e-10,reltol=1e-10)
+prob = ODEProblem(dxdt_, x0, tspan, θ)
+solve(prob, Vern9(), abstol = 1e-10, reltol = 1e-10)
 
 function predict_adjoint(θ)
-  Array(solve(prob,Vern9(),p=θ,saveat=ts,sensealg=InterpolatingAdjoint(autojacvec=ReverseDiffVJP(true))))
+    Array(solve(prob, Vern9(), p = θ, saveat = ts,
+                sensealg = InterpolatingAdjoint(autojacvec = ReverseDiffVJP(true))))
 end
 function loss_adjoint(θ)
-  x = predict_adjoint(θ)
-  mean(abs2,4.0 .- x[1,:]) + 2mean(abs2,x[2,:]) + mean(abs2,[first(re(θ)([t])) for t in ts])/10
+    x = predict_adjoint(θ)
+    mean(abs2, 4.0 .- x[1, :]) + 2mean(abs2, x[2, :]) +
+    mean(abs2, [first(re(θ)([t])) for t in ts]) / 10
 end
 
 l = loss_adjoint(θ)
-callback = function (θ,l; doplot=false)
-  println(l)
+callback = function (θ, l; doplot = false)
+    println(l)
 
-  if doplot
-    p = plot(solve(remake(prob,p=θ),Tsit5(),saveat=0.01),ylim=(-6,6),lw=3)
-    plot!(p,ts,[first(re(θ)([t])) for t in ts],label="u(t)",lw=3)
-    display(p)
-  end
+    if doplot
+        p = plot(solve(remake(prob, p = θ), Tsit5(), saveat = 0.01), ylim = (-6, 6), lw = 3)
+        plot!(p, ts, [first(re(θ)([t])) for t in ts], label = "u(t)", lw = 3)
+        display(p)
+    end
 
-  return false
+    return false
 end
 
 # Display the ODE with the current parameter values.
 
-callback(θ,l)
+callback(θ, l)
 
 # Setup and run the optimization
 
 loss1 = loss_adjoint(θ)
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_adjoint(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_adjoint(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, θ)
-res1 = Optimization.solve(optprob, ADAM(0.005), callback = callback,maxiters=100)
+res1 = Optimization.solve(optprob, ADAM(0.005), callback = callback, maxiters = 100)
 
 optprob2 = Optimization.OptimizationProblem(optf, res1.u)
 res2 = Optimization.solve(optprob2,
-                              NLopt.LD_LBFGS(), maxiters=100)
+                          NLopt.LD_LBFGS(), maxiters = 100)
 ```
 
 Now that the system is in a better behaved part of parameter space, we return to
@@ -97,23 +99,24 @@ the original loss function to finish the optimization:
 
 ```@example neuraloptimalcontrol
 function loss_adjoint(θ)
-  x = predict_adjoint(θ)
-  mean(abs2,4.0 .- x[1,:]) + 2mean(abs2,x[2,:]) + mean(abs2,[first(re(θ)([t])) for t in ts])
+    x = predict_adjoint(θ)
+    mean(abs2, 4.0 .- x[1, :]) + 2mean(abs2, x[2, :]) +
+    mean(abs2, [first(re(θ)([t])) for t in ts])
 end
-optf3 = Optimization.OptimizationFunction((x,p)->loss_adjoint(x), adtype)
+optf3 = Optimization.OptimizationFunction((x, p) -> loss_adjoint(x), adtype)
 
 optprob3 = Optimization.OptimizationProblem(optf3, res2.u)
 res3 = Optimization.solve(optprob3,
-                              NLopt.LD_LBFGS(),maxiters=100)
+                          NLopt.LD_LBFGS(), maxiters = 100)
 ```
 
 Now let's see what we received:
 
 ```@example neuraloptimalcontrol
 l = loss_adjoint(res3.u)
-callback(res3.u,l)
-p = plot(solve(remake(prob,p=res3.u),Tsit5(),saveat=0.01),ylim=(-6,6),lw=3)
-plot!(p,ts,[first(re(res3.u)([t])) for t in ts],label="u(t)",lw=3)
+callback(res3.u, l)
+p = plot(solve(remake(prob, p = res3.u), Tsit5(), saveat = 0.01), ylim = (-6, 6), lw = 3)
+plot!(p, ts, [first(re(res3.u)([t])) for t in ts], label = "u(t)", lw = 3)
 ```
 
 ![](https://user-images.githubusercontent.com/1814174/81859169-db65b280-9532-11ea-8394-dbb5efcd4036.png)
diff --git a/docs/src/examples/pde/pde_constrained.md b/docs/src/examples/pde/pde_constrained.md
index bad9dd86c..6f62bb09c 100644
--- a/docs/src/examples/pde/pde_constrained.md
+++ b/docs/src/examples/pde/pde_constrained.md
@@ -4,91 +4,92 @@ This example uses a prediction model to optimize the one-dimensional Heat Equati
 (Step-by-step description below)
 
 ```@example pde
-using DelimitedFiles,Plots
+using DelimitedFiles, Plots
 using DifferentialEquations, Optimization, OptimizationPolyalgorithms, Zygote
 
 # Problem setup parameters:
 Lx = 10.0
-x  = 0.0:0.01:Lx
+x = 0.0:0.01:Lx
 dx = x[2] - x[1]
 Nx = size(x)
 
-u0 = exp.(-(x.-3.0).^2) # I.C
+u0 = exp.(-(x .- 3.0) .^ 2) # I.C
 
 ## Problem Parameters
-p        = [1.0,1.0]    # True solution parameters
-xtrs     = [dx,Nx]      # Extra parameters
-dt       = 0.40*dx^2    # CFL condition
-t0, tMax = 0.0 ,1000*dt
-tspan    = (t0,tMax)
-t        = t0:dt:tMax;
+p = [1.0, 1.0]    # True solution parameters
+xtrs = [dx, Nx]      # Extra parameters
+dt = 0.40 * dx^2    # CFL condition
+t0, tMax = 0.0, 1000 * dt
+tspan = (t0, tMax)
+t = t0:dt:tMax;
 
 ## Definition of Auxiliary functions
-function ddx(u,dx)
+function ddx(u, dx)
     """
     2nd order Central difference for 1st degree derivative
     """
-    return [[zero(eltype(u))] ; (u[3:end] - u[1:end-2]) ./ (2.0*dx) ; [zero(eltype(u))]]
+    return [[zero(eltype(u))]; (u[3:end] - u[1:(end - 2)]) ./ (2.0 * dx); [zero(eltype(u))]]
 end
 
-
-function d2dx(u,dx)
+function d2dx(u, dx)
     """
     2nd order Central difference for 2nd degree derivative
     """
-    return [[zero(eltype(u))]; (u[3:end] - 2.0.*u[2:end-1] + u[1:end-2]) ./ (dx^2); [zero(eltype(u))]]
+    return [[zero(eltype(u))];
+            (u[3:end] - 2.0 .* u[2:(end - 1)] + u[1:(end - 2)]) ./ (dx^2);
+            [zero(eltype(u))]]
 end
 
 ## ODE description of the Physics:
-function heat(u,p,t)
+function heat(u, p, t)
     # Model parameters
     a0, a1 = p
-    dx,Nx = xtrs #[1.0,3.0,0.125,100]
-    return 2.0*a0 .* u +  a1 .* d2dx(u, dx)
+    dx, Nx = xtrs #[1.0,3.0,0.125,100]
+    return 2.0 * a0 .* u + a1 .* d2dx(u, dx)
 end
 
 # Testing Solver on linear PDE
-prob = ODEProblem(heat,u0,tspan,p)
-sol = solve(prob,Tsit5(), dt=dt,saveat=t);
+prob = ODEProblem(heat, u0, tspan, p)
+sol = solve(prob, Tsit5(), dt = dt, saveat = t);
 
-plot(x, sol.u[1], lw=3, label="t0", size=(800,500))
-plot!(x, sol.u[end],lw=3, ls=:dash, label="tMax")
+plot(x, sol.u[1], lw = 3, label = "t0", size = (800, 500))
+plot!(x, sol.u[end], lw = 3, ls = :dash, label = "tMax")
 
-ps  = [0.1, 0.2];   # Initial guess for model parameters
+ps = [0.1, 0.2];   # Initial guess for model parameters
 function predict(θ)
-    Array(solve(prob,Tsit5(),p=θ,dt=dt,saveat=t))
+    Array(solve(prob, Tsit5(), p = θ, dt = dt, saveat = t))
 end
 
 ## Defining Loss function
 function loss(θ)
     pred = predict(θ)
-    l = predict(θ)  - sol
+    l = predict(θ) - sol
     return sum(abs2, l), pred # Mean squared error
 end
 
-l,pred   = loss(ps)
+l, pred = loss(ps)
 size(pred), size(sol), size(t) # Checking sizes
 
-LOSS  = []                              # Loss accumulator
-PRED  = []                              # prediction accumulator
-PARS  = []                              # parameters accumulator
+LOSS = []                              # Loss accumulator
+PRED = []                              # prediction accumulator
+PARS = []                              # parameters accumulator
 
-callback = function (θ,l,pred) #callback function to observe training
-  display(l)
-  append!(PRED, [pred])
-  append!(LOSS, l)
-  append!(PARS, [θ])
-  false
+callback = function (θ, l, pred) #callback function to observe training
+    display(l)
+    append!(PRED, [pred])
+    append!(LOSS, l)
+    append!(PARS, [θ])
+    false
 end
 
-callback(ps,loss(ps)...) # Testing callback function
+callback(ps, loss(ps)...) # Testing callback function
 
 # Let see prediction vs. Truth
-scatter(sol[:,end], label="Truth", size=(800,500))
-plot!(PRED[end][:,end], lw=2, label="Prediction")
+scatter(sol[:, end], label = "Truth", size = (800, 500))
+plot!(PRED[end][:, end], lw = 2, label = "Prediction")
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, ps)
 res = Optimization.solve(optprob, PolyOpt(), callback = callback)
@@ -100,8 +101,8 @@ res = Optimization.solve(optprob, PolyOpt(), callback = callback)
 ### Load Packages
 
 ```@example pde2
-using DelimitedFiles,Plots
-using DifferentialEquations, Optimization, OptimizationPolyalgorithms, 
+using DelimitedFiles, Plots
+using DifferentialEquations, Optimization, OptimizationPolyalgorithms,
       Zygote
 ```
 
@@ -114,32 +115,32 @@ steps of **4.0e-5**.
 ```@example pde2
 # Problem setup parameters:
 Lx = 10.0
-x  = 0.0:0.01:Lx
+x = 0.0:0.01:Lx
 dx = x[2] - x[1]
 Nx = size(x)
 
-u0 = exp.(-(x.-3.0).^2) # I.C
+u0 = exp.(-(x .- 3.0) .^ 2) # I.C
 
 ## Problem Parameters
-p        = [1.0,1.0]    # True solution parameters
-xtrs     = [dx,Nx]      # Extra parameters
-dt       = 0.40*dx^2    # CFL condition
-t0, tMax = 0.0 ,1000*dt
-tspan    = (t0,tMax)
-t        = t0:dt:tMax;
+p = [1.0, 1.0]    # True solution parameters
+xtrs = [dx, Nx]      # Extra parameters
+dt = 0.40 * dx^2    # CFL condition
+t0, tMax = 0.0, 1000 * dt
+tspan = (t0, tMax)
+t = t0:dt:tMax;
 ```
 
 In plain terms, the quantities that were defined are:
 
-- `x` (to `Lx`) spans the specified 1D space
-- `dx` = distance between two points
-- `Nx` = total size of space
-- `u0` = initial condition
-- `p` = true solution
-- `xtrs` = convenient grouping of `dx` and `Nx` into Array
-- `dt` = time distance between two points
-- `t` (`t0` to `tMax`) spans the specified time frame
-- `tspan` = span of `t`
+  - `x` (to `Lx`) spans the specified 1D space
+  - `dx` = distance between two points
+  - `Nx` = total size of space
+  - `u0` = initial condition
+  - `p` = true solution
+  - `xtrs` = convenient grouping of `dx` and `Nx` into Array
+  - `dt` = time distance between two points
+  - `t` (`t0` to `tMax`) spans the specified time frame
+  - `tspan` = span of `t`
 
 ### Auxiliary Functions
 
@@ -148,19 +149,20 @@ Difference** is used in both the 1st and 2nd degree derivatives.
 
 ```@example pde2
 ## Definition of Auxiliary functions
-function ddx(u,dx)
+function ddx(u, dx)
     """
     2nd order Central difference for 1st degree derivative
     """
-    return [[zero(eltype(u))] ; (u[3:end] - u[1:end-2]) ./ (2.0*dx) ; [zero(eltype(u))]]
+    return [[zero(eltype(u))]; (u[3:end] - u[1:(end - 2)]) ./ (2.0 * dx); [zero(eltype(u))]]
 end
 
-
-function d2dx(u,dx)
+function d2dx(u, dx)
     """
     2nd order Central difference for 2nd degree derivative
     """
-    return [[zero(eltype(u))]; (u[3:end] - 2.0.*u[2:end-1] + u[1:end-2]) ./ (dx^2); [zero(eltype(u))]]
+    return [[zero(eltype(u))];
+            (u[3:end] - 2.0 .* u[2:(end - 1)] + u[1:(end - 2)]) ./ (dx^2);
+            [zero(eltype(u))]]
 end
 ```
 
@@ -170,11 +172,11 @@ Next, we set up our desired set of equations in order to define our problem.
 
 ```@example pde2
 ## ODE description of the Physics:
-function heat(u,p,t)
+function heat(u, p, t)
     # Model parameters
     a0, a1 = p
-    dx,Nx = xtrs #[1.0,3.0,0.125,100]
-    return 2.0*a0 .* u +  a1 .* d2dx(u, dx)
+    dx, Nx = xtrs #[1.0,3.0,0.125,100]
+    return 2.0 * a0 .* u + a1 .* d2dx(u, dx)
 end
 ```
 
@@ -185,11 +187,11 @@ will compare to further on.
 
 ```@example pde2
 # Testing Solver on linear PDE
-prob = ODEProblem(heat,u0,tspan,p)
-sol = solve(prob,Tsit5(), dt=dt,saveat=t);
+prob = ODEProblem(heat, u0, tspan, p)
+sol = solve(prob, Tsit5(), dt = dt, saveat = t);
 
-plot(x, sol.u[1], lw=3, label="t0", size=(800,500))
-plot!(x, sol.u[end],lw=3, ls=:dash, label="tMax")
+plot(x, sol.u[1], lw = 3, label = "t0", size = (800, 500))
+plot!(x, sol.u[end], lw = 3, ls = :dash, label = "tMax")
 ```
 
 ### Building the Prediction Model
@@ -200,9 +202,9 @@ non-linear transformation in one layer using `solve`. If unfamiliar with the con
 refer to [here](https://julialang.org/blog/2019/01/fluxdiffeq/).
 
 ```@example pde2
-ps  = [0.1, 0.2];   # Initial guess for model parameters
+ps = [0.1, 0.2];   # Initial guess for model parameters
 function predict(θ)
-    Array(solve(prob,Tsit5(),p=θ,dt=dt,saveat=t))
+    Array(solve(prob, Tsit5(), p = θ, dt = dt, saveat = t))
 end
 ```
 
@@ -221,11 +223,11 @@ use the **mean squared error**.
 ## Defining Loss function
 function loss(θ)
     pred = predict(θ)
-    l = predict(θ)  - sol
+    l = predict(θ) - sol
     return sum(abs2, l), pred # Mean squared error
 end
 
-l,pred   = loss(ps)
+l, pred = loss(ps)
 size(pred), size(sol), size(t) # Checking sizes
 ```
 
@@ -241,19 +243,19 @@ loss, the previous predictions and the previous parameters with `LOSS`, `PRED` a
 accumulators.
 
 ```@example pde2
-LOSS  = []                              # Loss accumulator
-PRED  = []                              # prediction accumulator
-PARS  = []                              # parameters accumulator
-
-callback = function (θ,l,pred) #callback function to observe training
-  display(l)
-  append!(PRED, [pred])
-  append!(LOSS, l)
-  append!(PARS, [θ])
-  false
+LOSS = []                              # Loss accumulator
+PRED = []                              # prediction accumulator
+PARS = []                              # parameters accumulator
+
+callback = function (θ, l, pred) #callback function to observe training
+    display(l)
+    append!(PRED, [pred])
+    append!(LOSS, l)
+    append!(PARS, [θ])
+    false
 end
 
-callback(ps,loss(ps)...) # Testing callback function
+callback(ps, loss(ps)...) # Testing callback function
 ```
 
 ### Plotting Prediction vs Ground Truth
@@ -264,8 +266,8 @@ almost perfectly when the PDE finishes its training and the loss is close to 0.
 
 ```@example pde2
 # Let see prediction vs. Truth
-scatter(sol[:,end], label="Truth", size=(800,500))
-plot!(PRED[end][:,end], lw=2, label="Prediction")
+scatter(sol[:, end], label = "Truth", size = (800, 500))
+plot!(PRED[end][:, end], lw = 2, label = "Prediction")
 ```
 
 ### Train
@@ -276,7 +278,7 @@ parameters that minimize the cost function.
 
 ```@example pde2
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, ps)
 res = Optimization.solve(optprob, PolyOpt(), callback = callback)
@@ -284,4 +286,4 @@ res = Optimization.solve(optprob, PolyOpt(), callback = callback)
 ```
 
 We successfully predict the final `ps` to be equal to **[0.999999999999975,
-1.0000000000000213]** vs the true solution of `p` = **[1.0, 1.0]**
\ No newline at end of file
+1.0000000000000213]** vs the true solution of `p` = **[1.0, 1.0]**
diff --git a/docs/src/examples/sde/SDE_control.md b/docs/src/examples/sde/SDE_control.md
index 20f845f27..1fd43560a 100644
--- a/docs/src/examples/sde/SDE_control.md
+++ b/docs/src/examples/sde/SDE_control.md
@@ -14,6 +14,7 @@ to the control parameter Ω(t) which rotates the quantum state about the `x`-axi
 to ultimately prepare and stabilize the qubit in the excited state.
 
 ## Copy-Pasteable Code
+
 Before getting to the explanation, here's some code to start with. We will
 follow a full explanation of the definition and training process:
 
@@ -26,7 +27,6 @@ using StochasticDiffEq, DiffEqCallbacks, DiffEqNoiseProcess
 using Statistics, LinearAlgebra, Random
 using Plots
 
-
 #################################################
 lr = 0.01f0
 epochs = 100
@@ -39,8 +39,8 @@ dt = 0.0005f0
 tinterval = 0.05f0
 tstart = 0.0f0
 Nintervals = 20 # total number of intervals, total time = t_interval*Nintervals
-tspan = (tstart,tinterval*Nintervals)
-ts = Array(tstart:dt:(Nintervals*tinterval+dt)) # time array for noise grid
+tspan = (tstart, tinterval * Nintervals)
+ts = Array(tstart:dt:(Nintervals * tinterval + dt)) # time array for noise grid
 
 # Hamiltonian parameters
 Δ = 20.0f0
@@ -50,48 +50,53 @@ ts = Array(tstart:dt:(Nintervals*tinterval+dt)) # time array for noise grid
 # loss hyperparameters
 C1 = Float32(1.0)  # evolution state fidelity
 
-struct Parameters{flType,intType,tType}
-  lr::flType
-  epochs::intType
-  numtraj::intType
-  numtrajplot::intType
-  dt::flType
-  tinterval::flType
-  tspan::tType
-  Nintervals::intType
-  ts::Vector{flType}
-  Δ::flType
-  Ωmax::flType
-  κ::flType
-  C1::flType
+struct Parameters{flType, intType, tType}
+    lr::flType
+    epochs::intType
+    numtraj::intType
+    numtrajplot::intType
+    dt::flType
+    tinterval::flType
+    tspan::tType
+    Nintervals::intType
+    ts::Vector{flType}
+    Δ::flType
+    Ωmax::flType
+    κ::flType
+    C1::flType
 end
 
-myparameters = Parameters{typeof(dt),typeof(numtraj), typeof(tspan)}(
-  lr, epochs, numtraj, numtrajplot, dt, tinterval, tspan, Nintervals, ts,
-  Δ, Ωmax, κ, C1)
+myparameters = Parameters{typeof(dt), typeof(numtraj), typeof(tspan)}(lr, epochs, numtraj,
+                                                                      numtrajplot, dt,
+                                                                      tinterval, tspan,
+                                                                      Nintervals, ts,
+                                                                      Δ, Ωmax, κ, C1)
 
 ################################################
 # Define Neural Network
 
 # state-aware
-nn = FastChain(
-  FastDense(4, 32, relu),
-  FastDense(32, 1, tanh))
+nn = FastChain(FastDense(4, 32, relu),
+               FastDense(32, 1, tanh))
 
 p_nn = initial_params(nn) # random initial parameters
 
-
 ###############################################
 # initial state anywhere on the Bloch sphere
 function prepare_initial(dt, n_par)
-  # shape 4 x n_par
-  # input number of parallel realizations and dt for type inference
-  # random position on the Bloch sphere
-  theta = acos.(2*rand(typeof(dt),n_par).-1)  # uniform sampling for cos(theta) between -1 and 1
-  phi = rand(typeof(dt),n_par)*2*pi  # uniform sampling for phi between 0 and 2pi
-  # real and imaginary parts ceR, cdR, ceI, cdI
-  u0 = [cos.(theta/2), sin.(theta/2).*cos.(phi), false*theta, sin.(theta/2).*sin.(phi)]
-  return vcat(transpose.(u0)...) # build matrix
+    # shape 4 x n_par
+    # input number of parallel realizations and dt for type inference
+    # random position on the Bloch sphere
+    theta = acos.(2 * rand(typeof(dt), n_par) .- 1)  # uniform sampling for cos(theta) between -1 and 1
+    phi = rand(typeof(dt), n_par) * 2 * pi  # uniform sampling for phi between 0 and 2pi
+    # real and imaginary parts ceR, cdR, ceI, cdI
+    u0 = [
+        cos.(theta / 2),
+        sin.(theta / 2) .* cos.(phi),
+        false * theta,
+        sin.(theta / 2) .* sin.(phi),
+    ]
+    return vcat(transpose.(u0)...) # build matrix
 end
 
 # target state
@@ -102,186 +107,180 @@ u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
 ###############################################
 # Define SDE
 
-function qubit_drift!(du,u,p,t)
-  # expansion coefficients |Ψ> = ce |e> + cd |d>
-  ceR, cdR, ceI, cdI = u # real and imaginary parts
-
-  # Δ: atomic frequency
-  # Ω: Rabi frequency for field in x direction
-  # κ: spontaneous emission
-  Δ, Ωmax, κ = p[end-2:end]
-  nn_weights = p[1:end-3]
-  Ω = (nn(u, nn_weights).*Ωmax)[1]
-
-  @inbounds begin
-    du[1] = 1//2*(ceI*Δ-ceR*κ+cdI*Ω)
-    du[2] = -cdI*Δ/2 + 1*ceR*(cdI*ceI+cdR*ceR)*κ+ceI*Ω/2
-    du[3] = 1//2*(-ceR*Δ-ceI*κ-cdR*Ω)
-    du[4] = cdR*Δ/2 + 1*ceI*(cdI*ceI+cdR*ceR)*κ-ceR*Ω/2
-  end
-  return nothing
+function qubit_drift!(du, u, p, t)
+    # expansion coefficients |Ψ> = ce |e> + cd |d>
+    ceR, cdR, ceI, cdI = u # real and imaginary parts
+
+    # Δ: atomic frequency
+    # Ω: Rabi frequency for field in x direction
+    # κ: spontaneous emission
+    Δ, Ωmax, κ = p[(end - 2):end]
+    nn_weights = p[1:(end - 3)]
+    Ω = (nn(u, nn_weights) .* Ωmax)[1]
+
+    @inbounds begin
+        du[1] = 1 // 2 * (ceI * Δ - ceR * κ + cdI * Ω)
+        du[2] = -cdI * Δ / 2 + 1 * ceR * (cdI * ceI + cdR * ceR) * κ + ceI * Ω / 2
+        du[3] = 1 // 2 * (-ceR * Δ - ceI * κ - cdR * Ω)
+        du[4] = cdR * Δ / 2 + 1 * ceI * (cdI * ceI + cdR * ceR) * κ - ceR * Ω / 2
+    end
+    return nothing
 end
 
-function qubit_diffusion!(du,u,p,t)
-  ceR, cdR, ceI, cdI = u # real and imaginary parts
+function qubit_diffusion!(du, u, p, t)
+    ceR, cdR, ceI, cdI = u # real and imaginary parts
 
-  κ = p[end]
+    κ = p[end]
 
-  du .= false
+    du .= false
 
-  @inbounds begin
-    #du[1] = zero(ceR)
-    du[2] += sqrt(κ)*ceR
-    #du[3] = zero(ceR)
-    du[4] += sqrt(κ)*ceI
-  end
-  return nothing
+    @inbounds begin
+        #du[1] = zero(ceR)
+        du[2] += sqrt(κ) * ceR
+        #du[3] = zero(ceR)
+        du[4] += sqrt(κ) * ceI
+    end
+    return nothing
 end
 
 # normalization callback
-condition(u,t,integrator) = true
+condition(u, t, integrator) = true
 function affect!(integrator)
-  integrator.u .= integrator.u/norm(integrator.u)
+    integrator.u .= integrator.u / norm(integrator.u)
 end
-callback = DiscreteCallback(condition, affect!, save_positions=(false, false))
+callback = DiscreteCallback(condition, affect!, save_positions = (false, false))
 
-CreateGrid(t,W1) = NoiseGrid(t,W1)
+CreateGrid(t, W1) = NoiseGrid(t, W1)
 Zygote.@nograd CreateGrid #avoid taking grads of this function
 
 # set scalar random process
-W = sqrt(myparameters.dt)*randn(typeof(myparameters.dt),size(myparameters.ts)) #for 1 trajectory
-W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-NG = CreateGrid(myparameters.ts,W1)
+W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+NG = CreateGrid(myparameters.ts, W1)
 
 # get control pulses
 p_all = [p_nn; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
 # define SDE problem
-prob = SDEProblem{true}(qubit_drift!, qubit_diffusion!, vec(u0[:,1]), myparameters.tspan, p_all,
-   callback=callback, noise=NG
-   )
+prob = SDEProblem{true}(qubit_drift!, qubit_diffusion!, vec(u0[:, 1]), myparameters.tspan,
+                        p_all,
+                        callback = callback, noise = NG)
 
 #########################################
 # compute loss
-function g(u,p,t)
-  ceR = @view u[1,:,:]
-  cdR = @view u[2,:,:]
-  ceI = @view u[3,:,:]
-  cdI = @view u[4,:,:]
-  p[1]*mean((cdR.^2 + cdI.^2) ./ (ceR.^2 + cdR.^2 + ceI.^2 + cdI.^2))
+function g(u, p, t)
+    ceR = @view u[1, :, :]
+    cdR = @view u[2, :, :]
+    ceI = @view u[3, :, :]
+    cdI = @view u[4, :, :]
+    p[1] * mean((cdR .^ 2 + cdI .^ 2) ./ (ceR .^ 2 + cdR .^ 2 + ceI .^ 2 + cdI .^ 2))
 end
 
-
-function loss(p; alg=EM(), sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP()))
-
-  pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
-  u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
-
-  function prob_func(prob, i, repeat)
-    # prepare initial state and applied control pulse
-    u0tmp = deepcopy(vec(u0[:, i]))
-    W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
-    W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-    NG = CreateGrid(myparameters.ts, W1)
-
-    remake(prob,
-      p=pars,
-      u0=u0tmp,
-      callback=callback,
-      noise=NG)
-  end
-
-  ensembleprob = EnsembleProblem(prob,
-    prob_func=prob_func,
-    safetycopy=true
-  )
-
-  _sol = solve(ensembleprob, alg, EnsembleThreads(),
-    sensealg=sensealg,
-    saveat=myparameters.tinterval,
-    dt=myparameters.dt,
-    adaptive=false,
-    trajectories=myparameters.numtraj, batch_size=myparameters.numtraj)
-  A = convert(Array, _sol)
-
-  l = g(A, [myparameters.C1], nothing)
-  # returns loss value
-  return l
+function loss(p; alg = EM(), sensealg = BacksolveAdjoint(autojacvec = ReverseDiffVJP()))
+    pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
+    u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
+
+    function prob_func(prob, i, repeat)
+        # prepare initial state and applied control pulse
+        u0tmp = deepcopy(vec(u0[:, i]))
+        W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+        W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+        NG = CreateGrid(myparameters.ts, W1)
+
+        remake(prob,
+               p = pars,
+               u0 = u0tmp,
+               callback = callback,
+               noise = NG)
+    end
+
+    ensembleprob = EnsembleProblem(prob,
+                                   prob_func = prob_func,
+                                   safetycopy = true)
+
+    _sol = solve(ensembleprob, alg, EnsembleThreads(),
+                 sensealg = sensealg,
+                 saveat = myparameters.tinterval,
+                 dt = myparameters.dt,
+                 adaptive = false,
+                 trajectories = myparameters.numtraj, batch_size = myparameters.numtraj)
+    A = convert(Array, _sol)
+
+    l = g(A, [myparameters.C1], nothing)
+    # returns loss value
+    return l
 end
 
 #########################################
 # visualization -- run for new batch
-function visualize(p; alg=EM())
-
-  u0 = prepare_initial(myparameters.dt, myparameters.numtrajplot)
-  pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
-
-  function prob_func(prob, i, repeat)
-    # prepare initial state and applied control pulse
-    u0tmp = deepcopy(vec(u0[:, i]))
-    W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
-    W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-    NG = CreateGrid(myparameters.ts, W1)
-
-    remake(prob,
-      p=pars,
-      u0=u0tmp,
-      callback=callback,
-      noise=NG)
-  end
-
-  ensembleprob = EnsembleProblem(prob,
-    prob_func=prob_func,
-    safetycopy=true
-  )
-
-  u = solve(ensembleprob, alg, EnsembleThreads(),
-    saveat=myparameters.tinterval,
-    dt=myparameters.dt,
-    adaptive=false, #abstol=1e-6, reltol=1e-6,
-    trajectories=myparameters.numtrajplot, batch_size=myparameters.numtrajplot)
-
-
-  ceR = @view u[1, :, :]
-  cdR = @view u[2, :, :]
-  ceI = @view u[3, :, :]
-  cdI = @view u[4, :, :]
-  infidelity = @. (cdR^2 + cdI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
-  meaninfidelity = mean(infidelity)
-  loss = myparameters.C1 * meaninfidelity
-
-  @info "Loss: " loss
-
-  fidelity = @. (ceR^2 + ceI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
-
-  mf = mean(fidelity, dims=2)[:]
-  sf = std(fidelity, dims=2)[:]
-
-  pl1 = plot(0:myparameters.Nintervals, mf,
-    ribbon=sf,
-    ylim=(0, 1), xlim=(0, myparameters.Nintervals),
-    c=1, lw=1.5, xlabel="steps i", ylabel="Fidelity", legend=false)
-
-  pl = plot(pl1, legend=false, size=(400, 360))
-  return pl, loss
+function visualize(p; alg = EM())
+    u0 = prepare_initial(myparameters.dt, myparameters.numtrajplot)
+    pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
+
+    function prob_func(prob, i, repeat)
+        # prepare initial state and applied control pulse
+        u0tmp = deepcopy(vec(u0[:, i]))
+        W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+        W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+        NG = CreateGrid(myparameters.ts, W1)
+
+        remake(prob,
+               p = pars,
+               u0 = u0tmp,
+               callback = callback,
+               noise = NG)
+    end
+
+    ensembleprob = EnsembleProblem(prob,
+                                   prob_func = prob_func,
+                                   safetycopy = true)
+
+    u = solve(ensembleprob, alg, EnsembleThreads(),
+              saveat = myparameters.tinterval,
+              dt = myparameters.dt,
+              adaptive = false, #abstol=1e-6, reltol=1e-6,
+              trajectories = myparameters.numtrajplot,
+              batch_size = myparameters.numtrajplot)
+
+    ceR = @view u[1, :, :]
+    cdR = @view u[2, :, :]
+    ceI = @view u[3, :, :]
+    cdI = @view u[4, :, :]
+    infidelity = @. (cdR^2 + cdI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
+    meaninfidelity = mean(infidelity)
+    loss = myparameters.C1 * meaninfidelity
+
+    @info "Loss: " loss
+
+    fidelity = @. (ceR^2 + ceI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
+
+    mf = mean(fidelity, dims = 2)[:]
+    sf = std(fidelity, dims = 2)[:]
+
+    pl1 = plot(0:(myparameters.Nintervals), mf,
+               ribbon = sf,
+               ylim = (0, 1), xlim = (0, myparameters.Nintervals),
+               c = 1, lw = 1.5, xlabel = "steps i", ylabel = "Fidelity", legend = false)
+
+    pl = plot(pl1, legend = false, size = (400, 360))
+    return pl, loss
 end
 
 # burn-in loss
 l = loss(p_nn)
 # callback to visualize training
-visualization_callback = function (p, l; doplot=false)
-  println(l)
+visualization_callback = function (p, l; doplot = false)
+    println(l)
 
-  if doplot
-    pl, _ = visualize(p)
-    display(pl)
-  end
+    if doplot
+        pl, _ = visualize(p)
+        display(pl)
+    end
 
-  return false
+    return false
 end
 
 # Display the ODE with the initial parameter values.
-visualization_callback(p_nn, l; doplot=true)
-
+visualization_callback(p_nn, l; doplot = true)
 
 ###################################
 # training loop
@@ -290,18 +289,20 @@ visualization_callback(p_nn, l; doplot=true)
 # optimize the parameters for a few epochs with ADAM on time span
 # Setup and run the optimization
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, p_nn)
-res = Optimization.solve(optprob, ADAM(myparameters.lr), callback=visualization_callback, maxiters=100)
+res = Optimization.solve(optprob, ADAM(myparameters.lr), callback = visualization_callback,
+                         maxiters = 100)
 
 # plot optimized control
-visualization_callback(res.u, loss(res.u); doplot=true)
+visualization_callback(res.u, loss(res.u); doplot = true)
 ```
 
 ## Step-by-step description
 
 ### Load packages
+
 ```@example sdecontrol
 using DiffEqFlux
 using SciMLSensitivity
@@ -312,7 +313,9 @@ using Plots
 ```
 
 ### Parameters
+
 We define the parameters of the qubit and hyperparameters of the training process.
+
 ```@example sdecontrol
 lr = 0.01f0
 epochs = 100
@@ -325,8 +328,8 @@ dt = 0.0005f0
 tinterval = 0.05f0
 tstart = 0.0f0
 Nintervals = 20 # total number of intervals, total time = t_interval*Nintervals
-tspan = (tstart,tinterval*Nintervals)
-ts = Array(tstart:dt:(Nintervals*tinterval+dt)) # time array for noise grid
+tspan = (tstart, tinterval * Nintervals)
+ts = Array(tstart:dt:(Nintervals * tinterval + dt)) # time array for noise grid
 
 # Hamiltonian parameters
 Δ = 20.0f0
@@ -336,75 +339,85 @@ ts = Array(tstart:dt:(Nintervals*tinterval+dt)) # time array for noise grid
 # loss hyperparameters
 C1 = Float32(1.0)  # evolution state fidelity
 
-struct Parameters{flType,intType,tType}
-  lr::flType
-  epochs::intType
-  numtraj::intType
-  numtrajplot::intType
-  dt::flType
-  tinterval::flType
-  tspan::tType
-  Nintervals::intType
-  ts::Vector{flType}
-  Δ::flType
-  Ωmax::flType
-  κ::flType
-  C1::flType
+struct Parameters{flType, intType, tType}
+    lr::flType
+    epochs::intType
+    numtraj::intType
+    numtrajplot::intType
+    dt::flType
+    tinterval::flType
+    tspan::tType
+    Nintervals::intType
+    ts::Vector{flType}
+    Δ::flType
+    Ωmax::flType
+    κ::flType
+    C1::flType
 end
 
-myparameters = Parameters{typeof(dt),typeof(numtraj), typeof(tspan)}(
-  lr, epochs, numtraj, numtrajplot, dt, tinterval, tspan, Nintervals, ts,
-  Δ, Ωmax, κ, C1)
+myparameters = Parameters{typeof(dt), typeof(numtraj), typeof(tspan)}(lr, epochs, numtraj,
+                                                                      numtrajplot, dt,
+                                                                      tinterval, tspan,
+                                                                      Nintervals, ts,
+                                                                      Δ, Ωmax, κ, C1)
 ```
 
 In plain terms, the quantities that were defined are:
 
-- `lr` = learning rate of the optimizer
-- `epochs` = number of epochs in the training process
-- `numtraj` = number of simulated trajectories in the training process
-- `numtrajplot` = number of simulated trajectories to visualize the performance
-- `dt` = time step for solver (initial `dt` if adaptive)
-- `tinterval` = time spacing between checkpoints
-- `tspan` = time span
-- `Nintervals` = number of checkpoints
-- `ts` = discretization of the entire time interval, used for `NoiseGrid`
-- `Δ` = detuning between the qubit and the laser
-- `Ωmax` = maximum frequency of the control laser
-- `κ` = decay rate
-- `C1` = loss function hyperparameter
+  - `lr` = learning rate of the optimizer
+  - `epochs` = number of epochs in the training process
+  - `numtraj` = number of simulated trajectories in the training process
+  - `numtrajplot` = number of simulated trajectories to visualize the performance
+  - `dt` = time step for solver (initial `dt` if adaptive)
+  - `tinterval` = time spacing between checkpoints
+  - `tspan` = time span
+  - `Nintervals` = number of checkpoints
+  - `ts` = discretization of the entire time interval, used for `NoiseGrid`
+  - `Δ` = detuning between the qubit and the laser
+  - `Ωmax` = maximum frequency of the control laser
+  - `κ` = decay rate
+  - `C1` = loss function hyperparameter
 
 ### Controller
+
 We use a neural network to control the parameter Ω(t). Alternatively, one could
 also, e.g., use [tensor layers](https://docs.sciml.ai/DiffEqFlux/stable/layers/TensorLayer/), Flux.jl, or Lux.jl.
 
 ```@example sdecontrol
 # state-aware
-nn = FastChain(
-  FastDense(4, 32, relu),
-  FastDense(32, 1, tanh))
+nn = FastChain(FastDense(4, 32, relu),
+               FastDense(32, 1, tanh))
 
 p_nn = initial_params(nn) # random initial parameters
 ```
 
 ### Initial state
+
 We prepare `n_par` initial states, uniformly distributed over the Bloch sphere.
 To avoid complex numbers in our simulations, we split the state of the qubit
+
 ```math
   ψ(t) = c_e(t) (1,0) + c_d(t) (0,1)
 ```
+
 into its real and imaginary part.
 
 ```@example sdecontrol
 # initial state anywhere on the Bloch sphere
 function prepare_initial(dt, n_par)
-  # shape 4 x n_par
-  # input number of parallel realizations and dt for type inference
-  # random position on the Bloch sphere
-  theta = acos.(2*rand(typeof(dt),n_par).-1)  # uniform sampling for cos(theta) between -1 and 1
-  phi = rand(typeof(dt),n_par)*2*pi  # uniform sampling for phi between 0 and 2pi
-  # real and imaginary parts ceR, cdR, ceI, cdI
-  u0 = [cos.(theta/2), sin.(theta/2).*cos.(phi), false*theta, sin.(theta/2).*sin.(phi)]
-  return vcat(transpose.(u0)...) # build matrix
+    # shape 4 x n_par
+    # input number of parallel realizations and dt for type inference
+    # random position on the Bloch sphere
+    theta = acos.(2 * rand(typeof(dt), n_par) .- 1)  # uniform sampling for cos(theta) between -1 and 1
+    phi = rand(typeof(dt), n_par) * 2 * pi  # uniform sampling for phi between 0 and 2pi
+    # real and imaginary parts ceR, cdR, ceI, cdI
+    u0 = [
+        cos.(theta / 2),
+        sin.(theta / 2) .* cos.(phi),
+        false * theta,
+        sin.(theta / 2) .* sin.(phi),
+    ]
+    return vcat(transpose.(u0)...) # build matrix
 end
 
 # target state
@@ -414,6 +427,7 @@ u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
 ```
 
 ### Defining the SDE
+
 We define the drift and diffusion term of the qubit. The SDE doesn't preserve the
 norm of the quantum state. To ensure the normalization of the state, we add a
 `DiscreteCallback` after each time step. Further, we use a NoiseGrid
@@ -424,63 +438,63 @@ continuously updated.
 
 ```@example sdecontrol
 # Define SDE
-function qubit_drift!(du,u,p,t)
-  # expansion coefficients |Ψ> = ce |e> + cd |d>
-  ceR, cdR, ceI, cdI = u # real and imaginary parts
-
-  # Δ: atomic frequency
-  # Ω: Rabi frequency for field in x direction
-  # κ: spontaneous emission
-  Δ, Ωmax, κ = p[end-2:end]
-  nn_weights = p[1:end-3]
-  Ω = (nn(u, nn_weights).*Ωmax)[1]
-
-  @inbounds begin
-    du[1] = 1//2*(ceI*Δ-ceR*κ+cdI*Ω)
-    du[2] = -cdI*Δ/2 + 1*ceR*(cdI*ceI+cdR*ceR)*κ+ceI*Ω/2
-    du[3] = 1//2*(-ceR*Δ-ceI*κ-cdR*Ω)
-    du[4] = cdR*Δ/2 + 1*ceI*(cdI*ceI+cdR*ceR)*κ-ceR*Ω/2
-  end
-  return nothing
+function qubit_drift!(du, u, p, t)
+    # expansion coefficients |Ψ> = ce |e> + cd |d>
+    ceR, cdR, ceI, cdI = u # real and imaginary parts
+
+    # Δ: atomic frequency
+    # Ω: Rabi frequency for field in x direction
+    # κ: spontaneous emission
+    Δ, Ωmax, κ = p[(end - 2):end]
+    nn_weights = p[1:(end - 3)]
+    Ω = (nn(u, nn_weights) .* Ωmax)[1]
+
+    @inbounds begin
+        du[1] = 1 // 2 * (ceI * Δ - ceR * κ + cdI * Ω)
+        du[2] = -cdI * Δ / 2 + 1 * ceR * (cdI * ceI + cdR * ceR) * κ + ceI * Ω / 2
+        du[3] = 1 // 2 * (-ceR * Δ - ceI * κ - cdR * Ω)
+        du[4] = cdR * Δ / 2 + 1 * ceI * (cdI * ceI + cdR * ceR) * κ - ceR * Ω / 2
+    end
+    return nothing
 end
 
-function qubit_diffusion!(du,u,p,t)
-  ceR, cdR, ceI, cdI = u # real and imaginary parts
+function qubit_diffusion!(du, u, p, t)
+    ceR, cdR, ceI, cdI = u # real and imaginary parts
 
-  κ = p[end]
+    κ = p[end]
 
-  du .= false
+    du .= false
 
-  @inbounds begin
-    #du[1] = zero(ceR)
-    du[2] += sqrt(κ)*ceR
-    #du[3] = zero(ceR)
-    du[4] += sqrt(κ)*ceI
-  end
-  return nothing
+    @inbounds begin
+        #du[1] = zero(ceR)
+        du[2] += sqrt(κ) * ceR
+        #du[3] = zero(ceR)
+        du[4] += sqrt(κ) * ceI
+    end
+    return nothing
 end
 
 # normalization callback
-condition(u,t,integrator) = true
+condition(u, t, integrator) = true
 function affect!(integrator)
-  integrator.u.=integrator.u/norm(integrator.u)
+    integrator.u .= integrator.u / norm(integrator.u)
 end
-callback = DiscreteCallback(condition,affect!,save_positions=(false,false))
+callback = DiscreteCallback(condition, affect!, save_positions = (false, false))
 
-CreateGrid(t,W1) = NoiseGrid(t,W1)
+CreateGrid(t, W1) = NoiseGrid(t, W1)
 Zygote.@nograd CreateGrid #avoid taking grads of this function
 
 # set scalar random process
-W = sqrt(myparameters.dt)*randn(typeof(myparameters.dt),size(myparameters.ts)) #for 1 trajectory
-W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-NG = CreateGrid(myparameters.ts,W1)
+W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+NG = CreateGrid(myparameters.ts, W1)
 
 # get control pulses
 p_all = [p_nn; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
 # define SDE problem
-prob = SDEProblem{true}(qubit_drift!, qubit_diffusion!, vec(u0[:,1]), myparameters.tspan, p_all,
-   callback=callback, noise=NG
-   )
+prob = SDEProblem{true}(qubit_drift!, qubit_diffusion!, vec(u0[:, 1]), myparameters.tspan,
+                        p_all,
+                        callback = callback, noise = NG)
 ```
 
 ### Compute loss function
@@ -495,130 +509,126 @@ parallel ensemble simulation docs](https://docs.sciml.ai/DiffEqDocs/stable/featu
 for a description of the available ensemble algorithms. To optimize only the parameters
 of the neural network, we use `pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]`
 
-``` @example sdecontrol
+```@example sdecontrol
 # compute loss
-function g(u,p,t)
-  ceR = @view u[1,:,:]
-  cdR = @view u[2,:,:]
-  ceI = @view u[3,:,:]
-  cdI = @view u[4,:,:]
-  p[1]*mean((cdR.^2 + cdI.^2) ./ (ceR.^2 + cdR.^2 + ceI.^2 + cdI.^2))
+function g(u, p, t)
+    ceR = @view u[1, :, :]
+    cdR = @view u[2, :, :]
+    ceI = @view u[3, :, :]
+    cdI = @view u[4, :, :]
+    p[1] * mean((cdR .^ 2 + cdI .^ 2) ./ (ceR .^ 2 + cdR .^ 2 + ceI .^ 2 + cdI .^ 2))
 end
 
-function loss(p; alg=EM(), sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP()))
-
-  pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
-  u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
-
-  function prob_func(prob, i, repeat)
-    # prepare initial state and applied control pulse
-    u0tmp = deepcopy(vec(u0[:, i]))
-    W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
-    W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-    NG = CreateGrid(myparameters.ts, W1)
-
-    remake(prob,
-      p=pars,
-      u0=u0tmp,
-      callback=callback,
-      noise=NG)
-  end
-
-  ensembleprob = EnsembleProblem(prob,
-    prob_func=prob_func,
-    safetycopy=true
-  )
-
-  _sol = solve(ensembleprob, alg, EnsembleThreads(),
-    sensealg=sensealg,
-    saveat=myparameters.tinterval,
-    dt=myparameters.dt,
-    adaptive=false,
-    trajectories=myparameters.numtraj, batch_size=myparameters.numtraj)
-  A = convert(Array, _sol)
-
-  l = g(A, [myparameters.C1], nothing)
-  # returns loss value
-  return l
+function loss(p; alg = EM(), sensealg = BacksolveAdjoint(autojacvec = ReverseDiffVJP()))
+    pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
+    u0 = prepare_initial(myparameters.dt, myparameters.numtraj)
+
+    function prob_func(prob, i, repeat)
+        # prepare initial state and applied control pulse
+        u0tmp = deepcopy(vec(u0[:, i]))
+        W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+        W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+        NG = CreateGrid(myparameters.ts, W1)
+
+        remake(prob,
+               p = pars,
+               u0 = u0tmp,
+               callback = callback,
+               noise = NG)
+    end
+
+    ensembleprob = EnsembleProblem(prob,
+                                   prob_func = prob_func,
+                                   safetycopy = true)
+
+    _sol = solve(ensembleprob, alg, EnsembleThreads(),
+                 sensealg = sensealg,
+                 saveat = myparameters.tinterval,
+                 dt = myparameters.dt,
+                 adaptive = false,
+                 trajectories = myparameters.numtraj, batch_size = myparameters.numtraj)
+    A = convert(Array, _sol)
+
+    l = g(A, [myparameters.C1], nothing)
+    # returns loss value
+    return l
 end
 ```
 
-
 ### Visualization
+
 To visualize the performance of the controller, we plot the mean value and
 standard deviation of the fidelity of a bunch of trajectories (`myparameters.numtrajplot`) as
 a function of the time steps at which loss values are computed.
 
 ```@example sdecontrol
-function visualize(p; alg=EM())
-
-  u0 = prepare_initial(myparameters.dt, myparameters.numtrajplot)
-  pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
-
-  function prob_func(prob, i, repeat)
-    # prepare initial state and applied control pulse
-    u0tmp = deepcopy(vec(u0[:, i]))
-    W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
-    W1 = cumsum([zero(myparameters.dt); W[1:end-1]], dims=1)
-    NG = CreateGrid(myparameters.ts, W1)
-
-    remake(prob,
-      p=pars,
-      u0=u0tmp,
-      callback=callback,
-      noise=NG)
-  end
-
-  ensembleprob = EnsembleProblem(prob,
-    prob_func=prob_func,
-    safetycopy=true
-  )
-
-  u = solve(ensembleprob, alg, EnsembleThreads(),
-    saveat=myparameters.tinterval,
-    dt=myparameters.dt,
-    adaptive=false, #abstol=1e-6, reltol=1e-6,
-    trajectories=myparameters.numtrajplot, batch_size=myparameters.numtrajplot)
-
-
-  ceR = @view u[1, :, :]
-  cdR = @view u[2, :, :]
-  ceI = @view u[3, :, :]
-  cdI = @view u[4, :, :]
-  infidelity = @. (cdR^2 + cdI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
-  meaninfidelity = mean(infidelity)
-  loss = myparameters.C1 * meaninfidelity
-
-  @info "Loss: " loss
-
-  fidelity = @. (ceR^2 + ceI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
-
-  mf = mean(fidelity, dims=2)[:]
-  sf = std(fidelity, dims=2)[:]
-
-  pl1 = plot(0:myparameters.Nintervals, mf,
-    ribbon=sf,
-    ylim=(0, 1), xlim=(0, myparameters.Nintervals),
-    c=1, lw=1.5, xlabel="steps i", ylabel="Fidelity", legend=false)
-
-  pl = plot(pl1, legend=false, size=(400, 360))
-  return pl, loss
+function visualize(p; alg = EM())
+    u0 = prepare_initial(myparameters.dt, myparameters.numtrajplot)
+    pars = [p; myparameters.Δ; myparameters.Ωmax; myparameters.κ]
+
+    function prob_func(prob, i, repeat)
+        # prepare initial state and applied control pulse
+        u0tmp = deepcopy(vec(u0[:, i]))
+        W = sqrt(myparameters.dt) * randn(typeof(myparameters.dt), size(myparameters.ts)) #for 1 trajectory
+        W1 = cumsum([zero(myparameters.dt); W[1:(end - 1)]], dims = 1)
+        NG = CreateGrid(myparameters.ts, W1)
+
+        remake(prob,
+               p = pars,
+               u0 = u0tmp,
+               callback = callback,
+               noise = NG)
+    end
+
+    ensembleprob = EnsembleProblem(prob,
+                                   prob_func = prob_func,
+                                   safetycopy = true)
+
+    u = solve(ensembleprob, alg, EnsembleThreads(),
+              saveat = myparameters.tinterval,
+              dt = myparameters.dt,
+              adaptive = false, #abstol=1e-6, reltol=1e-6,
+              trajectories = myparameters.numtrajplot,
+              batch_size = myparameters.numtrajplot)
+
+    ceR = @view u[1, :, :]
+    cdR = @view u[2, :, :]
+    ceI = @view u[3, :, :]
+    cdI = @view u[4, :, :]
+    infidelity = @. (cdR^2 + cdI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
+    meaninfidelity = mean(infidelity)
+    loss = myparameters.C1 * meaninfidelity
+
+    @info "Loss: " loss
+
+    fidelity = @. (ceR^2 + ceI^2) / (ceR^2 + cdR^2 + ceI^2 + cdI^2)
+
+    mf = mean(fidelity, dims = 2)[:]
+    sf = std(fidelity, dims = 2)[:]
+
+    pl1 = plot(0:(myparameters.Nintervals), mf,
+               ribbon = sf,
+               ylim = (0, 1), xlim = (0, myparameters.Nintervals),
+               c = 1, lw = 1.5, xlabel = "steps i", ylabel = "Fidelity", legend = false)
+
+    pl = plot(pl1, legend = false, size = (400, 360))
+    return pl, loss
 end
 # callback to visualize training
-visualization_callback = function (p, l; doplot=false)
-  println(l)
+visualization_callback = function (p, l; doplot = false)
+    println(l)
 
-  if doplot
-    pl, _ = visualize(p)
-    display(pl)
-  end
+    if doplot
+        pl, _ = visualize(p)
+        display(pl)
+    end
 
-  return false
+    return false
 end
 ```
 
-
 ### Training
+
 We use the `ADAM` optimizer to optimize the parameters of the neural network.
 In each epoch, we draw new initial quantum states, compute the forward evolution,
 and, subsequently, the gradients of the loss function with respect to the parameters
@@ -632,18 +642,18 @@ is computed under the hood in the SciMLSensitivity package.
 # optimize the parameters for a few epochs with ADAM on time span
 # Setup and run the optimization
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, p_nn)
-res = Optimization.solve(optprob, ADAM(myparameters.lr), callback=visualization_callback, maxiters=100)
+res = Optimization.solve(optprob, ADAM(myparameters.lr), callback = visualization_callback,
+                         maxiters = 100)
 
 # plot optimized control
-visualization_callback(res.u, loss(res.u); doplot=true)
+visualization_callback(res.u, loss(res.u); doplot = true)
 ```
 
 ![Evolution of the fidelity as a function of time](https://user-images.githubusercontent.com/42201748/107991039-10c59200-6fd6-11eb-8a97-a1c8d18a266b.png)
 
-
 ## References
 
 [1] Schäfer, Frank, Pavel Sekatski, Martin Koppenhöfer, Christoph Bruder, and Michal Kloc. "Control of stochastic quantum dynamics by differentiable programming." Machine Learning: Science and Technology 2, no. 3 (2021): 035004.
diff --git a/docs/src/examples/sde/optimization_sde.md b/docs/src/examples/sde/optimization_sde.md
index f6f306eed..e8c915cb2 100644
--- a/docs/src/examples/sde/optimization_sde.md
+++ b/docs/src/examples/sde/optimization_sde.md
@@ -17,23 +17,23 @@ solution, so we need a sensible data source.
 ```@example sde
 using DifferentialEquations, SciMLSensitivity, Plots
 
-function lotka_volterra!(du,u,p,t)
-  x,y = u
-  α,β,γ,δ = p
-  du[1] = dx = α*x - β*x*y
-  du[2] = dy = δ*x*y - γ*y
+function lotka_volterra!(du, u, p, t)
+    x, y = u
+    α, β, γ, δ = p
+    du[1] = dx = α * x - β * x * y
+    du[2] = dy = δ * x * y - γ * y
 end
-u0 = [1.0,1.0]
-tspan = (0.0,10.0)
+u0 = [1.0, 1.0]
+tspan = (0.0, 10.0)
 
-function multiplicative_noise!(du,u,p,t)
-  x,y = u
-  du[1] = p[5]*x
-  du[2] = p[6]*y
+function multiplicative_noise!(du, u, p, t)
+    x, y = u
+    du[1] = p[5] * x
+    du[2] = p[6] * y
 end
-p = [1.5,1.0,3.0,1.0,0.3,0.3]
+p = [1.5, 1.0, 3.0, 1.0, 0.3, 0.3]
 
-prob = SDEProblem(lotka_volterra!,multiplicative_noise!,u0,tspan,p)
+prob = SDEProblem(lotka_volterra!, multiplicative_noise!, u0, tspan, p)
 sol = solve(prob)
 plot(sol)
 ```
@@ -48,9 +48,9 @@ scenario, we will generate 10,000 trajectories from the SDE to build our dataset
 ```@example sde
 using Statistics
 ensembleprob = EnsembleProblem(prob)
-@time sol = solve(ensembleprob,SOSRI(),saveat=0.1,trajectories=10_000)
-truemean = mean(sol,dims=3)[:,:]
-truevar  = var(sol,dims=3)[:,:]
+@time sol = solve(ensembleprob, SOSRI(), saveat = 0.1, trajectories = 10_000)
+truemean = mean(sol, dims = 3)[:, :]
+truevar = var(sol, dims = 3)[:, :]
 ```
 
 From here, we wish to utilize the method of moments to fit the SDE's parameters.
@@ -60,24 +60,25 @@ then plot the evolution of the means and variances to verify the fit. For exampl
 
 ```@example sde
 function loss(p)
-  tmp_prob = remake(prob,p=p)
-  ensembleprob = EnsembleProblem(tmp_prob)
-  tmp_sol = solve(ensembleprob,SOSRI(),saveat=0.1,trajectories=1000)
-  arrsol = Array(tmp_sol)
-  sum(abs2,truemean - mean(arrsol,dims=3)) + 0.1sum(abs2,truevar - var(arrsol,dims=3)),arrsol
+    tmp_prob = remake(prob, p = p)
+    ensembleprob = EnsembleProblem(tmp_prob)
+    tmp_sol = solve(ensembleprob, SOSRI(), saveat = 0.1, trajectories = 1000)
+    arrsol = Array(tmp_sol)
+    sum(abs2, truemean - mean(arrsol, dims = 3)) +
+    0.1sum(abs2, truevar - var(arrsol, dims = 3)), arrsol
 end
 
-function cb2(p,l,arrsol)
-  @show p,l
-  means = mean(arrsol,dims=3)[:,:]
-  vars = var(arrsol,dims=3)[:,:]
-  p1 = plot(sol[1].t,means',lw=5)
-  scatter!(p1,sol[1].t,truemean')
-  p2 = plot(sol[1].t,vars',lw=5)
-  scatter!(p2,sol[1].t,truevar')
-  p = plot(p1,p2,layout = (2,1))
-  display(p)
-  false
+function cb2(p, l, arrsol)
+    @show p, l
+    means = mean(arrsol, dims = 3)[:, :]
+    vars = var(arrsol, dims = 3)[:, :]
+    p1 = plot(sol[1].t, means', lw = 5)
+    scatter!(p1, sol[1].t, truemean')
+    p2 = plot(sol[1].t, vars', lw = 5)
+    scatter!(p2, sol[1].t, truevar')
+    p = plot(p1, p2, layout = (2, 1))
+    display(p)
+    false
 end
 ```
 
@@ -85,11 +86,11 @@ We can then use `Optimization.solve` to fit the SDE:
 
 ```@example sde
 using Optimization, Zygote, OptimizationFlux
-pinit = [1.2,0.8,2.5,0.8,0.1,0.1]
+pinit = [1.2, 0.8, 2.5, 0.8, 0.1, 0.1]
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, pinit)
-@time res = Optimization.solve(optprob,ADAM(0.05),callback=cb2,maxiters = 100)
+@time res = Optimization.solve(optprob, ADAM(0.05), callback = cb2, maxiters = 100)
 ```
 
 Notice that **both the parameters of the deterministic drift equations and the
@@ -122,29 +123,28 @@ solution to be close to the constant 1.
 using DifferentialEquations, DiffEqFlux, Optimization, OptimizationFlux, Plots
 
 function lotka_volterra!(du, u, p, t)
-  x, y = u
-  α, β, δ, γ = p
-  du[1] = dx = α*x - β*x*y
-  du[2] = dy = -δ*y + γ*x*y
+    x, y = u
+    α, β, δ, γ = p
+    du[1] = dx = α * x - β * x * y
+    du[2] = dy = -δ * y + γ * x * y
 end
 
 function lotka_volterra_noise!(du, u, p, t)
-  du[1] = 0.1u[1]
-  du[2] = 0.1u[2]
+    du[1] = 0.1u[1]
+    du[2] = 0.1u[2]
 end
 
-u0 = [1.0,1.0]
+u0 = [1.0, 1.0]
 tspan = (0.0, 10.0)
 p = [2.2, 1.0, 2.0, 0.4]
 prob_sde = SDEProblem(lotka_volterra!, lotka_volterra_noise!, u0, tspan)
 
-
 function predict_sde(p)
-  return Array(solve(prob_sde, SOSRI(), p=p,
-               sensealg = ForwardDiffSensitivity(), saveat = 0.1))
+    return Array(solve(prob_sde, SOSRI(), p = p,
+                       sensealg = ForwardDiffSensitivity(), saveat = 0.1))
 end
 
-loss_sde(p) = sum(abs2, x-1 for x in predict_sde(p))
+loss_sde(p) = sum(abs2, x - 1 for x in predict_sde(p))
 ```
 
 For this training process, because the loss function is stochastic, we will use
@@ -155,11 +155,11 @@ like:
 
 ```@example sde
 callback = function (p, l)
-  display(l)
-  remade_solution = solve(remake(prob_sde, p = p), SOSRI(), saveat = 0.1)
-  plt = plot(remade_solution, ylim = (0, 6))
-  display(plt)
-  return false
+    display(l)
+    remade_solution = solve(remake(prob_sde, p = p), SOSRI(), saveat = 0.1)
+    plt = plot(remade_solution, ylim = (0, 6))
+    display(plt)
+    return false
 end
 ```
 
@@ -167,11 +167,11 @@ Let's optimize
 
 ```@example sde
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss_sde(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_sde(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, p)
 result_sde = Optimization.solve(optprob, ADAM(0.1),
-                                    callback = callback, maxiters = 100)
+                                callback = callback, maxiters = 100)
 ```
 
 ![](https://user-images.githubusercontent.com/1814174/51399524-2c6abf80-1b14-11e9-96ae-0192f7debd03.gif)
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
index a08555a77..f0cb3f016 100644
--- a/docs/src/getting_started.md
+++ b/docs/src/getting_started.md
@@ -1,10 +1,10 @@
 # [Getting Started with SciMLSensitivity: Differentiating ODE Solutions](@id auto_diff)
 
 !!! warn
-
-      This tutorial assumes familiarity with DifferentialEquations.jl.
-      If you are not familiar with DifferentialEquations.jl, please consult
-      [the DifferentialEquations.jl documentation](https://docs.sciml.ai/DiffEqDocs/stable/).
+    
+    This tutorial assumes familiarity with DifferentialEquations.jl.
+    If you are not familiar with DifferentialEquations.jl, please consult
+    [the DifferentialEquations.jl documentation](https://docs.sciml.ai/DiffEqDocs/stable/).
 
 SciMLSensitivity.jl is a tool for obtaining derivatives of equation solvers,
 such as differential equation solvers. These can be used in many ways, such as
@@ -14,7 +14,7 @@ tutorial, we will show how to make use of the tooling in SciMLSensitivity.jl
 to differentiate the ODE solvers.
 
 !!! note
-
+    
     SciMLSensitivity.jl applies to all equation solvers of the SciML ecosystem,
     such as linear solvers, nonlinear solvers, nonlinear optimization,
     and more. This tutorial focuses on differential equations, so please see
@@ -30,13 +30,14 @@ Lotka-Volterra equation. This is done via DifferentialEquations.jl using:
 ```@example diffode
 using DifferentialEquations
 
-function lotka_volterra!(du,u,p,t)
-  du[1] = dx = p[1]*u[1] - p[2]*u[1]*u[2]
-  du[2] = dy = -p[3]*u[2] + p[4]*u[1]*u[2]
+function lotka_volterra!(du, u, p, t)
+    du[1] = dx = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = dy = -p[3] * u[2] + p[4] * u[1] * u[2]
 end
-p = [1.5,1.0,3.0,1.0]; u0 = [1.0;1.0]
-prob = ODEProblem(lotka_volterra!,u0,(0.0,10.0),p)
-sol = solve(prob,Tsit5(),reltol=1e-6,abstol=1e-6)
+p = [1.5, 1.0, 3.0, 1.0];
+u0 = [1.0; 1.0];
+prob = ODEProblem(lotka_volterra!, u0, (0.0, 10.0), p)
+sol = solve(prob, Tsit5(), reltol = 1e-6, abstol = 1e-6)
 ```
 
 Now let's differentiate the solution to this ODE using a few different automatic
@@ -46,28 +47,28 @@ differentiation methods.
 
 Let's say we need the derivative of the solution with respect to the initial condition
 `u0` and its parameters `p`. One of the simplest ways to do this is via ForwardDiff.jl.
-All one needs to do is to use 
+All one needs to do is to use
 [the ForwardDiff.jl library](https://juliadiff.org/ForwardDiff.jl/stable/) to differentiate
 some function `f` which uses a differential equation `solve` inside of it. For example,
-let's say we want the derivative of the first component of the ODE solution with respect to 
+let's say we want the derivative of the first component of the ODE solution with respect to
 these quantities at evenly spaced time points of `dt = 1`. We can compute this via:
 
 ```@example diffode
 using ForwardDiff
 
 function f(x)
-    _prob = remake(prob,u0=x[1:2],p=x[3:end])
-    solve(_prob,Tsit5(),reltol=1e-6,abstol=1e-6,saveat=1)[1,:]
+    _prob = remake(prob, u0 = x[1:2], p = x[3:end])
+    solve(_prob, Tsit5(), reltol = 1e-6, abstol = 1e-6, saveat = 1)[1, :]
 end
-x = [u0;p]
-dx = ForwardDiff.jacobian(f,x)
+x = [u0; p]
+dx = ForwardDiff.jacobian(f, x)
 ```
 
 Let's dig into what this is saying a bit. `x` is a vector which concatenates the initial condition
 and parameters, meaning that the first 2 values are the initial conditions and the last 4 are the
 parameters. We use the `remake` function to build a function `f(x)` which uses these new initial
 conditions and parameters to solve the differential equation and return the time series of the first
-component. 
+component.
 
 Then `ForwardDiff.jacobian(f,x)` computes the Jacobian of `f` with respect to `x`. The
 output `dx[i,j]` corresponds to the derivative of the solution of the first component at time `t=j-1`
@@ -75,31 +76,31 @@ with respect to `x[i]`. For example, `dx[3,2]` is the derivative of the first co
 solution at time `t=1` with respect to `p[1]`.
 
 !!! note
-
-      Since [the global error is 1-2 orders of magnitude higher than the local error](https://docs.sciml.ai/DiffEqDocs/stable/basics/faq/#What-does-tolerance-mean-and-how-much-error-should-I-expect), we use accuracies of 1e-6 (instead of the default 1e-3) to get reasonable sensitivities
+    
+    Since [the global error is 1-2 orders of magnitude higher than the local error](https://docs.sciml.ai/DiffEqDocs/stable/basics/faq/#What-does-tolerance-mean-and-how-much-error-should-I-expect), we use accuracies of 1e-6 (instead of the default 1e-3) to get reasonable sensitivities
 
 ## Reverse-Mode Automatic Differentiation
 
 [The `solve` function is automatically compatible with AD systems like Zygote.jl](https://docs.sciml.ai/SciMLSensitivity/stable/)
 and thus there is no machinery that is necessary to use other than to put `solve` inside
-a function that is differentiated by Zygote. For example, the following computes the solution 
-to an ODE and computes the gradient of a loss function (the sum of the ODE's output at each 
+a function that is differentiated by Zygote. For example, the following computes the solution
+to an ODE and computes the gradient of a loss function (the sum of the ODE's output at each
 timepoint with dt=0.1) via the adjoint method:
 
 ```@example diffode
 using Zygote, SciMLSensitivity
 
-function sum_of_solution(u0,p)
-  _prob = remake(prob,u0=u0,p=p)
-  sum(solve(_prob,Tsit5(),reltol=1e-6,abstol=1e-6,saveat=0.1))
+function sum_of_solution(u0, p)
+    _prob = remake(prob, u0 = u0, p = p)
+    sum(solve(_prob, Tsit5(), reltol = 1e-6, abstol = 1e-6, saveat = 0.1))
 end
-du01,dp1 = Zygote.gradient(sum_of_solution,u0,p)
+du01, dp1 = Zygote.gradient(sum_of_solution, u0, p)
 ```
 
 Zygote.jl's automatic differentiation system is overloaded to allow SciMLSensitivity.jl
 to redefine the way the derivatives are computed, allowing trade-offs between numerical
 stability, memory, and compute performance, similar to how ODE solver algorithms are
-chosen. 
+chosen.
 
 ### Choosing Sensitivity Algorithms
 
@@ -109,11 +110,12 @@ Let's demonstrate this by choosing the `QuadratureAdjoint` `sensealg` for the di
 this system:
 
 ```@example diffode
-function sum_of_solution(u0,p)
-  _prob = remake(prob,u0=u0,p=p)
-  sum(solve(_prob,Tsit5(),reltol=1e-6,abstol=1e-6,saveat=0.1,sensealg=QuadratureAdjoint()))
+function sum_of_solution(u0, p)
+    _prob = remake(prob, u0 = u0, p = p)
+    sum(solve(_prob, Tsit5(), reltol = 1e-6, abstol = 1e-6, saveat = 0.1,
+              sensealg = QuadratureAdjoint()))
 end
-du01,dp1 = Zygote.gradient(sum_of_solution,u0,p)
+du01, dp1 = Zygote.gradient(sum_of_solution, u0, p)
 ```
 
 Here this computes the derivative of the output with respect to the initial
@@ -122,13 +124,14 @@ using the `QuadratureAdjoint()`. For more information on the choices of sensitiv
 algorithms, see the [reference documentation in choosing sensitivity algorithms](@ref sensitivity_diffeq).
 
 !!! note
+    
     ForwardDiff.jl's automatic differentiation system ignores the sensitivity algorithms.
 
 ## When Should You Use Forward or Reverse Mode?
 
 Good question! The simple answer is, if you are differentiating a system of
 fewer than 100 equations, use forward-mode, otherwise reverse-mode. But it can
-be a lot more complicated than that! For more information, see the 
+be a lot more complicated than that! For more information, see the
 [reference documentation in choosing sensitivity algorithms](@ref sensitivity_diffeq).
 
 ## And that is it! Where should you go from here?
@@ -136,5 +139,5 @@ be a lot more complicated than that! For more information, see the
 That's all there is to the basics of differentiating the ODE solvers with SciMLSensitivity.jl.
 That said, check out the following tutorials to dig into more detail:
 
-* See the [ODE parameter estimation tutorial](@ref odeparamestim) to learn how to fit the parameters of ODE systems
-* See the [direct sensitivity tutorial](@ref direct_sensitivity) to dig into the lower level API for more performance
\ No newline at end of file
+  - See the [ODE parameter estimation tutorial](@ref odeparamestim) to learn how to fit the parameters of ODE systems
+  - See the [direct sensitivity tutorial](@ref direct_sensitivity) to dig into the lower level API for more performance
diff --git a/docs/src/index.md b/docs/src/index.md
index 49babf6c5..dcb1d1559 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -11,20 +11,20 @@ as easy as possible without losing efficiency.
 
 Thus, what SciMLSensitivity.jl provides is:
 
-- Automatic differentiation overloads for improving the performance and flexibility
-  of AD calls over `solve`.
-- A lower level direct interface for defining forward sensitivity and adjoint problems
-  to allow for minimal overhead and maximal performance.
-- A bunch of tutorials, documentation, and test cases for this combination
-  with parameter estimation (data fitting / model calibration), neural network 
-  libraries and GPUs.
+  - Automatic differentiation overloads for improving the performance and flexibility
+    of AD calls over `solve`.
+  - A lower level direct interface for defining forward sensitivity and adjoint problems
+    to allow for minimal overhead and maximal performance.
+  - A bunch of tutorials, documentation, and test cases for this combination
+    with parameter estimation (data fitting / model calibration), neural network
+    libraries and GPUs.
 
 !!! note
-
+    
     This documentation assumes familiarity with the solver packages for the respective problem
     types. If one is not familiar with the solver packages, please consult the documentation
-    for pieces like [DifferentialEquations.jl](https://docs.sciml.ai/DiffEqDocs/stable/), 
-    [NonlinearSolve.jl](https://docs.sciml.ai/NonlinearSolve/stable/), 
+    for pieces like [DifferentialEquations.jl](https://docs.sciml.ai/DiffEqDocs/stable/),
+    [NonlinearSolve.jl](https://docs.sciml.ai/NonlinearSolve/stable/),
     [LinearSolve.jl](https://docs.sciml.ai/LinearSolve/stable/), etc. first.
 
 ## Installation
@@ -41,17 +41,17 @@ Pkg.add("SciMLSensitivity")
 The highest level interface is provided by the function `solve`:
 
 ```julia
-solve(prob,args...;sensealg=InterpolatingAdjoint(),
-      checkpoints=sol.t,kwargs...)
+solve(prob, args...; sensealg = InterpolatingAdjoint(),
+      checkpoints = sol.t, kwargs...)
 ```
 
 `solve` is fully compatible with automatic differentiation libraries
 like:
 
-- [Zygote.jl](https://fluxml.ai/Zygote.jl/stable/)
-- [ReverseDiff.jl](https://juliadiff.org/ReverseDiff.jl/)
-- [Tracker.jl](https://github.com/FluxML/Tracker.jl)
-- [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/)
+  - [Zygote.jl](https://fluxml.ai/Zygote.jl/stable/)
+  - [ReverseDiff.jl](https://juliadiff.org/ReverseDiff.jl/)
+  - [Tracker.jl](https://github.com/FluxML/Tracker.jl)
+  - [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/)
 
 and will automatically replace any calculations of the solution's derivative
 with a fast method. The keyword argument `sensealg` controls the dispatch to the
@@ -61,48 +61,54 @@ interpolations unless `sensealg=DiffEqBase.SensitivityADPassThrough()`
 is used, i.e. going back to the AD mechanism.
 
 !!! note
-
+    
     The behavior of ForwardDiff.jl is different from the other automatic differentiation libraries mentioned above.
     The `sensealg` keyword is ignored. Instead, the differential equations are solved using `Dual` numbers for `u0` and `p`.
     If only `p` is perturbed in the sensitivity analysis, but not `u0`, the state is still implemented as a `Dual` number.
     ForwardDiff.jl will thus not dispatch into continuous forward nor adjoint sensitivity analysis even if a `sensealg` is provided.
-    
+
 ## Equation Scope
 
-SciMLSensitivity.jl supports all the equation types of the 
+SciMLSensitivity.jl supports all the equation types of the
 [SciML Common Interface](https://docs.sciml.ai/SciMLBase/stable/), extending the problem
 types by adding overloads for automatic differentiation to improve the performance
 and flexibility of the differentiation system. This includes:
 
-- Linear systems (`LinearProblem`)
-  - Direct methods for dense and sparse
-  - Iterative solvers with preconditioning
-- Nonlinear Systems (`NonlinearProblem`)
-  - Systems of nonlinear equations
-  - Scalar bracketing systems
-- Integrals (quadrature) (`QuadratureProblem`)
-- Differential Equations
-  - Discrete equations (function maps, discrete stochastic (Gillespie/Markov)
-    simulations) (`DiscreteProblem`)
-  - Ordinary differential equations (ODEs) (`ODEProblem`)
-  - Split and Partitioned ODEs (Symplectic integrators, IMEX Methods) (`SplitODEProblem`)
-  - Stochastic ordinary differential equations (SODEs or SDEs) (`SDEProblem`)
-  - Stochastic differential-algebraic equations (SDAEs) (`SDEProblem` with mass matrices)
-  - Random differential equations (RODEs or RDEs) (`RODEProblem`)
-  - Differential algebraic equations (DAEs) (`DAEProblem` and `ODEProblem` with mass matrices)
-  - Delay differential equations (DDEs) (`DDEProblem`)
-  - Neutral, retarded, and algebraic delay differential equations (NDDEs, RDDEs, and DDAEs)
-  - Stochastic delay differential equations (SDDEs) (`SDDEProblem`)
-  - Experimental support for stochastic neutral, retarded, and algebraic delay differential equations (SNDDEs, SRDDEs, and SDDAEs)
-  - Mixed discrete and continuous equations (Hybrid Equations, Jump Diffusions) (`DEProblem`s with callbacks)
-- Optimization (`OptimizationProblem`)
-  - Nonlinear (constrained) optimization
-- (Stochastic/Delay/Differential-Algebraic) Partial Differential Equations (`PDESystem`)
-  - Finite difference and finite volume methods
-  - Interfaces to finite element methods
-  - Physics-Informed Neural Networks (PINNs)
-  - Integro-Differential Equations
-  - Fractional Differential Equations
+  - Linear systems (`LinearProblem`)
+    
+      + Direct methods for dense and sparse
+      + Iterative solvers with preconditioning
+
+  - Nonlinear Systems (`NonlinearProblem`)
+    
+      + Systems of nonlinear equations
+      + Scalar bracketing systems
+  - Integrals (quadrature) (`QuadratureProblem`)
+  - Differential Equations
+    
+      + Discrete equations (function maps, discrete stochastic (Gillespie/Markov)
+        simulations) (`DiscreteProblem`)
+      + Ordinary differential equations (ODEs) (`ODEProblem`)
+      + Split and Partitioned ODEs (Symplectic integrators, IMEX Methods) (`SplitODEProblem`)
+      + Stochastic ordinary differential equations (SODEs or SDEs) (`SDEProblem`)
+      + Stochastic differential-algebraic equations (SDAEs) (`SDEProblem` with mass matrices)
+      + Random differential equations (RODEs or RDEs) (`RODEProblem`)
+      + Differential algebraic equations (DAEs) (`DAEProblem` and `ODEProblem` with mass matrices)
+      + Delay differential equations (DDEs) (`DDEProblem`)
+      + Neutral, retarded, and algebraic delay differential equations (NDDEs, RDDEs, and DDAEs)
+      + Stochastic delay differential equations (SDDEs) (`SDDEProblem`)
+      + Experimental support for stochastic neutral, retarded, and algebraic delay differential equations (SNDDEs, SRDDEs, and SDDAEs)
+      + Mixed discrete and continuous equations (Hybrid Equations, Jump Diffusions) (`DEProblem`s with callbacks)
+  - Optimization (`OptimizationProblem`)
+    
+      + Nonlinear (constrained) optimization
+  - (Stochastic/Delay/Differential-Algebraic) Partial Differential Equations (`PDESystem`)
+    
+      + Finite difference and finite volume methods
+      + Interfaces to finite element methods
+      + Physics-Informed Neural Networks (PINNs)
+      + Integro-Differential Equations
+      + Fractional Differential Equations
 
 ## SciMLSensitivity and Universal Differential Equations
 
@@ -122,15 +128,15 @@ Learning](https://arxiv.org/abs/2001.04385).
 
 You can efficiently use the package for:
 
-- Parameter estimation of scientific models (ODEs, SDEs, DDEs, DAEs, etc.)
-- Neural ODEs, Neural SDE, Neural DAEs, Neural DDEs, etc.
-- Nonlinear optimal control, including training neural controllers
-- (Stiff) universal ordinary differential equations (universal ODEs)
-- Universal stochastic differential equations (universal SDEs)
-- Universal delay differential equations (universal DDEs)
-- Universal partial differential equations (universal PDEs)
-- Universal jump stochastic differential equations (universal jump diffusions)
-- Hybrid universal differential equations (universal DEs with event handling)
+  - Parameter estimation of scientific models (ODEs, SDEs, DDEs, DAEs, etc.)
+  - Neural ODEs, Neural SDE, Neural DAEs, Neural DDEs, etc.
+  - Nonlinear optimal control, including training neural controllers
+  - (Stiff) universal ordinary differential equations (universal ODEs)
+  - Universal stochastic differential equations (universal SDEs)
+  - Universal delay differential equations (universal DDEs)
+  - Universal partial differential equations (universal PDEs)
+  - Universal jump stochastic differential equations (universal jump diffusions)
+  - Hybrid universal differential equations (universal DEs with event handling)
 
 with high order, adaptive, implicit, GPU-accelerated, Newton-Krylov, etc.
 methods. For examples, please refer to [the DiffEqFlux release blog
@@ -146,19 +152,21 @@ while use with CPUs uses specialized kernels for accelerating differential equat
 
 Many training techniques are supported by this package, including:
 
-- Optimize-then-discretize (backsolve adjoints, checkpointed adjoints, quadrature adjoints)
-- Discretize-then-optimize (forward and reverse mode discrete sensitivity analysis)
-  - This is a generalization of [ANODE](https://arxiv.org/pdf/1902.10298.pdf) and
-    [ANODEv2](https://arxiv.org/pdf/1906.04596.pdf) to all
-    [DifferentialEquations.jl ODE solvers](https://docs.sciml.ai/DiffEqDocs/stable/solvers/ode_solve/)
-- Hybrid approaches (adaptive time stepping + AD for adaptive discretize-then-optimize)
-- O(1) memory backprop of ODEs via BacksolveAdjoint, and Virtual Brownian Trees for O(1) backprop of SDEs
-- [Continuous adjoints for integral loss functions](@ref continuous_loss)
-- Probabilistic programming and variational inference on ODEs/SDEs/DAEs/DDEs/hybrid
-  equations etc. is provided by integration with [Turing.jl](https://turing.ml/stable/docs/using-turing/)
-  and [Gen.jl](https://github.com/probcomp/Gen.jl). Reproduce
-  [variational loss functions](https://arxiv.org/abs/2001.01328) by plugging
-  [composable libraries together](https://turing.ml/stable/tutorials/09-variational-inference/).
+  - Optimize-then-discretize (backsolve adjoints, checkpointed adjoints, quadrature adjoints)
+
+  - Discretize-then-optimize (forward and reverse mode discrete sensitivity analysis)
+    
+      + This is a generalization of [ANODE](https://arxiv.org/pdf/1902.10298.pdf) and
+        [ANODEv2](https://arxiv.org/pdf/1906.04596.pdf) to all
+        [DifferentialEquations.jl ODE solvers](https://docs.sciml.ai/DiffEqDocs/stable/solvers/ode_solve/)
+  - Hybrid approaches (adaptive time stepping + AD for adaptive discretize-then-optimize)
+  - O(1) memory backprop of ODEs via BacksolveAdjoint, and Virtual Brownian Trees for O(1) backprop of SDEs
+  - [Continuous adjoints for integral loss functions](@ref continuous_loss)
+  - Probabilistic programming and variational inference on ODEs/SDEs/DAEs/DDEs/hybrid
+    equations etc. is provided by integration with [Turing.jl](https://turing.ml/stable/docs/using-turing/)
+    and [Gen.jl](https://github.com/probcomp/Gen.jl). Reproduce
+    [variational loss functions](https://arxiv.org/abs/2001.01328) by plugging
+    [composable libraries together](https://turing.ml/stable/tutorials/09-variational-inference/).
 
 all while mixing forward mode and reverse mode approaches as appropriate for the
 most speed. For more details on the adjoint sensitivity analysis methods for
@@ -166,25 +174,25 @@ computing fast gradients, see the [adjoints details page](@ref sensitivity_diffe
 
 With this package, you can explore various ways to integrate the two methodologies:
 
-- Neural networks can be defined where the “activations” are nonlinear functions
-  described by differential equations
-- Neural networks can be defined where some layers are ODE solves
-- ODEs can be defined where some terms are neural networks
-- Cost functions on ODEs can define neural networks
+  - Neural networks can be defined where the “activations” are nonlinear functions
+    described by differential equations
+  - Neural networks can be defined where some layers are ODE solves
+  - ODEs can be defined where some terms are neural networks
+  - Cost functions on ODEs can define neural networks
 
 ## Note on Modularity and Composability with Solvers
 
-Note that SciMLSensitivity.jl purely built on composable and modular infrastructure. 
+Note that SciMLSensitivity.jl purely built on composable and modular infrastructure.
 SciMLSensitivity provides high-level helper functions and documentation for the user, but the
 code generation stack is modular and composes in many ways. For example, one can
 use and swap out the ODE solver between any common interface compatible library, like:
 
-- Sundials.jl
-- OrdinaryDiffEq.jl
-- LSODA.jl
-- [IRKGaussLegendre.jl](https://github.com/mikelehu/IRKGaussLegendre.jl)
-- [SciPyDiffEq.jl](https://github.com/SciML/SciPyDiffEq.jl)
-- [… etc. many other choices!](https://docs.sciml.ai/DiffEqDocs/stable/solvers/ode_solve/)
+  - Sundials.jl
+  - OrdinaryDiffEq.jl
+  - LSODA.jl
+  - [IRKGaussLegendre.jl](https://github.com/mikelehu/IRKGaussLegendre.jl)
+  - [SciPyDiffEq.jl](https://github.com/SciML/SciPyDiffEq.jl)
+  - [… etc. many other choices!](https://docs.sciml.ai/DiffEqDocs/stable/solvers/ode_solve/)
 
 In addition, due to the composability of the system, none of the components are directly
 tied to the Flux.jl machine learning framework. For example, you can [use SciMLSensitivity.jl
@@ -193,8 +201,8 @@ to generate TensorFlow graphs and train the neural network with TensorFlow.jl](h
 single line code changes by utilizing the underlying code generation. The tutorials shown here
 are thus mostly a guide on how to use the ecosystem as a whole, only showing a small snippet
 of the possible ways to compose the thousands of differentiable libraries together! Swap out
-ODEs for SDEs, DDEs, DAEs, etc., put quadrature libraries or 
-[Tullio.jl](https://github.com/mcabbott/Tullio.jl) in the loss function, the world is your 
+ODEs for SDEs, DDEs, DAEs, etc., put quadrature libraries or
+[Tullio.jl](https://github.com/mcabbott/Tullio.jl) in the loss function, the world is your
 oyster!
 
 As a proof of composability, note that the implementation of Bayesian neural ODEs required
@@ -203,18 +211,19 @@ Julia packages.
 
 ## Contributing
 
-- Please refer to the
-  [SciML ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://github.com/SciML/ColPrac/blob/master/README.md)
-  for guidance on PRs, issues, and other matters relating to contributing to SciML.
-- See the [SciML Style Guide](https://github.com/SciML/SciMLStyle) for common coding practices and other style decisions.
-- There are a few community forums:
-    - The #diffeq-bridged and #sciml-bridged channels in the
-      [Julia Slack](https://julialang.org/slack/)
-    - The #diffeq-bridged and #sciml-bridged channels in the
-      [Julia Zulip](https://julialang.zulipchat.com/#narrow/stream/279055-sciml-bridged)
-    - On the [Julia Discourse forums](https://discourse.julialang.org)
-    - See also [SciML Community page](https://sciml.ai/community/)
+  - Please refer to the
+    [SciML ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://github.com/SciML/ColPrac/blob/master/README.md)
+    for guidance on PRs, issues, and other matters relating to contributing to SciML.
 
+  - See the [SciML Style Guide](https://github.com/SciML/SciMLStyle) for common coding practices and other style decisions.
+  - There are a few community forums:
+    
+      + The #diffeq-bridged and #sciml-bridged channels in the
+        [Julia Slack](https://julialang.org/slack/)
+      + The #diffeq-bridged and #sciml-bridged channels in the
+        [Julia Zulip](https://julialang.zulipchat.com/#narrow/stream/279055-sciml-bridged)
+      + On the [Julia Discourse forums](https://discourse.julialang.org)
+      + See also [SciML Community page](https://sciml.ai/community/)
 
 ## Citation
 
@@ -230,56 +239,72 @@ If you use SciMLSensitivity.jl or are influenced by its ideas, please cite:
 ```
 
 ## Reproducibility
+
 ```@raw html
 <details><summary>The documentation of this SciML package was built using these direct dependencies,</summary>
 ```
+
 ```@example
 using Pkg # hide
 Pkg.status() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>and using this machine and Julia version.</summary>
 ```
+
 ```@example
 using InteractiveUtils # hide
 versioninfo() # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 <details><summary>A more complete overview of all dependencies and their versions is also provided.</summary>
 ```
+
 ```@example
 using Pkg # hide
-Pkg.status(;mode = PKGMODE_MANIFEST) # hide
+Pkg.status(; mode = PKGMODE_MANIFEST) # hide
 ```
+
 ```@raw html
 </details>
 ```
+
 ```@raw html
 You can also download the 
 <a href="
 ```
+
 ```@eval
 using TOML
-version = TOML.parse(read("../../Project.toml",String))["version"]
-name = TOML.parse(read("../../Project.toml",String))["name"]
-link = "https://github.com/SciML/"*name*".jl/tree/gh-pages/v"*version*"/assets/Manifest.toml"
+version = TOML.parse(read("../../Project.toml", String))["version"]
+name = TOML.parse(read("../../Project.toml", String))["name"]
+link = "https://github.com/SciML/" * name * ".jl/tree/gh-pages/v" * version *
+       "/assets/Manifest.toml"
 ```
+
 ```@raw html
 ">manifest</a> file and the
 <a href="
 ```
+
 ```@eval
 using TOML
-version = TOML.parse(read("../../Project.toml",String))["version"]
-name = TOML.parse(read("../../Project.toml",String))["name"]
-link = "https://github.com/SciML/"*name*".jl/tree/gh-pages/v"*version*"/assets/Project.toml"
+version = TOML.parse(read("../../Project.toml", String))["version"]
+name = TOML.parse(read("../../Project.toml", String))["name"]
+link = "https://github.com/SciML/" * name * ".jl/tree/gh-pages/v" * version *
+       "/assets/Project.toml"
 ```
+
 ```@raw html
 ">project</a> file.
-```
\ No newline at end of file
+```
diff --git a/docs/src/manual/differential_equation_sensitivities.md b/docs/src/manual/differential_equation_sensitivities.md
index ae01af309..9297abbe7 100644
--- a/docs/src/manual/differential_equation_sensitivities.md
+++ b/docs/src/manual/differential_equation_sensitivities.md
@@ -1,18 +1,18 @@
 # [Sensitivity Algorithms for Differential Equations with Automatic Differentiation (AD)](@id sensitivity_diffeq)
 
-SciMLSensitivity.jl's high-level interface allows for specifying a 
+SciMLSensitivity.jl's high-level interface allows for specifying a
 sensitivity algorithm (`sensealg`) to control the method by which
 `solve` is differentiated in an automatic differentiation (AD)
-context by a compatible AD library. The underlying algorithms then 
-use the direct interface methods, like `ODEForwardSensitivityProblem` 
-and `adjoint_sensitivities`, to compute the derivatives without 
+context by a compatible AD library. The underlying algorithms then
+use the direct interface methods, like `ODEForwardSensitivityProblem`
+and `adjoint_sensitivities`, to compute the derivatives without
 requiring the user to do any of the setup.
 
 Current AD libraries whose calls are captured by the sensitivity
 system are:
 
-- [Zygote.jl](https://fluxml.ai/Zygote.jl/stable/)
-- [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl)
+  - [Zygote.jl](https://fluxml.ai/Zygote.jl/stable/)
+  - [Diffractor.jl](https://github.com/JuliaDiff/Diffractor.jl)
 
 ## Using and Controlling Sensitivity Algorithms within AD
 
@@ -21,15 +21,16 @@ Take for example this simple differential equation solve on Lotka-Volterra:
 ```julia
 using SciMLSensitivity, OrdinaryDiffEq, Zygote
 
-function fiip(du,u,p,t)
-  du[1] = dx = p[1]*u[1] - p[2]*u[1]*u[2]
-  du[2] = dy = -p[3]*u[2] + p[4]*u[1]*u[2]
+function fiip(du, u, p, t)
+    du[1] = dx = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = dy = -p[3] * u[2] + p[4] * u[1] * u[2]
 end
-p = [1.5,1.0,3.0,1.0]; u0 = [1.0;1.0]
-prob = ODEProblem(fiip,u0,(0.0,10.0),p)
-sol = solve(prob,Tsit5())
-loss(u0,p) = sum(solve(prob,Tsit5(),u0=u0,p=p,saveat=0.1))
-du0,dp = Zygote.gradient(loss,u0,p)
+p = [1.5, 1.0, 3.0, 1.0];
+u0 = [1.0; 1.0];
+prob = ODEProblem(fiip, u0, (0.0, 10.0), p)
+sol = solve(prob, Tsit5())
+loss(u0, p) = sum(solve(prob, Tsit5(), u0 = u0, p = p, saveat = 0.1))
+du0, dp = Zygote.gradient(loss, u0, p)
 ```
 
 This will compute the gradient of the loss function "sum of the values of the
@@ -48,8 +49,10 @@ Likewise, the `sensealg` argument can be given to directly control the method
 by which the derivative is computed. For example:
 
 ```julia
-loss(u0,p) = sum(solve(prob,Tsit5(),u0=u0,p=p,saveat=0.1,sensealg=ForwardSensitivity()))
-du0,dp = Zygote.gradient(loss,u0,p)
+function loss(u0, p)
+    sum(solve(prob, Tsit5(), u0 = u0, p = p, saveat = 0.1, sensealg = ForwardSensitivity()))
+end
+du0, dp = Zygote.gradient(loss, u0, p)
 ```
 
 would do reverse-mode automatic differentiation of the loss function, but when reversing
@@ -62,48 +65,50 @@ There are two classes of algorithms: the continuous sensitivity analysis
 methods, and the discrete sensitivity analysis methods (direct automatic
 differentiation). Generally:
 
-- [Continuous sensitivity analysis are more efficient while the discrete 
-  sensitivity analysis is more stable](https://arxiv.org/abs/2001.04385)
-  (full discussion is in the appendix of that paper)
-- Continuous sensitivity analysis methods only support a subset of
-  equations, which currently includes:
-    - ODEProblem (with mass matrices for differential-algebraic equations (DAEs)
-    - SDEProblem
-    - SteadyStateProblem / NonlinearProblem
-- Discrete sensitivity analysis methods only support a subset of algorithms,
-  namely, the pure Julia solvers which are written generically.
+  - [Continuous sensitivity analysis are more efficient while the discrete
+    sensitivity analysis is more stable](https://arxiv.org/abs/2001.04385)
+    (full discussion is in the appendix of that paper)
+
+  - Continuous sensitivity analysis methods only support a subset of
+    equations, which currently includes:
+    
+      + ODEProblem (with mass matrices for differential-algebraic equations (DAEs)
+      + SDEProblem
+      + SteadyStateProblem / NonlinearProblem
+  - Discrete sensitivity analysis methods only support a subset of algorithms,
+    namely, the pure Julia solvers which are written generically.
 
 For an analysis of which methods will be most efficient for computing the
 solution derivatives for a given problem, consult our analysis
 [in this arXiv paper](https://arxiv.org/abs/1812.01892). A general rule of thumb
 is:
 
-- `ForwardDiffSensitivity` is the fastest for differential equations with small
-  numbers of parameters (<100) and can be used on any differential equation
-  solver that is native Julia. If the chosen ODE solver is incompatible
-  with direct automatic differentiation, `ForwardSensitivty` may be used instead.
-- Adjoint sensitivity analysis is the fastest when the number of parameters is
-  sufficiently large. There are three configurations of note. Using
-  `QuadratureAdjoint` is the fastest but uses the most memory, `BacksolveAdjoint`
-  uses the least memory but on very stiff problems it may be unstable and
-  requires many checkpoints, while `InterpolatingAdjoint` is in the middle,
-  allowing checkpointing to control total memory use.
-- The methods which use direct automatic differentiation (`ReverseDiffAdjoint`,
-  `TrackerAdjoint`, `ForwardDiffSensitivity`, and `ZygoteAdjoint`) support
-  the full range of DifferentialEquations.jl features (SDEs, DDEs, events, etc.),
-  but only work on native Julia solvers.
-- For non-ODEs with large numbers of parameters, `TrackerAdjoint` in out-of-place
-  form may be the best performer on GPUs, and `ReverseDiffAdjoint`
-- `TrackerAdjoint` is able to use a `TrackedArray` form with out-of-place
-  functions `du = f(u,p,t)` but requires an `Array{TrackedReal}` form for
-  `f(du,u,p,t)` mutating `du`. The latter has much more overhead, and should be
-  avoided if possible. When solving non-ODEs with lots of parameters, using
-  `TrackerAdjoint` with an out-of-place definition may currently be the best option.
+  - `ForwardDiffSensitivity` is the fastest for differential equations with small
+    numbers of parameters (<100) and can be used on any differential equation
+    solver that is native Julia. If the chosen ODE solver is incompatible
+    with direct automatic differentiation, `ForwardSensitivty` may be used instead.
+  - Adjoint sensitivity analysis is the fastest when the number of parameters is
+    sufficiently large. There are three configurations of note. Using
+    `QuadratureAdjoint` is the fastest but uses the most memory, `BacksolveAdjoint`
+    uses the least memory but on very stiff problems it may be unstable and
+    requires many checkpoints, while `InterpolatingAdjoint` is in the middle,
+    allowing checkpointing to control total memory use.
+  - The methods which use direct automatic differentiation (`ReverseDiffAdjoint`,
+    `TrackerAdjoint`, `ForwardDiffSensitivity`, and `ZygoteAdjoint`) support
+    the full range of DifferentialEquations.jl features (SDEs, DDEs, events, etc.),
+    but only work on native Julia solvers.
+  - For non-ODEs with large numbers of parameters, `TrackerAdjoint` in out-of-place
+    form may be the best performer on GPUs, and `ReverseDiffAdjoint`
+  - `TrackerAdjoint` is able to use a `TrackedArray` form with out-of-place
+    functions `du = f(u,p,t)` but requires an `Array{TrackedReal}` form for
+    `f(du,u,p,t)` mutating `du`. The latter has much more overhead, and should be
+    avoided if possible. When solving non-ODEs with lots of parameters, using
+    `TrackerAdjoint` with an out-of-place definition may currently be the best option.
 
 !!! note
-
+    
     Compatibility with direct automatic differentiation algorithms (`ForwardDiffSensitivity`,
-    `ReverseDiffAdjoint`, etc.) can be queried using the 
+    `ReverseDiffAdjoint`, etc.) can be queried using the
     `SciMLBase.isautodifferentiable(::SciMLAlgorithm)` trait function.
 
 If the chosen algorithm is a continuous sensitivity analysis algorithm, then an `autojacvec`
@@ -113,12 +118,12 @@ is the most efficient, though `autojacvec=false` is slightly less accurate but v
 efficiency. For adjoint methods, it's more complicated and dependent on the way that the user's
 `f` function is implemented:
 
-- `EnzymeVJP()` is the most efficient if it's applicable on your equation.
-- If your function has no branching (no if statements) but uses mutation, `ReverseDiffVJP(true)`
-  will be the most efficient after Enzyme. Otherwise, `ReverseDiffVJP()`, but you may wish to
-  proceed with eliminating mutation as without compilation enabled this can be slow.
-- If you are on the CPU or GPU and your function is very vectorized and has no mutation, choose `ZygoteVJP()`.
-- Else fallback to `TrackerVJP()` if Zygote does not support the function.
+  - `EnzymeVJP()` is the most efficient if it's applicable on your equation.
+  - If your function has no branching (no if statements) but uses mutation, `ReverseDiffVJP(true)`
+    will be the most efficient after Enzyme. Otherwise, `ReverseDiffVJP()`, but you may wish to
+    proceed with eliminating mutation as without compilation enabled this can be slow.
+  - If you are on the CPU or GPU and your function is very vectorized and has no mutation, choose `ZygoteVJP()`.
+  - Else fallback to `TrackerVJP()` if Zygote does not support the function.
 
 ## Special Notes on Non-ODE Differential Equation Problems
 
@@ -150,10 +155,10 @@ differentiation techniques.
 ### Hybrid Equations (Equations with events/callbacks) and Jump Equations
 
 `ForwardDiffSensitivity` can differentiate code with callbacks when `convert_tspan=true`.
-`ForwardSensitivity` is incompatible with hybrid equations. The shadowing methods are 
-incompatible with callbacks. All methods based on discrete adjoint sensitivity analysis 
-via automatic differentiation, like `ReverseDiffAdjoint`, `TrackerAdjoint`, or 
-`QuadratureAdjoint` are fully compatible with events. This applies to ODEs, SDEs, DAEs, 
+`ForwardSensitivity` is incompatible with hybrid equations. The shadowing methods are
+incompatible with callbacks. All methods based on discrete adjoint sensitivity analysis
+via automatic differentiation, like `ReverseDiffAdjoint`, `TrackerAdjoint`, or
+`QuadratureAdjoint` are fully compatible with events. This applies to ODEs, SDEs, DAEs,
 and DDEs. The continuous adjoint sensitivities `BacksolveAdjoint`, `InterpolatingAdjoint`,
 and `QuadratureAdjoint` are compatible with events for ODEs. `BacksolveAdjoint` and
 `InterpolatingAdjoint` can also handle events for SDEs. Use `BacksolveAdjoint` if
@@ -164,12 +169,12 @@ the continuous adjoint sensitivities do not support multiple events per time poi
 
 Note that when defining your differential equation, the vjp can be
 manually overwritten by providing the `AbstractSciMLFunction` definition
-with  a `vjp(u,p,t)` that returns a tuple `f(u,p,t),v->J*v` in the form of 
+with  a `vjp(u,p,t)` that returns a tuple `f(u,p,t),v->J*v` in the form of
 [ChainRules.jl](https://www.juliadiff.org/ChainRulesCore.jl/stable/).
 When this is done, the choice of `ZygoteVJP` will utilize your VJP
 function during the internal steps of the adjoint. This is useful for
 models where automatic differentiation may have trouble producing
-optimal code. This can be paired with 
+optimal code. This can be paired with
 [ModelingToolkit.jl](https://docs.sciml.ai/ModelingToolkit/stable/)
 for producing hyper-optimized, sparse, and parallel VJP functions utilizing
 the automated symbolic conversions.
@@ -223,7 +228,7 @@ like in stiff equations, PDE discretizations, and many other contexts,
 so it is not used by default. When training a neural ODE for machine
 learning applications, the user should try `BacksolveAdjoint` and see
 if it is sufficiently accurate on their problem. More details on this
-topic can be found in 
+topic can be found in
 [Stiff Neural Ordinary Differential Equations](https://aip.scitation.org/doi/10.1063/5.0060697)
 
 Note that DiffEqFlux's implementation of `BacksolveAdjoint` includes
diff --git a/docs/src/manual/direct_adjoint_sensitivities.md b/docs/src/manual/direct_adjoint_sensitivities.md
index c274ce649..a64611b42 100644
--- a/docs/src/manual/direct_adjoint_sensitivities.md
+++ b/docs/src/manual/direct_adjoint_sensitivities.md
@@ -11,4 +11,4 @@ adjoint_sensitivities
 ```@docs
 second_order_sensitivities
 second_order_sensitivity_product
-```
\ No newline at end of file
+```
diff --git a/docs/src/manual/direct_forward_sensitivity.md b/docs/src/manual/direct_forward_sensitivity.md
index c5ad72172..3b78119ee 100644
--- a/docs/src/manual/direct_forward_sensitivity.md
+++ b/docs/src/manual/direct_forward_sensitivity.md
@@ -3,4 +3,4 @@
 ```@docs
 ODEForwardSensitivityProblem
 extract_local_sensitivities
-```
\ No newline at end of file
+```
diff --git a/docs/src/manual/nonlinear_solve_sensitivities.md b/docs/src/manual/nonlinear_solve_sensitivities.md
index 178ea0599..dd8be816b 100644
--- a/docs/src/manual/nonlinear_solve_sensitivities.md
+++ b/docs/src/manual/nonlinear_solve_sensitivities.md
@@ -2,4 +2,4 @@
 
 ```@docs
 SteadyStateAdjoint
-```
\ No newline at end of file
+```
diff --git a/docs/src/tutorials/adjoint_continuous_functional.md b/docs/src/tutorials/adjoint_continuous_functional.md
index 4c8808199..e1d5570e1 100644
--- a/docs/src/tutorials/adjoint_continuous_functional.md
+++ b/docs/src/tutorials/adjoint_continuous_functional.md
@@ -39,14 +39,14 @@ straightforward, since one can simply use the fact that the solution from
 ```@example continuousadjoint
 using OrdinaryDiffEq, SciMLSensitivity
 
-function f(du,u,p,t)
-  du[1] = dx = p[1]*u[1] - p[2]*u[1]*u[2]
-  du[2] = dy = -p[3]*u[2] + u[1]*u[2]
+function f(du, u, p, t)
+    du[1] = dx = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = dy = -p[3] * u[2] + u[1] * u[2]
 end
 
-p = [1.5,1.0,3.0]
-prob = ODEForwardSensitivityProblem(f,[1.0;1.0],(0.0,10.0),p)
-sol = solve(prob,DP8())
+p = [1.5, 1.0, 3.0]
+prob = ODEForwardSensitivityProblem(f, [1.0; 1.0], (0.0, 10.0), p)
+sol = solve(prob, DP8())
 ```
 
 gives a continuous solution `sol(t)` with the derivative at each time point. This
@@ -67,24 +67,25 @@ G(u,p)=\int_{0}^{T}\frac{\sum_{i=1}^{n}u_{i}^{2}(t)}{2}dt
 which is:
 
 ```@example continuousadjoint
-g(u,p,t) = (sum(u).^2) ./ 2
+g(u, p, t) = (sum(u) .^ 2) ./ 2
 ```
 
 Notice that the gradient of this function with respect to the state `u` is:
 
 ```@example continuousadjoint
-function dg(out,u,p,t)
-  out[1]= u[1] + u[2]
-  out[2]= u[1] + u[2]
+function dg(out, u, p, t)
+    out[1] = u[1] + u[2]
+    out[2] = u[1] + u[2]
 end
 ```
 
 To get the adjoint sensitivities, we call:
 
 ```@example continuousadjoint
-prob = ODEProblem(f,[1.0;1.0],(0.0,10.0),p)
-sol = solve(prob,DP8())
-res = adjoint_sensitivities(sol,Vern9(),dgdu_continuous=dg,g=g,abstol=1e-8,reltol=1e-8)
+prob = ODEProblem(f, [1.0; 1.0], (0.0, 10.0), p)
+sol = solve(prob, DP8())
+res = adjoint_sensitivities(sol, Vern9(), dgdu_continuous = dg, g = g, abstol = 1e-8,
+                            reltol = 1e-8)
 ```
 
 Notice that we can check this against autodifferentiation and numerical
@@ -93,11 +94,11 @@ differentiation as follows:
 ```@example continuousadjoint
 using QuadGK, ForwardDiff, Calculus
 function G(p)
-  tmp_prob = remake(prob,p=p)
-  sol = solve(tmp_prob,Vern9(),abstol=1e-14,reltol=1e-14)
-  res,err = quadgk((t)-> (sum(sol(t)).^2)./2,0.0,10.0,atol=1e-14,rtol=1e-10)
-  res
+    tmp_prob = remake(prob, p = p)
+    sol = solve(tmp_prob, Vern9(), abstol = 1e-14, reltol = 1e-14)
+    res, err = quadgk((t) -> (sum(sol(t)) .^ 2) ./ 2, 0.0, 10.0, atol = 1e-14, rtol = 1e-10)
+    res
 end
-res2 = ForwardDiff.gradient(G,[1.5,1.0,3.0])
-res3 = Calculus.gradient(G,[1.5,1.0,3.0])
+res2 = ForwardDiff.gradient(G, [1.5, 1.0, 3.0])
+res3 = Calculus.gradient(G, [1.5, 1.0, 3.0])
 ```
diff --git a/docs/src/tutorials/chaotic_ode.md b/docs/src/tutorials/chaotic_ode.md
index 38cacb42d..6610fb033 100644
--- a/docs/src/tutorials/chaotic_ode.md
+++ b/docs/src/tutorials/chaotic_ode.md
@@ -13,6 +13,7 @@ where
 ```math
 \langle g \rangle_T = \frac{1}{T} \int_0^T g(u,p) \text{d}t,
 ```
+
 under the assumption of ergodicity, ``\langle g \rangle_∞`` only depends on `p`.
 
 In the case of chaotic systems, the trajectories diverge with ``O(1)`` error. This
@@ -23,9 +24,9 @@ can be seen, for instance, when solving the [Lorenz system](https://en.wikipedia
 using OrdinaryDiffEq, SciMLSensitivity, Zygote
 
 function lorenz!(du, u, p, t)
-  du[1] = 10 * (u[2] - u[1])
-  du[2] = u[1] * (p[1] - u[3]) - u[2]
-  du[3] = u[1] * u[2] - (8 // 3) * u[3]
+    du[1] = 10 * (u[2] - u[1])
+    du[2] = u[1] * (p[1] - u[3]) - u[2]
+    du[3] = u[1] * u[2] - (8 // 3) * u[3]
 end
 
 p = [28.0]
@@ -35,6 +36,7 @@ prob = ODEProblem(lorenz!, u0, tspan, p)
 sol = solve(prob, Vern9(), abstol = 1e-14, reltol = 1e-14)
 sol2 = solve(prob, Vern9(), abstol = 1e-14 + eps(Float64), reltol = 1e-14)
 ```
+
 ![Chaotic behavior of the Lorenz system](../assets/chaos_eps_pert.png)
 
 More formally, such chaotic behavior can be analyzed using tools from
@@ -66,23 +68,22 @@ the long-time average quantities.
 
 The following `sensealg` choices exist
 
-- `ForwardLSS(;LSSregularizer=TimeDilation(10.0,0.0,0.0),g=nothing,ADKwargs...)`:
-  An implementation of the forward [least square shadowing](https://arxiv.org/abs/1204.0159) method.
-  For `LSSregularizer`, one can choose between two different windowing options,
-  `TimeDilation` (default) with weight `10.0` and `CosWindowing`, and `Cos2Windowing`.
-- `AdjointLSS(;LSSRegularizer=TimeDilation(10.0, 0.0, 0.0),g=nothing,ADKwargs...)`: An
-  implementation of the adjoint-mode [least square shadowing](https://arxiv.org/abs/1204.0159)
-  method. `10.0` controls the weight of the time dilation term in `AdjointLSS`.
-- `NILSS(nseg,nstep;nus=nothing,rng=Xorshifts.Xoroshiro128Plus(rand(UInt64)),g=nothing,ADKwargs...)`:  
-  An implementation of the [non-intrusive least squares shadowing (NILSS)](https://arxiv.org/abs/1611.00880)
-  method. Here, `nseg` is the number of segments, `nstep` is the number of steps per
-  segment, and `nus` is the number of unstable Lyapunov exponents.
-- `NILSAS(nseg,nstep,M=nothing;rng =Xorshifts.Xoroshiro128Plus(rand(UInt64)),
-          adjoint_sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP()),g=nothing,ADKwargs...)`:  
-  An implementation of the [non-intrusive least squares adjoint shadowing (NILSAS)](https://arxiv.org/abs/1801.08674)
-  method. `nseg` is the number of segments. `nstep` is the number of steps per
-  segment, `M >= nus + 1` has to be provided, where `nus` is the number of unstable
-  covariant Lyapunov vectors.
+  - `ForwardLSS(;LSSregularizer=TimeDilation(10.0,0.0,0.0),g=nothing,ADKwargs...)`:
+    An implementation of the forward [least square shadowing](https://arxiv.org/abs/1204.0159) method.
+    For `LSSregularizer`, one can choose between two different windowing options,
+    `TimeDilation` (default) with weight `10.0` and `CosWindowing`, and `Cos2Windowing`.
+  - `AdjointLSS(;LSSRegularizer=TimeDilation(10.0, 0.0, 0.0),g=nothing,ADKwargs...)`: An
+    implementation of the adjoint-mode [least square shadowing](https://arxiv.org/abs/1204.0159)
+    method. `10.0` controls the weight of the time dilation term in `AdjointLSS`.
+  - `NILSS(nseg,nstep;nus=nothing,rng=Xorshifts.Xoroshiro128Plus(rand(UInt64)),g=nothing,ADKwargs...)`:
+    An implementation of the [non-intrusive least squares shadowing (NILSS)](https://arxiv.org/abs/1611.00880)
+    method. Here, `nseg` is the number of segments, `nstep` is the number of steps per
+    segment, and `nus` is the number of unstable Lyapunov exponents.
+  - `NILSAS(nseg,nstep,M=nothing;rng =Xorshifts.Xoroshiro128Plus(rand(UInt64)), adjoint_sensealg=BacksolveAdjoint(autojacvec=ReverseDiffVJP()),g=nothing,ADKwargs...)`:
+    An implementation of the [non-intrusive least squares adjoint shadowing (NILSAS)](https://arxiv.org/abs/1801.08674)
+    method. `nseg` is the number of segments. `nstep` is the number of steps per
+    segment, `M >= nus + 1` has to be provided, where `nus` is the number of unstable
+    covariant Lyapunov vectors.
 
 Recommendation: Since the computational and memory costs of `NILSS()` scale with
 the number of positive (unstable) Lyapunov, it is typically less expensive than
@@ -94,36 +95,37 @@ as the instantaneous objective, we can use the direct interface by passing `Forw
 as the `sensealg`:
 
 ```@example chaosode
-function lorenz!(du,u,p,t)
-  du[1] = p[1]*(u[2]-u[1])
-  du[2] = u[1]*(p[2]-u[3]) - u[2]
-  du[3] = u[1]*u[2] - p[3]*u[3]
+function lorenz!(du, u, p, t)
+    du[1] = p[1] * (u[2] - u[1])
+    du[2] = u[1] * (p[2] - u[3]) - u[2]
+    du[3] = u[1] * u[2] - p[3] * u[3]
 end
 
-p = [10.0, 28.0, 8/3]
+p = [10.0, 28.0, 8 / 3]
 
-tspan_init = (0.0,30.0)
-tspan_attractor = (30.0,50.0)
+tspan_init = (0.0, 30.0)
+tspan_attractor = (30.0, 50.0)
 u0 = rand(3)
-prob_init = ODEProblem(lorenz!,u0,tspan_init,p)
-sol_init = solve(prob_init,Tsit5())
-prob_attractor = ODEProblem(lorenz!,sol_init[end],tspan_attractor,p)
+prob_init = ODEProblem(lorenz!, u0, tspan_init, p)
+sol_init = solve(prob_init, Tsit5())
+prob_attractor = ODEProblem(lorenz!, sol_init[end], tspan_attractor, p)
 
-g(u,p,t) = u[end]
+g(u, p, t) = u[end]
 
 function G(p)
-  _prob = remake(prob_attractor,p=p)
-  _sol = solve(_prob,Vern9(),abstol=1e-14,reltol=1e-14,saveat=0.01,sensealg=ForwardLSS(g=g))
-  sum(getindex.(_sol.u,3))
+    _prob = remake(prob_attractor, p = p)
+    _sol = solve(_prob, Vern9(), abstol = 1e-14, reltol = 1e-14, saveat = 0.01,
+                 sensealg = ForwardLSS(g = g))
+    sum(getindex.(_sol.u, 3))
 end
-dp1 = Zygote.gradient(p->G(p),p)
+dp1 = Zygote.gradient(p -> G(p), p)
 ```
 
 Alternatively, we can define the `ForwardLSSProblem` and solve it
 via `shadow_forward` as follows:
 
 ```@example chaosode
-sol_attractor = solve(prob_attractor, Vern9(), abstol=1e-14, reltol=1e-14)
-lss_problem = ForwardLSSProblem(sol_attractor, ForwardLSS(g=g))
+sol_attractor = solve(prob_attractor, Vern9(), abstol = 1e-14, reltol = 1e-14)
+lss_problem = ForwardLSSProblem(sol_attractor, ForwardLSS(g = g))
 resfw = shadow_forward(lss_problem)
 ```
diff --git a/docs/src/tutorials/data_parallel.md b/docs/src/tutorials/data_parallel.md
index c442680e4..94bfd05c6 100644
--- a/docs/src/tutorials/data_parallel.md
+++ b/docs/src/tutorials/data_parallel.md
@@ -20,17 +20,17 @@ define an ODE:
 using Lux, DiffEqFlux, DifferentialEquations, CUDA, Random
 rng = Random.default_rng()
 
-dudt = Lux.Chain(Lux.Dense(2,50,tanh),Lux.Dense(50,2))
-p,st = Lux.setup(rng, dudt)
-f(u,p,t) = dudt(u,p,st)[1]
+dudt = Lux.Chain(Lux.Dense(2, 50, tanh), Lux.Dense(50, 2))
+p, st = Lux.setup(rng, dudt)
+f(u, p, t) = dudt(u, p, st)[1]
 ```
 
 and we can solve this ODE where the initial condition is a vector:
 
 ```@example dataparallel
-u0 = Float32[2.; 0.]
-prob = ODEProblem(f,u0,(0f0,1f0),p)
-solve(prob,Tsit5())
+u0 = Float32[2.0; 0.0]
+prob = ODEProblem(f, u0, (0.0f0, 1.0f0), p)
+solve(prob, Tsit5())
 ```
 
 or we can solve this ODE where the initial condition is a matrix, where
@@ -39,8 +39,8 @@ each column is an independent system:
 ```@example dataparallel
 u0 = Float32.([0 1 2
                0 0 0])
-prob = ODEProblem(f,u0,(0f0,1f0),p)
-solve(prob,Tsit5())
+prob = ODEProblem(f, u0, (0.0f0, 1.0f0), p)
+solve(prob, Tsit5())
 ```
 
 On the CPU this will multithread across the system (due to BLAS) and
@@ -51,8 +51,8 @@ GPU:
 ```@example dataparallel
 xs = Float32.([0 1 2
                0 0 0])
-prob = ODEProblem(f,Lux.gpu(u0),(0f0,1f0),Lux.gpu(p))
-solve(prob,Tsit5())
+prob = ODEProblem(f, Lux.gpu(u0), (0.0f0, 1.0f0), Lux.gpu(p))
+solve(prob, Tsit5())
 ```
 
 This method of parallelism is optimal if all the operations are
@@ -86,41 +86,41 @@ Distributed and GPU minibatching are described below.
 using DifferentialEquations, Optimization, OptimizationFlux
 pa = [1.0]
 u0 = [3.0]
-θ = [u0;pa]
+θ = [u0; pa]
 
-function model1(θ,ensemble)
-  prob = ODEProblem((u, p, t) -> 1.01u .* p, [θ[1]], (0.0, 1.0), [θ[2]])
+function model1(θ, ensemble)
+    prob = ODEProblem((u, p, t) -> 1.01u .* p, [θ[1]], (0.0, 1.0), [θ[2]])
 
-  function prob_func(prob, i, repeat)
-    remake(prob, u0 = 0.5 .+ i/100 .* prob.u0)
-  end
+    function prob_func(prob, i, repeat)
+        remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
+    end
 
-  ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
-  sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
+    ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
+    sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
 end
 
 # loss function
-loss_serial(θ)   = sum(abs2,1.0.-Array(model1(θ,EnsembleSerial())))
-loss_threaded(θ) = sum(abs2,1.0.-Array(model1(θ,EnsembleThreads())))
+loss_serial(θ) = sum(abs2, 1.0 .- Array(model1(θ, EnsembleSerial())))
+loss_threaded(θ) = sum(abs2, 1.0 .- Array(model1(θ, EnsembleThreads())))
 
-callback = function (θ,l) # callback function to observe training
-  @show l
-  false
+callback = function (θ, l) # callback function to observe training
+    @show l
+    false
 end
 
 opt = ADAM(0.1)
 l1 = loss_serial(θ)
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_serial(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_serial(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, θ)
 
-res_serial = Optimization.solve(optprob, opt; callback = callback, maxiters=100)
+res_serial = Optimization.solve(optprob, opt; callback = callback, maxiters = 100)
 
-optf2 = Optimization.OptimizationFunction((x,p)->loss_threaded(x), adtype)
+optf2 = Optimization.OptimizationFunction((x, p) -> loss_threaded(x), adtype)
 optprob2 = Optimization.OptimizationProblem(optf2, θ)
 
-res_threads = Optimization.solve(optprob2, opt; callback = callback, maxiters=100)
+res_threads = Optimization.solve(optprob2, opt; callback = callback, maxiters = 100)
 ```
 
 ## Multithreaded Batching In-Depth
@@ -144,7 +144,7 @@ Thus we use the [remake function from the problem interface](https://docs.sciml.
 
 ```@example dataparallel
 function prob_func(prob, i, repeat)
-  remake(prob, u0 = 0.5 .+ i/100 .* prob.u0)
+    remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
 end
 ```
 
@@ -198,38 +198,38 @@ using Distributed
 addprocs(4)
 
 @everywhere begin
-  using DifferentialEquations, Optimization, OptimizationFlux
-  function f(u,p,t)
-    1.01u .* p
-  end
+    using DifferentialEquations, Optimization, OptimizationFlux
+    function f(u, p, t)
+        1.01u .* p
+    end
 end
 
 pa = [1.0]
 u0 = [3.0]
-θ = [u0;pa]
+θ = [u0; pa]
 
-function model1(θ,ensemble)
-  prob = ODEProblem(f, [θ[1]], (0.0, 1.0), [θ[2]])
+function model1(θ, ensemble)
+    prob = ODEProblem(f, [θ[1]], (0.0, 1.0), [θ[2]])
 
-  function prob_func(prob, i, repeat)
-    remake(prob, u0 = 0.5 .+ i/100 .* prob.u0)
-  end
+    function prob_func(prob, i, repeat)
+        remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
+    end
 
-  ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
-  sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
+    ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
+    sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
 end
 
-callback = function (θ,l) # callback function to observe training
-  @show l
-  false
+callback = function (θ, l) # callback function to observe training
+    @show l
+    false
 end
 
 opt = ADAM(0.1)
-loss_distributed(θ) = sum(abs2,1.0.-Array(model1(θ,EnsembleDistributed())))
+loss_distributed(θ) = sum(abs2, 1.0 .- Array(model1(θ, EnsembleDistributed())))
 l1 = loss_distributed(θ)
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_distributed(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_distributed(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, θ)
 
 res_distributed = Optimization.solve(optprob, opt; callback = callback, maxiters = 100)
@@ -252,38 +252,36 @@ a GPU:
 
 ```julia
 using DifferentialEquations, Optimization, OptimizationFlux, DiffEqGPU
-function f(du,u,p,t)
-  @inbounds begin
-    du[1] = 1.01 * u[1] * p[1] * p[2]
-  end
+function f(du, u, p, t)
+    @inbounds begin du[1] = 1.01 * u[1] * p[1] * p[2] end
 end
 
 pa = [1.0]
 u0 = [3.0]
-θ = [u0;pa]
+θ = [u0; pa]
 
-function model1(θ,ensemble)
-  prob = ODEProblem(f, [θ[1]], (0.0, 1.0), [θ[2]])
+function model1(θ, ensemble)
+    prob = ODEProblem(f, [θ[1]], (0.0, 1.0), [θ[2]])
 
-  function prob_func(prob, i, repeat)
-    remake(prob, u0 = 0.5 .+ i/100 .* prob.u0)
-  end
+    function prob_func(prob, i, repeat)
+        remake(prob, u0 = 0.5 .+ i / 100 .* prob.u0)
+    end
 
-  ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
-  sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
+    ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
+    sim = solve(ensemble_prob, Tsit5(), ensemble, saveat = 0.1, trajectories = 100)
 end
 
-callback = function (θ,l) # callback function to observe training
-  @show l
-  false
+callback = function (θ, l) # callback function to observe training
+    @show l
+    false
 end
 
 opt = ADAM(0.1)
-loss_gpu(θ) = sum(abs2,1.0.-Array(model1(θ,EnsembleGPUArray())))
+loss_gpu(θ) = sum(abs2, 1.0 .- Array(model1(θ, EnsembleGPUArray())))
 l1 = loss_gpu(θ)
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss_gpu(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_gpu(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, θ)
 
 res_gpu = Optimization.solve(optprob, opt; callback = callback, maxiters = 100)
@@ -291,6 +289,6 @@ res_gpu = Optimization.solve(optprob, opt; callback = callback, maxiters = 100)
 
 ## Multi-GPU Batching
 
-DiffEqGPU supports batching across multiple GPUs. See 
+DiffEqGPU supports batching across multiple GPUs. See
 [its README](https://github.com/SciML/DiffEqGPU.jl#setting-up-multi-gpu)
 for details on setting it up.
diff --git a/docs/src/tutorials/direct_sensitivity.md b/docs/src/tutorials/direct_sensitivity.md
index 96f99aa54..6a224adf1 100644
--- a/docs/src/tutorials/direct_sensitivity.md
+++ b/docs/src/tutorials/direct_sensitivity.md
@@ -15,28 +15,28 @@ equations attached to the Lotka-Volterra equations by:
 ```@example directsense
 using OrdinaryDiffEq, SciMLSensitivity
 
-function f(du,u,p,t)
-  du[1] = dx = p[1]*u[1] - p[2]*u[1]*u[2]
-  du[2] = dy = -p[3]*u[2] + u[1]*u[2]
+function f(du, u, p, t)
+    du[1] = dx = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = dy = -p[3] * u[2] + u[1] * u[2]
 end
 
-p = [1.5,1.0,3.0]
-prob = ODEForwardSensitivityProblem(f,[1.0;1.0],(0.0,10.0),p)
+p = [1.5, 1.0, 3.0]
+prob = ODEForwardSensitivityProblem(f, [1.0; 1.0], (0.0, 10.0), p)
 ```
 
 This generates a problem which the ODE solvers can solve:
 
 ```@example directsense
-sol = solve(prob,DP8())
+sol = solve(prob, DP8())
 ```
 
 Note that the solution is the standard ODE system and the sensitivity system combined.
 We can use the following helper functions to extract the sensitivity information:
 
 ```julia
-x,dp = extract_local_sensitivities(sol)
-x,dp = extract_local_sensitivities(sol,i)
-x,dp = extract_local_sensitivities(sol,t)
+x, dp = extract_local_sensitivities(sol)
+x, dp = extract_local_sensitivities(sol, i)
+x, dp = extract_local_sensitivities(sol, t)
 ```
 
 In each case, `x` is the ODE values and `dp` is the matrix of sensitivities
@@ -46,7 +46,7 @@ The second returns the `i`th time step, while the third
 interpolates to calculate the sensitivities at time `t`. For example, if we do:
 
 ```@example directsense
-x,dp = extract_local_sensitivities(sol)
+x, dp = extract_local_sensitivities(sol)
 da = dp[1]
 ```
 
@@ -55,7 +55,7 @@ plot this
 
 ```@example directsense
 using Plots
-plot(sol.t,da',lw=3)
+plot(sol.t, da', lw = 3)
 ```
 
 transposing so that the rows (the timeseries) is plotted.
@@ -72,14 +72,14 @@ cost functional. First, let's solve the ODE and get a high quality continuous
 solution:
 
 ```@example directsense
-function f(du,u,p,t)
-  du[1] = dx = p[1]*u[1] - p[2]*u[1]*u[2]
-  du[2] = dy = -p[3]*u[2] + u[1]*u[2]
+function f(du, u, p, t)
+    du[1] = dx = p[1] * u[1] - p[2] * u[1] * u[2]
+    du[2] = dy = -p[3] * u[2] + u[1] * u[2]
 end
 
-p = [1.5,1.0,3.0]
-prob = ODEProblem(f,[1.0;1.0],(0.0,10.0),p)
-sol = solve(prob,Vern9(),abstol=1e-10,reltol=1e-10)
+p = [1.5, 1.0, 3.0]
+prob = ODEProblem(f, [1.0; 1.0], (0.0, 10.0), p)
+sol = solve(prob, Vern9(), abstol = 1e-10, reltol = 1e-10)
 ```
 
 Now let's calculate the sensitivity of the ``\ell_2`` error against 1 at evenly spaced
@@ -103,7 +103,7 @@ dg_{2}&=1-u_{2} \\
 and thus:
 
 ```@example directsense
-dg(out,u,p,t,i) = (out.=1.0.-u)
+dg(out, u, p, t, i) = (out .= 1.0 .- u)
 ```
 
 Also, we can omit `dgdp`, because the cost function doesn't dependent on `p`.
@@ -112,8 +112,8 @@ sensitivities, call:
 
 ```@example directsense
 ts = 0:0.5:10
-res = adjoint_sensitivities(sol,Vern9(),t=ts,dgdu_discrete=dg,abstol=1e-14,
-                            reltol=1e-14)
+res = adjoint_sensitivities(sol, Vern9(), t = ts, dgdu_discrete = dg, abstol = 1e-14,
+                            reltol = 1e-14)
 ```
 
 This is super high accuracy. As always, there's a tradeoff between accuracy
@@ -121,15 +121,15 @@ and computation time. We can check this almost exactly matches the
 autodifferentiation and numerical differentiation results:
 
 ```@example directsense
-using ForwardDiff,Calculus,ReverseDiff,Tracker
+using ForwardDiff, Calculus, ReverseDiff, Tracker
 function G(p)
-  tmp_prob = remake(prob,u0=convert.(eltype(p),prob.u0),p=p)
-  sol = solve(tmp_prob,Vern9(),abstol=1e-14,reltol=1e-14,saveat=ts,
-              sensealg=SensitivityADPassThrough())
-  A = convert(Array,sol)
-  sum(((1 .- A).^2)./2)
+    tmp_prob = remake(prob, u0 = convert.(eltype(p), prob.u0), p = p)
+    sol = solve(tmp_prob, Vern9(), abstol = 1e-14, reltol = 1e-14, saveat = ts,
+                sensealg = SensitivityADPassThrough())
+    A = convert(Array, sol)
+    sum(((1 .- A) .^ 2) ./ 2)
 end
-res2 = ForwardDiff.gradient(G,[1.5,1.0,3.0])
+res2 = ForwardDiff.gradient(G, [1.5, 1.0, 3.0])
 ```
 
 and see this gives the same values.
diff --git a/docs/src/tutorials/parameter_estimation_ode.md b/docs/src/tutorials/parameter_estimation_ode.md
index d20d7dfd4..b1db7468c 100644
--- a/docs/src/tutorials/parameter_estimation_ode.md
+++ b/docs/src/tutorials/parameter_estimation_ode.md
@@ -6,13 +6,14 @@ If you want to just get things running, try the following! Explanation will
 follow.
 
 ```@example optode_cp
-using DifferentialEquations, Optimization, OptimizationPolyalgorithms, SciMLSensitivity, Zygote, Plots
+using DifferentialEquations, Optimization, OptimizationPolyalgorithms, SciMLSensitivity,
+      Zygote, Plots
 
 function lotka_volterra!(du, u, p, t)
-  x, y = u
-  α, β, δ, γ = p
-  du[1] = dx = α*x - β*x*y
-  du[2] = dy = -δ*y + γ*x*y
+    x, y = u
+    α, β, δ, γ = p
+    du[1] = dx = α * x - β * x * y
+    du[2] = dy = -δ * y + γ * x * y
 end
 
 # Initial condition
@@ -35,22 +36,22 @@ plot(sol)
 savefig("LV_ode.png")
 
 function loss(p)
-  sol = solve(prob, Tsit5(), p=p, saveat = tsteps)
-  loss = sum(abs2, sol.-1)
-  return loss, sol
+    sol = solve(prob, Tsit5(), p = p, saveat = tsteps)
+    loss = sum(abs2, sol .- 1)
+    return loss, sol
 end
 
 callback = function (p, l, pred)
-  display(l)
-  plt = plot(pred, ylim = (0, 6))
-  display(plt)
-  # Tell Optimization.solve to not halt the optimization. If return true, then
-  # optimization stops.
-  return false
+    display(l)
+    plt = plot(pred, ylim = (0, 6))
+    display(plt)
+    # Tell Optimization.solve to not halt the optimization. If return true, then
+    # optimization stops.
+    return false
 end
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p)
 
 result_ode = Optimization.solve(optprob, PolyOpt(),
@@ -71,14 +72,14 @@ more details, [see the DifferentialEquations.jl documentation](https://docs.scim
 ```
 
 ```@example optode
-using DifferentialEquations, Optimization, OptimizationPolyalgorithms, 
+using DifferentialEquations, Optimization, OptimizationPolyalgorithms,
       SciMLSensitivity, Zygote, Plots
 
 function lotka_volterra!(du, u, p, t)
-  x, y = u
-  α, β, δ, γ = p
-  du[1] = dx = α*x - β*x*y
-  du[2] = dy = -δ*y + γ*x*y
+    x, y = u
+    α, β, δ, γ = p
+    du[1] = dx = α * x - β * x * y
+    du[2] = dy = -δ * y + γ * x * y
 end
 
 # Initial condition
@@ -113,9 +114,9 @@ define our loss as the squared distance from 1.
 
 ```@example optode
 function loss(p)
-  sol = solve(prob, Tsit5(), p=p, saveat = tsteps)
-  loss = sum(abs2, sol.-1)
-  return loss, sol
+    sol = solve(prob, Tsit5(), p = p, saveat = tsteps)
+    loss = sum(abs2, sol .- 1)
+    return loss, sol
 end
 ```
 
@@ -128,12 +129,12 @@ situation:
 
 ```@example optode
 callback = function (p, l, pred)
-  display(l)
-  plt = plot(pred, ylim = (0, 6))
-  display(plt)
-  # Tell Optimization.solve to not halt the optimization. If return true, then
-  # optimization stops.
-  return false
+    display(l)
+    plt = plot(pred, ylim = (0, 6))
+    display(plt)
+    # Tell Optimization.solve to not halt the optimization. If return true, then
+    # optimization stops.
+    return false
 end
 ```
 
@@ -141,7 +142,7 @@ Let's optimize the model.
 
 ```@example optode
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p)->loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = Optimization.OptimizationProblem(optf, p)
 
 result_ode = Optimization.solve(optprob, PolyOpt(),
@@ -156,7 +157,7 @@ that we solved the control problem and successfully found parameters to make the
 ODE solution constant:
 
 ```@example optode
-remade_solution = solve(remake(prob, p = result_ode.u), Tsit5(),      
+remade_solution = solve(remake(prob, p = result_ode.u), Tsit5(),
                         saveat = tsteps)
 plot(remade_solution, ylim = (0, 6))
 ```
diff --git a/docs/src/tutorials/training_tips/divergence.md b/docs/src/tutorials/training_tips/divergence.md
index 953b89fa3..7dd5724f1 100644
--- a/docs/src/tutorials/training_tips/divergence.md
+++ b/docs/src/tutorials/training_tips/divergence.md
@@ -15,77 +15,77 @@ parameters. This is shown in the loss function:
 
 ```julia
 function loss(p)
-  tmp_prob = remake(prob, p=p)
-  tmp_sol = solve(tmp_prob,Tsit5(),saveat=0.1)
-  if tmp_sol.retcode == ReturnCode.Success
-    return sum(abs2,Array(tmp_sol) - dataset)
-  else
-    return Inf
-  end
+    tmp_prob = remake(prob, p = p)
+    tmp_sol = solve(tmp_prob, Tsit5(), saveat = 0.1)
+    if tmp_sol.retcode == ReturnCode.Success
+        return sum(abs2, Array(tmp_sol) - dataset)
+    else
+        return Inf
+    end
 end
 ```
 
 A full example making use of this trick is:
 
 ```@example divergence
-using DifferentialEquations, SciMLSensitivity, Optimization, OptimizationFlux, OptimizationNLopt, Plots
+using DifferentialEquations, SciMLSensitivity, Optimization, OptimizationFlux,
+      OptimizationNLopt, Plots
 
-function lotka_volterra!(du,u,p,t)
+function lotka_volterra!(du, u, p, t)
     rab, wol = u
-    α,β,γ,δ=p
-    du[1] = drab = α*rab - β*rab*wol
-    du[2] = dwol = γ*rab*wol - δ*wol
+    α, β, γ, δ = p
+    du[1] = drab = α * rab - β * rab * wol
+    du[2] = dwol = γ * rab * wol - δ * wol
     nothing
 end
 
-u0 = [1.0,1.0]
-tspan = (0.0,10.0)
-p = [1.5,1.0,3.0,1.0]
-prob = ODEProblem(lotka_volterra!,u0,tspan,p)
-sol = solve(prob,saveat=0.1)
+u0 = [1.0, 1.0]
+tspan = (0.0, 10.0)
+p = [1.5, 1.0, 3.0, 1.0]
+prob = ODEProblem(lotka_volterra!, u0, tspan, p)
+sol = solve(prob, saveat = 0.1)
 plot(sol)
 
 dataset = Array(sol)
-scatter!(sol.t,dataset')
+scatter!(sol.t, dataset')
 
-tmp_prob = remake(prob, p=[1.2,0.8,2.5,0.8])
+tmp_prob = remake(prob, p = [1.2, 0.8, 2.5, 0.8])
 tmp_sol = solve(tmp_prob)
 plot(tmp_sol)
-scatter!(sol.t,dataset')
+scatter!(sol.t, dataset')
 
 function loss(p)
-  tmp_prob = remake(prob, p=p)
-  tmp_sol = solve(tmp_prob,Tsit5(),saveat=0.1)
-  if tmp_sol.retcode == ReturnCode.Success
-    return sum(abs2,Array(tmp_sol) - dataset)
-  else
-    return Inf
-  end
+    tmp_prob = remake(prob, p = p)
+    tmp_sol = solve(tmp_prob, Tsit5(), saveat = 0.1)
+    if tmp_sol.retcode == ReturnCode.Success
+        return sum(abs2, Array(tmp_sol) - dataset)
+    else
+        return Inf
+    end
 end
 
-
-pinit = [1.2,0.8,2.5,0.8]
+pinit = [1.2, 0.8, 2.5, 0.8]
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, pinit)
-res = Optimization.solve(optprob,ADAM(), maxiters = 1000)
+res = Optimization.solve(optprob, ADAM(), maxiters = 1000)
 
 # res = Optimization.solve(optprob,NLopt.LD_LBFGS(), maxiters = 1000) ### errors!
 ```
 
 You might notice that `AutoZygote` (default) fails for the above `Optimization.solve` call with Optim's optimizers, which happens because
-of Zygote's behavior for zero gradients, in which case it returns `nothing`. To avoid such issues, you can just use a different version of the same check which compares the size of the obtained 
+of Zygote's behavior for zero gradients, in which case it returns `nothing`. To avoid such issues, you can just use a different version of the same check which compares the size of the obtained
 solution and the data we have, shown below, which is easier to AD.
 
 ```julia
 function loss(p)
-  tmp_prob = remake(prob, p=p)
-  tmp_sol = solve(tmp_prob,Tsit5(),saveat=0.1)
-  if size(tmp_sol) == size(dataset)
-    return sum(abs2,Array(tmp_sol) .- dataset)
-  else
-    return Inf
-  end
+    tmp_prob = remake(prob, p = p)
+    tmp_sol = solve(tmp_prob, Tsit5(), saveat = 0.1)
+    if size(tmp_sol) == size(dataset)
+        return sum(abs2, Array(tmp_sol) .- dataset)
+    else
+        return Inf
+    end
 end
 ```
diff --git a/docs/src/tutorials/training_tips/local_minima.md b/docs/src/tutorials/training_tips/local_minima.md
index 52d2eb215..7e4d23d43 100644
--- a/docs/src/tutorials/training_tips/local_minima.md
+++ b/docs/src/tutorials/training_tips/local_minima.md
@@ -3,10 +3,10 @@
 Local minima can be an issue with fitting neural differential equations. However,
 there are many strategies to avoid local minima:
 
-1. Insert stochasticity into the loss function through minibatching
-2. Weigh the loss function to allow for fitting earlier portions first
-3. Iteratively grow the fit
-4. Training the initial conditions and the parameters to start
+ 1. Insert stochasticity into the loss function through minibatching
+ 2. Weigh the loss function to allow for fitting earlier portions first
+ 3. Iteratively grow the fit
+ 4. Training the initial conditions and the parameters to start
 
 ## Iterative Growing Of Fits to Reduce Probability of Bad Local Minima
 
@@ -27,13 +27,13 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 
 function trueODEfunc(du, u, p, t)
     true_A = Float32[-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
 
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
-dudt2 = Lux.Chain(ActivationFunction(x -> x.^3),
+dudt2 = Lux.Chain(ActivationFunction(x -> x .^ 3),
                   Lux.Dense(2, 16, tanh),
                   Lux.Dense(16, 2))
 
@@ -41,39 +41,39 @@ pinit, st = Lux.setup(rng, dudt2)
 pinit = Lux.ComponentArray(pinit)
 
 function neuralode_f(u, p, t)
-  dudt2(u, p, st)[1]
+    dudt2(u, p, st)[1]
 end
 
 function predict_neuralode(p)
-  prob = ODEProblem(neuralode_f, u0, tspan, p)
-  sol = solve(prob, Vern7(), saveat = tsteps, abstol=1e-6, reltol=1e-6)
-  Array(sol)
+    prob = ODEProblem(neuralode_f, u0, tspan, p)
+    sol = solve(prob, Vern7(), saveat = tsteps, abstol = 1e-6, reltol = 1e-6)
+    Array(sol)
 end
 
 function loss_neuralode(p)
     pred = predict_neuralode(p)
-    loss = sum(abs2, (ode_data[:,1:size(pred,2)] .- pred))
+    loss = sum(abs2, (ode_data[:, 1:size(pred, 2)] .- pred))
     return loss, pred
 end
 
 iter = 0
 callback = function (p, l, pred; doplot = false)
-  global iter
-  iter += 1
-
-  println(l)
-  if doplot
-    # plot current prediction against data
-    plt = scatter(tsteps[1:size(pred,2)], ode_data[1,1:size(pred,2)], label = "data")
-    scatter!(plt, tsteps[1:size(pred,2)], pred[1,:], label = "prediction")
-    display(plot(plt))
-  end
-
-  return false
+    global iter
+    iter += 1
+
+    println(l)
+    if doplot
+        # plot current prediction against data
+        plt = scatter(tsteps[1:size(pred, 2)], ode_data[1, 1:size(pred, 2)], label = "data")
+        scatter!(plt, tsteps[1:size(pred, 2)], pred[1, :], label = "prediction")
+        display(plot(plt))
+    end
+
+    return false
 end
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss_neuralode(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_neuralode(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, pinit)
 result_neuralode = Optimization.solve(optprob,
@@ -81,8 +81,8 @@ result_neuralode = Optimization.solve(optprob,
                                       maxiters = 300)
 
 pred = predict_neuralode(result_neuralode.u)
-plt = scatter(tsteps[1:size(pred,2)], ode_data[1,1:size(pred,2)], label = "data")
-scatter!(plt, tsteps[1:size(pred,2)], pred[1,:], label = "prediction")
+plt = scatter(tsteps[1:size(pred, 2)], ode_data[1, 1:size(pred, 2)], label = "data")
+scatter!(plt, tsteps[1:size(pred, 2)], pred[1, :], label = "prediction")
 ```
 
 However, we've now fallen into a trap of a local minimum. If the optimizer changes
@@ -96,22 +96,22 @@ Let's start by reducing the timespan to `(0,1.5)`:
 
 ```@example iterativefit
 function predict_neuralode(p)
-  prob = ODEProblem(neuralode_f, u0, (0.0f0, 1.5f0), p)
-  sol = solve(prob, Vern7(), saveat = tsteps, abstol=1e-6, reltol=1e-6)
-  Array(sol)
+    prob = ODEProblem(neuralode_f, u0, (0.0f0, 1.5f0), p)
+    sol = solve(prob, Vern7(), saveat = tsteps, abstol = 1e-6, reltol = 1e-6)
+    Array(sol)
 end
 
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss_neuralode(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss_neuralode(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, pinit)
 result_neuralode2 = Optimization.solve(optprob,
-                                      ADAM(0.05), callback = callback,
-                                      maxiters = 300)
+                                       ADAM(0.05), callback = callback,
+                                       maxiters = 300)
 
 pred = predict_neuralode(result_neuralode2.u)
-plt = scatter(tsteps[1:size(pred,2)], ode_data[1,1:size(pred,2)], label = "data")
-scatter!(plt, tsteps[1:size(pred,2)], pred[1,:], label = "prediction")
+plt = scatter(tsteps[1:size(pred, 2)], ode_data[1, 1:size(pred, 2)], label = "data")
+scatter!(plt, tsteps[1:size(pred, 2)], pred[1, :], label = "prediction")
 ```
 
 This fits beautifully. Now let's grow the timespan and utilize the parameters
@@ -119,19 +119,19 @@ from our `(0,1.5)` fit as the initial condition to our next fit:
 
 ```@example iterativefit
 function predict_neuralode(p)
-  prob = ODEProblem(neuralode_f, u0, (0.0f0, 3.0f0), p)
-  sol = solve(prob, Vern7(), saveat = tsteps, abstol=1e-6, reltol=1e-6)
-  Array(sol)
+    prob = ODEProblem(neuralode_f, u0, (0.0f0, 3.0f0), p)
+    sol = solve(prob, Vern7(), saveat = tsteps, abstol = 1e-6, reltol = 1e-6)
+    Array(sol)
 end
 
 optprob = Optimization.OptimizationProblem(optf, result_neuralode2.u)
 result_neuralode3 = Optimization.solve(optprob,
-                                        ADAM(0.05), maxiters = 300,
-                                        callback = callback)
+                                       ADAM(0.05), maxiters = 300,
+                                       callback = callback)
 
 pred = predict_neuralode(result_neuralode3.u)
-plt = scatter(tsteps[1:size(pred,2)], ode_data[1,1:size(pred,2)], label = "data")
-scatter!(plt, tsteps[1:size(pred,2)], pred[1,:], label = "prediction")
+plt = scatter(tsteps[1:size(pred, 2)], ode_data[1, 1:size(pred, 2)], label = "data")
+scatter!(plt, tsteps[1:size(pred, 2)], pred[1, :], label = "prediction")
 ```
 
 Once again, a great fit. Now we utilize these parameters as the initial condition
@@ -139,19 +139,19 @@ to the full fit:
 
 ```@example iterativefit
 function predict_neuralode(p)
-  prob = ODEProblem(neuralode_f, u0, (0.0f0, 5.0f0), p)
-  sol = solve(prob, Vern7(), saveat = tsteps, abstol=1e-6, reltol=1e-6)
-  Array(sol)
+    prob = ODEProblem(neuralode_f, u0, (0.0f0, 5.0f0), p)
+    sol = solve(prob, Vern7(), saveat = tsteps, abstol = 1e-6, reltol = 1e-6)
+    Array(sol)
 end
 
 optprob = Optimization.OptimizationProblem(optf, result_neuralode3.u)
 result_neuralode4 = Optimization.solve(optprob,
-                                      ADAM(0.01), maxiters = 500,
-                                      callback = callback)
+                                       ADAM(0.01), maxiters = 500,
+                                       callback = callback)
 
 pred = predict_neuralode(result_neuralode4.u)
-plt = scatter(tsteps[1:size(pred,2)], ode_data[1,1:size(pred,2)], label = "data")
-scatter!(plt, tsteps[1:size(pred,2)], pred[1,:], label = "prediction")
+plt = scatter(tsteps[1:size(pred, 2)], ode_data[1, 1:size(pred, 2)], label = "data")
+scatter!(plt, tsteps[1:size(pred, 2)], pred[1, :], label = "prediction")
 ```
 
 ## Training both the initial conditions and the parameters to start
@@ -167,7 +167,6 @@ one could use a mix of (4) and (5), or breaking up the trajectory into chunks an
 ```@example resetic
 using Flux, Plots, DifferentialEquations, SciMLSensitivity
 
-
 #Starting example with tspan (0, 5)
 u0 = Float32[2.0; 0.0]
 datasize = 30
@@ -176,40 +175,39 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 
 function trueODEfunc(du, u, p, t)
     true_A = [-0.1 2.0; -2.0 -0.1]
-    du .= ((u.^3)'true_A)'
+    du .= ((u .^ 3)'true_A)'
 end
 
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
 #Using flux here to easily demonstrate the idea, but this can be done with Optimization.solve!
-dudt2 = Chain(Dense(2,16, tanh),
-             Dense(16,2))
-
+dudt2 = Chain(Dense(2, 16, tanh),
+              Dense(16, 2))
 
-p,re = Flux.destructure(dudt2) # use this p as the initial condition!
-dudt(u,p,t) = re(p)(u) # need to restrcture for backprop!
-prob = ODEProblem(dudt,u0,tspan)
+p, re = Flux.destructure(dudt2) # use this p as the initial condition!
+dudt(u, p, t) = re(p)(u) # need to restrcture for backprop!
+prob = ODEProblem(dudt, u0, tspan)
 
 function predict_n_ode()
-    Array(solve(prob,u0=u0,p=p, saveat=tsteps))
+    Array(solve(prob, u0 = u0, p = p, saveat = tsteps))
 end
 
 function loss_n_ode()
-      pred = predict_n_ode()
-      sqnorm(x) = sum(abs2, x)
-      loss = sum(abs2,ode_data .- pred)
-      loss
+    pred = predict_n_ode()
+    sqnorm(x) = sum(abs2, x)
+    loss = sum(abs2, ode_data .- pred)
+    loss
 end
 
-function callback(;doplot=true) #callback function to observe training
+function callback(; doplot = true) #callback function to observe training
     pred = predict_n_ode()
-    display(sum(abs2,ode_data .- pred))
+    display(sum(abs2, ode_data .- pred))
     if doplot
-      # plot current prediction against data
-      pl = plot(tsteps,ode_data[1,:],label="data")
-      plot!(pl,tsteps,pred[1,:],label="prediction")
-      display(plot(pl))
+        # plot current prediction against data
+        pl = plot(tsteps, ode_data[1, :], label = "data")
+        plot!(pl, tsteps, pred[1, :], label = "prediction")
+        display(plot(pl))
     end
     return false
 end
@@ -221,7 +219,7 @@ data = Iterators.repeated((), 1000)
 
 #Specify to flux to include both the initial conditions (IC) and parameters of the NODE to train
 Flux.train!(loss_n_ode, Flux.params(u0, p), data,
-                    Flux.Optimise.ADAM(0.05), cb = callback)
+            Flux.Optimise.ADAM(0.05), cb = callback)
 
 #Here we reset the IC back to the original and train only the NODE parameters
 u0 = Float32[2.0; 0.0]
@@ -238,28 +236,23 @@ tsteps = range(tspan[1], tspan[2], length = datasize)
 prob_trueode = ODEProblem(trueODEfunc, u0, tspan)
 ode_data = Array(solve(prob_trueode, Tsit5(), saveat = tsteps))
 
-dudt2 = Chain(Dense(2,16, tanh),
-             Dense(16,2))
-
-p,re = Flux.destructure(dudt2) # use this p as the initial condition!
-dudt(u,p,t) = re(p)(u) # need to restrcture for backprop!
-prob = ODEProblem(dudt,u0,tspan)
-
+dudt2 = Chain(Dense(2, 16, tanh),
+              Dense(16, 2))
 
+p, re = Flux.destructure(dudt2) # use this p as the initial condition!
+dudt(u, p, t) = re(p)(u) # need to restrcture for backprop!
+prob = ODEProblem(dudt, u0, tspan)
 
 data = Iterators.repeated((), 1500)
 
 Flux.train!(loss_n_ode, Flux.params(u0, p), data,
-                    Flux.Optimise.ADAM(0.05), cb = callback)
-
-
+            Flux.Optimise.ADAM(0.05), cb = callback)
 
 u0 = Float32[2.0; 0.0]
 Flux.train!(loss_n_ode, Flux.params(p), data,
             Flux.Optimise.ADAM(0.05), cb = callback)
 
 callback()
-
 ```
 
 And there we go, a set of robust strategies for fitting an equation that would otherwise
diff --git a/docs/src/tutorials/training_tips/multiple_nn.md b/docs/src/tutorials/training_tips/multiple_nn.md
index 96a72068f..cf3ecd08f 100644
--- a/docs/src/tutorials/training_tips/multiple_nn.md
+++ b/docs/src/tutorials/training_tips/multiple_nn.md
@@ -10,55 +10,55 @@ The following is a fully working demo on the Fitzhugh-Nagumo ODE:
 using Lux, DiffEqFlux, Optimization, OptimizationNLopt, DifferentialEquations, Random
 
 rng = Random.default_rng()
-Random.seed!(rng,1)
+Random.seed!(rng, 1)
 
-function fitz(du,u,p,t)
-  v,w = u
-  a,b,τinv,l = p
-  du[1] = v - v^3/3 -w + l
-  du[2] = τinv*(v +  a - b*w)
+function fitz(du, u, p, t)
+    v, w = u
+    a, b, τinv, l = p
+    du[1] = v - v^3 / 3 - w + l
+    du[2] = τinv * (v + a - b * w)
 end
 
-p_ = Float32[0.7,0.8,1/12.5,0.5]
-u0 = [1f0;1f0]
-tspan = (0f0,10f0)
-prob = ODEProblem(fitz,u0,tspan,p_)
-sol = solve(prob, Tsit5(), saveat = 0.5 )
+p_ = Float32[0.7, 0.8, 1 / 12.5, 0.5]
+u0 = [1.0f0; 1.0f0]
+tspan = (0.0f0, 10.0f0)
+prob = ODEProblem(fitz, u0, tspan, p_)
+sol = solve(prob, Tsit5(), saveat = 0.5)
 
 # Ideal data
 X = Array(sol)
-Xₙ = X + Float32(1e-3)*randn(eltype(X), size(X))  #noisy data
+Xₙ = X + Float32(1e-3) * randn(eltype(X), size(X))  #noisy data
 
 # For xz term
 NN_1 = Lux.Chain(Lux.Dense(2, 16, tanh), Lux.Dense(16, 1))
-p1,st1 = Lux.setup(rng, NN_1)
+p1, st1 = Lux.setup(rng, NN_1)
 
 # for xy term
 NN_2 = Lux.Chain(Lux.Dense(3, 16, tanh), Lux.Dense(16, 1))
 p2, st2 = Lux.setup(rng, NN_2)
-scaling_factor = 1f0
+scaling_factor = 1.0f0
 
 p1 = Lux.ComponentArray(p1)
 p2 = Lux.ComponentArray(p2)
 
 p = Lux.ComponentArray{eltype(p1)}()
-p = Lux.ComponentArray(p;p1)
-p = Lux.ComponentArray(p;p2)
-p = Lux.ComponentArray(p;scaling_factor)
-
-function dudt_(u,p,t)
-    v,w = u
-    z1 = NN_1([v,w], p.p1, st1)[1]
-    z2 = NN_2([v,w,t], p.p2, st2)[1]
-    [z1[1],p.scaling_factor*z2[1]]
+p = Lux.ComponentArray(p; p1)
+p = Lux.ComponentArray(p; p2)
+p = Lux.ComponentArray(p; scaling_factor)
+
+function dudt_(u, p, t)
+    v, w = u
+    z1 = NN_1([v, w], p.p1, st1)[1]
+    z2 = NN_2([v, w, t], p.p2, st2)[1]
+    [z1[1], p.scaling_factor * z2[1]]
 end
-prob_nn = ODEProblem(dudt_,u0, tspan, p)
-sol_nn = solve(prob_nn, Tsit5(),saveat = sol.t)
+prob_nn = ODEProblem(dudt_, u0, tspan, p)
+sol_nn = solve(prob_nn, Tsit5(), saveat = sol.t)
 
 function predict(θ)
-    Array(solve(prob_nn, Vern7(), p=θ, saveat = sol.t,
-                         abstol=1e-6, reltol=1e-6,
-                         sensealg = InterpolatingAdjoint(autojacvec=ReverseDiffVJP(true))))
+    Array(solve(prob_nn, Vern7(), p = θ, saveat = sol.t,
+                abstol = 1e-6, reltol = 1e-6,
+                sensealg = InterpolatingAdjoint(autojacvec = ReverseDiffVJP(true))))
 end
 
 # No regularisation right now
@@ -68,21 +68,22 @@ function loss(θ)
 end
 loss(p)
 const losses = []
-callback(θ,l,pred) = begin
+callback(θ, l, pred) = begin
     push!(losses, l)
-    if length(losses)%50==0
+    if length(losses) % 50 == 0
         println(losses[end])
     end
     false
 end
 adtype = Optimization.AutoZygote()
-optf = Optimization.OptimizationFunction((x,p) -> loss(x), adtype)
+optf = Optimization.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = Optimization.OptimizationProblem(optf, p)
-res1_uode = Optimization.solve(optprob, ADAM(0.01), callback=callback, maxiters = 500)
+res1_uode = Optimization.solve(optprob, ADAM(0.01), callback = callback, maxiters = 500)
 
 optprob2 = Optimization.OptimizationProblem(optf, res1_uode.u)
-res2_uode = Optimization.solve(optprob2, NLopt.LD_LBFGS(), maxiters = 10000, callback = callback)
+res2_uode = Optimization.solve(optprob2, NLopt.LD_LBFGS(), maxiters = 10000,
+                               callback = callback)
 ```
 
 The key is that `Optimization.solve` acts on a single parameter vector `p`.
diff --git a/src/derivative_wrappers.jl b/src/derivative_wrappers.jl
index 2215c1a9b..14f50c382 100644
--- a/src/derivative_wrappers.jl
+++ b/src/derivative_wrappers.jl
@@ -175,7 +175,7 @@ function gradient!(df::AbstractArray{<:Number}, f,
 end
 
 """
-  jacobianvec!(Jv, f, x, v, alg, (buffer, seed)) -> nothing
+jacobianvec!(Jv, f, x, v, alg, (buffer, seed)) -> nothing
 
 ``Jv <- J(f(x))v``
 """
diff --git a/src/forward_sensitivity.jl b/src/forward_sensitivity.jl
index 3c70e123f..729f1a1cd 100644
--- a/src/forward_sensitivity.jl
+++ b/src/forward_sensitivity.jl
@@ -495,9 +495,10 @@ Extracts the time series for the local sensitivities from the ODE solution. This
 that the ODE was defined via `ODEForwardSensitivityProblem`.
 
 ```julia
-extract_local_sensitivities(sol, asmatrix::Val=Val(false)) # Decompose the entire time series
-extract_local_sensitivities(sol, i::Integer, asmatrix::Val=Val(false)) # Decompose sol[i]
-extract_local_sensitivities(sol, t::Union{Number,AbstractVector}, asmatrix::Val=Val(false)) # Decompose sol(t)
+extract_local_sensitivities(sol, asmatrix::Val = Val(false)) # Decompose the entire time series
+extract_local_sensitivities(sol, i::Integer, asmatrix::Val = Val(false)) # Decompose sol[i]
+extract_local_sensitivities(sol, t::Union{Number, AbstractVector},
+                            asmatrix::Val = Val(false)) # Decompose sol(t)
 ```
 """
 function extract_local_sensitivities(sol, asmatrix::Val = Val(false))
diff --git a/src/sensitivity_algorithms.jl b/src/sensitivity_algorithms.jl
index 665b618ee..660354391 100644
--- a/src/sensitivity_algorithms.jl
+++ b/src/sensitivity_algorithms.jl
@@ -16,7 +16,7 @@ abstract type AbstractShadowingSensitivityAlgorithm{CS, AD, FDT} <:
 
 """
 ```julia
-ForwardSensitivity{CS,AD,FDT} <: AbstractForwardSensitivityAlgorithm{CS,AD,FDT}
+ForwardSensitivity{CS, AD, FDT} <: AbstractForwardSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of continuous forward sensitivity analysis for propagating
@@ -27,37 +27,37 @@ within the reverse-mode automatic differentiation environment.
 ## Constructor
 
 ```julia
-function ForwardSensitivity(;
-                            chunk_size=0,autodiff=true,
-                            diff_type=Val{:central},
-                            autojacvec=autodiff,
-                            autojacmat=false)
+ForwardSensitivity(;
+                   chunk_size = 0, autodiff = true,
+                   diff_type = Val{:central},
+                   autojacvec = autodiff,
+                   autojacmat = false)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation in the internal sensitivity algorithm
-  computations. Default is `true`.
-* `chunk_size`: Chunk size for forward mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `autojacvec`: Calculate the Jacobian-vector product via automatic
-  differentiation with special seeding.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
+  - `autodiff`: Use automatic differentiation in the internal sensitivity algorithm
+    computations. Default is `true`.
+  - `chunk_size`: Chunk size for forward mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `autojacvec`: Calculate the Jacobian-vector product via automatic
+    differentiation with special seeding.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
 
 Further details:
 
-- If `autodiff=true` and `autojacvec=true`, then the one chunk `J*v` forward-mode
-  directional derivative calculation trick is used to compute the product without
-  constructing the Jacobian (via ForwardDiff.jl).
-- If `autodiff=false` and `autojacvec=true`, then the numerical direction derivative
-  trick `(f(x+epsilon*v)-f(x))/epsilon` is used to compute `J*v` without constructing
-  the Jacobian.
-- If `autodiff=true` and `autojacvec=false`, then the Jacobian is constructed via
-  chunked forward-mode automatic differentiation (via ForwardDiff.jl).
-- If `autodiff=false` and `autojacvec=false`, then the Jacobian is constructed via
-  finite differences via FiniteDiff.jl.
+  - If `autodiff=true` and `autojacvec=true`, then the one chunk `J*v` forward-mode
+    directional derivative calculation trick is used to compute the product without
+    constructing the Jacobian (via ForwardDiff.jl).
+  - If `autodiff=false` and `autojacvec=true`, then the numerical direction derivative
+    trick `(f(x+epsilon*v)-f(x))/epsilon` is used to compute `J*v` without constructing
+    the Jacobian.
+  - If `autodiff=true` and `autojacvec=false`, then the Jacobian is constructed via
+    chunked forward-mode automatic differentiation (via ForwardDiff.jl).
+  - If `autodiff=false` and `autojacvec=false`, then the Jacobian is constructed via
+    finite differences via FiniteDiff.jl.
 
 ## SciMLProblem Support
 
@@ -80,7 +80,7 @@ end
 
 """
 ```julia
-ForwardDiffSensitivity{CS,CTS} <: AbstractForwardSensitivityAlgorithm{CS,Nothing,Nothing}
+ForwardDiffSensitivity{CS, CTS} <: AbstractForwardSensitivityAlgorithm{CS, Nothing, Nothing}
 ```
 
 An implementation of discrete forward sensitivity analysis through ForwardDiff.jl.
@@ -91,16 +91,16 @@ environment.
 ## Constructor
 
 ```julia
-ForwardDiffSensitivity(;chunk_size=0,convert_tspan=nothing)
+ForwardDiffSensitivity(; chunk_size = 0, convert_tspan = nothing)
 ```
 
 ## Keyword Arguments
 
-* `chunk_size`: the chunk size used by ForwardDiff for computing the Jacobian, i.e. the
-  number of simultaneous columns computed.
-* `convert_tspan`: whether to convert time to also be `Dual` valued. By default this is
-  `nothing` which will only convert if callbacks are found. Conversion is required in order
-  to accurately differentiate callbacks (hybrid equations).
+  - `chunk_size`: the chunk size used by ForwardDiff for computing the Jacobian, i.e. the
+    number of simultaneous columns computed.
+  - `convert_tspan`: whether to convert time to also be `Dual` valued. By default this is
+    `nothing` which will only convert if callbacks are found. Conversion is required in order
+    to accurately differentiate callbacks (hybrid equations).
 
 ## SciMLProblem Support
 
@@ -116,7 +116,7 @@ end
 
 """
 ```julia
-BacksolveAdjoint{CS,AD,FDT,VJP} <: AbstractAdjointSensitivityAlgorithm{CS,AD,FDT}
+BacksolveAdjoint{CS, AD, FDT, VJP} <: AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of adjoint sensitivity analysis using a backwards solution of the ODE.
@@ -127,44 +127,48 @@ stabilization is included for additional numerical stability over the naive impl
 ## Constructor
 
 ```julia
-BacksolveAdjoint(;chunk_size=0,autodiff=true,
-                  diff_type=Val{:central},
-                  autojacvec=nothing,
-                  checkpointing=true, noisemixing=false)
+BacksolveAdjoint(; chunk_size = 0, autodiff = true,
+                 diff_type = Val{:central},
+                 autojacvec = nothing,
+                 checkpointing = true, noisemixing = false)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
-  differentiation with special seeding. The default is `true`. The total set
-  of choices are:
-    - `false`: the Jacobian is constructed via FiniteDiff.jl
-    - `true`: the Jacobian is constructed via ForwardDiff.jl
-    - `TrackerVJP`: Uses Tracker.jl for the vjp.
-    - `ZygoteVJP`: Uses Zygote.jl for the vjp.
-    - `EnzymeVJP`: Uses Enzyme.jl for the vjp.
-    - `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
-      is a boolean for whether to precompile the tape, which should only be done
-      if there are no branches (`if` or `while` statements) in the `f` function.
-* `checkpointing`: whether checkpointing is enabled for the reverse pass. Defaults
-  to `true`.
-* `noisemixing`: Handle noise processes that are not of the form `du[i] = f(u[i])`.
-  For example, to compute the sensitivities of an SDE with diagonal diffusion
-  ```julia
-  function g_mixing!(du,u,p,t)
-    du[1] = p[3]*u[1] + p[4]*u[2]
-    du[2] = p[3]*u[1] + p[4]*u[2]
-    nothing
-  end
-  ```
-  correctly, `noisemixing=true` must be enabled. The default is `false`.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
+    differentiation with special seeding. The default is `true`. The total set
+    of choices are:
+
+      + `false`: the Jacobian is constructed via FiniteDiff.jl
+      + `true`: the Jacobian is constructed via ForwardDiff.jl
+      + `TrackerVJP`: Uses Tracker.jl for the vjp.
+      + `ZygoteVJP`: Uses Zygote.jl for the vjp.
+      + `EnzymeVJP`: Uses Enzyme.jl for the vjp.
+      + `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
+        is a boolean for whether to precompile the tape, which should only be done
+        if there are no branches (`if` or `while` statements) in the `f` function.
+  - `checkpointing`: whether checkpointing is enabled for the reverse pass. Defaults
+    to `true`.
+  - `noisemixing`: Handle noise processes that are not of the form `du[i] = f(u[i])`.
+    For example, to compute the sensitivities of an SDE with diagonal diffusion
+
+    ```julia
+    function g_mixing!(du, u, p, t)
+        du[1] = p[3] * u[1] + p[4] * u[2]
+        du[2] = p[3] * u[1] + p[4] * u[2]
+        nothing
+    end
+    ```
+
+    correctly, `noisemixing=true` must be enabled. The default is `false`.
 
 For more details on the vjp choices, please consult the sensitivity algorithms
 documentation page or the docstrings of the vjp types.
@@ -180,18 +184,18 @@ from the forward solution. As a quick demonstration:
 
 ```julia
 using Sundials
-function lorenz(du,u,p,t)
- du[1] = 10.0*(u[2]-u[1])
- du[2] = u[1]*(28.0-u[3]) - u[2]
- du[3] = u[1]*u[2] - (8/3)*u[3]
+function lorenz(du, u, p, t)
+    du[1] = 10.0 * (u[2] - u[1])
+    du[2] = u[1] * (28.0 - u[3]) - u[2]
+    du[3] = u[1] * u[2] - (8 / 3) * u[3]
 end
-u0 = [1.0;0.0;0.0]
-tspan = (0.0,100.0)
-prob = ODEProblem(lorenz,u0,tspan)
-sol = solve(prob,Tsit5(),reltol=1e-12,abstol=1e-12)
-prob2 = ODEProblem(lorenz,sol[end],(100.0,0.0))
-sol = solve(prob,Tsit5(),reltol=1e-12,abstol=1e-12)
-@show sol[end]-u0 #[-3.22091, -1.49394, 21.3435]
+u0 = [1.0; 0.0; 0.0]
+tspan = (0.0, 100.0)
+prob = ODEProblem(lorenz, u0, tspan)
+sol = solve(prob, Tsit5(), reltol = 1e-12, abstol = 1e-12)
+prob2 = ODEProblem(lorenz, sol[end], (100.0, 0.0))
+sol = solve(prob, Tsit5(), reltol = 1e-12, abstol = 1e-12)
+@show sol[end] - u0 #[-3.22091, -1.49394, 21.3435]
 ```
 
 Thus, one should check the stability of the backsolve on their type of problem before
@@ -216,40 +220,40 @@ callback functions (events).
 ## References
 
 ODE:
- Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
- R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
- for Scientific Machine Learning,	arXiv:2001.04385
+Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
+R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
+for Scientific Machine Learning,	arXiv:2001.04385
 
- Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
- and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
- differential/algebraic equation solvers, ACM Transactions on Mathematical
- Software (TOMS), 31, pp:363–396 (2005)
+Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
+and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
+differential/algebraic equation solvers, ACM Transactions on Mathematical
+Software (TOMS), 31, pp:363–396 (2005)
 
- Chen, R.T.Q. and Rubanova, Y. and Bettencourt, J. and Duvenaud, D. K.,
- Neural ordinary differential equations. In Advances in neural information processing
- systems, pp. 6571–6583 (2018)
+Chen, R.T.Q. and Rubanova, Y. and Bettencourt, J. and Duvenaud, D. K.,
+Neural ordinary differential equations. In Advances in neural information processing
+systems, pp. 6571–6583 (2018)
 
- Pontryagin, L. S. and Mishchenko, E.F. and Boltyanskii, V.G. and Gamkrelidze, R.V.
- The mathematical theory of optimal processes. Routledge, (1962)
+Pontryagin, L. S. and Mishchenko, E.F. and Boltyanskii, V.G. and Gamkrelidze, R.V.
+The mathematical theory of optimal processes. Routledge, (1962)
 
- Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
- and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
- continuous sensitivity analysis for derivatives of differential equation solutions,
- arXiv:1812.01892
+Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
+and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
+continuous sensitivity analysis for derivatives of differential equation solutions,
+arXiv:1812.01892
 
 DAE:
- Cao, Y. and Li, S. and Petzold, L. and Serban, R., Adjoint sensitivity analysis
- for differential-algebraic equations: The adjoint DAE system and its numerical
- solution, SIAM journal on scientific computing 24 pp: 1076-1089 (2003)
+Cao, Y. and Li, S. and Petzold, L. and Serban, R., Adjoint sensitivity analysis
+for differential-algebraic equations: The adjoint DAE system and its numerical
+solution, SIAM journal on scientific computing 24 pp: 1076-1089 (2003)
 
 SDE:
- Gobet, E. and Munos, R., Sensitivity Analysis Using Ito-Malliavin Calculus and
- Martingales, and Application to Stochastic Optimal Control,
- SIAM Journal on control and optimization, 43, pp. 1676-1713 (2005)
+Gobet, E. and Munos, R., Sensitivity Analysis Using Ito-Malliavin Calculus and
+Martingales, and Application to Stochastic Optimal Control,
+SIAM Journal on control and optimization, 43, pp. 1676-1713 (2005)
 
- Li, X. and Wong, T.-K. L.and Chen, R. T. Q. and Duvenaud, D.,
- Scalable Gradients for Stochastic Differential Equations,
- PMLR 108, pp. 3870-3882 (2020), http://proceedings.mlr.press/v108/li20i.html
+Li, X. and Wong, T.-K. L.and Chen, R. T. Q. and Duvenaud, D.,
+Scalable Gradients for Stochastic Differential Equations,
+PMLR 108, pp. 3870-3882 (2020), http://proceedings.mlr.press/v108/li20i.html
 """
 struct BacksolveAdjoint{CS, AD, FDT, VJP} <:
        AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
@@ -272,7 +276,7 @@ end
 
 """
 ```julia
-InterpolatingAdjoint{CS,AD,FDT,VJP} <: AbstractAdjointSensitivityAlgorithm{CS,AD,FDT}
+InterpolatingAdjoint{CS, AD, FDT, VJP} <: AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of adjoint sensitivity analysis which uses the interpolation of
@@ -284,44 +288,48 @@ enabled, it will only require the memory to interpolate between checkpoints.
 ## Constructor
 
 ```julia
-function InterpolatingAdjoint(;chunk_size=0,autodiff=true,
-                               diff_type=Val{:central},
-                               autojacvec=nothing,
-                               checkpointing=false, noisemixing=false)
+InterpolatingAdjoint(; chunk_size = 0, autodiff = true,
+                     diff_type = Val{:central},
+                     autojacvec = nothing,
+                     checkpointing = false, noisemixing = false)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
-  differentiation with special seeding. The default is `true`. The total set
-  of choices are:
-    - `false`: the Jacobian is constructed via FiniteDiff.jl
-    - `true`: the Jacobian is constructed via ForwardDiff.jl
-    - `TrackerVJP`: Uses Tracker.jl for the vjp.
-    - `ZygoteVJP`: Uses Zygote.jl for the vjp.
-    - `EnzymeVJP`: Uses Enzyme.jl for the vjp.
-    - `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
-      is a boolean for whether to precompile the tape, which should only be done
-      if there are no branches (`if` or `while` statements) in the `f` function.
-* `checkpointing`: whether checkpointing is enabled for the reverse pass. Defaults
-  to `false`.
-* `noisemixing`: Handle noise processes that are not of the form `du[i] = f(u[i])`.
-  For example, to compute the sensitivities of an SDE with diagonal diffusion
-  ```julia
-  function g_mixing!(du,u,p,t)
-    du[1] = p[3]*u[1] + p[4]*u[2]
-    du[2] = p[3]*u[1] + p[4]*u[2]
-    nothing
-  end
-  ```
-  correctly, `noisemixing=true` must be enabled. The default is `false`.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
+    differentiation with special seeding. The default is `true`. The total set
+    of choices are:
+
+      + `false`: the Jacobian is constructed via FiniteDiff.jl
+      + `true`: the Jacobian is constructed via ForwardDiff.jl
+      + `TrackerVJP`: Uses Tracker.jl for the vjp.
+      + `ZygoteVJP`: Uses Zygote.jl for the vjp.
+      + `EnzymeVJP`: Uses Enzyme.jl for the vjp.
+      + `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
+        is a boolean for whether to precompile the tape, which should only be done
+        if there are no branches (`if` or `while` statements) in the `f` function.
+  - `checkpointing`: whether checkpointing is enabled for the reverse pass. Defaults
+    to `false`.
+  - `noisemixing`: Handle noise processes that are not of the form `du[i] = f(u[i])`.
+    For example, to compute the sensitivities of an SDE with diagonal diffusion
+
+    ```julia
+    function g_mixing!(du, u, p, t)
+        du[1] = p[3] * u[1] + p[4] * u[2]
+        du[2] = p[3] * u[1] + p[4] * u[2]
+        nothing
+    end
+    ```
+
+    correctly, `noisemixing=true` must be enabled. The default is `false`.
 
 For more details on the vjp choices, please consult the sensitivity algorithms
 documentation page or the docstrings of the vjp types.
@@ -343,19 +351,19 @@ supports callbacks (events).
 
 ## References
 
- Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
- R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
- for Scientific Machine Learning,	arXiv:2001.04385
+Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
+R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
+for Scientific Machine Learning,	arXiv:2001.04385
 
- Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
- and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
- differential/algebraic equation solvers, ACM Transactions on Mathematical
- Software (TOMS), 31, pp:363–396 (2005)
+Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
+and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
+differential/algebraic equation solvers, ACM Transactions on Mathematical
+Software (TOMS), 31, pp:363–396 (2005)
 
- Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
- and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
- continuous sensitivity analysis for derivatives of differential equation solutions,
- arXiv:1812.01892
+Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
+and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
+continuous sensitivity analysis for derivatives of differential equation solutions,
+arXiv:1812.01892
 """
 struct InterpolatingAdjoint{CS, AD, FDT, VJP} <:
        AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
@@ -379,7 +387,7 @@ end
 
 """
 ```julia
-QuadratureAdjoint{CS,AD,FDT,VJP} <: AbstractAdjointSensitivityAlgorithm{CS,AD,FDT}
+QuadratureAdjoint{CS, AD, FDT, VJP} <: AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of adjoint sensitivity analysis which develops a full
@@ -396,34 +404,36 @@ pass and is thus memory intensive.
 ## Constructor
 
 ```julia
-function QuadratureAdjoint(;chunk_size=0,autodiff=true,
-                            diff_type=Val{:central},
-                            autojacvec=nothing,abstol=1e-6,
-                            reltol=1e-3)
+QuadratureAdjoint(; chunk_size = 0, autodiff = true,
+                  diff_type = Val{:central},
+                  autojacvec = nothing, abstol = 1e-6,
+                  reltol = 1e-3)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
-  differentiation with special seeding. The default is `true`. The total set
-  of choices are:
-    - `false`: the Jacobian is constructed via FiniteDiff.jl
-    - `true`: the Jacobian is constructed via ForwardDiff.jl
-    - `TrackerVJP`: Uses Tracker.jl for the vjp.
-    - `ZygoteVJP`: Uses Zygote.jl for the vjp.
-    - `EnzymeVJP`: Uses Enzyme.jl for the vjp.
-    - `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
-      is a boolean for whether to precompile the tape, which should only be done
-      if there are no branches (`if` or `while` statements) in the `f` function.
-* `abstol`: absolute tolerance for the quadrature calculation
-* `reltol`: relative tolerance for the quadrature calculation
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
+    differentiation with special seeding. The default is `true`. The total set
+    of choices are:
+
+      + `false`: the Jacobian is constructed via FiniteDiff.jl
+      + `true`: the Jacobian is constructed via ForwardDiff.jl
+      + `TrackerVJP`: Uses Tracker.jl for the vjp.
+      + `ZygoteVJP`: Uses Zygote.jl for the vjp.
+      + `EnzymeVJP`: Uses Enzyme.jl for the vjp.
+      + `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
+        is a boolean for whether to precompile the tape, which should only be done
+        if there are no branches (`if` or `while` statements) in the `f` function.
+  - `abstol`: absolute tolerance for the quadrature calculation
+  - `reltol`: relative tolerance for the quadrature calculation
 
 For more details on the vjp choices, please consult the sensitivity algorithms
 documentation page or the docstrings of the vjp types.
@@ -434,22 +444,22 @@ This `sensealg` only supports `ODEProblem`s. This `sensealg` supports events (ca
 
 ## References
 
- Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
- R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
- for Scientific Machine Learning,	arXiv:2001.04385
+Rackauckas, C. and Ma, Y. and Martensen, J. and Warner, C. and Zubov, K. and Supekar,
+R. and Skinner, D. and Ramadhana, A. and Edelman, A., Universal Differential Equations
+for Scientific Machine Learning,	arXiv:2001.04385
 
- Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
- and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
- differential/algebraic equation solvers, ACM Transactions on Mathematical
- Software (TOMS), 31, pp:363–396 (2005)
+Hindmarsh, A. C. and Brown, P. N. and Grant, K. E. and Lee, S. L. and Serban, R.
+and Shumaker, D. E. and Woodward, C. S., SUNDIALS: Suite of nonlinear and
+differential/algebraic equation solvers, ACM Transactions on Mathematical
+Software (TOMS), 31, pp:363–396 (2005)
 
- Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
- and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
- continuous sensitivity analysis for derivatives of differential equation solutions,
- arXiv:1812.01892
+Rackauckas, C. and Ma, Y. and Dixit, V. and Guo, X. and Innes, M. and Revels, J.
+and Nyberg, J. and Ivaturi, V., A comparison of automatic differentiation and
+continuous sensitivity analysis for derivatives of differential equation solutions,
+arXiv:1812.01892
 
- Kim, S., Ji, W., Deng, S., Ma, Y., & Rackauckas, C. (2021). Stiff neural ordinary
- differential equations. Chaos: An Interdisciplinary Journal of Nonlinear Science, 31(9), 093122.
+Kim, S., Ji, W., Deng, S., Ma, Y., & Rackauckas, C. (2021). Stiff neural ordinary
+differential equations. Chaos: An Interdisciplinary Journal of Nonlinear Science, 31(9), 093122.
 """
 struct QuadratureAdjoint{CS, AD, FDT, VJP} <:
        AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
@@ -471,7 +481,7 @@ end
 
 """
 ```julia
-TrackerAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing,true,nothing}
+TrackerAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing, true, nothing}
 ```
 
 An implementation of discrete adjoint sensitivity analysis
@@ -503,7 +513,7 @@ struct TrackerAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing, true, noth
 
 """
 ```julia
-ReverseDiffAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing,true,nothing}
+ReverseDiffAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing, true, nothing}
 ```
 
 An implementation of discrete adjoint sensitivity analysis using the ReverseDiff.jl
@@ -544,7 +554,7 @@ struct ZygoteAdjoint <: AbstractAdjointSensitivityAlgorithm{nothing, true, nothi
 
 """
 ```julia
-ForwardLSS{CS,AD,FDT,RType,gType} <: AbstractShadowingSensitivityAlgorithm{CS,AD,FDT}
+ForwardLSS{CS, AD, FDT, RType, gType} <: AbstractShadowingSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of the discrete, forward-mode
@@ -560,30 +570,32 @@ See `NILSS()` and `NILSAS()` for a more efficient non-intrusive formulation.
 
 ```julia
 ForwardLSS(;
-          chunk_size=0,autodiff=true,
-          diff_type=Val{:central},
-          LSSregularizer=TimeDilation(10.0,0.0,0.0),
-          g=nothing)
+           chunk_size = 0, autodiff = true,
+           diff_type = Val{:central},
+           LSSregularizer = TimeDilation(10.0, 0.0, 0.0),
+           g = nothing)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `LSSregularizer`: Using `LSSregularizer`, one can choose between three different
-  regularization routines. The default choice is `TimeDilation(10.0,0.0,0.0)`.
-    - `CosWindowing()`: cos windowing of the time grid, i.e. the time grid (saved
-      time steps) is transformed using a cosine.
-    - `Cos2Windowing()`: cos^2 windowing of the time grid.
-    - `TimeDilation(alpha::Number,t0skip::Number,t1skip::Number)`: Corresponds to
-      a time dilation. `alpha` controls the weight. `t0skip` and `t1skip` indicate
-      the times truncated at the beginning and end of the trajectory, respectively.
-* `g`: instantaneous objective function of the long-time averaged objective.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `LSSregularizer`: Using `LSSregularizer`, one can choose between three different
+    regularization routines. The default choice is `TimeDilation(10.0,0.0,0.0)`.
+
+      + `CosWindowing()`: cos windowing of the time grid, i.e. the time grid (saved
+        time steps) is transformed using a cosine.
+      + `Cos2Windowing()`: cos^2 windowing of the time grid.
+      + `TimeDilation(alpha::Number,t0skip::Number,t1skip::Number)`: Corresponds to
+        a time dilation. `alpha` controls the weight. `t0skip` and `t1skip` indicate
+        the times truncated at the beginning and end of the trajectory, respectively.
+  - `g`: instantaneous objective function of the long-time averaged objective.
 
 ## SciMLProblem Support
 
@@ -620,7 +632,7 @@ end
 
 """
 ```julia
-AdjointLSS{CS,AD,FDT,RType,gType} <: AbstractShadowingSensitivityAlgorithm{CS,AD,FDT}
+AdjointLSS{CS, AD, FDT, RType, gType} <: AbstractShadowingSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of the discrete, adjoint-mode
@@ -636,28 +648,30 @@ See `NILSS()` and `NILSAS()` for a more efficient non-intrusive formulation.
 
 ```julia
 AdjointLSS(;
-          chunk_size=0,autodiff=true,
-          diff_type=Val{:central},
-          LSSRegularizer=TimeDilation(10.0,0.0,0.0),
-          g=nothing)
+           chunk_size = 0, autodiff = true,
+           diff_type = Val{:central},
+           LSSRegularizer = TimeDilation(10.0, 0.0, 0.0),
+           g = nothing)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `LSSregularizer`: Using `LSSregularizer`, one can choose between different
-  regularization routines. The default choice is `TimeDilation(10.0,0.0,0.0)`.
-    - `TimeDilation(alpha::Number,t0skip::Number,t1skip::Number)`: Corresponds to
-      a time dilation. `alpha` controls the weight. `t0skip` and `t1skip` indicate
-      the times truncated at the beginning and end of the trajectory, respectively.
-      The default value for `t0skip` and `t1skip` is `zero(alpha)`.
-* `g`: instantaneous objective function of the long-time averaged objective.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `LSSregularizer`: Using `LSSregularizer`, one can choose between different
+    regularization routines. The default choice is `TimeDilation(10.0,0.0,0.0)`.
+
+      + `TimeDilation(alpha::Number,t0skip::Number,t1skip::Number)`: Corresponds to
+        a time dilation. `alpha` controls the weight. `t0skip` and `t1skip` indicate
+        the times truncated at the beginning and end of the trajectory, respectively.
+        The default value for `t0skip` and `t1skip` is `zero(alpha)`.
+  - `g`: instantaneous objective function of the long-time averaged objective.
 
 ## SciMLProblem Support
 
@@ -693,7 +707,7 @@ struct Cos2Windowing <: AbstractCosWindowing end
 
 """
 ```julia
-TimeDilation{T1<:Number} <: AbstractLSSregularizer
+TimeDilation{T1 <: Number} <: AbstractLSSregularizer
 ```
 
 A regularization method for `LSS`. See `?LSS` for
@@ -703,8 +717,8 @@ additional information and other methods.
 
 ```julia
 TimeDilation(alpha;
-          t0skip=zero(alpha),
-          t1skip=zero(alpha))
+             t0skip = zero(alpha),
+             t1skip = zero(alpha))
 ```
 """
 struct TimeDilation{T1 <: Number} <: AbstractLSSregularizer
@@ -716,7 +730,7 @@ function TimeDilation(alpha, t0skip = zero(alpha), t1skip = zero(alpha))
     TimeDilation{typeof(alpha)}(alpha, t0skip, t1skip)
 end
 """
-```julia
+```
 struct NILSS{CS,AD,FDT,RNG,nType,gType} <: AbstractShadowingSensitivityAlgorithm{CS,AD,FDT}
 ```
 
@@ -736,35 +750,35 @@ step, and thus should generally be preferred (for large system sizes) over `Forw
 
 ```julia
 NILSS(nseg, nstep; nus = nothing,
-                   rng = Xorshifts.Xoroshiro128Plus(rand(UInt64)),
-                   chunk_size=0,autodiff=true,
-                   diff_type=Val{:central},
-                   autojacvec=autodiff,
-                   g=nothing)
+      rng = Xorshifts.Xoroshiro128Plus(rand(UInt64)),
+      chunk_size = 0, autodiff = true,
+      diff_type = Val{:central},
+      autojacvec = autodiff,
+      g = nothing)
 ```
 
 ## Arguments
 
-* `nseg`: Number of segments on full time interval on the attractor.
-* `nstep`: number of steps on each segment.
+  - `nseg`: Number of segments on full time interval on the attractor.
+  - `nstep`: number of steps on each segment.
 
 ## Keyword Arguments
 
-* `nus`: Dimension of the unstable subspace. Default is `nothing`. `nus` must be
-  smaller or equal to the state dimension (`length(u0)`). With the default choice,
-  `nus = length(u0) - 1` will be set at compile time.
-* `rng`: (Pseudo) random number generator. Used for initializing the homogeneous
-  tangent states (`w`). Default is `Xorshifts.Xoroshiro128Plus(rand(UInt64))`.
-* `autodiff`: Use automatic differentiation in the internal sensitivity algorithm
-  computations. Default is `true`.
-* `chunk_size`: Chunk size for forward mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `autojacvec`: Calculate the Jacobian-vector product via automatic
-  differentiation with special seeding.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `g`: instantaneous objective function of the long-time averaged objective.
+  - `nus`: Dimension of the unstable subspace. Default is `nothing`. `nus` must be
+    smaller or equal to the state dimension (`length(u0)`). With the default choice,
+    `nus = length(u0) - 1` will be set at compile time.
+  - `rng`: (Pseudo) random number generator. Used for initializing the homogeneous
+    tangent states (`w`). Default is `Xorshifts.Xoroshiro128Plus(rand(UInt64))`.
+  - `autodiff`: Use automatic differentiation in the internal sensitivity algorithm
+    computations. Default is `true`.
+  - `chunk_size`: Chunk size for forward mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `autojacvec`: Calculate the Jacobian-vector product via automatic
+    differentiation with special seeding.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `g`: instantaneous objective function of the long-time averaged objective.
 
 ## SciMLProblem Support
 
@@ -775,6 +789,7 @@ same over infinite time independent of the specified initial conditions, such th
 the sensitivity with respect to the parameters is of interest.
 
 ## References
+
 Ni, A., Blonigan, P. J., Chater, M., Wang, Q., Zhang, Z., Sensitivity analy-
 sis on chaotic dynamical system by Non-Intrusive Least Square Shadowing
 (NI-LSS), in: 46th AIAA Fluid Dynamics Conference, AIAA AVIATION Forum (AIAA 2016-4399),
@@ -806,7 +821,7 @@ end
 
 """
 ```julia
-NILSAS{CS,AD,FDT,RNG,SENSE,gType} <: AbstractShadowingSensitivityAlgorithm{CS,AD,FDT}
+NILSAS{CS, AD, FDT, RNG, SENSE, gType} <: AbstractShadowingSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of the adjoint-mode, continuous
@@ -826,46 +841,48 @@ with respect to multiple parameters with negligible additional cost.
 ## Constructor
 
 ```julia
-NILSAS(nseg, nstep, M=nothing; rng = Xorshifts.Xoroshiro128Plus(rand(UInt64)),
-                                adjoint_sensealg = BacksolveAdjoint(autojacvec=ReverseDiffVJP()),
-                                chunk_size=0,autodiff=true,
-                                diff_type=Val{:central},
-                                g=nothing
-                                )
+NILSAS(nseg, nstep, M = nothing; rng = Xorshifts.Xoroshiro128Plus(rand(UInt64)),
+       adjoint_sensealg = BacksolveAdjoint(autojacvec = ReverseDiffVJP()),
+       chunk_size = 0, autodiff = true,
+       diff_type = Val{:central},
+       g = nothing)
 ```
 
 ## Arguments
 
-* `nseg`: Number of segments on full time interval on the attractor.
-* `nstep`: number of steps on each segment.
-* `M`: number of homogeneous adjoint solutions. This number must be bigger or equal
-  than the number of (positive, adjoint) Lyapunov exponents. Default is `nothing`.
+  - `nseg`: Number of segments on full time interval on the attractor.
+  - `nstep`: number of steps on each segment.
+  - `M`: number of homogeneous adjoint solutions. This number must be bigger or equal
+    than the number of (positive, adjoint) Lyapunov exponents. Default is `nothing`.
 
 ## Keyword Arguments
 
-* `rng`: (Pseudo) random number generator. Used for initializing the terminate
-  conditions of the homogeneous adjoint states (`w`). Default is `Xorshifts.Xoroshiro128Plus(rand(UInt64))`.
-* `adjoint_sensealg`: Continuous adjoint sensitivity method to compute homogeneous
-  and inhomogeneous adjoint solutions on each segment. Default is `BacksolveAdjoint(autojacvec=ReverseDiffVJP())`.
-  * `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
-  differentiation with special seeding. The default is `true`. The total set
-  of choices are:
-    - `false`: the Jacobian is constructed via FiniteDiff.jl
-    - `true`: the Jacobian is constructed via ForwardDiff.jl
-    - `TrackerVJP`: Uses Tracker.jl for the vjp.
-    - `ZygoteVJP`: Uses Zygote.jl for the vjp.
-    - `EnzymeVJP`: Uses Enzyme.jl for the vjp.
-    - `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
-      is a boolean for whether to precompile the tape, which should only be done
-      if there are no branches (`if` or `while` statements) in the `f` function.
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `g`: instantaneous objective function of the long-time averaged objective.
+  - `rng`: (Pseudo) random number generator. Used for initializing the terminate
+    conditions of the homogeneous adjoint states (`w`). Default is `Xorshifts.Xoroshiro128Plus(rand(UInt64))`.
+
+  - `adjoint_sensealg`: Continuous adjoint sensitivity method to compute homogeneous
+    and inhomogeneous adjoint solutions on each segment. Default is `BacksolveAdjoint(autojacvec=ReverseDiffVJP())`.
+
+      + `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
+        differentiation with special seeding. The default is `true`. The total set
+        of choices are:
+
+          * `false`: the Jacobian is constructed via FiniteDiff.jl
+          * `true`: the Jacobian is constructed via ForwardDiff.jl
+          * `TrackerVJP`: Uses Tracker.jl for the vjp.
+          * `ZygoteVJP`: Uses Zygote.jl for the vjp.
+          * `EnzymeVJP`: Uses Enzyme.jl for the vjp.
+          * `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
+            is a boolean for whether to precompile the tape, which should only be done
+            if there are no branches (`if` or `while` statements) in the `f` function.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `g`: instantaneous objective function of the long-time averaged objective.
 
 ## SciMLProblem Support
 
@@ -908,7 +925,7 @@ end
 
 """
 ```julia
-SteadyStateAdjoint{CS,AD,FDT,VJP,LS} <: AbstractAdjointSensitivityAlgorithm{CS,AD,FDT}
+SteadyStateAdjoint{CS, AD, FDT, VJP, LS} <: AbstractAdjointSensitivityAlgorithm{CS, AD, FDT}
 ```
 
 An implementation of the adjoint differentiation of a nonlinear solve. Uses the
@@ -918,34 +935,36 @@ implicit function theorem to directly compute the derivative of the solution to
 ## Constructor
 
 ```julia
-SteadyStateAdjoint(;chunk_size = 0, autodiff = true,
-                    diff_type = Val{:central},
-                    autojacvec = autodiff, linsolve = nothing)
+SteadyStateAdjoint(; chunk_size = 0, autodiff = true,
+                   diff_type = Val{:central},
+                   autojacvec = autodiff, linsolve = nothing)
 ```
 
 ## Keyword Arguments
 
-* `autodiff`: Use automatic differentiation for constructing the Jacobian
-  if the Jacobian needs to be constructed.  Defaults to `true`.
-* `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
-  built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
-  choice of chunk size.
-* `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
-  if the full Jacobian is required with `autodiff=false`.
-* `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
-  differentiation with special seeding. The default is `nothing`. The total set
-  of choices are:
-    - `false`: the Jacobian is constructed via FiniteDiff.jl
-    - `true`: the Jacobian is constructed via ForwardDiff.jl
-    - `TrackerVJP`: Uses Tracker.jl for the vjp.
-    - `ZygoteVJP`: Uses Zygote.jl for the vjp.
-    - `EnzymeVJP`: Uses Enzyme.jl for the vjp.
-    - `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
-      is a boolean for whether to precompile the tape, which should only be done
-      if there are no branches (`if` or `while` statements) in the `f` function.
-* `linsolve`: the linear solver used in the adjoint solve. Defaults to `nothing`,
-  which uses a polyalgorithm to choose an efficient
-  algorithm automatically.
+  - `autodiff`: Use automatic differentiation for constructing the Jacobian
+    if the Jacobian needs to be constructed.  Defaults to `true`.
+
+  - `chunk_size`: Chunk size for forward-mode differentiation if full Jacobians are
+    built (`autojacvec=false` and `autodiff=true`). Default is `0` for automatic
+    choice of chunk size.
+  - `diff_type`: The method used by FiniteDiff.jl for constructing the Jacobian
+    if the full Jacobian is required with `autodiff=false`.
+  - `autojacvec`: Calculate the vector-Jacobian product (`J'*v`) via automatic
+    differentiation with special seeding. The default is `nothing`. The total set
+    of choices are:
+
+      + `false`: the Jacobian is constructed via FiniteDiff.jl
+      + `true`: the Jacobian is constructed via ForwardDiff.jl
+      + `TrackerVJP`: Uses Tracker.jl for the vjp.
+      + `ZygoteVJP`: Uses Zygote.jl for the vjp.
+      + `EnzymeVJP`: Uses Enzyme.jl for the vjp.
+      + `ReverseDiffVJP(compile=false)`: Uses ReverseDiff.jl for the vjp. `compile`
+        is a boolean for whether to precompile the tape, which should only be done
+        if there are no branches (`if` or `while` statements) in the `f` function.
+  - `linsolve`: the linear solver used in the adjoint solve. Defaults to `nothing`,
+    which uses a polyalgorithm to choose an efficient
+    algorithm automatically.
 
 For more details on the vjp choices, please consult the sensitivity algorithms
 documentation page or the docstrings of the vjp types.
@@ -987,18 +1006,17 @@ performance of the VJP method.
 ## Constructor
 
 ```julia
-ZygoteVJP(;allow_nothing=false)
+ZygoteVJP(; allow_nothing = false)
 ```
 
 Keyword arguments:
 
-* `allow_nothing`: whether `nothing`s should be implicitly converted to zeros. In Zygote,
-  the derivative of a function with respect to `p` which does not use `p` in any possible
-  calculation is given a derivative of `nothing` instead of zero. By default, this `nothing`
-  is caught in order to throw an informative error message about a potentially unintentional
-  misdefined function. However, if this was intentional, setting `allow_nothing=true` will
-  remove the error message.
-
+  - `allow_nothing`: whether `nothing`s should be implicitly converted to zeros. In Zygote,
+    the derivative of a function with respect to `p` which does not use `p` in any possible
+    calculation is given a derivative of `nothing` instead of zero. By default, this `nothing`
+    is caught in order to throw an informative error message about a potentially unintentional
+    misdefined function. However, if this was intentional, setting `allow_nothing=true` will
+    remove the error message.
 """
 struct ZygoteVJP <: VJPChoice
     allow_nothing::Bool
@@ -1020,18 +1038,17 @@ like BLAS/LAPACK are used) and this will be the most efficient adjoint implement
 ## Constructor
 
 ```julia
-EnzymeVJP(;chunksize=0)
+EnzymeVJP(; chunksize = 0)
 ```
 
 ## Keyword Arguments
 
-- `chunksize`: the default chunk size for the temporary variables inside the vjp's right
-  hand side definition. This is used for compatibility with ODE solves that default to using
-  ForwardDiff.jl for the Jacobian of the stiff ODE solve, such as OrdinaryDiffEq.jl. This
-  should be set to the maximum chunksize that can occur during an integration to preallocate
-  the `DualCaches` for PreallocationTools.jl. It defaults to 0, using `ForwardDiff.pickchunksize`
-  but could be decreased if this value is known to be lower to conserve memory.
-
+  - `chunksize`: the default chunk size for the temporary variables inside the vjp's right
+    hand side definition. This is used for compatibility with ODE solves that default to using
+    ForwardDiff.jl for the Jacobian of the stiff ODE solve, such as OrdinaryDiffEq.jl. This
+    should be set to the maximum chunksize that can occur during an integration to preallocate
+    the `DualCaches` for PreallocationTools.jl. It defaults to 0, using `ForwardDiff.pickchunksize`
+    but could be decreased if this value is known to be lower to conserve memory.
 """
 struct EnzymeVJP <: VJPChoice
     chunksize::Int
@@ -1054,17 +1071,17 @@ reverse mode.
 ## Constructor
 
 ```julia
-TrackerVJP(;allow_nothing=false)
+TrackerVJP(; allow_nothing = false)
 ```
 
 Keyword arguments:
 
-* `allow_nothing`: whether non-tracked values should be implicitly converted to zeros. In Tracker,
-  the derivative of a function with respect to `p` which does not use `p` in any possible
-  calculation is given an untracked return instead of zero. By default, this `nothing` Trackedness
-  is caught in order to throw an informative error message about a potentially unintentional
-  misdefined function. However, if this was intentional, setting `allow_nothing=true` will
-  remove the error message.
+  - `allow_nothing`: whether non-tracked values should be implicitly converted to zeros. In Tracker,
+    the derivative of a function with respect to `p` which does not use `p` in any possible
+    calculation is given an untracked return instead of zero. By default, this `nothing` Trackedness
+    is caught in order to throw an informative error message about a potentially unintentional
+    misdefined function. However, if this was intentional, setting `allow_nothing=true` will
+    remove the error message.
 """
 struct TrackerVJP <: VJPChoice
     allow_nothing::Bool
@@ -1090,14 +1107,14 @@ Does not support GPUs (CuArrays).
 ## Constructor
 
 ```julia
-ReverseDiffVJP(compile=false)
+ReverseDiffVJP(compile = false)
 ```
 
 ## Keyword Arguments
 
-* `compile`: Whether to cache the compilation of the reverse tape. This heavily increases
-  the performance of the method, but requires that the `f` function of the ODE/DAE/SDE/DDE
-  has no branching.
+  - `compile`: Whether to cache the compilation of the reverse tape. This heavily increases
+    the performance of the method, but requires that the `f` function of the ODE/DAE/SDE/DDE
+    has no branching.
 """
 struct ReverseDiffVJP{compile} <: VJPChoice
     ReverseDiffVJP(compile = false) = new{compile}()
@@ -1146,7 +1163,7 @@ end
 
 """
 ```julia
-ForwardDiffOverAdjoint{A} <: AbstractSecondOrderSensitivityAlgorithm{nothing,true,nothing}
+ForwardDiffOverAdjoint{A} <: AbstractSecondOrderSensitivityAlgorithm{nothing, true, nothing}
 ```
 
 ForwardDiff.jl over a choice of `sensealg` method for the adjoint.