diff --git a/lib/statistex.ex b/lib/statistex.ex index 9d6b539..088a81a 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -27,6 +27,8 @@ defmodule Statistex do :mode, :minimum, :maximum, + :outlier_bounds, + :outliers, sample_size: 0 ] @@ -47,6 +49,8 @@ defmodule Statistex do mode: mode, minimum: number, maximum: number, + outlier_bounds: {number, number}, + outliers: [number], sample_size: non_neg_integer } @@ -81,6 +85,10 @@ defmodule Statistex do @empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number." + @first_quartile 25 + @third_quartile 75 + @iqr_factor 1.5 + @doc """ Calculate all statistics Statistex offers for a given list of numbers. @@ -89,7 +97,15 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. ## Options - In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The 50th percentile is always calculated as it is the median. + + In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can + be given. The percentiles 25th, 50th (median) and 75th are always calculated. + + The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`, + `nil` is the default. If this option set to `:once` the outliers are excluded + and the statistics are calculated with the rest of the samples. The value + `:repeatedly` repeats the outlier exclusion until the samples no longer + contains outliers. ## Examples @@ -100,7 +116,7 @@ defmodule Statistex do standard_deviation: 200.0, standard_deviation_ratio: 0.4, median: 500.0, - percentiles: %{50 => 500.0}, + percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, frequency_distribution: %{ 200 => 1, 400 => 3, @@ -112,7 +128,9 @@ defmodule Statistex do minimum: 200, maximum: 900, sample_size: 9, - total: 4500 + total: 4500, + outliers: [], + outlier_bounds: {200, 900.0} } iex> Statistex.statistics([]) @@ -125,13 +143,15 @@ defmodule Statistex do standard_deviation: 0.0, standard_deviation_ratio: 0.0, median: 0.0, - percentiles: %{50 => 0.0}, + percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0}, frequency_distribution: %{0 => 4}, mode: 0, minimum: 0, maximum: 0, sample_size: 4, - total: 0 + total: 0, + outliers: [], + outlier_bounds: {0.0, 0.0} } """ @@ -143,33 +163,64 @@ defmodule Statistex do end def statistics(samples, configuration) do - total = total(samples) - sample_size = length(samples) - average = average(samples, total: total, sample_size: sample_size) - variance = variance(samples, average: average, sample_size: sample_size) - standard_deviation = standard_deviation(samples, variance: variance) + samples = Enum.sort(samples) - standard_deviation_ratio = - standard_deviation_ratio(samples, standard_deviation: standard_deviation) + minimum = hd(samples) + maximum = List.last(samples) percentiles = calculate_percentiles(samples, configuration) - frequency_distribution = frequency_distribution(samples) - - %__MODULE__{ - total: total, - average: average, - variance: variance, - standard_deviation: standard_deviation, - standard_deviation_ratio: standard_deviation_ratio, - median: median(samples, percentiles: percentiles), - percentiles: percentiles, - frequency_distribution: frequency_distribution, - mode: mode(samples, frequency_distribution: frequency_distribution), - minimum: minimum(samples), - maximum: maximum(samples), - sample_size: sample_size - } + outlier_bounds = + do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) + + {outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds) + + if exclude_outliers?(configuration) and Enum.any?(outliers) do + configuration = + configuration + |> Keyword.update!(:exclude_outliers, fn + :once -> :stop + :repeatedly -> :repeatedly + end) + |> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end) + + statistics(rest, configuration) + else + outliers = outliers ++ Keyword.get(configuration, :acc_outliers, []) + + total = total(samples) + sample_size = length(samples) + average = average(samples, total: total, sample_size: sample_size) + variance = variance(samples, average: average, sample_size: sample_size) + + frequency_distribution = frequency_distribution(samples) + + standard_deviation = standard_deviation(samples, variance: variance) + + standard_deviation_ratio = + standard_deviation_ratio(samples, standard_deviation: standard_deviation) + + %__MODULE__{ + total: total, + average: average, + variance: variance, + standard_deviation: standard_deviation, + standard_deviation_ratio: standard_deviation_ratio, + median: median(samples, percentiles: percentiles), + percentiles: percentiles, + frequency_distribution: frequency_distribution, + mode: mode(samples, frequency_distribution: frequency_distribution), + minimum: minimum, + maximum: maximum, + outlier_bounds: outlier_bounds, + outliers: outliers, + sample_size: sample_size + } + end + end + + defp exclude_outliers?(configuration) do + Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly] end @doc """ @@ -396,8 +447,10 @@ defmodule Statistex do percentiles_configuration = Keyword.get(configuration, :percentiles, []) # median_percentile is manually added so that it can be used directly by median - percentiles_configuration = Enum.uniq([@median_percentile | percentiles_configuration]) - percentiles(samples, percentiles_configuration) + percentiles_configuration = + Enum.uniq([25, @median_percentile, 75 | percentiles_configuration]) + + Percentile.percentiles(samples, percentiles_configuration) end @doc """ @@ -447,7 +500,9 @@ defmodule Statistex do """ @spec percentiles(samples, number | [number(), ...]) :: percentiles() - defdelegate(percentiles(samples, percentiles), to: Percentile) + def percentiles(samples, percentiles) do + samples |> Enum.sort() |> Percentile.percentiles(percentiles) + end @doc """ A map showing which sample occurs how often in the samples. @@ -541,10 +596,87 @@ defmodule Statistex do def median(samples, options) do percentiles = - Keyword.get_lazy(options, :percentiles, fn -> percentiles(samples, @median_percentile) end) + Keyword.get_lazy(options, :percentiles, fn -> + Percentile.percentiles(samples, @median_percentile) + end) + + get_percentile(samples, @median_percentile, percentiles) + end + + @doc """ + Calculates the lower and upper bound for outliers. + + Any sample that is `<` as the lower bound and any sample `>` are outliers of + the given `samples`. + + ## Examples + + iex> Statistex.outlier_bounds([3, 4, 5]) + {3, 5} + + iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + {22.5, 50} + + iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + {50, 80.625} + """ + @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} + def outlier_bounds(samples, options \\ []) + def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) + def outlier_bounds(samples, options), do: samples |> Enum.sort() |> do_outlier_bounds(options) + + defp do_outlier_bounds(samples, options) do + percentiles = + Keyword.get_lazy(options, :percentiles, fn -> + Percentile.percentiles(samples, [@first_quartile, @third_quartile]) + end) + + minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end) + maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end) + + q1 = get_percentile(samples, @first_quartile, percentiles) + q3 = get_percentile(samples, @third_quartile, percentiles) + iqr = q3 - q1 + + {max(q1 - iqr * @iqr_factor, minimum), min(q3 + iqr * @iqr_factor, maximum)} + end + + @doc """ + Returns all outliers for the given `samples`. + + ## Examples + + iex> Statistex.outliers([3, 4, 5]) + [] + + iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + [1, 2, 6] + + iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + [99, 99, 99] + """ + @spec outliers(samples, keyword) :: samples | [] + def outliers(samples, options \\ []) do + {outliers, _rest} = samples |> Enum.sort() |> do_outliers(options) + + outliers + end + + defp do_outliers(samples, options) do + {lower_bound, upper_bound} = + Keyword.get_lazy(options, :outlier_bounds, fn -> do_outlier_bounds(samples, options) end) + + {min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end) + + {max, rest} = + rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end) + + {min ++ max, rest} + end - Map.get_lazy(percentiles, @median_percentile, fn -> - samples |> percentiles(@median_percentile) |> Map.fetch!(@median_percentile) + defp get_percentile(samples, percentile, percentiles) do + Map.get_lazy(percentiles, percentile, fn -> + samples |> Percentile.percentiles(percentile) |> Map.fetch!(percentile) end) end diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index 162ccb7..c475da6 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -12,12 +12,11 @@ defmodule Statistex.Percentile do def percentiles(samples, percentile_ranks) do number_of_samples = length(samples) - sorted_samples = Enum.sort(samples) percentile_ranks |> List.wrap() |> Enum.reduce(%{}, fn percentile_rank, acc -> - perc = percentile(sorted_samples, number_of_samples, percentile_rank) + perc = percentile(samples, number_of_samples, percentile_rank) Map.put(acc, percentile_rank, perc) end) end diff --git a/test/statistex/percentile_test.exs b/test/statistex/percentile_test.exs index fbc03a5..020f523 100644 --- a/test/statistex/percentile_test.exs +++ b/test/statistex/percentile_test.exs @@ -4,20 +4,20 @@ defmodule Statistex.PercentileTest do doctest Statistex.Percentile - @nist_sample_data [ - 95.1772, - 95.1567, - 95.1937, - 95.1959, - 95.1442, - 95.0610, - 95.1591, - 95.1195, - 95.1065, - 95.0925, - 95.1990, - 95.1682 - ] + @nist_sample_data Enum.sort([ + 95.1772, + 95.1567, + 95.1937, + 95.1959, + 95.1442, + 95.0610, + 95.1591, + 95.1195, + 95.1065, + 95.0925, + 95.1990, + 95.1682 + ]) # Test data from: # http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm @@ -49,7 +49,7 @@ defmodule Statistex.PercentileTest do end describe "a list of two elements" do - @samples [300, 200] + @samples [200, 300] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 200.0 @@ -67,7 +67,7 @@ defmodule Statistex.PercentileTest do end describe "seemingly problematic 2 element list [9, 1]" do - @samples [9, 1] + @samples [1, 9] percentiles = %{ 25 => 1, @@ -88,7 +88,7 @@ defmodule Statistex.PercentileTest do end describe "a list of three elements" do - @samples [100, 300, 200] + @samples [100, 200, 300] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 100.0 diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 3c602a2..301ce2a 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -12,6 +12,104 @@ defmodule Statistex.StatistexTest do end end + describe ".outlier_bounds/2" do + test "returns outlier bounds for samples without outliers" do + assert Statistex.outlier_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + {200, 900.0} + end + + test "returns outlier bounds for samples with outliers" do + assert Statistex.outlier_bounds([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + {87.5, 787.5} + end + end + + describe ".statistics/2" do + test "returns Statistex struct without outliers" do + assert Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + %Statistex{ + total: 4500, + average: 500.0, + variance: 40000.0, + standard_deviation: 200.0, + standard_deviation_ratio: 0.4, + median: 500.0, + percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, + frequency_distribution: %{200 => 1, 400 => 3, 500 => 3, 700 => 1, 900 => 1}, + mode: [500, 400], + minimum: 200, + maximum: 900, + outlier_bounds: {200, 900.0}, + outliers: [], + sample_size: 9 + } + end + + test "returns Statistex struct with outliers" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + %Statistex{ + total: 4450, + average: 445.0, + variance: 61361.11111111111, + standard_deviation: 247.71175004652304, + standard_deviation_ratio: 0.5566556180820742, + median: 475.0, + percentiles: %{25 => 350.0, 50 => 475.0, 75 => 525.0}, + frequency_distribution: %{50 => 2, 450 => 3, 500 => 3, 600 => 1, 900 => 1}, + mode: [500, 450], + minimum: 50, + maximum: 900, + outlier_bounds: {87.5, 787.5}, + outliers: [50, 50, 900], + sample_size: 10 + } + end + + test "returns Statistex struct with excluded outliers once" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], + exclude_outliers: :once + ) == + %Statistex{ + total: 3450, + average: 492.85714285714283, + variance: 2857.142857142857, + standard_deviation: 53.452248382484875, + standard_deviation_ratio: 0.1084538372977954, + median: 500.0, + percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3, 600 => 1}, + mode: [500, 450], + minimum: 450, + maximum: 600, + outlier_bounds: {450, 575.0}, + outliers: [600, 50, 50, 900], + sample_size: 7 + } + end + + test "returns Statistex struct with excluded outliers repeatedly" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], + exclude_outliers: :repeatedly + ) == + %Statistex{ + total: 2850, + average: 475.0, + variance: 750.0, + standard_deviation: 27.386127875258307, + standard_deviation_ratio: 0.05765500605317538, + median: 475.0, + percentiles: %{25 => 450.0, 50 => 475.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3}, + mode: [500, 450], + minimum: 450, + maximum: 500, + outlier_bounds: {450, 500}, + outliers: [50, 50, 900, 600], + sample_size: 6 + } + end + end + describe "property testing as we might get loads of data" do property "doesn't blow up no matter what kind of nonempty list of floats it's given" do check all(samples <- list_of(float(), min_length: 1)) do