diff --git a/.gitignore b/.gitignore index dbc9658..33b2fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ docs # Don't feel like tracking that gives me what I want any more :) .tool-versions + +# dialyzer +/tools diff --git a/CHANGELOG.md b/CHANGELOG.md index 34d69b2..2f43b51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ +## 1.1 (Unreleased) + +This release adds functionality around identifying outliers. + +* the Statistex struct comes with more keys: `:lower_outlier_bound`, `:upper_outlier_bound` & `:outliers`, +along with the new public functions `:outliers/2` and `:outlier_bounds/2`. +* `statistics/2` now also accepts `exclude_outliers: true` to exclude the outliers from the calculation +of statistics. +* some functions have also been updated to accept more optional arguments such as `:sorted?` to avoid unnecessary extra work. + +Huge thanks for these changes go to [@NickNeck](https://github.com/NickNeck)! + ## 1.0 2019-07-05 Import of the initial functionality from [benchee](github.com/bencheeorg/benchee). -Dubbed 1.0 because many people had already been running this code indirectly through benchee. \ No newline at end of file +Dubbed 1.0 because many people had already been running this code indirectly through benchee. diff --git a/lib/statistex.ex b/lib/statistex.ex index 9fcf019..8b67d9c 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -15,6 +15,8 @@ defmodule Statistex do alias Statistex.{Mode, Percentile} require Integer + import Statistex.Helper, only: [maybe_sort: 2] + defstruct [ :total, :average, @@ -27,6 +29,9 @@ defmodule Statistex do :mode, :minimum, :maximum, + :lower_outlier_bound, + :upper_outlier_bound, + :outliers, sample_size: 0 ] @@ -47,6 +52,9 @@ defmodule Statistex do mode: mode, minimum: number, maximum: number, + lower_outlier_bound: number, + upper_outlier_bound: number, + outliers: [number], sample_size: non_neg_integer } @@ -81,59 +89,74 @@ defmodule Statistex do @empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." + @first_quartile 25 + @median_percentile 50 + @third_quartile 75 + # https://en.wikipedia.org/wiki/Interquartile_range#Outliers + # https://builtin.com/articles/1-5-iqr-rule + @iqr_factor 1.5 + @doc """ Calculate all statistics Statistex offers for a given list of numbers. The statistics themselves are described in the individual samples that can be used to calculate individual values. - `Argumenterror` is raised if the given list is empty. + `ArgumentError` is raised if the given list is empty. ## Options - In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The 50th percentile is always calculated as it is the median. + + * `:percentiles`: percentiles to calculate (see `percentiles/2`). + The percentiles 25th, 50th (median) and 75th are always calculated. + * `:exclude_outliers` can be set to `true` or `false`. Defaults to `false`. + If this option is set to `true` the outliers are excluded from the calculation + of the statistics. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. ## Examples - iex> Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) + iex> Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) %Statistex{ - average: 500.0, - variance: 40_000.0, - standard_deviation: 200.0, - standard_deviation_ratio: 0.4, - median: 500.0, - percentiles: %{50 => 500.0}, - frequency_distribution: %{ - 200 => 1, - 400 => 3, - 500 => 3, - 700 => 1, - 900 => 1 - }, - mode: [500, 400], - minimum: 200, - maximum: 900, - sample_size: 9, - total: 4500 + total: 4450, + average: 445.0, + variance: 61_361.11111111111, + standard_deviation: 247.71175004652304, + standard_deviation_ratio: 0.5566556180820742, + median: 475.0, + percentiles: %{25 => 350.0, 50 => 475.0, 75 => 525.0}, + frequency_distribution: %{50 => 2, 450 => 3, 500 => 3, 600 => 1, 900 => 1}, + mode: [500, 450], + minimum: 50, + maximum: 900, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, + outliers: [50, 50, 900], + sample_size: 10 } - iex> Statistex.statistics([]) - ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. - - iex> Statistex.statistics([0, 0, 0, 0]) + # excluding outliers changes the results + iex> Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], exclude_outliers: true) %Statistex{ - average: 0.0, - variance: 0.0, - standard_deviation: 0.0, - standard_deviation_ratio: 0.0, - median: 0.0, - percentiles: %{50 => 0.0}, - frequency_distribution: %{0 => 4}, - mode: 0, - minimum: 0, - maximum: 0, - sample_size: 4, - total: 0 + total: 3450, + average: 492.85714285714283, + variance: 2857.142857142857, + standard_deviation: 53.452248382484875, + standard_deviation_ratio: 0.1084538372977954, + median: 500.0, + percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3, 600 => 1}, + mode: [500, 450], + maximum: 600, + minimum: 450, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, + outliers: [50, 50, 900], + sample_size: 7 } + iex> Statistex.statistics([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. + """ @spec statistics(samples, configuration) :: t() def statistics(samples, configuration \\ []) @@ -143,18 +166,45 @@ defmodule Statistex do end def statistics(samples, configuration) do - total = total(samples) - sample_size = length(samples) - average = average(samples, total: total, sample_size: sample_size) - variance = variance(samples, average: average, sample_size: sample_size) - standard_deviation = standard_deviation(samples, variance: variance) + sorted_samples = maybe_sort(samples, configuration) - standard_deviation_ratio = - standard_deviation_ratio(samples, standard_deviation: standard_deviation) + percentiles = calculate_percentiles(sorted_samples, configuration) + outlier_bounds = outlier_bounds(sorted_samples, percentiles: percentiles) + + # rest remains sorted here/it's an important property + {outliers, rest} = outliers(sorted_samples, outlier_bounds: outlier_bounds) + + if exclude_outliers?(configuration) and Enum.any?(outliers) do + # need to recalculate with the outliers removed + percentiles = calculate_percentiles(rest, configuration) + + create_full_statistics(rest, percentiles, outliers, outlier_bounds) + else + create_full_statistics(sorted_samples, percentiles, outliers, outlier_bounds) + end + end + + defp exclude_outliers?(configuration) do + Access.get(configuration, :exclude_outliers) == true + end + + defp create_full_statistics(sorted_samples, percentiles, outliers, outlier_bounds) do + total = total(sorted_samples) + sample_size = length(sorted_samples) + minimum = hd(sorted_samples) + maximum = List.last(sorted_samples) - percentiles = calculate_percentiles(samples, configuration) + average = average(sorted_samples, total: total, sample_size: sample_size) + variance = variance(sorted_samples, average: average, sample_size: sample_size) - frequency_distribution = frequency_distribution(samples) + frequency_distribution = frequency_distribution(sorted_samples) + + standard_deviation = standard_deviation(sorted_samples, variance: variance) + + standard_deviation_ratio = + standard_deviation_ratio(sorted_samples, standard_deviation: standard_deviation) + + {lower_outlier_bound, upper_outlier_bound} = outlier_bounds %__MODULE__{ total: total, @@ -162,12 +212,15 @@ defmodule Statistex do variance: variance, standard_deviation: standard_deviation, standard_deviation_ratio: standard_deviation_ratio, - median: median(samples, percentiles: percentiles), + median: median(sorted_samples, percentiles: percentiles), percentiles: percentiles, frequency_distribution: frequency_distribution, - mode: mode(samples, frequency_distribution: frequency_distribution), - minimum: minimum(samples), - maximum: maximum(samples), + mode: mode(sorted_samples, frequency_distribution: frequency_distribution), + minimum: minimum, + maximum: maximum, + lower_outlier_bound: lower_outlier_bound, + upper_outlier_bound: upper_outlier_bound, + outliers: outliers, sample_size: sample_size } end @@ -322,7 +375,7 @@ defmodule Statistex do iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12]) 4.0 - iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12], variance: 16.0) + iex> Statistex.standard_deviation(:dontcare, variance: 16.0) 4.0 iex> Statistex.standard_deviation([42]) @@ -391,21 +444,25 @@ defmodule Statistex do end end - @median_percentile 50 - defp calculate_percentiles(samples, configuration) do + defp calculate_percentiles(sorted_samples, configuration) do percentiles_configuration = Keyword.get(configuration, :percentiles, []) # median_percentile is manually added so that it can be used directly by median - percentiles_configuration = Enum.uniq([@median_percentile | percentiles_configuration]) - percentiles(samples, percentiles_configuration) + percentiles_configuration = + Enum.uniq([ + @first_quartile, + @median_percentile, + @third_quartile | percentiles_configuration + ]) + + Percentile.percentiles(sorted_samples, percentiles_configuration, sorted: true) end @doc """ Calculates the value at the `percentile_rank`-th percentile. - Think of this as the - value below which `percentile_rank` percent of the samples lie. For example, - if `Statistex.percentile(samples, 99)` == 123.45, + Think of this as the value below which `percentile_rank` percent of the samples lie. + For example, if `Statistex.percentile(samples, 99) == 123.45`, 99% of samples are less than 123.45. Passing a number for `percentile_rank` calculates a single percentile. @@ -419,11 +476,19 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. + ## Options + + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. + ## Examples iex> Statistex.percentiles([5, 3, 4, 5, 1, 3, 1, 3], 12.5) %{12.5 => 1.0} + iex> Statistex.percentiles([1, 1, 3, 3, 3, 4, 5, 5], 12.5, sorted?: true) + %{12.5 => 1.0} + iex> Statistex.percentiles([5, 3, 4, 5, 1, 3, 1, 3], [50]) %{50 => 3.0} @@ -447,7 +512,8 @@ defmodule Statistex do """ @spec percentiles(samples, number | [number(), ...]) :: percentiles() - defdelegate(percentiles(samples, percentiles), to: Percentile) + defdelegate percentiles(samples, percentiles, options), to: Percentile + defdelegate percentiles(samples, percentiles), to: Percentile @doc """ A map showing which sample occurs how often in the samples. @@ -521,11 +587,26 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. + ## Options + * `:percentiles` - you can pass it a map of calculated percentiles to fetch the median from (it is the 50th percentile). + If it doesn't include the median/50th percentile - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + ## Examples iex> Statistex.median([1, 3, 4, 6, 7, 8, 9]) 6.0 + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], percentiles: %{50 => 6.0}) + 6.0 + + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], percentiles: %{25 => 3.0}) + 6.0 + + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], sorted?: true) + 6.0 + iex> Statistex.median([1, 2, 3, 4, 5, 6, 8, 9]) 4.5 @@ -540,12 +621,117 @@ defmodule Statistex do def median([], _), do: raise(ArgumentError, @empty_list_error_message) def median(samples, options) do + percentiles = Access.get(options, :percentiles, %{}) + percentiles = - Keyword.get_lazy(options, :percentiles, fn -> percentiles(samples, @median_percentile) end) + case percentiles do + %{@median_percentile => _} -> + percentiles - Map.get_lazy(percentiles, @median_percentile, fn -> - samples |> percentiles(@median_percentile) |> Map.fetch!(@median_percentile) - end) + # missing necessary keys + %{} -> + Percentile.percentiles(samples, @median_percentile, options) + end + + Map.fetch!(percentiles, @median_percentile) + end + + @doc """ + Calculates the lower and upper bound for outliers. + + Any sample that is `<` as the lower bound and any sample `>` are outliers of + the given `samples`. + + List passed needs to be non empty, otherwise an `ArgumentError` is raised. + + ## Options + * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). + If it doesn't include them - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + + ## Examples + + iex> Statistex.outlier_bounds([3, 4, 5]) + {0.0, 8.0} + + iex> Statistex.outlier_bounds([4, 5, 3]) + {0.0, 8.0} + + iex> Statistex.outlier_bounds([3, 4, 5], sorted?: true) + {0.0, 8.0} + + iex> Statistex.outlier_bounds([3, 4, 5], percentiles: %{25 => 3.0, 75 => 5.0}) + {0.0, 8.0} + + iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + {22.5, 66.5} + + iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + {31.625, 80.625} + + iex> Statistex.outlier_bounds([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. + """ + @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} + def outlier_bounds(samples, options \\ []) + def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) + + def outlier_bounds(samples, options) do + percentiles = Access.get(options, :percentiles, %{}) + + percentiles = + case percentiles do + %{@first_quartile => _, @third_quartile => _} -> + percentiles + + # missing necessary keys + %{} -> + Percentile.percentiles(samples, [@first_quartile, @third_quartile], options) + end + + q1 = Map.fetch!(percentiles, @first_quartile) + q3 = Map.fetch!(percentiles, @third_quartile) + iqr = q3 - q1 + outlier_tolerance = iqr * @iqr_factor + + {q1 - outlier_tolerance, q3 + outlier_tolerance} + end + + @doc """ + Returns all outliers for the given `samples`, along with the remaining values. + + Returns: `{outliers, remaining_samples`} where `remaining_samples` has the outliers removed. + + ## Options + * `:outlier_bounds` - if you already have calculated the outlier bounds. + * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). + If it doesn't include them - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + + ## Examples + + iex> Statistex.outliers([3, 4, 5]) + {[], [3, 4, 5]} + + iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + {[1, 2, 6], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} + + iex> Statistex.outliers([50, 50, 1, 50, 50, 50, 50, 50, 2, 50, 50, 50, 50, 6]) + {[1, 2, 6], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} + + iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + {[99, 99, 99], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} + """ + @spec outliers(samples, keyword) :: {samples | [], samples} + def outliers(samples, options \\ []) do + {lower_bound, upper_bound} = + Keyword.get_lazy(options, :outlier_bounds, fn -> + outlier_bounds(samples, options) + end) + + Enum.split_with(samples, fn sample -> sample < lower_bound || sample > upper_bound end) end @doc """ diff --git a/lib/statistex/helper.ex b/lib/statistex/helper.ex new file mode 100644 index 0000000..958d034 --- /dev/null +++ b/lib/statistex/helper.ex @@ -0,0 +1,18 @@ +defmodule Statistex.Helper do + @moduledoc false + # Everyone loves helper modules... ok ok, no. But I needed/wanted this function, + # but didn't wanna put it on the main module. + + # With the design goal that we don't want to needlessly do operations, esp. big ones + # like sorting we need an optional `sorted?` arguments in a bunch of places. + # This unifies the handling of that. + def maybe_sort(samples, options) do + sorted? = Access.get(options, :sorted?, false) + + if sorted? do + samples + else + Enum.sort(samples) + end + end +end diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index dd6bd31..91688b6 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -1,24 +1,28 @@ defmodule Statistex.Percentile do @moduledoc false - @spec percentiles(Statistex.samples(), number | [number, ...]) :: + import Statistex.Helper, only: [maybe_sort: 2] + + @spec percentiles(Statistex.samples(), number | [number, ...], keyword()) :: Statistex.percentiles() - def percentiles([], _) do + def percentiles(samples, percentiles, options \\ []) + + def percentiles([], _, _) do raise( ArgumentError, "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." ) end - def percentiles(samples, percentile_ranks) do + def percentiles(samples, percentile_ranks, options) do number_of_samples = length(samples) - sorted_samples = Enum.sort(samples) + sorted_samples = maybe_sort(samples, options) percentile_ranks |> List.wrap() - |> Enum.reduce(%{}, fn percentile_rank, acc -> + |> Map.new(fn percentile_rank -> perc = percentile(sorted_samples, number_of_samples, percentile_rank) - Map.put(acc, percentile_rank, perc) + {percentile_rank, perc} end) end @@ -63,11 +67,20 @@ defmodule Statistex.Percentile do # particular sample). Of the 9 main strategies, (types 1-9), types 6, 7, and 8 # are generally acceptable and give similar results. # + # R uses type 7, but you can change the strategies used in R with arguments. + # + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6) + # 25% 50% 75% + # 9.25 10.00 11.75 + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 7) + # 25% 50% 75% + # 9.75 10.00 11.25 + # # For more information on interpolation strategies, see: # - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html # - http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm defp interpolation_value(lower_bound, upper_bound, rank) do - # in our source rank is k, and interpolation_weitgh is d + # in our source rank is k, and interpolation_weight is d interpolation_weight = rank - trunc(rank) interpolation_weight * (upper_bound - lower_bound) end diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 3c602a2..5292270 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -10,6 +10,164 @@ defmodule Statistex.StatistexTest do test "if handed percentiles missing the median percentile still calculates it" do assert Statistex.median([1, 2, 3, 4, 5, 6, 8, 9], percentiles: %{}) == 4.5 end + + # what an odd test to write, huh? Well that way we can see we trust the `sorted?` + # value not resorting. + test "if told that the list is sorted while it isn't the result will be wrong" do + assert Statistex.median([1, 6, 4, 3, 5, 9, 2, 8], sorted?: true) != 4.5 + end + end + + describe ".outlier_bounds/2" do + # examples doubled up, maybe get rid of them? + test "returns outlier bounds for samples without outliers" do + assert Statistex.outlier_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + {100.0, 900.0} + end + + test "returns outlier bounds for samples with outliers" do + assert Statistex.outlier_bounds([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + {87.5, 787.5} + end + end + + describe ".statistics/2" do + test "all 0 values do what you think they would" do + assert Statistex.statistics([0, 0, 0, 0]) == %Statistex{ + average: 0.0, + variance: 0.0, + standard_deviation: 0.0, + standard_deviation_ratio: 0.0, + median: 0.0, + percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0}, + frequency_distribution: %{0 => 4}, + mode: 0, + minimum: 0, + maximum: 0, + sample_size: 4, + total: 0, + outliers: [], + lower_outlier_bound: 0.0, + upper_outlier_bound: 0.0 + } + end + + test "returns Statistex struct without outliers" do + assert Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + %Statistex{ + total: 4500, + average: 500.0, + variance: 40_000.0, + standard_deviation: 200.0, + standard_deviation_ratio: 0.4, + median: 500.0, + percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, + frequency_distribution: %{200 => 1, 400 => 3, 500 => 3, 700 => 1, 900 => 1}, + mode: [500, 400], + minimum: 200, + maximum: 900, + lower_outlier_bound: 100.0, + upper_outlier_bound: 900.0, + outliers: [], + sample_size: 9 + } + end + + test "returns Statistex struct with outliers" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + %Statistex{ + total: 4450, + average: 445.0, + variance: 61_361.11111111111, + standard_deviation: 247.71175004652304, + standard_deviation_ratio: 0.5566556180820742, + median: 475.0, + percentiles: %{25 => 350.0, 50 => 475.0, 75 => 525.0}, + frequency_distribution: %{50 => 2, 450 => 3, 500 => 3, 600 => 1, 900 => 1}, + mode: [500, 450], + minimum: 50, + maximum: 900, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, + outliers: [50, 50, 900], + sample_size: 10 + } + end + + # https://www.youtube.com/watch?v=rZJbj2I-_Ek + test "gets outliers from the sample right" do + # One could argue that this is controversial, R comes up with these results (by default): + # > summary(c(9, 9, 10, 10, 10, 11, 12, 36)) + # Min. 1st Qu. Median Mean 3rd Qu. Max. + # 9.00 9.75 10.00 13.38 11.25 36.00 + # + # R by default uses type 7 interpolation, we implemented type 6 interpolation though. Which + # R can also use: + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6) + # 25% 50% 75% + # 9.25 10.00 11.75 + # Which is our result. + + assert %Statistex{ + median: 10.0, + percentiles: %{25 => 9.25, 50 => 10.0, 75 => 11.75}, + minimum: 9, + maximum: 36, + lower_outlier_bound: 5.5, + upper_outlier_bound: 15.5, + outliers: [36] + } = Statistex.statistics([9, 9, 10, 10, 10, 11, 12, 36], exclude_outliers: false) + end + + # https://en.wikipedia.org/wiki/Box_plot#Example_with_outliers + test "another example with outliers" do + data = [ + 52, + 57, + 57, + 58, + 63, + 66, + 66, + 67, + 67, + 68, + 69, + 70, + 70, + 70, + 70, + 72, + 73, + 75, + 75, + 76, + 76, + 78, + 79, + 89 + ] + + assert %Statistex{ + median: 70.0, + percentiles: %{25 => 66.0, 50 => 70.0, 75 => 75.0}, + # report interquantile range? + lower_outlier_bound: 52.5, + upper_outlier_bound: 88.5, + outliers: [52, 89] + } = Statistex.statistics(data, exclude_outliers: false) + end + + # https://en.wikipedia.org/wiki/Interquartile_range#Data_set_in_a_table + test "quartile example" do + assert %Statistex{ + median: 87.0, + percentiles: %{25 => 31.0, 50 => 87.0, 75 => 119.0} + } = + Statistex.statistics([7, 7, 31, 31, 47, 75, 87, 115, 116, 119, 119, 155, 177], + exclude_outliers: false + ) + end end describe "property testing as we might get loads of data" do @@ -30,6 +188,17 @@ defmodule Statistex.StatistexTest do defp assert_statistics_properties(samples) do stats = statistics(samples) + assert_basic_statistics(stats) + assert_mode_in_samples(stats, samples) + assert_frequencies(stats, samples) + assert_bounds_and_outliers(stats, samples) + + # shuffling values around shouldn't change the results + shuffled_stats = samples |> Enum.shuffle() |> statistics() + assert stats == shuffled_stats + end + + defp assert_basic_statistics(stats) do assert stats.sample_size >= 1 assert stats.minimum <= stats.maximum @@ -41,11 +210,15 @@ defmodule Statistex.StatistexTest do assert stats.median == stats.percentiles[50] + assert stats.median >= stats.percentiles[25] + assert stats.percentiles[75] >= stats.median + assert stats.variance >= 0 assert stats.standard_deviation >= 0 assert stats.standard_deviation_ratio >= 0 + end - # mode actually occurs in the samples + defp assert_mode_in_samples(stats, samples) do case stats.mode do [_ | _] -> Enum.each(stats.mode, fn mode -> @@ -59,7 +232,9 @@ defmodule Statistex.StatistexTest do mode -> assert mode in samples end + end + defp assert_frequencies(stats, samples) do frequency_distribution = stats.frequency_distribution frequency_entry_count = map_size(frequency_distribution) @@ -76,7 +251,7 @@ defmodule Statistex.StatistexTest do # all samples are in frequencies Enum.each(samples, fn sample -> assert Map.has_key?(frequency_distribution, sample) end) - # counts some up to sample_size + # counts of frequencies sum up to sample_size count_sum = frequency_distribution |> Map.values() @@ -85,6 +260,37 @@ defmodule Statistex.StatistexTest do assert count_sum == stats.sample_size end + defp assert_bounds_and_outliers(stats, samples) do + Enum.each(stats.outliers, fn outlier -> + assert outlier in samples + assert outlier < stats.lower_outlier_bound || outlier > stats.upper_outlier_bound + end) + + assert stats.lower_outlier_bound <= stats.percentiles[25] + assert stats.upper_outlier_bound >= stats.percentiles[75] + + non_outlier_statistics = Statistex.statistics(samples, exclude_outliers: true) + # outlier or not, outliers or bounds aren't changed + assert non_outlier_statistics.outliers == stats.outliers + assert non_outlier_statistics.lower_outlier_bound == stats.lower_outlier_bound + assert non_outlier_statistics.upper_outlier_bound == stats.upper_outlier_bound + + if Enum.empty?(stats.outliers) do + # no outliers? Then excluding outliers shouldn't change anything! + assert non_outlier_statistics == stats + else + assert non_outlier_statistics.sample_size < stats.sample_size + assert non_outlier_statistics.standard_deviation < stats.standard_deviation + # property may not hold vor the std_dev ratio seemingly as values may be skewed too much + + frequency_occurrences = Map.keys(non_outlier_statistics.percentiles) + + # outliers don't make an appearances in the frequency occurrences + assert MapSet.intersection(MapSet.new(stats.outliers), MapSet.new(frequency_occurrences)) == + MapSet.new([]) + end + end + defp big_list_big_floats do sized(fn size -> resize(