Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 163 additions & 34 deletions lib/statistex.ex
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ defmodule Statistex do
:mode,
:minimum,
:maximum,
:outliers_bounds,
:outliers,
sample_size: 0
]

Expand All @@ -47,6 +49,8 @@ defmodule Statistex do
mode: mode,
minimum: number,
maximum: number,
outliers_bounds: {number, number},
outliers: [number],
sample_size: non_neg_integer
}

Expand Down Expand Up @@ -81,6 +85,8 @@ defmodule Statistex do

@empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least on number."

@iqr_factor 1.5

@doc """
Calculate all statistics Statistex offers for a given list of numbers.

Expand All @@ -89,7 +95,15 @@ defmodule Statistex do
`Argumenterror` is raised if the given list is empty.

## Options
In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The 50th percentile is always calculated as it is the median.

In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can
be given. The percentiles 25th, 50th (median) and 75th are always calculated.

The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`,
`nil` is the default. If this option set to `:once` the outliers are excluded
and the statistics are calculated with the rest of the samples. The value
`:repeatedly` repeats the outlier exclusion until the samples no longer
contains outliers.

## Examples

Expand All @@ -100,7 +114,7 @@ defmodule Statistex do
standard_deviation: 200.0,
standard_deviation_ratio: 0.4,
median: 500.0,
percentiles: %{50 => 500.0},
percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0},
frequency_distribution: %{
200 => 1,
400 => 3,
Expand All @@ -112,7 +126,9 @@ defmodule Statistex do
minimum: 200,
maximum: 900,
sample_size: 9,
total: 4500
total: 4500,
outliers: [],
outliers_bounds: {200, 900.0}
}

iex> Statistex.statistics([])
Expand All @@ -125,13 +141,15 @@ defmodule Statistex do
standard_deviation: 0.0,
standard_deviation_ratio: 0.0,
median: 0.0,
percentiles: %{50 => 0.0},
percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0},
frequency_distribution: %{0 => 4},
mode: 0,
minimum: 0,
maximum: 0,
sample_size: 4,
total: 0
total: 0,
outliers: [],
outliers_bounds: {0.0, 0.0}
}

"""
Expand All @@ -143,33 +161,65 @@ defmodule Statistex do
end

def statistics(samples, configuration) do
total = total(samples)
sample_size = length(samples)
average = average(samples, total: total, sample_size: sample_size)
variance = variance(samples, average: average, sample_size: sample_size)
standard_deviation = standard_deviation(samples, variance: variance)
samples = Enum.sort(samples)

standard_deviation_ratio =
standard_deviation_ratio(samples, standard_deviation: standard_deviation)
minimum = hd(samples)
maximum = List.last(samples)

percentiles = calculate_percentiles(samples, configuration)

frequency_distribution = frequency_distribution(samples)

%__MODULE__{
total: total,
average: average,
variance: variance,
standard_deviation: standard_deviation,
standard_deviation_ratio: standard_deviation_ratio,
median: median(samples, percentiles: percentiles),
percentiles: percentiles,
frequency_distribution: frequency_distribution,
mode: mode(samples, frequency_distribution: frequency_distribution),
minimum: minimum(samples),
maximum: maximum(samples),
sample_size: sample_size
}
outliers_bounds =
do_outliers_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum)

{outliers, rest} = do_outliers(samples, outliers_bounds: outliers_bounds)

if exclude_outliers?(configuration) and not Enum.empty?(outliers) do
configuration =
configuration
|> Keyword.put(:outliers_excluded, true)
|> Keyword.update!(:exclude_outliers, fn
:once -> :stop
:repeatedly -> :repeatedly
end)
|> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end)

statistics(rest, configuration)
else
outliers = outliers ++ Keyword.get(configuration, :acc_outliers, [])

total = total(samples)
sample_size = length(samples)
average = average(samples, total: total, sample_size: sample_size)
variance = variance(samples, average: average, sample_size: sample_size)

frequency_distribution = frequency_distribution(samples)

standard_deviation = standard_deviation(samples, variance: variance)

standard_deviation_ratio =
standard_deviation_ratio(samples, standard_deviation: standard_deviation)

%__MODULE__{
total: total,
average: average,
variance: variance,
standard_deviation: standard_deviation,
standard_deviation_ratio: standard_deviation_ratio,
median: median(samples, percentiles: percentiles),
percentiles: percentiles,
frequency_distribution: frequency_distribution,
mode: mode(samples, frequency_distribution: frequency_distribution),
minimum: minimum,
maximum: maximum,
outliers_bounds: outliers_bounds,
outliers: outliers,
sample_size: sample_size
}
end
end

defp exclude_outliers?(configuration) do
Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly]
end

@doc """
Expand Down Expand Up @@ -396,8 +446,10 @@ defmodule Statistex do
percentiles_configuration = Keyword.get(configuration, :percentiles, [])

# median_percentile is manually added so that it can be used directly by median
percentiles_configuration = Enum.uniq([@median_percentile | percentiles_configuration])
percentiles(samples, percentiles_configuration)
percentiles_configuration =
Enum.uniq([25, @median_percentile, 75 | percentiles_configuration])

Percentile.percentiles(samples, percentiles_configuration)
end

@doc """
Expand Down Expand Up @@ -447,7 +499,9 @@ defmodule Statistex do
"""
@spec percentiles(samples, number | [number(), ...]) ::
percentiles()
defdelegate(percentiles(samples, percentiles), to: Percentile)
def percentiles(samples, percentiles) do
samples |> Enum.sort() |> Percentile.percentiles(percentiles)
end

@doc """
A map showing which sample occurs how often in the samples.
Expand Down Expand Up @@ -541,10 +595,85 @@ defmodule Statistex do

def median(samples, options) do
percentiles =
Keyword.get_lazy(options, :percentiles, fn -> percentiles(samples, @median_percentile) end)
Keyword.get_lazy(options, :percentiles, fn ->
Percentile.percentiles(samples, @median_percentile)
end)

get_percentile(samples, @median_percentile, percentiles)
end

@doc """
Calculates the lower and upper bound for outliers.

Any sample that is `<` as the lower bound and any sample `>` are outliers of
the given `samples`.

## Examples

iex> Statistex.outliers_bounds([3, 4, 5])
{3, 5}

iex> Statistex.outliers_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
{22.5, 50}

iex> Statistex.outliers_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99])
{50, 80.625}
"""
@spec outliers_bounds(samples, keyword) :: {lower :: number, upper :: number}
def outliers_bounds(samples, options \\ [])
def outliers_bounds([], _), do: raise(ArgumentError, @empty_list_error_message)
def outliers_bounds(samples, options), do: samples |> Enum.sort() |> do_outliers_bounds(options)

defp do_outliers_bounds(samples, options) do
percentiles =
Keyword.get_lazy(options, :percentiles, fn -> Percentile.percentiles(samples, [25, 75]) end)

minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end)
maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end)

p25 = get_percentile(samples, 25, percentiles)
p75 = get_percentile(samples, 75, percentiles)
iqr = p75 - p25

{max(p25 - iqr * @iqr_factor, minimum), min(p75 + iqr * @iqr_factor, maximum)}
end

@doc """
Returns all outliers for the given `samples`.

## Examples

iex> Statistex.outliers([3, 4, 5])
[]

iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])
[1, 2, 6]

iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99])
[99, 99, 99]
"""
@spec outliers(samples, keyword) :: samples | []
def outliers(samples, options \\ []) do
{outliers, _rest} = samples |> Enum.sort() |> do_outliers(options)

outliers
end

defp do_outliers(samples, options) do
{lower_bound, upper_bound} =
Keyword.get_lazy(options, :outliers_bounds, fn -> do_outliers_bounds(samples, options) end)

{min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end)

{max, rest} =
rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end)

{min ++ max, rest}
end

Map.get_lazy(percentiles, @median_percentile, fn ->
samples |> percentiles(@median_percentile) |> Map.fetch!(@median_percentile)
defp get_percentile(samples, percentile, percentiles) do
Map.get_lazy(percentiles, percentile, fn ->
samples |> Percentile.percentiles(percentile) |> Map.fetch!(percentile)
end)
end

Expand Down
3 changes: 1 addition & 2 deletions lib/statistex/percentile.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ defmodule Statistex.Percentile do

def percentiles(samples, percentile_ranks) do
number_of_samples = length(samples)
sorted_samples = Enum.sort(samples)

percentile_ranks
|> List.wrap()
|> Enum.reduce(%{}, fn percentile_rank, acc ->
perc = percentile(sorted_samples, number_of_samples, percentile_rank)
perc = percentile(samples, number_of_samples, percentile_rank)
Map.put(acc, percentile_rank, perc)
end)
end
Expand Down
34 changes: 17 additions & 17 deletions test/statistex/percentile_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@ defmodule Statistex.PercentileTest do

doctest Statistex.Percentile

@nist_sample_data [
95.1772,
95.1567,
95.1937,
95.1959,
95.1442,
95.0610,
95.1591,
95.1195,
95.1065,
95.0925,
95.1990,
95.1682
]
@nist_sample_data Enum.sort([
95.1772,
95.1567,
95.1937,
95.1959,
95.1442,
95.0610,
95.1591,
95.1195,
95.1065,
95.0925,
95.1990,
95.1682
])

# Test data from:
# http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
Expand Down Expand Up @@ -49,7 +49,7 @@ defmodule Statistex.PercentileTest do
end

describe "a list of two elements" do
@samples [300, 200]
@samples [200, 300]
test "1st percentile (small sample size simply picks first element)" do
%{1 => result} = percentiles(@samples, [1])
assert result == 200.0
Expand All @@ -67,7 +67,7 @@ defmodule Statistex.PercentileTest do
end

describe "seemingly problematic 2 element list [9, 1]" do
@samples [9, 1]
@samples [1, 9]

percentiles = %{
25 => 1,
Expand All @@ -88,7 +88,7 @@ defmodule Statistex.PercentileTest do
end

describe "a list of three elements" do
@samples [100, 300, 200]
@samples [100, 200, 300]
test "1st percentile (small sample size simply picks first element)" do
%{1 => result} = percentiles(@samples, [1])
assert result == 100.0
Expand Down
Loading