-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathexp-CLT.lisp
106 lines (95 loc) · 9.58 KB
/
exp-CLT.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
;; -*- Mode:Lisp; Syntax:ANSI-Common-LISP; Coding:us-ascii-unix; fill-column:158 -*-
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;; @file exp-CLT.lisp
;; @author Mitch Richling <https://www.mitchr.me>
;; @brief Theoretical family of probability distributions of sums of an input PDF represented as a historgram.@EOL
;; @std Common Lisp
;; @see
;; @copyright
;; @parblock
;; Copyright (c) 1993,1995,1996,1997,2003,2008,2009,2010,2012, 2015, Mitchell Jay Richling <https://www.mitchr.me> All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
;;
;; 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer.
;;
;; 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the following disclaimer in the documentation
;; and/or other materials provided with the distribution.
;;
;; 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software
;; without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
;; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
;; OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;; LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
;; DAMAGE.
;; @endparblock
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defun exCLT (rv-dat wt-dat max-nsum)
"Compute PDFs for the repeated sum of a random variable
rv-dat is an array of discrete random variable values (non-negative integers). wt-dat is an array of weights i.e. the probability, density, or count. This
function transforms the weights into a PDF by dividing the weights by a constant value so that they will all sum to 1.
A family of max-nsum theoretical probability distributions is returned, each as a column of the returned 2D array. The i'th PDF is the PDF resulting from
summing i independent copies of the input PDF.
This function allows us to see the central limit theorem in action if we divide each derived random variable by i. It can be a real eye-opener to see just
how quickly, or slowly, a particular distribution's mean sampling distribution approaches normality!
The algorithm for doing the PDF computations is iterative using the i'th PDF for the computation of the (i+1)'th PDF. This renders the PDFs very quickly when
compared to a naive implementation. Even with a good algorithm, this kind of computation is best done in a fast language (C++, Fortran, or a fast LISP
implementation like SCBL)."
(let* ((cvp nil) ; Column j has the PDF for nsum==j+1
(mx-in-ut (reduce #'max rv-dat)) ; Maximum value in input histogram
(num-buck (1+ mx-in-ut)) ; Size of buckets in our base histogram
(max-buck (make-array (1+ max-nsum) :initial-element 0)) ; (aref max-buck j) has the max ut for nsum=j+1
(cut-tall (1+ (* max-nsum num-buck)))) ; Number of rows in cvp
(setq cvp (make-array (list cut-tall max-nsum) :initial-element 0)) ; Initialize cvp to 0
(loop for ut across rv-dat ; Populate first column of cvp (nsum==1)
for wt across wt-dat ; After this loop, rv-dat & wt-dat are never used
when (<= 0 ut) ; Ignore negative values
do (incf (aref cvp ut 0) wt)) ; incf in case we have duplicate rv-dat values
(loop with tot-p = (reduce #'+ (mjr_arr_get-col cvp 0)) ; Fix the PDF so the total probably sums to 1
for i from 0 to mx-in-ut ; Multiply all weight values by the same
do (setf (aref cvp i 0) (/ (aref cvp i 0) tot-p))) ; factor.
(setf (aref max-buck 0) (1- num-buck)) ; Set the max bucket index for col 0
(loop for nsum from 2 upto max-nsum ; Compute the PDFs for the remaining nsum values
for nsum-1 = (1- nsum) ; Store nsum-1 away for later use
do (format 't "NSUM: ~3d~%" nsum) ; Can take a while, print status
do (loop for i from 0 upto (aref max-buck 0) ; Value for nsum=1 case
for piv = (aref cvp i 0) ; Prob for nsum=1 col
do (loop for j from 0 upto (aref max-buck (1- nsum-1)) ; Value for nsum-1 case
for pjv = (aref cvp j (1- nsum-1)) ; Prob for nsum-1 case
do (incf (aref cvp (+ i j) nsum-1) (* piv pjv)) ; Add up joint probability
do (setf (aref max-buck nsum-1) (+ i j))))) ; Update max value so far (slow, but easy)
cvp)) ; Return the result
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defun exCLT-CSV (in-file out-file max-nsum)
"Read a histogram from a CSV file, compute sum PDFs with exCLT, and write result to a CSV file.
This function first reads in a CSV file containing a histogram. The first column should be the value of a discreet random
variable (a non-negative integer). The second column is the weights."
(let* ((in-hist (mjr_util_read-csv-file ; Load histograms from disk
in-file ; The file with the data
:return-as-array 't ; Return a matrix, not a list of lists
:filter-func #'mjr_string_read-as-lisp)) ; Convert input strings into numbers
(rv-dat (mjr_arr_get-col in-hist 0)) ; Column with variable values
(wt-dat (mjr_arr_get-col in-hist 1)) ; Column with weights/probabilities
(cvp (exCLT rv-dat wt-dat max-nsum))) ; Compute PDFs
(mjr_util_write-csv-file cvp out-file ; Write out the PDFs in CSV form
:titles (loop for i from 1 upto max-nsum ; Put numeric column titles on the first
collect (format nil "n.~d" i))) ; line of the file.
cvp)) ; Return the result
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Mathematical application: Most lower level statistics texts will describe an application of the central limit theorem to the sampling distribution from
;; arbitrary populations. The fundamental result is that the sampling distribution will approach normality as the sample size grows to infinity. Students
;; are normally given advice like "a sample of 30 is sufficient". The data in exCLT-IN.csv describes a very non-normal PDF. The following code will read in
;; that distribution, and compute the sampling distribution from sample sizes up to 30. The R code in exCLT-AUX.R illustrates how to use the results.
(defvar cvp) ; Global var to store the results
(null (setq cvp (exCLT-csv "exp-CLT-IN.csv" "exCLT-OUT.csv" 30))) ; Run the func, and store the results
(format 't "Data stored in variable CVP~%") ; Remind the user about the CVP variable.
(format 't "Data written to file exCLT-OUT.csv~%") ; Remind the user about the output file.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Interesting compute farm/grid/cloud application: If the random variable is serial job CPU utilization (an integer from 0 to 100) and the weights are the
;; number of wall clock seconds consumed by jobs with that job CPU utilization, then the output PDFs will represent HOST utilization (when we divide by the
;; number of CPUs on the host) for various job counts.