Name | Executed | Routines | % | Executed | Lines | % | Unexecuted |
/home/matt/eu/rds/include/std/stats.e | 21 | 21 | 100.00% | 277 | 278 | 99.64% | 1 |
Routine | Executed | Lines | Unexecuted | |
movavg() | 29 | 30 | 96.67% | 1 |
avedev() | 21 | 21 | 100.00% | 0 |
average() | 7 | 7 | 100.00% | 0 |
central_moment() | 6 | 6 | 100.00% | 0 |
count() | 4 | 4 | 100.00% | 0 |
emovavg() | 14 | 14 | 100.00% | 0 |
geomean() | 19 | 19 | 100.00% | 0 |
harmean() | 19 | 19 | 100.00% | 0 |
kurtosis() | 10 | 10 | 100.00% | 0 |
largest() | 16 | 16 | 100.00% | 0 |
massage() | 5 | 5 | 100.00% | 0 |
median() | 10 | 10 | 100.00% | 0 |
mode() | 12 | 12 | 100.00% | 0 |
range() | 20 | 20 | 100.00% | 0 |
raw_frequency() | 22 | 22 | 100.00% | 0 |
skewness() | 7 | 7 | 100.00% | 0 |
small() | 5 | 5 | 100.00% | 0 |
smallest() | 16 | 16 | 100.00% | 0 |
stdev() | 19 | 19 | 100.00% | 0 |
sum() | 9 | 9 | 100.00% | 0 |
sum_central_moments() | 2 | 2 | 100.00% | 0 |
# | Executed | |
1 | -- (c) Copyright - See License.txt | |
2 | -- | |
3 | namespace stats | |
4 | ||
5 | --**** | |
6 | -- == Statistics | |
7 | -- **Page Contents** | |
8 | -- | |
9 | -- < | |
10 | -- | |
11 | -- === Routines | |
12 | ||
13 | include std/math.e | |
14 | include std/sort.e | |
15 | include std/sequence.e | |
16 | ||
17 | ||
18 | --** | |
19 | -- Determines the k-th smallest value from the supplied set of numbers. | |
20 | -- | |
21 | -- Parameters: | |
22 | -- # ##data_set## : The list of values from which the smallest value is chosen. | |
23 | -- # ##ordinal_idx## : The relative index of the desired smallest value. | |
24 | -- | |
25 | -- Returns: | |
26 | -- A **sequence**, {The k-th smallest value, its index in the set} | |
27 | -- | |
28 | -- Comments: | |
29 | -- ##small##() is used to return a value based on its size relative to | |
30 | -- all the other elements in the sequence. When ##index## is 1, the smallest index is returned. Use ##index = length(data_set)## to return the highest. | |
31 | -- | |
32 | -- If ##ordinal_idx## is less than one, or greater then length of ##data_set##, | |
33 | -- an empty sequence is returned. | |
34 | -- | |
35 | -- The set of values does not have to be in any particular order. The values may be any Euphoria object. | |
36 | -- | |
37 | -- Example 1: | |
38 | -- | |
39 | -- ? small( {4,5,6,8,5,4,3,"text"}, 3 ) -- Ans: {4,1} (The 3rd smallest value) | |
40 | -- ? small( {4,5,6,8,5,4,3,"text"}, 1 ) -- Ans: {3,7} (The 1st smallest value) | |
41 | -- ? small( {4,5,6,8,5,4,3,"text"}, 7 ) -- Ans: {8,4} (The 7th smallest value) | |
42 | -- ? small( {"def", "qwe", "abc", "try"}, 2 ) -- Ans: {"def", 1} (The 2nd smallest value) | |
43 | -- ? small( {1,2,3,4}, -1) -- Ans: {} -- no-value | |
44 | -- ? small( {1,2,3,4}, 10) -- Ans: {} -- no-value | |
45 | -- | |
46 | -- | |
47 | ||
48 | 4 | |
49 | sequence lSortedData | |
50 | ||
51 | 4 | if ordinal_idx < 1 or ordinal_idx > length(data_set) then |
52 | 3 | return {} |
53 | end if | |
54 | ||
55 | 1 | lSortedData = sort(data_set) |
56 | ||
57 | 1 | return {lSortedData[ordinal_idx], find(lSortedData[ordinal_idx], data_set)} |
58 | end function | |
59 | ||
60 | --** | |
61 | -- Returns the largest of the data points that are atoms. | |
62 | -- | |
63 | -- Parameters: | |
64 | -- # ##data_set## : a list of 1 or more numbers among which you want the largest. | |
65 | -- | |
66 | -- Returns: | |
67 | -- An **object**, either of: | |
68 | -- * an atom (the largest value) if there is at least one atom item in the set\\ | |
69 | -- * ##{} ##if there //is// no largest value. | |
70 | -- | |
71 | -- Comments: | |
72 | -- Any ##data_set## element which is not an atom is ignored. | |
73 | -- | |
74 | -- Example 1: | |
75 | -- | |
76 | -- ? largest( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"} ) -- Ans: 8 | |
77 | -- ? largest( {"just","text"} ) -- Ans: {} | |
78 | -- | |
79 | -- | |
80 | -- See also: | |
81 | -- [[:range]] | |
82 | -- | |
83 | 3 | |
84 | atom result_, temp_ | |
85 | integer lFoundAny | |
86 | 3 | if atom(data_set) then |
87 | 1 | return data_set |
88 | end if | |
89 | 2 | lFoundAny = 0 |
90 | 2 | for i = 1 to length(data_set) do |
91 | 18 | if atom(data_set[i]) then |
92 | 15 | temp_ = data_set[i] |
93 | 15 | if lFoundAny then |
94 | 14 | if temp_ > result_ then |
95 | 1 | result_ = temp_ |
96 | end if | |
97 | else | |
98 | 1 | result_ = temp_ |
99 | 1 | lFoundAny = 1 |
100 | end if | |
101 | end if | |
102 | 18 | end for |
103 | 2 | if lFoundAny = 0 then |
104 | 1 | return {} |
105 | end if | |
106 | 1 | return result_ |
107 | end function | |
108 | ||
109 | --** | |
110 | -- Returns the smallest of the data points. | |
111 | -- | |
112 | -- Parameters: | |
113 | -- # ##data_set## : A list of 1 or more numbers for which you want the smallest. | |
114 | -- **Note:** only atom elements are included and any sub-sequences | |
115 | -- elements are ignored. | |
116 | -- | |
117 | -- Returns: | |
118 | -- An **object**, either of: | |
119 | -- * an atom (the smallest value) if there is at least one atom item in the set\\ | |
120 | -- * ##{} ##if there //is// no largest value. | |
121 | -- | |
122 | -- Comments: | |
123 | -- Any ##data_set## element which is not an atom is ignored. | |
124 | -- | |
125 | -- Example 1: | |
126 | -- | |
127 | -- ? smallest( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"} ) -- Ans: 1 | |
128 | -- ? smallest( {"just","text"} ) -- Ans: {} | |
129 | -- | |
130 | -- | |
131 | -- See also: | |
132 | -- [[:range]] | |
133 | 3 | |
134 | atom result_, temp_ | |
135 | integer lFoundAny | |
136 | 3 | if atom(data_set) then |
137 | 1 | return data_set |
138 | end if | |
139 | 2 | lFoundAny = 0 |
140 | 2 | for i = 1 to length(data_set) do |
141 | 18 | if atom(data_set[i]) then |
142 | 15 | temp_ = data_set[i] |
143 | 15 | if lFoundAny then |
144 | 14 | if temp_ < result_ then |
145 | 2 | result_ = temp_ |
146 | end if | |
147 | else | |
148 | 1 | result_ = temp_ |
149 | 1 | lFoundAny = 1 |
150 | end if | |
151 | end if | |
152 | 18 | end for |
153 | 2 | if lFoundAny = 0 then |
154 | 1 | return {} |
155 | end if | |
156 | 1 | return result_ |
157 | end function | |
158 | ||
159 | --** | |
160 | -- Determines a number of //range// statistics for the data set. | |
161 | -- | |
162 | -- Parameters: | |
163 | -- # ##data_set## : a list of 1 or more numbers for which you want the range data. | |
164 | -- | |
165 | -- Returns: | |
166 | -- A **sequence**, empty if no atoms were found, else like {Lowest, Highest, Range, Mid-range} | |
167 | -- | |
168 | -- Comments: | |
169 | -- Any sequence element in ##data_set## is ignored. | |
170 | -- | |
171 | -- Example 1: | |
172 | -- | |
173 | -- ? range( {7,2,8,5,6,6,4,8,6,16,3,3,4,1,8,"text"} ) -- Ans: {1, 16, 15, 8.5} | |
174 | -- | |
175 | -- | |
176 | -- See also: | |
177 | -- [[:smallest]] [[:largest]] | |
178 | -- | |
179 | 3 | |
180 | sequence result_ | |
181 | atom temp_ | |
182 | 3 | integer lFoundAny = 0 |
183 | ||
184 | 3 | if atom(data_set) then |
185 | 1 | data_set = {data_set} |
186 | end if | |
187 | ||
188 | 3 | for i = 1 to length(data_set) do |
189 | 17 | if atom(data_set[i]) then |
190 | 16 | temp_ = data_set[i] |
191 | 16 | if lFoundAny then |
192 | 14 | if temp_ < result_[1] then |
193 | 2 | result_[1] = temp_ |
194 | 12 | elsif temp_ > result_[2] then |
195 | 2 | result_[2] = temp_ |
196 | end if | |
197 | else | |
198 | 2 | result_ = {temp_, temp_, 0, 0} |
199 | 2 | lFoundAny = 1 |
200 | end if | |
201 | end if | |
202 | 17 | end for |
203 | 3 | if lFoundAny = 0 then |
204 | 1 | return {} |
205 | end if | |
206 | 2 | result_[3] = result_[2] - result_[1] |
207 | 2 | result_[4] = (result_[1] + result_[2]) / 2 |
208 | 2 | return result_ |
209 | end function | |
210 | ||
211 | --**** | |
212 | -- Enums used to influence the results of some of these functions. | |
213 | ||
214 | public enum | |
215 | --** | |
216 | -- The supplied data is the entire population. | |
217 | 21 | ST_FULLPOP, |
218 | ||
219 | --** | |
220 | -- The supplied data is only a random sample of the population. | |
221 | 21 | ST_SAMPLE |
222 | ||
223 | public enum | |
224 | --** | |
225 | -- The supplied data consists of only atoms. | |
226 | 21 | ST_ALLNUM, |
227 | ||
228 | --** | |
229 | -- Any sub-sequences (eg. strings) in the supplied data are ignored. | |
230 | 21 | ST_IGNSTR, |
231 | ||
232 | --** | |
233 | -- Any sub-sequences (eg. strings) in the supplied data are assumed to | |
234 | -- have the value zero. | |
235 | 21 | ST_ZEROSTR, |
236 | ||
237 | $ | |
238 | ||
239 | 151 | |
240 | 151 | switch subseq_opt do |
241 | case ST_IGNSTR then | |
242 | 16 | return remove_subseq(data_set, SEQ_NOALT) |
243 | ||
244 | case ST_ZEROSTR then | |
245 | 3 | return remove_subseq(data_set, 0) |
246 | ||
247 | case else | |
248 | 132 | return data_set |
249 | end switch | |
250 | end function | |
251 | ||
252 | --** | |
253 | -- Returns the standard deviation based of the population. | |
254 | -- | |
255 | -- Parameters: | |
256 | -- # ##data_set## : a list of 1 or more numbers for which you want the estimated standard deviation. | |
257 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
258 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
259 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
260 | -- # ##population_type## : an integer. ST_SAMPLE (the default) assumes that ##data_set## is a random | |
261 | -- sample of the total population. ST_FULLPOP means that ##data_set## is the | |
262 | -- entire population. | |
263 | -- | |
264 | -- Returns: | |
265 | -- An **atom**, the estimated standard deviation. | |
266 | -- An empty **sequence** means that there is no meaningful data to calculate from. | |
267 | -- | |
268 | -- Comments: | |
269 | -- ##stdev##() is a measure of how values are different from the average. | |
270 | -- | |
271 | -- The numbers in ##data_set## can either be the entire population of values or | |
272 | -- just a random subset. You indicate which in the ##population_type## parameter. By default | |
273 | -- ##data_set## represents a sample and not the entire population. When using this | |
274 | -- function with sample data, the result is an //estimated// standard deviation. | |
275 | -- | |
276 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
277 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
278 | -- an number. If that is not the case then the function will crash. So it is | |
279 | -- important that if it can possibly contain sub-sequences that you tell this | |
280 | -- function what to do with them. Your choices are to ignore them or assume they | |
281 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
282 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
283 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
284 | -- **Note** It is faster if the data only contains numbers. | |
285 | -- | |
286 | -- The equation for standard deviation is: | |
287 | -- {{{ | |
288 | -- stdev(X) ==> SQRT(SUM(SQ(X{1..N} - MEAN)) / (N)) | |
289 | -- }}} | |
290 | -- | |
291 | -- Example 1: | |
292 | -- | |
293 | -- ? stdev( {4,5,6,7,5,4,3,7} ) -- Ans: 1.457737974 | |
294 | -- ? stdev( {4,5,6,7,5,4,3,7} ,, ST_FULLPOP) -- Ans: 1.363589014 | |
295 | -- ? stdev( {4,5,6,7,5,4,3,"text"} , ST_IGNSTR) -- Ans: 1.345185418 | |
296 | -- ? stdev( {4,5,6,7,5,4,3,"text"}, ST_IGNSTR, ST_FULLPOP ) -- Ans: 1.245399698 | |
297 | -- ? stdev( {4,5,6,7,5,4,3,"text"} , 0) -- Ans: 2.121320344 | |
298 | -- ? stdev( {4,5,6,7,5,4,3,"text"}, 0, ST_FULLPOP ) -- Ans: 1.984313483 | |
299 | -- | |
300 | -- | |
301 | -- See also: | |
302 | -- [[:average]], [[:avedev]] | |
303 | -- | |
304 | ||
305 | 39 | |
306 | atom lSum | |
307 | atom lMean | |
308 | integer lCnt | |
309 | ||
310 | 39 | data_set = massage(data_set, subseq_opt) |
311 | ||
312 | 39 | lCnt = length(data_set) |
313 | ||
314 | 39 | if lCnt = 0 then |
315 | 6 | return {} |
316 | end if | |
317 | 33 | if lCnt = 1 then |
318 | 6 | return 0 |
319 | end if | |
320 | ||
321 | 27 | lSum = 0 |
322 | 27 | for i = 1 to length(data_set) do |
323 | 23535 | lSum += data_set[i] |
324 | 23535 | end for |
325 | ||
326 | 27 | lMean = lSum / lCnt |
327 | 27 | lSum = 0 |
328 | 27 | for i = 1 to length(data_set) do |
329 | 23535 | lSum += power(data_set[i] - lMean, 2) |
330 | 23535 | end for |
331 | ||
332 | 27 | if population_type = ST_SAMPLE then |
333 | 25 | lCnt -= 1 |
334 | end if | |
335 | ||
336 | 27 | return power(lSum / lCnt, 0.5) |
337 | end function | |
338 | ||
339 | --** | |
340 | -- Returns the average of the absolute deviations of data points from their mean. | |
341 | -- | |
342 | -- Parameters: | |
343 | -- # ##data_set## : a list of 1 or more numbers for which you want the mean of the absolute deviations. | |
344 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
345 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
346 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
347 | -- # ##population_type## : an integer. ST_SAMPLE (the default) assumes that ##data_set## is a random | |
348 | -- sample of the total population. ST_FULLPOP means that ##data_set## is the | |
349 | -- entire population. | |
350 | -- | |
351 | -- Returns: | |
352 | -- An **atom** , the deviation from the mean.\\ | |
353 | -- An empty **sequence**, means that there is no meaningful data to calculate from. | |
354 | -- | |
355 | -- Comments: | |
356 | -- ##avedev##() is a measure of the variability in a data set. Its statistical | |
357 | -- properties are less well behaved than those of the standard deviation, which is | |
358 | -- why it is used less. | |
359 | -- | |
360 | -- The numbers in ##data_set## can either be the entire population of values or | |
361 | -- just a random subset. You indicate which in the ##population_type## parameter. By default | |
362 | -- ##data_set## represents a sample and not the entire population. When using this | |
363 | -- function with sample data, the result is an //estimated// deviation. | |
364 | -- | |
365 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
366 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
367 | -- an number. If that is not the case then the function will crash. So it is | |
368 | -- important that if it can possibly contain sub-sequences that you tell this | |
369 | -- function what to do with them. Your choices are to ignore them or assume they | |
370 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
371 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
372 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
373 | -- **Note** It is faster if the data only contains numbers. | |
374 | -- | |
375 | -- The equation for absolute average deviation is~: | |
376 | -- {{{ | |
377 | -- avedev(X) ==> SUM( ABS(X{1..N} - MEAN(X)) ) / N | |
378 | -- }}} | |
379 | -- | |
380 | -- Example 1: | |
381 | -- | |
382 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,7} ) -- Ans: 1.966666667 | |
383 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,7},, ST_FULLPOP ) -- Ans: 1.84375 | |
384 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR ) -- Ans: 1.99047619 | |
385 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR,ST_FULLPOP ) -- Ans: 1.857777778 | |
386 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, 0 ) -- Ans: 2.225 | |
387 | -- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, 0, ST_FULLPOP ) -- Ans: 2.0859375 | |
388 | -- | |
389 | -- | |
390 | -- See also: | |
391 | -- [[:average]], [[:stdev]] | |
392 | -- | |
393 | ||
394 | 4 | |
395 | atom lSum | |
396 | atom lMean | |
397 | integer lCnt | |
398 | ||
399 | 4 | data_set = massage(data_set, subseq_opt) |
400 | ||
401 | 4 | lCnt = length(data_set) |
402 | ||
403 | 4 | if lCnt = 0 then |
404 | 2 | return {} |
405 | end if | |
406 | 2 | if lCnt = 1 then |
407 | 1 | return 0 |
408 | end if | |
409 | 1 | lSum = 0 |
410 | ||
411 | 1 | for i = 1 to length(data_set) do |
412 | 15 | lSum += data_set[i] |
413 | 15 | end for |
414 | ||
415 | 1 | lMean = lSum / lCnt |
416 | 1 | lSum = 0 |
417 | 1 | for i = 1 to length(data_set) do |
418 | 15 | if data_set[i] > lMean then |
419 | 8 | lSum += data_set[i] - lMean |
420 | else | |
421 | 7 | lSum += lMean - data_set[i] |
422 | end if | |
423 | 15 | end for |
424 | ||
425 | 1 | if population_type = ST_SAMPLE then |
426 | 1 | lCnt -= 1 |
427 | end if | |
428 | 1 | return lSum / lCnt |
429 | end function | |
430 | ||
431 | --** | |
432 | -- Returns the sum of all the atoms in an object. | |
433 | -- | |
434 | -- Parameters: | |
435 | -- # ##data_set## : Either an atom or a list of numbers to sum. | |
436 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
437 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
438 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
439 | -- | |
440 | -- Returns: | |
441 | -- An **atom**, the sum of the set. | |
442 | -- | |
443 | -- Comments: | |
444 | -- ##sum##() is used as a measure of the magnitude of a sequence of positive values. | |
445 | -- | |
446 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
447 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
448 | -- an number. If that is not the case then the function will crash. So it is | |
449 | -- important that if it can possibly contain sub-sequences that you tell this | |
450 | -- function what to do with them. Your choices are to ignore them or assume they | |
451 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
452 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
453 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
454 | -- **Note** It is faster if the data only contains numbers. | |
455 | -- | |
456 | -- The equation is~: | |
457 | -- | |
458 | -- {{{ | |
459 | -- sum(X) ==> SUM( X{1..N} ) | |
460 | -- }}} | |
461 | -- | |
462 | -- Example 1: | |
463 | -- | |
464 | -- ? sum( {7,2,8.5,6,6,-4.8,6,6,3.341,-8,"text"}, 0 ) -- Ans: 32.041 | |
465 | -- | |
466 | -- | |
467 | -- See also: | |
468 | -- [[:average]] | |
469 | ||
470 | 36 | |
471 | atom result_ | |
472 | 36 | if atom(data_set) then |
473 | 1 | return data_set |
474 | end if | |
475 | ||
476 | 35 | data_set = massage(data_set, subseq_opt) |
477 | 35 | result_ = 0 |
478 | 35 | for i = 1 to length(data_set) do |
479 | 23720 | result_ += data_set[i] |
480 | 23720 | end for |
481 | ||
482 | 35 | return result_ |
483 | end function | |
484 | ||
485 | --** | |
486 | -- Returns the count of all the atoms in an object. | |
487 | -- | |
488 | -- Parameters: | |
489 | -- # ##data_set## : either an atom or a list. | |
490 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
491 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
492 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
493 | -- | |
494 | -- Comments: | |
495 | -- This returns the number of numbers in ##data_set## | |
496 | -- | |
497 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
498 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
499 | -- an number. If that is not the case then the function will crash. So it is | |
500 | -- important that if it can possibly contain sub-sequences that you tell this | |
501 | -- function what to do with them. Your choices are to ignore them or assume they | |
502 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
503 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
504 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
505 | -- **Note** It is faster if the data only contains numbers. | |
506 | -- | |
507 | -- Returns: | |
508 | -- | |
509 | -- An **integer**, the number of atoms in the set. When ##data_set## is an atom, 1 is returned. | |
510 | -- | |
511 | -- Example 1: | |
512 | -- | |
513 | -- ? count( {7,2,8.5,6,6,-4.8,6,6,3.341,-8,"text"} ) -- Ans: 10 | |
514 | -- ? count( {"cat", "dog", "lamb", "cow", "rabbit"} ) -- Ans: 0 (no atoms) | |
515 | -- ? count( 5 ) -- Ans: 1 | |
516 | -- | |
517 | -- | |
518 | -- See also: | |
519 | -- [[:average]], [[:sum]] | |
520 | ||
521 | 13 | |
522 | 13 | if atom(data_set) then |
523 | 1 | return 1 |
524 | end if | |
525 | ||
526 | 12 | return length(massage(data_set, subseq_opt)) |
527 | ||
528 | end function | |
529 | ||
530 | ||
531 | --** | |
532 | -- Returns the average (mean) of the data points. | |
533 | -- | |
534 | -- Parameters: | |
535 | -- # ##data_set## : A list of 1 or more numbers for which you want the mean. | |
536 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
537 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
538 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
539 | -- | |
540 | -- | |
541 | -- Returns: | |
542 | -- An **object**, | |
543 | -- * ##{}## (the empty sequence) if there are no atoms in the set. | |
544 | -- * an atom (the mean) if there are one or more atoms in the set. | |
545 | -- | |
546 | -- Comments: | |
547 | -- | |
548 | -- ##average##() is the theoretical probable value of a randomly selected item from the set. | |
549 | -- | |
550 | -- The equation for average is: | |
551 | -- | |
552 | -- {{{ | |
553 | -- average(X) ==> SUM( X{1..N} ) / N | |
554 | -- }}} | |
555 | -- | |
556 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
557 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
558 | -- an number. If that is not the case then the function will crash. So it is | |
559 | -- important that if it can possibly contain sub-sequences that you tell this | |
560 | -- function what to do with them. Your choices are to ignore them or assume they | |
561 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
562 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
563 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
564 | -- **Note** It is faster if the data only contains numbers. | |
565 | -- | |
566 | -- Example 1: | |
567 | -- | |
568 | -- ? average( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR ) -- Ans: 5.13333333 | |
569 | -- | |
570 | -- | |
571 | -- See also: | |
572 | -- [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]] | |
573 | -- | |
574 | 30 | |
575 | ||
576 | 30 | if atom(data_set) then |
577 | 1 | return data_set |
578 | end if | |
579 | ||
580 | 29 | data_set = massage(data_set, subseq_opt) |
581 | ||
582 | 29 | if length(data_set) = 0 then |
583 | 1 | return {} |
584 | end if | |
585 | 28 | return sum(data_set) / length(data_set) |
586 | end function | |
587 | ||
588 | --** | |
589 | -- Returns the geometric mean of the atoms in a sequence. | |
590 | -- | |
591 | -- Parameters: | |
592 | -- # ##data_set## : the values to take the geometric mean of. | |
593 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
594 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
595 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
596 | -- | |
597 | -- Returns: | |
598 | -- | |
599 | -- An **atom**, the geometric mean of the atoms in ##data_set##. | |
600 | -- If there is no atom to take the mean of, 1 is returned. | |
601 | -- | |
602 | -- Comments: | |
603 | -- | |
604 | -- The geometric mean of ##N## atoms is the N-th root of their product. Signs are ignored. | |
605 | -- | |
606 | -- This is useful to compute average growth rates. | |
607 | -- | |
608 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
609 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
610 | -- an number. If that is not the case then the function will crash. So it is | |
611 | -- important that if it can possibly contain sub-sequences that you tell this | |
612 | -- function what to do with them. Your choices are to ignore them or assume they | |
613 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
614 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
615 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
616 | -- **Note** It is faster if the data only contains numbers. | |
617 | -- | |
618 | -- Example 1: | |
619 | -- | |
620 | -- ? geomean({3, "abc", -2, 6}, ST_IGNSTR) -- prints out power(36,1/3) = 3,30192724889462669 | |
621 | -- ? geomean({1,2,3,4,5,6,7,8,9,10}) -- = 4.528728688 | |
622 | -- | |
623 | -- | |
624 | -- See Also: | |
625 | -- [[:average]] | |
626 | ||
627 | 6 | |
628 | 6 | atom prod_ = 1.0 |
629 | integer count_ | |
630 | ||
631 | 6 | if atom(data_set) then |
632 | 1 | return data_set |
633 | end if | |
634 | ||
635 | 5 | data_set = massage(data_set, subseq_opt) |
636 | ||
637 | 5 | count_ = length(data_set) |
638 | 5 | if count_ = 0 then |
639 | 1 | return 1 |
640 | end if | |
641 | 4 | if count_ = 1 then |
642 | 1 | return data_set[1] |
643 | end if | |
644 | ||
645 | 3 | for i = 1 to length(data_set) do |
646 | 9 | atom x = data_set[i] |
647 | ||
648 | 9 | if x = 0 then |
649 | 1 | return 0 |
650 | else | |
651 | 8 | prod_ *= x |
652 | end if | |
653 | ||
654 | 8 | end for |
655 | ||
656 | 2 | if prod_ < 0 then |
657 | 1 | return power(-prod_, 1/count_) |
658 | else | |
659 | 1 | return power(prod_, 1/count_) |
660 | end if | |
661 | ||
662 | end function | |
663 | ||
664 | --** | |
665 | -- Returns the harmonic mean of the atoms in a sequence. | |
666 | -- | |
667 | -- Parameters: | |
668 | -- # ##data_set## : the values to take the harmonic mean of. | |
669 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
670 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
671 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
672 | -- | |
673 | -- Returns: | |
674 | -- | |
675 | -- An **atom**, the harmonic mean of the atoms in ##data_set##. | |
676 | -- | |
677 | -- Comments: | |
678 | -- The harmonic mean is the inverse of the average of their inverses. | |
679 | -- | |
680 | -- This is useful in engineering to compute equivalent capacities and resistances. | |
681 | -- | |
682 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
683 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
684 | -- an number. If that is not the case then the function will crash. So it is | |
685 | -- important that if it can possibly contain sub-sequences that you tell this | |
686 | -- function what to do with them. Your choices are to ignore them or assume they | |
687 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
688 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
689 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
690 | -- **Note** It is faster if the data only contains numbers. | |
691 | -- | |
692 | -- Example 1: | |
693 | -- | |
694 | -- ? harmean({3, "abc", -2, 6}, ST_IGNSTR) -- = 0. | |
695 | -- ? harmean({{2, 3, 4}) -- 3 / (1/2 + 1/3 + 1/4) = 2.769230769 | |
696 | -- | |
697 | -- | |
698 | -- See Also: | |
699 | -- [[:average]] | |
700 | ||
701 | 3 | |
702 | integer count_ | |
703 | ||
704 | 3 | data_set = massage(data_set, subseq_opt) |
705 | ||
706 | 3 | count_ = length(data_set) |
707 | 3 | if count_ = 1 then |
708 | 1 | return data_set[1] |
709 | end if | |
710 | ||
711 | 2 | atom y = 0 |
712 | 2 | atom z = 1 |
713 | 2 | for i = 1 to count_ do |
714 | 3 | atom x = 1 |
715 | 3 | z *= data_set[i] |
716 | 3 | for j = 1 to count_ do |
717 | 9 | if j != i then |
718 | 6 | x *= data_set[j] |
719 | end if | |
720 | 9 | end for |
721 | 3 | y += x |
722 | 3 | end for |
723 | ||
724 | 2 | if y = 0 then |
725 | 1 | return 0 |
726 | end if | |
727 | ||
728 | 1 | return count_ * z / y |
729 | end function | |
730 | ||
731 | --** | |
732 | -- Returns the average (mean) of the data points for overlaping periods. This | |
733 | -- can be either a simple or weighted moving average. | |
734 | -- | |
735 | -- Parameters: | |
736 | -- # ##data_set## : a list of 1 or more numbers for which you want a moving average. | |
737 | -- # ##period_delta## : an object, either | |
738 | -- * an integer representing the size of the period, or | |
739 | -- * a list of weightings to apply to the respective period positions. | |
740 | -- | |
741 | -- Returns: | |
742 | -- A **sequence**, either the requested averages or ##{}## if the Data sequence is empty or | |
743 | -- the supplied period is less than one. | |
744 | -- | |
745 | -- If a list of weights was supplied, the result is a weighted average; otherwise, it is a simple average. | |
746 | -- | |
747 | -- Comments: | |
748 | -- | |
749 | -- A moving average is used to smooth out a set of data points over a period.\\ | |
750 | -- For example, given a period of 5: | |
751 | -- # the first returned element is the average | |
752 | -- of the first five data points [1..5], | |
753 | -- # the second returned element is | |
754 | -- the average of the second five data points [2..6], \\and so on \\until | |
755 | -- the last returned value is the average of the last 5 data points | |
756 | -- [$-4 .. $]. | |
757 | -- | |
758 | -- When ##period_delta## is an atom, it is rounded down to the width of the average. When it is a | |
759 | -- sequence, the width is its length. If there are not enough data points, zeroes are inserted. | |
760 | -- | |
761 | -- Note that only atom elements are included and any sub-sequence elements are ignored. | |
762 | -- | |
763 | -- Example 1: | |
764 | -- | |
765 | -- ? movavg( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8}, 10 ) | |
766 | -- -- Ans: {5.8, 5.4, 5.5, 5.1, 4.7, 4.9} | |
767 | -- ? movavg( {7,2,8,5,6}, 2 ) | |
768 | -- -- Ans: {4.5, 5, 6.5, 5.5} | |
769 | -- ? movavg( {7,2,8,5,6}, {0.5, 1.5} ) | |
770 | -- -- Ans: {3.25, 6.5, 5.75, 5.75} | |
771 | -- | |
772 | -- | |
773 | -- See also: | |
774 | -- [[:average]] | |
775 | -- | |
776 | 10 | |
777 | sequence result_ | |
778 | integer lLow | |
779 | integer lHigh | |
780 | integer j | |
781 | integer n | |
782 | ||
783 | 10 | if atom(data_set) then |
784 | 2 | data_set = {data_set} |
785 | ||
786 | 8 | elsif count(data_set) = 0 then |
787 | 1 | return data_set |
788 | end if | |
789 | ||
790 | 9 | if atom(period_delta) then |
791 | 7 | if floor(period_delta) < 1 then |
792 | 2 | return {} |
793 | end if | |
794 | 5 | period_delta = repeat(1, floor(period_delta)) |
795 | end if | |
796 | ||
797 | 7 | if length(data_set) < length(period_delta) then |
798 | 1 | data_set = repeat(0, length(period_delta) - length(data_set)) & data_set |
799 | end if | |
800 | 7 | lLow = 1 |
801 | 7 | lHigh = length(period_delta) |
802 | 7 | result_ = repeat(0, length(data_set) - length(period_delta) + 1) |
803 | 7 | while lHigh <= length(data_set) do |
804 | 26 | j = 1 |
805 | 26 | n = 0 |
806 | 26 | for i = lLow to lHigh do |
807 | 147 | if atom(data_set[i]) then |
808 | 147 | result_[lLow] += data_set[i] * period_delta[j] |
809 | 147 | n += 1 |
810 | end if | |
811 | 147 | j += 1 |
812 | 147 | end for |
813 | 26 | if n > 0 then |
814 | 26 | result_[lLow] /= n |
815 | else | |
816 | 0 | result_[lLow] = 0 |
817 | end if | |
818 | ||
819 | 26 | lLow += 1 |
820 | 26 | lHigh += 1 |
821 | 26 | end while |
822 | ||
823 | 7 | return result_ |
824 | end function | |
825 | ||
826 | --** | |
827 | -- Returns the exponential moving average of a set of data points. | |
828 | -- | |
829 | -- Parameters: | |
830 | -- # ##data_set## : a list of 1 or more numbers for which you want a moving average. | |
831 | -- # ##smoothing_factor## : an atom, the smoothing factor, typically between 0 and 1. | |
832 | -- | |
833 | -- Returns: | |
834 | -- A **sequence**, made of the requested averages, or ##{}## if ##data_set## is empty or | |
835 | -- the supplied period is less than one. | |
836 | -- | |
837 | -- Comments: | |
838 | -- | |
839 | -- A moving average is used to smooth out a set of data points over a period. | |
840 | -- | |
841 | -- The formula used is:\\ | |
842 | -- : ##Y,,i,, = Y,,i-1,, + F * (X,,i,, - Y,,i-1,,)## | |
843 | -- | |
844 | -- Note that only atom elements are included and any sub-sequences elements are ignored. | |
845 | -- | |
846 | -- The smoothing factor controls how data is smoothed. 0 smooths everything to 0, and 1 means no smoothing at all. | |
847 | -- | |
848 | -- Any value for ##smoothing_factor## outside the 0.0..1.0 range causes ##smoothing_factor## | |
849 | -- to be set to the periodic factor ##(2/(N+1))##. | |
850 | -- | |
851 | -- Example 1: | |
852 | -- | |
853 | -- ? emovavg( {7,2,8,5,6}, 0.75 ) | |
854 | -- -- Ans: {5.25,2.8125,6.703125,5.42578125,5.856445313} | |
855 | -- ? emovavg( {7,2,8,5,6}, 0.25 ) | |
856 | -- -- Ans: {1.75,1.8125,3.359375,3.76953125,4.327148438} | |
857 | -- ? emovavg( {7,2,8,5,6}, -1 ) | |
858 | -- -- Ans: {2.333333333,2.222222222,4.148148148,4.432098765,4.95473251} | |
859 | -- | |
860 | -- | |
861 | -- See also: | |
862 | -- [[:average]] | |
863 | ||
864 | 4 | |
865 | atom lPrev | |
866 | ||
867 | 4 | if atom(data_set) then |
868 | 1 | data_set = {data_set} |
869 | ||
870 | 3 | elsif count(data_set) = 0 then |
871 | 1 | return data_set |
872 | end if | |
873 | ||
874 | 3 | if smoothing_factor < 0 or smoothing_factor > 1 then |
875 | 1 | smoothing_factor = (2 / (count(data_set) + 1)) |
876 | end if | |
877 | ||
878 | 3 | lPrev = average(data_set) |
879 | 3 | for i = 1 to length(data_set) do |
880 | 11 | if atom(data_set[i]) then |
881 | 11 | data_set[i] = (data_set[i] - lPrev) * smoothing_factor + lPrev |
882 | 11 | lPrev = data_set[i] |
883 | end if | |
884 | 11 | end for |
885 | 3 | return data_set |
886 | end function | |
887 | ||
888 | --** | |
889 | -- Returns the mid point of the data points. | |
890 | -- | |
891 | -- Parameters: | |
892 | -- # ##data_set## : a list of 1 or more numbers for which you want the mean. | |
893 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
894 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
895 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
896 | -- | |
897 | -- Returns: | |
898 | -- An **object**, either ##{}## if there are no items in the set, or an **atom** (the median) otherwise. | |
899 | -- | |
900 | -- Comments: | |
901 | -- | |
902 | -- ##median##() is the item for which half the items are below it and half | |
903 | -- are above it. | |
904 | -- | |
905 | -- All elements are included; any sequence elements are assumed to have the value zero. | |
906 | -- | |
907 | -- The equation for average is: | |
908 | -- | |
909 | -- {{{ | |
910 | -- median(X) ==> sort(X)[N/2] | |
911 | -- }}} | |
912 | -- | |
913 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
914 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
915 | -- an number. If that is not the case then the function will crash. So it is | |
916 | -- important that if it can possibly contain sub-sequences that you tell this | |
917 | -- function what to do with them. Your choices are to ignore them or assume they | |
918 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
919 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
920 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
921 | -- **Note** It is faster if the data only contains numbers. | |
922 | -- | |
923 | -- Example 1: | |
924 | -- | |
925 | -- ? median( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: 5 | |
926 | -- | |
927 | -- | |
928 | -- See also: | |
929 | -- [[:average]], [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]] | |
930 | -- | |
931 | ||
932 | 4 | |
933 | ||
934 | 4 | if atom(data_set) then |
935 | 1 | return data_set |
936 | end if | |
937 | ||
938 | 3 | data_set = massage(data_set, subseq_opt) |
939 | ||
940 | 3 | if length(data_set) = 0 then |
941 | 1 | return data_set |
942 | end if | |
943 | ||
944 | 2 | if length(data_set) < 3 then |
945 | 1 | return data_set[1] |
946 | end if | |
947 | 1 | data_set = sort(data_set) |
948 | 1 | return data_set[ floor((length(data_set) + 1) / 2) ] |
949 | ||
950 | end function | |
951 | ||
952 | --** | |
953 | -- Returns the frequency of each unique item in the data set. | |
954 | -- | |
955 | -- Parameters: | |
956 | -- # ##data_set## : a list of 1 or more numbers for which you want the frequencies. | |
957 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
958 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
959 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
960 | -- | |
961 | -- Returns: | |
962 | -- A **sequence**. This will contain zero or more 2-element sub-sequences. The | |
963 | -- first element is the frequency count and the second element is the data item | |
964 | -- that was counted. The returned values are in descending order, meaning that | |
965 | -- the highest frequencies are at the beginning of the returned list. | |
966 | -- | |
967 | -- Comments: | |
968 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
969 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
970 | -- an number. If that is not the case then the function will crash. So it is | |
971 | -- important that if it can possibly contain sub-sequences that you tell this | |
972 | -- function what to do with them. Your choices are to ignore them or assume they | |
973 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
974 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
975 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
976 | -- **Note** It is faster if the data only contains numbers. | |
977 | -- | |
978 | -- Example 1: | |
979 | -- | |
980 | -- ? raw_frequency("the cat is the hatter") | |
981 | -- | |
982 | -- This returns | |
983 | -- {{{ | |
984 | -- { | |
985 | -- {5,116}, | |
986 | -- {4,32}, | |
987 | -- {3,104}, | |
988 | -- {3,101}, | |
989 | -- {2,97}, | |
990 | -- {1,115}, | |
991 | -- {1,114}, | |
992 | -- {1,105}, | |
993 | -- {1,99} | |
994 | -- } | |
995 | -- }}} | |
996 | -- | |
997 | ||
998 | 5 | |
999 | ||
1000 | sequence lCounts | |
1001 | sequence lKeys | |
1002 | 5 | integer lNew = 0 |
1003 | integer lPos | |
1004 | 5 | integer lMax = -1 |
1005 | ||
1006 | 5 | if atom(data_set) then |
1007 | 1 | return {{1,data_set}} |
1008 | end if | |
1009 | ||
1010 | 4 | data_set = massage(data_set, subseq_opt) |
1011 | ||
1012 | 4 | if length(data_set) = 0 then |
1013 | 1 | return {{1,data_set}} |
1014 | end if | |
1015 | 3 | lCounts = repeat({0,0}, length(data_set)) |
1016 | 3 | lKeys = repeat(0, length(data_set)) |
1017 | 3 | for i = 1 to length(data_set) do |
1018 | 53 | lPos = find(data_set[i], lKeys) |
1019 | 53 | if lPos = 0 then |
1020 | 24 | lNew += 1 |
1021 | 24 | lPos = lNew |
1022 | 24 | lCounts[lPos][2] = data_set[i] |
1023 | 24 | lKeys[lPos] = data_set[i] |
1024 | 24 | if lPos > lMax then |
1025 | 24 | lMax = lPos |
1026 | end if | |
1027 | end if | |
1028 | 53 | lCounts[lPos][1] += 1 |
1029 | 53 | end for |
1030 | 3 | return sort(lCounts[1..lMax], DESCENDING) |
1031 | ||
1032 | end function | |
1033 | ||
1034 | --** | |
1035 | -- Returns the most frequent point(s) of the data set. | |
1036 | -- | |
1037 | -- Parameters: | |
1038 | -- # ##data_set## : a list of 1 or more numbers for which you want the mode. | |
1039 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
1040 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
1041 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
1042 | -- | |
1043 | -- Returns: | |
1044 | -- A **sequence**. The list of modal items in the data set. | |
1045 | -- | |
1046 | -- Comments: | |
1047 | -- | |
1048 | -- It is possible for the ##mode##() to return more than one item when more than | |
1049 | -- one item in the set has the same highest frequency count. | |
1050 | -- | |
1051 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
1052 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
1053 | -- an number. If that is not the case then the function will crash. So it is | |
1054 | -- important that if it can possibly contain sub-sequences that you tell this | |
1055 | -- function what to do with them. Your choices are to ignore them or assume they | |
1056 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
1057 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
1058 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
1059 | -- **Note** It is faster if the data only contains numbers. | |
1060 | -- | |
1061 | -- Example 1: | |
1062 | -- | |
1063 | -- ? mode( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: {6} | |
1064 | -- ? mode( {8,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: {8,6} | |
1065 | -- | |
1066 | -- | |
1067 | -- See also: | |
1068 | -- [[:average]], [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]] | |
1069 | -- | |
1070 | ||
1071 | 3 | |
1072 | ||
1073 | sequence lCounts | |
1074 | sequence lRes | |
1075 | ||
1076 | 3 | data_set = massage(data_set, subseq_opt) |
1077 | ||
1078 | 3 | if not length( data_set ) then |
1079 | 1 | return {} |
1080 | end if | |
1081 | ||
1082 | 2 | lCounts = raw_frequency(data_set, subseq_opt) |
1083 | ||
1084 | 2 | lRes = {lCounts[1][2]} |
1085 | 2 | for i = 2 to length(lCounts) do |
1086 | 3 | if lCounts[i][1] < lCounts[1][1] then |
1087 | 2 | exit |
1088 | end if | |
1089 | 1 | lRes = append(lRes, lCounts[i][2]) |
1090 | 1 | end for |
1091 | ||
1092 | 2 | return lRes |
1093 | ||
1094 | end function | |
1095 | ||
1096 | --** | |
1097 | -- Returns the distance between a supplied value and the mean, to some supplied | |
1098 | -- order of magnitude. This is used to get a measure of the //shape// of a | |
1099 | -- data set. | |
1100 | -- | |
1101 | -- Parameters: | |
1102 | -- # ##data_set## : a list of 1 or more numbers whose mean is used. | |
1103 | -- # ##datum##: either a single value or a list of values for which you require | |
1104 | -- the central moments. | |
1105 | -- # ##order_mag##: An integer. This is the order of magnitude required. Usually | |
1106 | -- a number from 1 to 4, but can be anything. | |
1107 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
1108 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
1109 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
1110 | -- | |
1111 | -- Returns: | |
1112 | -- An **object**. The same data type as ##datum##. This is the set of calculated | |
1113 | -- central moments. | |
1114 | -- | |
1115 | -- Comments: | |
1116 | -- | |
1117 | -- For each of the items in #datum##, its central moment is calculated as ... | |
1118 | -- {{{ | |
1119 | -- CM = power( ITEM - AVG, MAGNITUDE) | |
1120 | -- }}} | |
1121 | -- | |
1122 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
1123 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
1124 | -- an number. If that is not the case then the function will crash. So it is | |
1125 | -- important that if it can possibly contain sub-sequences that you tell this | |
1126 | -- function what to do with them. Your choices are to ignore them or assume they | |
1127 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
1128 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
1129 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
1130 | -- **Note** It is faster if the data only contains numbers. | |
1131 | -- | |
1132 | -- Example 1: | |
1133 | -- | |
1134 | -- ? central_moment("the cat is the hatter", "the",1) --> {23.14285714, 11.14285714, 8.142857143} | |
1135 | -- ? central_moment("the cat is the hatter", 't',2) --> 535.5918367 | |
1136 | -- ? central_moment("the cat is the hatter", 't',3) --> 12395.12536 | |
1137 | -- | |
1138 | -- | |
1139 | -- See also: | |
1140 | -- [[:average]] | |
1141 | -- | |
1142 | 8 | |
1143 | ||
1144 | atom lMean | |
1145 | ||
1146 | 8 | data_set = massage(data_set, subseq_opt) |
1147 | ||
1148 | 8 | if length(data_set) = 0 then |
1149 | 1 | return 0 |
1150 | end if | |
1151 | ||
1152 | 7 | lMean = average(data_set) |
1153 | ||
1154 | 7 | return power( datum - lMean, order_mag) |
1155 | ||
1156 | end function | |
1157 | ||
1158 | --** | |
1159 | -- Returns sum of the central moments of each item in a data set. | |
1160 | -- | |
1161 | -- Parameters: | |
1162 | -- # ##data_set## : a list of 1 or more numbers whose mean is used. | |
1163 | -- # ##order_mag##: An integer. This is the order of magnitude required. Usually | |
1164 | -- a number from 1 to 4, but can be anything. | |
1165 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
1166 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
1167 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
1168 | -- | |
1169 | -- Returns: | |
1170 | -- An **atom**. The total of the central moments calculated for each of the | |
1171 | -- items in ##data_set##. | |
1172 | -- | |
1173 | -- Comments: | |
1174 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
1175 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
1176 | -- an number. If that is not the case then the function will crash. So it is | |
1177 | -- important that if it can possibly contain sub-sequences that you tell this | |
1178 | -- function what to do with them. Your choices are to ignore them or assume they | |
1179 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
1180 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
1181 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
1182 | -- **Note** It is faster if the data only contains numbers. | |
1183 | -- | |
1184 | -- Example 1: | |
1185 | -- | |
1186 | -- ? sum_central_moments("the cat is the hatter", 1) --> -8.526512829e-14 | |
1187 | -- ? sum_central_moments("the cat is the hatter", 2) --> 19220.57143 | |
1188 | -- ? sum_central_moments("the cat is the hatter", 3) --> -811341.551 | |
1189 | -- ? sum_central_moments("the cat is the hatter", 4) --> 56824083.71 | |
1190 | -- | |
1191 | -- | |
1192 | -- See also: | |
1193 | -- [[:central_moment]], [[:average]] | |
1194 | -- | |
1195 | 7 | |
1196 | 7 | return sum( central_moment(data_set, data_set, order_mag, subseq_opt) ) |
1197 | end function | |
1198 | ||
1199 | --** | |
1200 | -- Returns a measure of the asymmetry of a data set. Usually the data_set is a | |
1201 | -- probablity distribution but it can be anything. This value is used to assess | |
1202 | -- how suitable the data set is in representing the required analysis. It can | |
1203 | -- help detect if there are too many extreme values in the data set. | |
1204 | -- | |
1205 | -- Parameters: | |
1206 | -- # ##data_set## : a list of 1 or more numbers whose mean is used. | |
1207 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
1208 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
1209 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
1210 | -- | |
1211 | -- Returns: | |
1212 | -- An **atom**. The skewness measure of the data set. | |
1213 | -- | |
1214 | -- Comments: | |
1215 | -- Generally speaking, a negative return indicates that most of the values are | |
1216 | -- lower than the mean, while positive values indicate that most values are | |
1217 | -- greater than the mean. However this might not be the case when there are a few | |
1218 | -- extreme values on one side of the mean. | |
1219 | -- | |
1220 | -- The larger the magnitude of the returned value, the more the data is skewed | |
1221 | -- in that direction. | |
1222 | -- | |
1223 | -- A returned value of zero indicates that the mean and median values are identical | |
1224 | -- and that the data is symmetrical. | |
1225 | -- | |
1226 | -- | |
1227 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
1228 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
1229 | -- an number. If that is not the case then the function will crash. So it is | |
1230 | -- important that if it can possibly contain sub-sequences that you tell this | |
1231 | -- function what to do with them. Your choices are to ignore them or assume they | |
1232 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
1233 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
1234 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
1235 | -- **Note** It is faster if the data only contains numbers. | |
1236 | -- | |
1237 | -- Example 1: | |
1238 | -- | |
1239 | -- ? skewness("the cat is the hatter") --> -1.296820819 | |
1240 | -- ? skewness("thecatisthehatter") --> 0.1029393238 | |
1241 | -- | |
1242 | -- | |
1243 | -- See also: | |
1244 | -- [[:kurtosis]] | |
1245 | -- | |
1246 | 4 | |
1247 | ||
1248 | 4 | if atom(data_set) then |
1249 | 1 | return data_set |
1250 | end if | |
1251 | ||
1252 | 3 | data_set = massage(data_set, subseq_opt) |
1253 | ||
1254 | 3 | if length(data_set) = 0 then |
1255 | 1 | return data_set |
1256 | end if | |
1257 | 2 | return sum_central_moments(data_set, 3) / ((length(data_set) - 1) * power(stdev(data_set), 3)) |
1258 | ||
1259 | end function | |
1260 | ||
1261 | --** | |
1262 | -- Returns a measure of the spread of values in a dataset when compared to a | |
1263 | -- //normal// probability curve. | |
1264 | -- | |
1265 | -- Parameters: | |
1266 | -- # ##data_set## : a list of 1 or more numbers whose kurtosis is required. | |
1267 | -- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it | |
1268 | -- means that ##data_set## is assumed to contain no sub-sequences otherwise this | |
1269 | -- gives instructions about how to treat sub-sequences. See comments for details. | |
1270 | -- | |
1271 | -- Returns: | |
1272 | -- An **object**. If this is an atom it is the kurtosis measure of the data set. | |
1273 | -- Othewise it is a sequence containing an error integer. The return value {0} | |
1274 | -- indicates that an empty dataset was passed, {1} indicates that the standard | |
1275 | -- deviation is zero (all values are the same). | |
1276 | -- | |
1277 | -- Comments: | |
1278 | -- Generally speaking, a negative return indicates that most of the values are | |
1279 | -- further from the mean, while positive values indicate that most values are | |
1280 | -- nearer to the mean. | |
1281 | -- | |
1282 | -- The larger the magnitude of the returned value, the more the data is 'peaked' | |
1283 | -- or 'flatter' in that direction. | |
1284 | -- | |
1285 | -- If the data can contain sub-sequences, such as strings, you need to let the | |
1286 | -- the function know about this otherwise it assumes every value in ##data_set## is | |
1287 | -- an number. If that is not the case then the function will crash. So it is | |
1288 | -- important that if it can possibly contain sub-sequences that you tell this | |
1289 | -- function what to do with them. Your choices are to ignore them or assume they | |
1290 | -- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter | |
1291 | -- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only | |
1292 | -- contains numbers use the default ##subseq_opt## value, ST_ALLNUM. | |
1293 | -- **Note** It is faster if the data only contains numbers. | |
1294 | -- | |
1295 | -- Example 1: | |
1296 | -- | |
1297 | -- ? kurtosis("thecatisthehatter") --> -1.737889192 | |
1298 | -- | |
1299 | -- | |
1300 | -- See also: | |
1301 | -- [[:skewness]] | |
1302 | -- | |
1303 | 4 | |
1304 | atom sd | |
1305 | ||
1306 | 4 | if atom(data_set) then |
1307 | 1 | return data_set |
1308 | end if | |
1309 | 3 | data_set = massage(data_set, subseq_opt) |
1310 | 3 | if length(data_set) = 0 then |
1311 | 1 | return {0} |
1312 | end if | |
1313 | 2 | sd = stdev(data_set) |
1314 | 2 | if sd = 0 then |
1315 | 1 | return {1} |
1316 | end if | |
1317 | ||
1318 | 1 | return (sum_central_moments(data_set, 4) / ((length(data_set) - 1) * power(stdev(data_set), 4))) - 3 |
1319 | ||
1320 | end function |