COVERAGE SUMMARY
FILE SUMMARY
NameExecutedRoutines%ExecutedLines%Unexecuted
/home/matt/eu/rds/include/std/stats.e2121100.00%27727899.64%1
ROUTINE SUMMARY
RoutineExecutedLinesUnexecuted
movavg()293096.67%1
avedev()2121100.00%0
average()77100.00%0
central_moment()66100.00%0
count()44100.00%0
emovavg()1414100.00%0
geomean()1919100.00%0
harmean()1919100.00%0
kurtosis()1010100.00%0
largest()1616100.00%0
massage()55100.00%0
median()1010100.00%0
mode()1212100.00%0
range()2020100.00%0
raw_frequency()2222100.00%0
skewness()77100.00%0
small()55100.00%0
smallest()1616100.00%0
stdev()1919100.00%0
sum()99100.00%0
sum_central_moments()22100.00%0
LINE COVERAGE DETAIL
#Executed
1
-- (c) Copyright - See License.txt
2
--
3
namespace stats
4
5
--****
6
-- == Statistics
7
-- **Page Contents**
8
--
9
-- <>
10
--
11
-- === Routines
12
13
include std/math.e
14
include std/sort.e
15
include std/sequence.e
16
17
18
--**
19
-- Determines the k-th smallest value from the supplied set of numbers.
20
--
21
-- Parameters:
22
-- # ##data_set## : The list of values from which the smallest value is chosen.
23
-- # ##ordinal_idx## : The relative index of the desired smallest value.
24
--
25
-- Returns:
26
-- A **sequence**, {The k-th smallest value, its index in the set}
27
--
28
-- Comments:
29
-- ##small##() is used to return a value based on its size relative to
30
-- all the other elements in the sequence. When ##index## is 1, the smallest index is returned. Use ##index = length(data_set)## to return the highest.
31
--
32
-- If ##ordinal_idx## is less than one, or greater then length of ##data_set##,
33
-- an empty sequence is returned.
34
--
35
-- The set of values does not have to be in any particular order. The values may be any Euphoria object.
36
--
37
-- Example 1:
38
--
39
-- ? small( {4,5,6,8,5,4,3,"text"}, 3 ) -- Ans: {4,1} (The 3rd smallest value)
40
-- ? small( {4,5,6,8,5,4,3,"text"}, 1 ) -- Ans: {3,7} (The 1st smallest value)
41
-- ? small( {4,5,6,8,5,4,3,"text"}, 7 ) -- Ans: {8,4} (The 7th smallest value)
42
-- ? small( {"def", "qwe", "abc", "try"}, 2 ) -- Ans: {"def", 1} (The 2nd smallest value)
43
-- ? small( {1,2,3,4}, -1) -- Ans: {} -- no-value
44
-- ? small( {1,2,3,4}, 10) -- Ans: {} -- no-value
45
--
46
--
47
484
49
sequence lSortedData
50
514
if ordinal_idx < 1 or ordinal_idx > length(data_set) then
523
return {}
53
end if
54
551
lSortedData = sort(data_set)
56
571
return {lSortedData[ordinal_idx], find(lSortedData[ordinal_idx], data_set)}
58
end function
59
60
--**
61
-- Returns the largest of the data points that are atoms.
62
--
63
-- Parameters:
64
-- # ##data_set## : a list of 1 or more numbers among which you want the largest.
65
--
66
-- Returns:
67
-- An **object**, either of:
68
-- * an atom (the largest value) if there is at least one atom item in the set\\
69
-- * ##{} ##if there //is// no largest value.
70
--
71
-- Comments:
72
-- Any ##data_set## element which is not an atom is ignored.
73
--
74
-- Example 1:
75
--
76
-- ? largest( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"} ) -- Ans: 8
77
-- ? largest( {"just","text"} ) -- Ans: {}
78
--
79
--
80
-- See also:
81
-- [[:range]]
82
--
833
84
atom result_, temp_
85
integer lFoundAny
863
if atom(data_set) then
871
return data_set
88
end if
892
lFoundAny = 0
902
for i = 1 to length(data_set) do
9118
if atom(data_set[i]) then
9215
temp_ = data_set[i]
9315
if lFoundAny then
9414
if temp_ > result_ then
951
result_ = temp_
96
end if
97
else
981
result_ = temp_
991
lFoundAny = 1
100
end if
101
end if
10218
end for
1032
if lFoundAny = 0 then
1041
return {}
105
end if
1061
return result_
107
end function
108
109
--**
110
-- Returns the smallest of the data points.
111
--
112
-- Parameters:
113
-- # ##data_set## : A list of 1 or more numbers for which you want the smallest.
114
-- **Note:** only atom elements are included and any sub-sequences
115
-- elements are ignored.
116
--
117
-- Returns:
118
-- An **object**, either of:
119
-- * an atom (the smallest value) if there is at least one atom item in the set\\
120
-- * ##{} ##if there //is// no largest value.
121
--
122
-- Comments:
123
-- Any ##data_set## element which is not an atom is ignored.
124
--
125
-- Example 1:
126
--
127
-- ? smallest( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"} ) -- Ans: 1
128
-- ? smallest( {"just","text"} ) -- Ans: {}
129
--
130
--
131
-- See also:
132
-- [[:range]]
1333
134
atom result_, temp_
135
integer lFoundAny
1363
if atom(data_set) then
1371
return data_set
138
end if
1392
lFoundAny = 0
1402
for i = 1 to length(data_set) do
14118
if atom(data_set[i]) then
14215
temp_ = data_set[i]
14315
if lFoundAny then
14414
if temp_ < result_ then
1452
result_ = temp_
146
end if
147
else
1481
result_ = temp_
1491
lFoundAny = 1
150
end if
151
end if
15218
end for
1532
if lFoundAny = 0 then
1541
return {}
155
end if
1561
return result_
157
end function
158
159
--**
160
-- Determines a number of //range// statistics for the data set.
161
--
162
-- Parameters:
163
-- # ##data_set## : a list of 1 or more numbers for which you want the range data.
164
--
165
-- Returns:
166
-- A **sequence**, empty if no atoms were found, else like {Lowest, Highest, Range, Mid-range}
167
--
168
-- Comments:
169
-- Any sequence element in ##data_set## is ignored.
170
--
171
-- Example 1:
172
--
173
-- ? range( {7,2,8,5,6,6,4,8,6,16,3,3,4,1,8,"text"} ) -- Ans: {1, 16, 15, 8.5}
174
--
175
--
176
-- See also:
177
-- [[:smallest]] [[:largest]]
178
--
1793
180
sequence result_
181
atom temp_
1823
integer lFoundAny = 0
183
1843
if atom(data_set) then
1851
data_set = {data_set}
186
end if
187
1883
for i = 1 to length(data_set) do
18917
if atom(data_set[i]) then
19016
temp_ = data_set[i]
19116
if lFoundAny then
19214
if temp_ < result_[1] then
1932
result_[1] = temp_
19412
elsif temp_ > result_[2] then
1952
result_[2] = temp_
196
end if
197
else
1982
result_ = {temp_, temp_, 0, 0}
1992
lFoundAny = 1
200
end if
201
end if
20217
end for
2033
if lFoundAny = 0 then
2041
return {}
205
end if
2062
result_[3] = result_[2] - result_[1]
2072
result_[4] = (result_[1] + result_[2]) / 2
2082
return result_
209
end function
210
211
--****
212
-- Enums used to influence the results of some of these functions.
213
214
public enum
215
--**
216
-- The supplied data is the entire population.
21721
ST_FULLPOP,
218
219
--**
220
-- The supplied data is only a random sample of the population.
22121
ST_SAMPLE
222
223
public enum
224
--**
225
-- The supplied data consists of only atoms.
22621
ST_ALLNUM,
227
228
--**
229
-- Any sub-sequences (eg. strings) in the supplied data are ignored.
23021
ST_IGNSTR,
231
232
--**
233
-- Any sub-sequences (eg. strings) in the supplied data are assumed to
234
-- have the value zero.
23521
ST_ZEROSTR,
236
237
$
238
239151
240151
switch subseq_opt do
241
case ST_IGNSTR then
24216
return remove_subseq(data_set, SEQ_NOALT)
243
244
case ST_ZEROSTR then
2453
return remove_subseq(data_set, 0)
246
247
case else
248132
return data_set
249
end switch
250
end function
251
252
--**
253
-- Returns the standard deviation based of the population.
254
--
255
-- Parameters:
256
-- # ##data_set## : a list of 1 or more numbers for which you want the estimated standard deviation.
257
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
258
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
259
-- gives instructions about how to treat sub-sequences. See comments for details.
260
-- # ##population_type## : an integer. ST_SAMPLE (the default) assumes that ##data_set## is a random
261
-- sample of the total population. ST_FULLPOP means that ##data_set## is the
262
-- entire population.
263
--
264
-- Returns:
265
-- An **atom**, the estimated standard deviation.
266
-- An empty **sequence** means that there is no meaningful data to calculate from.
267
--
268
-- Comments:
269
-- ##stdev##() is a measure of how values are different from the average.
270
--
271
-- The numbers in ##data_set## can either be the entire population of values or
272
-- just a random subset. You indicate which in the ##population_type## parameter. By default
273
-- ##data_set## represents a sample and not the entire population. When using this
274
-- function with sample data, the result is an //estimated// standard deviation.
275
--
276
-- If the data can contain sub-sequences, such as strings, you need to let the
277
-- the function know about this otherwise it assumes every value in ##data_set## is
278
-- an number. If that is not the case then the function will crash. So it is
279
-- important that if it can possibly contain sub-sequences that you tell this
280
-- function what to do with them. Your choices are to ignore them or assume they
281
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
282
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
283
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
284
-- **Note** It is faster if the data only contains numbers.
285
--
286
-- The equation for standard deviation is:
287
-- {{{
288
-- stdev(X) ==> SQRT(SUM(SQ(X{1..N} - MEAN)) / (N))
289
-- }}}
290
--
291
-- Example 1:
292
--
293
-- ? stdev( {4,5,6,7,5,4,3,7} ) -- Ans: 1.457737974
294
-- ? stdev( {4,5,6,7,5,4,3,7} ,, ST_FULLPOP) -- Ans: 1.363589014
295
-- ? stdev( {4,5,6,7,5,4,3,"text"} , ST_IGNSTR) -- Ans: 1.345185418
296
-- ? stdev( {4,5,6,7,5,4,3,"text"}, ST_IGNSTR, ST_FULLPOP ) -- Ans: 1.245399698
297
-- ? stdev( {4,5,6,7,5,4,3,"text"} , 0) -- Ans: 2.121320344
298
-- ? stdev( {4,5,6,7,5,4,3,"text"}, 0, ST_FULLPOP ) -- Ans: 1.984313483
299
--
300
--
301
-- See also:
302
-- [[:average]], [[:avedev]]
303
--
304
30539
306
atom lSum
307
atom lMean
308
integer lCnt
309
31039
data_set = massage(data_set, subseq_opt)
311
31239
lCnt = length(data_set)
313
31439
if lCnt = 0 then
3156
return {}
316
end if
31733
if lCnt = 1 then
3186
return 0
319
end if
320
32127
lSum = 0
32227
for i = 1 to length(data_set) do
32323535
lSum += data_set[i]
32423535
end for
325
32627
lMean = lSum / lCnt
32727
lSum = 0
32827
for i = 1 to length(data_set) do
32923535
lSum += power(data_set[i] - lMean, 2)
33023535
end for
331
33227
if population_type = ST_SAMPLE then
33325
lCnt -= 1
334
end if
335
33627
return power(lSum / lCnt, 0.5)
337
end function
338
339
--**
340
-- Returns the average of the absolute deviations of data points from their mean.
341
--
342
-- Parameters:
343
-- # ##data_set## : a list of 1 or more numbers for which you want the mean of the absolute deviations.
344
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
345
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
346
-- gives instructions about how to treat sub-sequences. See comments for details.
347
-- # ##population_type## : an integer. ST_SAMPLE (the default) assumes that ##data_set## is a random
348
-- sample of the total population. ST_FULLPOP means that ##data_set## is the
349
-- entire population.
350
--
351
-- Returns:
352
-- An **atom** , the deviation from the mean.\\
353
-- An empty **sequence**, means that there is no meaningful data to calculate from.
354
--
355
-- Comments:
356
-- ##avedev##() is a measure of the variability in a data set. Its statistical
357
-- properties are less well behaved than those of the standard deviation, which is
358
-- why it is used less.
359
--
360
-- The numbers in ##data_set## can either be the entire population of values or
361
-- just a random subset. You indicate which in the ##population_type## parameter. By default
362
-- ##data_set## represents a sample and not the entire population. When using this
363
-- function with sample data, the result is an //estimated// deviation.
364
--
365
-- If the data can contain sub-sequences, such as strings, you need to let the
366
-- the function know about this otherwise it assumes every value in ##data_set## is
367
-- an number. If that is not the case then the function will crash. So it is
368
-- important that if it can possibly contain sub-sequences that you tell this
369
-- function what to do with them. Your choices are to ignore them or assume they
370
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
371
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
372
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
373
-- **Note** It is faster if the data only contains numbers.
374
--
375
-- The equation for absolute average deviation is~:
376
-- {{{
377
-- avedev(X) ==> SUM( ABS(X{1..N} - MEAN(X)) ) / N
378
-- }}}
379
--
380
-- Example 1:
381
--
382
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,7} ) -- Ans: 1.966666667
383
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,7},, ST_FULLPOP ) -- Ans: 1.84375
384
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR ) -- Ans: 1.99047619
385
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR,ST_FULLPOP ) -- Ans: 1.857777778
386
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, 0 ) -- Ans: 2.225
387
-- ? avedev( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, 0, ST_FULLPOP ) -- Ans: 2.0859375
388
--
389
--
390
-- See also:
391
-- [[:average]], [[:stdev]]
392
--
393
3944
395
atom lSum
396
atom lMean
397
integer lCnt
398
3994
data_set = massage(data_set, subseq_opt)
400
4014
lCnt = length(data_set)
402
4034
if lCnt = 0 then
4042
return {}
405
end if
4062
if lCnt = 1 then
4071
return 0
408
end if
4091
lSum = 0
410
4111
for i = 1 to length(data_set) do
41215
lSum += data_set[i]
41315
end for
414
4151
lMean = lSum / lCnt
4161
lSum = 0
4171
for i = 1 to length(data_set) do
41815
if data_set[i] > lMean then
4198
lSum += data_set[i] - lMean
420
else
4217
lSum += lMean - data_set[i]
422
end if
42315
end for
424
4251
if population_type = ST_SAMPLE then
4261
lCnt -= 1
427
end if
4281
return lSum / lCnt
429
end function
430
431
--**
432
-- Returns the sum of all the atoms in an object.
433
--
434
-- Parameters:
435
-- # ##data_set## : Either an atom or a list of numbers to sum.
436
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
437
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
438
-- gives instructions about how to treat sub-sequences. See comments for details.
439
--
440
-- Returns:
441
-- An **atom**, the sum of the set.
442
--
443
-- Comments:
444
-- ##sum##() is used as a measure of the magnitude of a sequence of positive values.
445
--
446
-- If the data can contain sub-sequences, such as strings, you need to let the
447
-- the function know about this otherwise it assumes every value in ##data_set## is
448
-- an number. If that is not the case then the function will crash. So it is
449
-- important that if it can possibly contain sub-sequences that you tell this
450
-- function what to do with them. Your choices are to ignore them or assume they
451
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
452
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
453
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
454
-- **Note** It is faster if the data only contains numbers.
455
--
456
-- The equation is~:
457
--
458
-- {{{
459
-- sum(X) ==> SUM( X{1..N} )
460
-- }}}
461
--
462
-- Example 1:
463
--
464
-- ? sum( {7,2,8.5,6,6,-4.8,6,6,3.341,-8,"text"}, 0 ) -- Ans: 32.041
465
--
466
--
467
-- See also:
468
-- [[:average]]
469
47036
471
atom result_
47236
if atom(data_set) then
4731
return data_set
474
end if
475
47635
data_set = massage(data_set, subseq_opt)
47735
result_ = 0
47835
for i = 1 to length(data_set) do
47923720
result_ += data_set[i]
48023720
end for
481
48235
return result_
483
end function
484
485
--**
486
-- Returns the count of all the atoms in an object.
487
--
488
-- Parameters:
489
-- # ##data_set## : either an atom or a list.
490
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
491
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
492
-- gives instructions about how to treat sub-sequences. See comments for details.
493
--
494
-- Comments:
495
-- This returns the number of numbers in ##data_set##
496
--
497
-- If the data can contain sub-sequences, such as strings, you need to let the
498
-- the function know about this otherwise it assumes every value in ##data_set## is
499
-- an number. If that is not the case then the function will crash. So it is
500
-- important that if it can possibly contain sub-sequences that you tell this
501
-- function what to do with them. Your choices are to ignore them or assume they
502
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
503
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
504
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
505
-- **Note** It is faster if the data only contains numbers.
506
--
507
-- Returns:
508
--
509
-- An **integer**, the number of atoms in the set. When ##data_set## is an atom, 1 is returned.
510
--
511
-- Example 1:
512
--
513
-- ? count( {7,2,8.5,6,6,-4.8,6,6,3.341,-8,"text"} ) -- Ans: 10
514
-- ? count( {"cat", "dog", "lamb", "cow", "rabbit"} ) -- Ans: 0 (no atoms)
515
-- ? count( 5 ) -- Ans: 1
516
--
517
--
518
-- See also:
519
-- [[:average]], [[:sum]]
520
52113
52213
if atom(data_set) then
5231
return 1
524
end if
525
52612
return length(massage(data_set, subseq_opt))
527
528
end function
529
530
531
--**
532
-- Returns the average (mean) of the data points.
533
--
534
-- Parameters:
535
-- # ##data_set## : A list of 1 or more numbers for which you want the mean.
536
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
537
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
538
-- gives instructions about how to treat sub-sequences. See comments for details.
539
--
540
--
541
-- Returns:
542
-- An **object**,
543
-- * ##{}## (the empty sequence) if there are no atoms in the set.
544
-- * an atom (the mean) if there are one or more atoms in the set.
545
--
546
-- Comments:
547
--
548
-- ##average##() is the theoretical probable value of a randomly selected item from the set.
549
--
550
-- The equation for average is:
551
--
552
-- {{{
553
-- average(X) ==> SUM( X{1..N} ) / N
554
-- }}}
555
--
556
-- If the data can contain sub-sequences, such as strings, you need to let the
557
-- the function know about this otherwise it assumes every value in ##data_set## is
558
-- an number. If that is not the case then the function will crash. So it is
559
-- important that if it can possibly contain sub-sequences that you tell this
560
-- function what to do with them. Your choices are to ignore them or assume they
561
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
562
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
563
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
564
-- **Note** It is faster if the data only contains numbers.
565
--
566
-- Example 1:
567
--
568
-- ? average( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,"text"}, ST_IGNSTR ) -- Ans: 5.13333333
569
--
570
--
571
-- See also:
572
-- [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]]
573
--
57430
575
57630
if atom(data_set) then
5771
return data_set
578
end if
579
58029
data_set = massage(data_set, subseq_opt)
581
58229
if length(data_set) = 0 then
5831
return {}
584
end if
58528
return sum(data_set) / length(data_set)
586
end function
587
588
--**
589
-- Returns the geometric mean of the atoms in a sequence.
590
--
591
-- Parameters:
592
-- # ##data_set## : the values to take the geometric mean of.
593
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
594
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
595
-- gives instructions about how to treat sub-sequences. See comments for details.
596
--
597
-- Returns:
598
--
599
-- An **atom**, the geometric mean of the atoms in ##data_set##.
600
-- If there is no atom to take the mean of, 1 is returned.
601
--
602
-- Comments:
603
--
604
-- The geometric mean of ##N## atoms is the N-th root of their product. Signs are ignored.
605
--
606
-- This is useful to compute average growth rates.
607
--
608
-- If the data can contain sub-sequences, such as strings, you need to let the
609
-- the function know about this otherwise it assumes every value in ##data_set## is
610
-- an number. If that is not the case then the function will crash. So it is
611
-- important that if it can possibly contain sub-sequences that you tell this
612
-- function what to do with them. Your choices are to ignore them or assume they
613
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
614
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
615
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
616
-- **Note** It is faster if the data only contains numbers.
617
--
618
-- Example 1:
619
--
620
-- ? geomean({3, "abc", -2, 6}, ST_IGNSTR) -- prints out power(36,1/3) = 3,30192724889462669
621
-- ? geomean({1,2,3,4,5,6,7,8,9,10}) -- = 4.528728688
622
--
623
--
624
-- See Also:
625
-- [[:average]]
626
6276
6286
atom prod_ = 1.0
629
integer count_
630
6316
if atom(data_set) then
6321
return data_set
633
end if
634
6355
data_set = massage(data_set, subseq_opt)
636
6375
count_ = length(data_set)
6385
if count_ = 0 then
6391
return 1
640
end if
6414
if count_ = 1 then
6421
return data_set[1]
643
end if
644
6453
for i = 1 to length(data_set) do
6469
atom x = data_set[i]
647
6489
if x = 0 then
6491
return 0
650
else
6518
prod_ *= x
652
end if
653
6548
end for
655
6562
if prod_ < 0 then
6571
return power(-prod_, 1/count_)
658
else
6591
return power(prod_, 1/count_)
660
end if
661
662
end function
663
664
--**
665
-- Returns the harmonic mean of the atoms in a sequence.
666
--
667
-- Parameters:
668
-- # ##data_set## : the values to take the harmonic mean of.
669
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
670
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
671
-- gives instructions about how to treat sub-sequences. See comments for details.
672
--
673
-- Returns:
674
--
675
-- An **atom**, the harmonic mean of the atoms in ##data_set##.
676
--
677
-- Comments:
678
-- The harmonic mean is the inverse of the average of their inverses.
679
--
680
-- This is useful in engineering to compute equivalent capacities and resistances.
681
--
682
-- If the data can contain sub-sequences, such as strings, you need to let the
683
-- the function know about this otherwise it assumes every value in ##data_set## is
684
-- an number. If that is not the case then the function will crash. So it is
685
-- important that if it can possibly contain sub-sequences that you tell this
686
-- function what to do with them. Your choices are to ignore them or assume they
687
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
688
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
689
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
690
-- **Note** It is faster if the data only contains numbers.
691
--
692
-- Example 1:
693
--
694
-- ? harmean({3, "abc", -2, 6}, ST_IGNSTR) -- = 0.
695
-- ? harmean({{2, 3, 4}) -- 3 / (1/2 + 1/3 + 1/4) = 2.769230769
696
--
697
--
698
-- See Also:
699
-- [[:average]]
700
7013
702
integer count_
703
7043
data_set = massage(data_set, subseq_opt)
705
7063
count_ = length(data_set)
7073
if count_ = 1 then
7081
return data_set[1]
709
end if
710
7112
atom y = 0
7122
atom z = 1
7132
for i = 1 to count_ do
7143
atom x = 1
7153
z *= data_set[i]
7163
for j = 1 to count_ do
7179
if j != i then
7186
x *= data_set[j]
719
end if
7209
end for
7213
y += x
7223
end for
723
7242
if y = 0 then
7251
return 0
726
end if
727
7281
return count_ * z / y
729
end function
730
731
--**
732
-- Returns the average (mean) of the data points for overlaping periods. This
733
-- can be either a simple or weighted moving average.
734
--
735
-- Parameters:
736
-- # ##data_set## : a list of 1 or more numbers for which you want a moving average.
737
-- # ##period_delta## : an object, either
738
-- * an integer representing the size of the period, or
739
-- * a list of weightings to apply to the respective period positions.
740
--
741
-- Returns:
742
-- A **sequence**, either the requested averages or ##{}## if the Data sequence is empty or
743
-- the supplied period is less than one.
744
--
745
-- If a list of weights was supplied, the result is a weighted average; otherwise, it is a simple average.
746
--
747
-- Comments:
748
--
749
-- A moving average is used to smooth out a set of data points over a period.\\
750
-- For example, given a period of 5:
751
-- # the first returned element is the average
752
-- of the first five data points [1..5],
753
-- # the second returned element is
754
-- the average of the second five data points [2..6], \\and so on \\until
755
-- the last returned value is the average of the last 5 data points
756
-- [$-4 .. $].
757
--
758
-- When ##period_delta## is an atom, it is rounded down to the width of the average. When it is a
759
-- sequence, the width is its length. If there are not enough data points, zeroes are inserted.
760
--
761
-- Note that only atom elements are included and any sub-sequence elements are ignored.
762
--
763
-- Example 1:
764
--
765
-- ? movavg( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8}, 10 )
766
-- -- Ans: {5.8, 5.4, 5.5, 5.1, 4.7, 4.9}
767
-- ? movavg( {7,2,8,5,6}, 2 )
768
-- -- Ans: {4.5, 5, 6.5, 5.5}
769
-- ? movavg( {7,2,8,5,6}, {0.5, 1.5} )
770
-- -- Ans: {3.25, 6.5, 5.75, 5.75}
771
--
772
--
773
-- See also:
774
-- [[:average]]
775
--
77610
777
sequence result_
778
integer lLow
779
integer lHigh
780
integer j
781
integer n
782
78310
if atom(data_set) then
7842
data_set = {data_set}
785
7868
elsif count(data_set) = 0 then
7871
return data_set
788
end if
789
7909
if atom(period_delta) then
7917
if floor(period_delta) < 1 then
7922
return {}
793
end if
7945
period_delta = repeat(1, floor(period_delta))
795
end if
796
7977
if length(data_set) < length(period_delta) then
7981
data_set = repeat(0, length(period_delta) - length(data_set)) & data_set
799
end if
8007
lLow = 1
8017
lHigh = length(period_delta)
8027
result_ = repeat(0, length(data_set) - length(period_delta) + 1)
8037
while lHigh <= length(data_set) do
80426
j = 1
80526
n = 0
80626
for i = lLow to lHigh do
807147
if atom(data_set[i]) then
808147
result_[lLow] += data_set[i] * period_delta[j]
809147
n += 1
810
end if
811147
j += 1
812147
end for
81326
if n > 0 then
81426
result_[lLow] /= n
815
else
8160
result_[lLow] = 0
817
end if
818
81926
lLow += 1
82026
lHigh += 1
82126
end while
822
8237
return result_
824
end function
825
826
--**
827
-- Returns the exponential moving average of a set of data points.
828
--
829
-- Parameters:
830
-- # ##data_set## : a list of 1 or more numbers for which you want a moving average.
831
-- # ##smoothing_factor## : an atom, the smoothing factor, typically between 0 and 1.
832
--
833
-- Returns:
834
-- A **sequence**, made of the requested averages, or ##{}## if ##data_set## is empty or
835
-- the supplied period is less than one.
836
--
837
-- Comments:
838
--
839
-- A moving average is used to smooth out a set of data points over a period.
840
--
841
-- The formula used is:\\
842
-- : ##Y,,i,, = Y,,i-1,, + F * (X,,i,, - Y,,i-1,,)##
843
--
844
-- Note that only atom elements are included and any sub-sequences elements are ignored.
845
--
846
-- The smoothing factor controls how data is smoothed. 0 smooths everything to 0, and 1 means no smoothing at all.
847
--
848
-- Any value for ##smoothing_factor## outside the 0.0..1.0 range causes ##smoothing_factor##
849
-- to be set to the periodic factor ##(2/(N+1))##.
850
--
851
-- Example 1:
852
--
853
-- ? emovavg( {7,2,8,5,6}, 0.75 )
854
-- -- Ans: {5.25,2.8125,6.703125,5.42578125,5.856445313}
855
-- ? emovavg( {7,2,8,5,6}, 0.25 )
856
-- -- Ans: {1.75,1.8125,3.359375,3.76953125,4.327148438}
857
-- ? emovavg( {7,2,8,5,6}, -1 )
858
-- -- Ans: {2.333333333,2.222222222,4.148148148,4.432098765,4.95473251}
859
--
860
--
861
-- See also:
862
-- [[:average]]
863
8644
865
atom lPrev
866
8674
if atom(data_set) then
8681
data_set = {data_set}
869
8703
elsif count(data_set) = 0 then
8711
return data_set
872
end if
873
8743
if smoothing_factor < 0 or smoothing_factor > 1 then
8751
smoothing_factor = (2 / (count(data_set) + 1))
876
end if
877
8783
lPrev = average(data_set)
8793
for i = 1 to length(data_set) do
88011
if atom(data_set[i]) then
88111
data_set[i] = (data_set[i] - lPrev) * smoothing_factor + lPrev
88211
lPrev = data_set[i]
883
end if
88411
end for
8853
return data_set
886
end function
887
888
--**
889
-- Returns the mid point of the data points.
890
--
891
-- Parameters:
892
-- # ##data_set## : a list of 1 or more numbers for which you want the mean.
893
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
894
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
895
-- gives instructions about how to treat sub-sequences. See comments for details.
896
--
897
-- Returns:
898
-- An **object**, either ##{}## if there are no items in the set, or an **atom** (the median) otherwise.
899
--
900
-- Comments:
901
--
902
-- ##median##() is the item for which half the items are below it and half
903
-- are above it.
904
--
905
-- All elements are included; any sequence elements are assumed to have the value zero.
906
--
907
-- The equation for average is:
908
--
909
-- {{{
910
-- median(X) ==> sort(X)[N/2]
911
-- }}}
912
--
913
-- If the data can contain sub-sequences, such as strings, you need to let the
914
-- the function know about this otherwise it assumes every value in ##data_set## is
915
-- an number. If that is not the case then the function will crash. So it is
916
-- important that if it can possibly contain sub-sequences that you tell this
917
-- function what to do with them. Your choices are to ignore them or assume they
918
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
919
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
920
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
921
-- **Note** It is faster if the data only contains numbers.
922
--
923
-- Example 1:
924
--
925
-- ? median( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: 5
926
--
927
--
928
-- See also:
929
-- [[:average]], [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]]
930
--
931
9324
933
9344
if atom(data_set) then
9351
return data_set
936
end if
937
9383
data_set = massage(data_set, subseq_opt)
939
9403
if length(data_set) = 0 then
9411
return data_set
942
end if
943
9442
if length(data_set) < 3 then
9451
return data_set[1]
946
end if
9471
data_set = sort(data_set)
9481
return data_set[ floor((length(data_set) + 1) / 2) ]
949
950
end function
951
952
--**
953
-- Returns the frequency of each unique item in the data set.
954
--
955
-- Parameters:
956
-- # ##data_set## : a list of 1 or more numbers for which you want the frequencies.
957
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
958
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
959
-- gives instructions about how to treat sub-sequences. See comments for details.
960
--
961
-- Returns:
962
-- A **sequence**. This will contain zero or more 2-element sub-sequences. The
963
-- first element is the frequency count and the second element is the data item
964
-- that was counted. The returned values are in descending order, meaning that
965
-- the highest frequencies are at the beginning of the returned list.
966
--
967
-- Comments:
968
-- If the data can contain sub-sequences, such as strings, you need to let the
969
-- the function know about this otherwise it assumes every value in ##data_set## is
970
-- an number. If that is not the case then the function will crash. So it is
971
-- important that if it can possibly contain sub-sequences that you tell this
972
-- function what to do with them. Your choices are to ignore them or assume they
973
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
974
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
975
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
976
-- **Note** It is faster if the data only contains numbers.
977
--
978
-- Example 1:
979
--
980
-- ? raw_frequency("the cat is the hatter")
981
--
982
-- This returns
983
-- {{{
984
-- {
985
-- {5,116},
986
-- {4,32},
987
-- {3,104},
988
-- {3,101},
989
-- {2,97},
990
-- {1,115},
991
-- {1,114},
992
-- {1,105},
993
-- {1,99}
994
-- }
995
-- }}}
996
--
997
9985
999
1000
sequence lCounts
1001
sequence lKeys
10025
integer lNew = 0
1003
integer lPos
10045
integer lMax = -1
1005
10065
if atom(data_set) then
10071
return {{1,data_set}}
1008
end if
1009
10104
data_set = massage(data_set, subseq_opt)
1011
10124
if length(data_set) = 0 then
10131
return {{1,data_set}}
1014
end if
10153
lCounts = repeat({0,0}, length(data_set))
10163
lKeys = repeat(0, length(data_set))
10173
for i = 1 to length(data_set) do
101853
lPos = find(data_set[i], lKeys)
101953
if lPos = 0 then
102024
lNew += 1
102124
lPos = lNew
102224
lCounts[lPos][2] = data_set[i]
102324
lKeys[lPos] = data_set[i]
102424
if lPos > lMax then
102524
lMax = lPos
1026
end if
1027
end if
102853
lCounts[lPos][1] += 1
102953
end for
10303
return sort(lCounts[1..lMax], DESCENDING)
1031
1032
end function
1033
1034
--**
1035
-- Returns the most frequent point(s) of the data set.
1036
--
1037
-- Parameters:
1038
-- # ##data_set## : a list of 1 or more numbers for which you want the mode.
1039
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
1040
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
1041
-- gives instructions about how to treat sub-sequences. See comments for details.
1042
--
1043
-- Returns:
1044
-- A **sequence**. The list of modal items in the data set.
1045
--
1046
-- Comments:
1047
--
1048
-- It is possible for the ##mode##() to return more than one item when more than
1049
-- one item in the set has the same highest frequency count.
1050
--
1051
-- If the data can contain sub-sequences, such as strings, you need to let the
1052
-- the function know about this otherwise it assumes every value in ##data_set## is
1053
-- an number. If that is not the case then the function will crash. So it is
1054
-- important that if it can possibly contain sub-sequences that you tell this
1055
-- function what to do with them. Your choices are to ignore them or assume they
1056
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
1057
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
1058
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
1059
-- **Note** It is faster if the data only contains numbers.
1060
--
1061
-- Example 1:
1062
--
1063
-- ? mode( {7,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: {6}
1064
-- ? mode( {8,2,8,5,6,6,4,8,6,6,3,3,4,1,8,4} ) -- Ans: {8,6}
1065
--
1066
--
1067
-- See also:
1068
-- [[:average]], [[:geomean]], [[:harmean]], [[:movavg]], [[:emovavg]]
1069
--
1070
10713
1072
1073
sequence lCounts
1074
sequence lRes
1075
10763
data_set = massage(data_set, subseq_opt)
1077
10783
if not length( data_set ) then
10791
return {}
1080
end if
1081
10822
lCounts = raw_frequency(data_set, subseq_opt)
1083
10842
lRes = {lCounts[1][2]}
10852
for i = 2 to length(lCounts) do
10863
if lCounts[i][1] < lCounts[1][1] then
10872
exit
1088
end if
10891
lRes = append(lRes, lCounts[i][2])
10901
end for
1091
10922
return lRes
1093
1094
end function
1095
1096
--**
1097
-- Returns the distance between a supplied value and the mean, to some supplied
1098
-- order of magnitude. This is used to get a measure of the //shape// of a
1099
-- data set.
1100
--
1101
-- Parameters:
1102
-- # ##data_set## : a list of 1 or more numbers whose mean is used.
1103
-- # ##datum##: either a single value or a list of values for which you require
1104
-- the central moments.
1105
-- # ##order_mag##: An integer. This is the order of magnitude required. Usually
1106
-- a number from 1 to 4, but can be anything.
1107
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
1108
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
1109
-- gives instructions about how to treat sub-sequences. See comments for details.
1110
--
1111
-- Returns:
1112
-- An **object**. The same data type as ##datum##. This is the set of calculated
1113
-- central moments.
1114
--
1115
-- Comments:
1116
--
1117
-- For each of the items in #datum##, its central moment is calculated as ...
1118
-- {{{
1119
-- CM = power( ITEM - AVG, MAGNITUDE)
1120
-- }}}
1121
--
1122
-- If the data can contain sub-sequences, such as strings, you need to let the
1123
-- the function know about this otherwise it assumes every value in ##data_set## is
1124
-- an number. If that is not the case then the function will crash. So it is
1125
-- important that if it can possibly contain sub-sequences that you tell this
1126
-- function what to do with them. Your choices are to ignore them or assume they
1127
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
1128
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
1129
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
1130
-- **Note** It is faster if the data only contains numbers.
1131
--
1132
-- Example 1:
1133
--
1134
-- ? central_moment("the cat is the hatter", "the",1) --> {23.14285714, 11.14285714, 8.142857143}
1135
-- ? central_moment("the cat is the hatter", 't',2) --> 535.5918367
1136
-- ? central_moment("the cat is the hatter", 't',3) --> 12395.12536
1137
--
1138
--
1139
-- See also:
1140
-- [[:average]]
1141
--
11428
1143
1144
atom lMean
1145
11468
data_set = massage(data_set, subseq_opt)
1147
11488
if length(data_set) = 0 then
11491
return 0
1150
end if
1151
11527
lMean = average(data_set)
1153
11547
return power( datum - lMean, order_mag)
1155
1156
end function
1157
1158
--**
1159
-- Returns sum of the central moments of each item in a data set.
1160
--
1161
-- Parameters:
1162
-- # ##data_set## : a list of 1 or more numbers whose mean is used.
1163
-- # ##order_mag##: An integer. This is the order of magnitude required. Usually
1164
-- a number from 1 to 4, but can be anything.
1165
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
1166
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
1167
-- gives instructions about how to treat sub-sequences. See comments for details.
1168
--
1169
-- Returns:
1170
-- An **atom**. The total of the central moments calculated for each of the
1171
-- items in ##data_set##.
1172
--
1173
-- Comments:
1174
-- If the data can contain sub-sequences, such as strings, you need to let the
1175
-- the function know about this otherwise it assumes every value in ##data_set## is
1176
-- an number. If that is not the case then the function will crash. So it is
1177
-- important that if it can possibly contain sub-sequences that you tell this
1178
-- function what to do with them. Your choices are to ignore them or assume they
1179
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
1180
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
1181
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
1182
-- **Note** It is faster if the data only contains numbers.
1183
--
1184
-- Example 1:
1185
--
1186
-- ? sum_central_moments("the cat is the hatter", 1) --> -8.526512829e-14
1187
-- ? sum_central_moments("the cat is the hatter", 2) --> 19220.57143
1188
-- ? sum_central_moments("the cat is the hatter", 3) --> -811341.551
1189
-- ? sum_central_moments("the cat is the hatter", 4) --> 56824083.71
1190
--
1191
--
1192
-- See also:
1193
-- [[:central_moment]], [[:average]]
1194
--
11957
11967
return sum( central_moment(data_set, data_set, order_mag, subseq_opt) )
1197
end function
1198
1199
--**
1200
-- Returns a measure of the asymmetry of a data set. Usually the data_set is a
1201
-- probablity distribution but it can be anything. This value is used to assess
1202
-- how suitable the data set is in representing the required analysis. It can
1203
-- help detect if there are too many extreme values in the data set.
1204
--
1205
-- Parameters:
1206
-- # ##data_set## : a list of 1 or more numbers whose mean is used.
1207
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
1208
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
1209
-- gives instructions about how to treat sub-sequences. See comments for details.
1210
--
1211
-- Returns:
1212
-- An **atom**. The skewness measure of the data set.
1213
--
1214
-- Comments:
1215
-- Generally speaking, a negative return indicates that most of the values are
1216
-- lower than the mean, while positive values indicate that most values are
1217
-- greater than the mean. However this might not be the case when there are a few
1218
-- extreme values on one side of the mean.
1219
--
1220
-- The larger the magnitude of the returned value, the more the data is skewed
1221
-- in that direction.
1222
--
1223
-- A returned value of zero indicates that the mean and median values are identical
1224
-- and that the data is symmetrical.
1225
--
1226
--
1227
-- If the data can contain sub-sequences, such as strings, you need to let the
1228
-- the function know about this otherwise it assumes every value in ##data_set## is
1229
-- an number. If that is not the case then the function will crash. So it is
1230
-- important that if it can possibly contain sub-sequences that you tell this
1231
-- function what to do with them. Your choices are to ignore them or assume they
1232
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
1233
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
1234
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
1235
-- **Note** It is faster if the data only contains numbers.
1236
--
1237
-- Example 1:
1238
--
1239
-- ? skewness("the cat is the hatter") --> -1.296820819
1240
-- ? skewness("thecatisthehatter") --> 0.1029393238
1241
--
1242
--
1243
-- See also:
1244
-- [[:kurtosis]]
1245
--
12464
1247
12484
if atom(data_set) then
12491
return data_set
1250
end if
1251
12523
data_set = massage(data_set, subseq_opt)
1253
12543
if length(data_set) = 0 then
12551
return data_set
1256
end if
12572
return sum_central_moments(data_set, 3) / ((length(data_set) - 1) * power(stdev(data_set), 3))
1258
1259
end function
1260
1261
--**
1262
-- Returns a measure of the spread of values in a dataset when compared to a
1263
-- //normal// probability curve.
1264
--
1265
-- Parameters:
1266
-- # ##data_set## : a list of 1 or more numbers whose kurtosis is required.
1267
-- # ##subseq_opt## : an object. When this is ST_ALLNUM (the default) it
1268
-- means that ##data_set## is assumed to contain no sub-sequences otherwise this
1269
-- gives instructions about how to treat sub-sequences. See comments for details.
1270
--
1271
-- Returns:
1272
-- An **object**. If this is an atom it is the kurtosis measure of the data set.
1273
-- Othewise it is a sequence containing an error integer. The return value {0}
1274
-- indicates that an empty dataset was passed, {1} indicates that the standard
1275
-- deviation is zero (all values are the same).
1276
--
1277
-- Comments:
1278
-- Generally speaking, a negative return indicates that most of the values are
1279
-- further from the mean, while positive values indicate that most values are
1280
-- nearer to the mean.
1281
--
1282
-- The larger the magnitude of the returned value, the more the data is 'peaked'
1283
-- or 'flatter' in that direction.
1284
--
1285
-- If the data can contain sub-sequences, such as strings, you need to let the
1286
-- the function know about this otherwise it assumes every value in ##data_set## is
1287
-- an number. If that is not the case then the function will crash. So it is
1288
-- important that if it can possibly contain sub-sequences that you tell this
1289
-- function what to do with them. Your choices are to ignore them or assume they
1290
-- have the value zero. To ignore them, use ST_IGNSTR as the ##subseq_opt## parameter
1291
-- value otherwise use ST_ZEROSTR. However, if you know that ##data_set## only
1292
-- contains numbers use the default ##subseq_opt## value, ST_ALLNUM.
1293
-- **Note** It is faster if the data only contains numbers.
1294
--
1295
-- Example 1:
1296
--
1297
-- ? kurtosis("thecatisthehatter") --> -1.737889192
1298
--
1299
--
1300
-- See also:
1301
-- [[:skewness]]
1302
--
13034
1304
atom sd
1305
13064
if atom(data_set) then
13071
return data_set
1308
end if
13093
data_set = massage(data_set, subseq_opt)
13103
if length(data_set) = 0 then
13111
return {0}
1312
end if
13132
sd = stdev(data_set)
13142
if sd = 0 then
13151
return {1}
1316
end if
1317
13181
return (sum_central_moments(data_set, 4) / ((length(data_set) - 1) * power(stdev(data_set), 4))) - 3
1319
1320
end function