statistics.py - OpenGrok cross reference for /third

Lines Matching refs:data
4 This module provides functions for calculating statistics of data, including
13 mean                Arithmetic mean (average) of data.
15 geometric_mean      Geometric mean of data.
16 harmonic_mean       Harmonic mean of data.
17 median              Median (middle value) of data.
18 median_low          Low median of data.
19 median_high         High median of data.
20 median_grouped      Median, or 50th percentile, of grouped data.
21 mode                Mode (most common value) of data.
22 multimode           List of modes (most common values of data).
23 quantiles           Divide data into intervals with equal probability.
26 Calculate the arithmetic mean ("the average") of data:
32 Calculate the standard median of discrete data:
38 Calculate the median, or 50th percentile, of data grouped into class intervals
39 centred on the data values provided. E.g. if your data points are rounded to
45 This should be interpreted in this way: you have two data points in the class
46 interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47 the class interval 3.5-4.5. The median of these data points is 2.8333...
56 pvariance           Population variance of data.
57 variance            Sample variance of data.
58 pstdev              Population standard deviation of data.
59 stdev               Sample standard deviation of data.
62 Calculate the standard deviation of sample data:
70 >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71 >>> mu = mean(data)
72 >>> pvariance(data, mu)
154 def _sum(data):
155     """_sum(data) -> (type, sum, count)
157     Return a high-precision sum of the given numeric data as a fraction,
179     >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
180     >>> _sum(data)
191     for typ, values in groupby(data, type):
208 def _ss(data, c=None):
209     """Return the exact mean and sum of square deviations of sequence data.
213     If given *c* is used the mean; otherwise, it is calculated from the data.
218         T, ssd, count = _sum((d := x - c) * d for x in data)
225     for typ, values in groupby(data, type):
414 def mean(data):
415     """Return the sample arithmetic mean of data.
428     If ``data`` is empty, StatisticsError will be raised.
430     T, total, n = _sum(data)
432         raise StatisticsError('mean requires at least one data point')
436 def fmean(data, weights=None):
437     """Convert data to floats and compute the arithmetic mean.
446         n = len(data)
454         data = count(data)
456         total = fsum(data)
458             raise StatisticsError('fmean requires at least one data point')
465     num = fsum(map(mul, data, weights))
467         raise StatisticsError('data and weights must be the same length')
474 def geometric_mean(data):
475     """Convert data to floats and compute the geometric mean.
487         return exp(fmean(map(log, data)))
493 def harmonic_mean(data, weights=None):
494     """Return the harmonic mean of data.
497     reciprocals of the data.  It can be used for averaging ratios or
513     If ``data`` is empty, or any element is less than zero,
516     if iter(data) is data:
517         data = list(data)
519     n = len(data)
521         raise StatisticsError('harmonic_mean requires at least one data point')
523         x = data[0]
537             raise StatisticsError('Number of weights does not match data size')
540         data = _fail_neg(data, errmsg)
541         T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
549 def median(data):
550     """Return the median (middle value) of numeric data.
552     When the number of data points is odd, return the middle data point.
553     When the number of data points is even, the median is interpolated by
562     data = sorted(data)
563     n = len(data)
565         raise StatisticsError("no median for empty data")
567         return data[n // 2]
570         return (data[i - 1] + data[i]) / 2
573 def median_low(data):
574     """Return the low median of numeric data.
576     When the number of data points is odd, the middle value is returned.
585     data = sorted(data)
586     n = len(data)
588         raise StatisticsError("no median for empty data")
590         return data[n // 2]
592         return data[n // 2 - 1]
595 def median_high(data):
596     """Return the high median of data.
598     When the number of data points is odd, the middle value is returned.
607     data = sorted(data)
608     n = len(data)
610         raise StatisticsError("no median for empty data")
611     return data[n // 2]
614 def median_grouped(data, interval=1.0):
615     """Estimates the median for numeric data binned around the midpoints
618     The *data* can be any iterable of numeric data with each value being
643         >>> data = list(demographics.elements())
644         >>> median(data)
646         >>> round(median_grouped(data, interval=10), 1)
649     The caller is responsible for making sure the data points are separated
657     data = sorted(data)
658     n = len(data)
660         raise StatisticsError("no median for empty data")
664     x = data[n // 2]
666     # Using O(log n) bisection, find where all the x values occur in the data.
667     # All x will lie within data[i:j].
668     i = bisect_left(data, x)
669     j = bisect_right(data, x, lo=i)
679     # https://www.cuemath.com/data/median-of-grouped-data/
686 def mode(data):
687     """Return the most common data point from discrete or nominal data.
689     ``mode`` assumes discrete data, and returns a single value. This is the
695     This also works with nominal (non-numeric) data:
706     If *data* is empty, ``mode``, raises StatisticsError.
709     pairs = Counter(iter(data)).most_common(1)
713         raise StatisticsError('no mode for empty data') from None
716 def multimode(data):
720     or an empty list if *data* is empty.
729     counts = Counter(iter(data))
750 # For sample data where there is a positive probability for values
751 # beyond the range of the data, the R6 exclusive method is a
761 # For describing population data where the end points are known to
762 # be included in the data, the R7 inclusive method is a reasonable
773 def quantiles(data, *, n=4, method='exclusive'):
774     """Divide *data* into *n* continuous intervals with equal probability.
780     separate *data* in to 100 equal sized groups.
782     The *data* can be any iterable containing sample.
783     The cut points are linearly interpolated between data points.
785     If *method* is set to *inclusive*, *data* is treated as population
786     data.  The minimum value is treated as the 0th percentile and the
791     data = sorted(data)
792     ld = len(data)
794         raise StatisticsError('must have at least two data points')
800             interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
810             interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
822 def variance(data, xbar=None):
823     """Return the sample variance of data.
825     data should be an iterable of Real-valued numbers, with at least two
827     the data. If it is missing or None, the mean is automatically calculated.
829     Use this function when your data is a sample from a population. To
834     >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
835     >>> variance(data)
838     If you have already calculated the mean of your data, you can pass it as
841     >>> m = mean(data)
842     >>> variance(data, m)
846     ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
860     T, ss, c, n = _ss(data, xbar)
862         raise StatisticsError('variance requires at least two data points')
866 def pvariance(data, mu=None):
867     """Return the population variance of ``data``.
869     data should be a sequence or iterable of Real-valued numbers, with at least one
871     the data. If it is missing or None, the mean is automatically calculated.
879     >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
880     >>> pvariance(data)
883     If you have already calculated the mean of the data, you can pass it as
886     >>> mu = mean(data)
887     >>> pvariance(data, mu)
901     T, ss, c, n = _ss(data, mu)
903         raise StatisticsError('pvariance requires at least one data point')
907 def stdev(data, xbar=None):
916     T, ss, c, n = _ss(data, xbar)
918         raise StatisticsError('stdev requires at least two data points')
925 def pstdev(data, mu=None):
934     T, ss, c, n = _ss(data, mu)
936         raise StatisticsError('pstdev requires at least one data point')
943 def _mean_stdev(data):
945     T, ss, xbar, n = _ss(data)
947         raise StatisticsError('stdev requires at least two data points')
982         raise StatisticsError('covariance requires that both inputs have same number of data points')
984         raise StatisticsError('covariance requires at least two data points')
1010         raise StatisticsError('correlation requires that both inputs have same number of data points')
1012         raise StatisticsError('correlation requires at least two data points')
1038     estimated, and noise represents the variability of the data that was
1053     The data is fit to a line passing through the origin.
1067         raise StatisticsError('linear regression requires that both inputs have same number of data points')
1069         raise StatisticsError('linear regression requires at least two data points')
1188     def from_samples(cls, data):
1189         "Make a normal distribution instance from sample data."
1190         return cls(*_mean_stdev(data))