[frames] | no frames]

# Source Code for Module igraph.statistics

```  1  # vim:ts=4:sw=4:sts=4:et
2  # -*- coding: utf-8 -*-
3  """
4  Statistics related stuff in igraph
5  """
6
8  Copyright (C) 2006-2012  Tamas Nepusz <ntamas@gmail.com>
9  Pázmány Péter sétány 1/a, 1117 Budapest, Hungary
10
11  This program is free software; you can redistribute it and/or modify
13  the Free Software Foundation; either version 2 of the License, or
14  (at your option) any later version.
15
16  This program is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  GNU General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with this program; if not, write to the Free Software
23  Foundation, Inc.,  51 Franklin Street, Fifth Floor, Boston, MA
24  02110-1301 USA
25  """
26
27  import math
28
29  __all__ = ["FittedPowerLaw", "Histogram", "RunningMean", "mean", "median", \
30          "percentile", "quantile", "power_law_fit"]
31
32
33 -class FittedPowerLaw(object):
34      """Result of fitting a power-law to a vector of samples
35
36      Example:
37
38          >>> result = power_law_fit([1, 2, 3, 4, 5, 6], return_alpha_only=False)
39          >>> result                   # doctest:+ELLIPSIS
40          FittedPowerLaw(continuous=False, alpha=2.425828..., xmin=3.0, L=-7.54633..., D=0.2138..., p=0.99311...)
41          >>> print result             # doctest:+ELLIPSIS
42          Fitted power-law distribution on discrete data
43          <BLANKLINE>
44          Exponent (alpha)  = 2.425828
45          Cutoff (xmin)     = 3.000000
46          <BLANKLINE>
47          Log-likelihood    = -7.546337
48          <BLANKLINE>
49          H0: data was drawn from the fitted distribution
50          <BLANKLINE>
51          KS test statistic = 0.213817
52          p-value           = 0.993111
53          <BLANKLINE>
54          H0 could not be rejected at significance level 0.05
55          >>> result.alpha             # doctest:+ELLIPSIS
56          2.425828...
57          >>> result.xmin
58          3.0
59          >>> result.continuous
60          False
61      """
62
63 -    def __init__(self, continuous, alpha, xmin, L, D, p):
64          self.continuous = continuous
65          self.xmin = xmin
66          self.alpha = alpha
67          self.L = L
68          self.D = D
69          self.p = p
70
71 -    def __repr__(self):
72          return "%s(continuous=%r, alpha=%r, xmin=%r, L=%r, D=%r, p=%r)" % \
73                  (self.__class__.__name__, self.continuous, self.alpha, \
74                  self.xmin, self.L, self.D, self.p)
75
76 -    def __str__(self):
77          return self.summary(significance=0.05)
78
79 -    def summary(self, significance=0.05):
80          """Returns the summary of the power law fit.
81
82          @param significance: the significance level of the Kolmogorov-Smirnov test
83            used to decide whether the input data could have come from the fitted
84            distribution
85          @return: the summary as a string
86          """
87          result = ["Fitted power-law distribution on %s data" % \
88                  ("discrete", "continuous")[bool(self.continuous)]]
89          result.append("")
90          result.append("Exponent (alpha)  = %f" % self.alpha)
91          result.append("Cutoff (xmin)     = %f" % self.xmin)
92          result.append("")
93          result.append("Log-likelihood    = %f" % self.L)
94          result.append("")
95          result.append("H0: data was drawn from the fitted distribution")
96          result.append("")
97          result.append("KS test statistic = %f" % self.D)
98          result.append("p-value           = %f" % self.p)
99          result.append("")
100          if self.p < significance:
101              result.append("H0 rejected at significance level %g" \
102                      % significance)
103          else:
104              result.append("H0 could not be rejected at significance "\
105                      "level %g" % significance)
106
107          return "\n".join(result)
108
109
110 -class Histogram(object):
111      """Generic histogram class for real numbers
112
113      Example:
114
115          >>> h = Histogram(5)     # Initializing, bin width = 5
116          >>> h << [2,3,2,7,8,5,5,0,7,9]     # Adding more items
117          >>> print h
118          N = 10, mean +- sd: 4.8000 +- 2.9740
119          [ 0,  5): **** (4)
120          [ 5, 10): ****** (6)
121      """
122
123 -    def __init__(self, bin_width = 1, data = None):
124          """Initializes the histogram with the given data set.
125
126          @param bin_width: the bin width of the histogram.
127          @param data: the data set to be used. Must contain real numbers.
128          """
129          self._bin_width = float(bin_width)
130          self._bins = None
131          self._min, self._max = None, None
132          self._running_mean = RunningMean()
133          self.clear()
134
135          if data:
137
138 -    def _get_bin(self, num, create = False):
139          """Returns the bin index corresponding to the given number.
140
141          @param num: the number for which the bin is being sought
142          @param create: whether to create a new bin if no bin exists yet.
143          @return: the index of the bin or C{None} if no bin exists yet and
144            {create} is C{False}."""
145          if len(self._bins) == 0:
146              if not create:
147                  result = None
148              else:
149                  self._min = int(num/self._bin_width)*self._bin_width
150                  self._max = self._min+self._bin_width
151                  self._bins = [0]
152                  result = 0
153              return result
154
155          if num >= self._min:
156              binidx = int((num-self._min)/self._bin_width)
157              if binidx < len(self._bins):
158                  return binidx
159              if not create:
160                  return None
161              extra_bins = binidx-len(self._bins)+1
162              self._bins.extend([0]*extra_bins)
163              self._max = self._min + len(self._bins)*self._bin_width
164              return binidx
165
166          if not create:
167              return None
168
169          extra_bins = int(math.ceil((self._min-num)/self._bin_width))
170          self._bins[0:0] = [0]*extra_bins
171          self._min -= extra_bins*self._bin_width
172          self._max = self._min + len(self._bins)*self._bin_width
173          return 0
174
175      @property
176 -    def n(self):
177          """Returns the number of elements in the histogram"""
178          return len(self._running_mean)
179
180      @property
181 -    def mean(self):
182          """Returns the mean of the elements in the histogram"""
183          return self._running_mean.mean
184
185      # pylint: disable-msg=C0103
186      @property
187 -    def sd(self):
188          """Returns the standard deviation of the elements in
189          the histogram"""
190          return self._running_mean.sd
191
192      @property
193 -    def var(self):
194          """Returns the variance of the elements in the histogram"""
195          return self._running_mean.var
196
197 -    def add(self, num, repeat=1):
198          """Adds a single number to the histogram.
199
200          @param num: the number to be added
201          @param repeat: number of repeated additions
202          """
203          num = float(num)
204          binidx = self._get_bin(num, True)
205          self._bins[binidx] += repeat
207
209          """Adds a single number or the elements of an iterable to the histogram.
210
211          @param data: the data to be added"""
212          try:
213              iterator = iter(data)
214          except TypeError:
215              iterator = iter([data])
216          for x in iterator:
219
220 -    def clear(self):
221          """Clears the collected data"""
222          self._bins = []
223          self._min, self._max = None, None
224          self._running_mean = RunningMean()
225
226 -    def bins(self):
227          """Generator returning the bins of the histogram in increasing order
228
229          @return: a tuple with the following elements: left bound, right bound,
230            number of elements in the bin"""
231          x = self._min
232          for elem in self._bins:
233              yield (x, x+self._bin_width, elem)
234              x += self._bin_width
235
236 -    def __plot__(self, context, bbox, _, **kwds):
237          """Plotting support"""
238          from igraph.drawing.coord import DescartesCoordinateSystem
239          coord_system = DescartesCoordinateSystem(context, bbox, \
240              (kwds.get("min", self._min), 0, \
241               kwds.get("max", self._max), kwds.get("max_value", max(self._bins))
242              ))
243
244          # Draw the boxes
245          context.set_line_width(1)
246          context.set_source_rgb(1., 0., 0.)
247          x = self._min
248          for value in self._bins:
249              top_left_x, top_left_y = coord_system.local_to_context(x, value)
250              x += self._bin_width
251              bottom_right_x, bottom_right_y = coord_system.local_to_context(x, 0)
252              context.rectangle(top_left_x, top_left_y, \
253                                bottom_right_x - top_left_x, \
254                                bottom_right_y - top_left_y)
255              context.fill()
256
257          # Draw the axes
258          coord_system.draw()
259
260 -    def to_string(self, max_width=78, show_bars=True, show_counts=True):
261          """Returns the string representation of the histogram.
262
263          @param max_width: the maximal width of each line of the string
264            This value may not be obeyed if it is too small.
265          @param show_bars: specify whether the histogram bars should be shown
266          @param show_counts: specify whether the histogram counts should be
267            shown. If both I{show_bars} and I{show_counts} are C{False},
268            only a general descriptive statistics (number of elements, mean and
269            standard deviation) is shown.
270          """
271
272          if self._min is None or self._max is None:
273              return "N = 0"
274
275          # Determine how many decimal digits should we use
276          if int(self._min) == self._min and int(self._bin_width) == self._bin_width:
277              number_format = "%d"
278          else:
279              number_format = "%.3f"
280          num_length = max(len(number_format % self._min), \
281                           len(number_format % self._max))
282          number_format = "%" + str(num_length) + number_format[1:]
283          format_string = "[%s, %s): %%s" % (number_format, number_format)
284
285          # Calculate the scale of the bars on the histogram
286          if show_bars:
287              maxval = max(self._bins)
288              if show_counts:
289                  maxval_length = len(str(maxval))
290                  scale = maxval // (max_width-2*num_length-maxval_length-9)
291              else:
292                  scale = maxval // (max_width-2*num_length-6)
293              scale = max(scale, 1)
294
295          result = ["N = %d, mean +- sd: %.4f +- %.4f" % \
296              (self.n, self.mean, self.sd)]
297
298          if show_bars:
299              # Print the bars
300              if scale > 1:
301                  result.append("Each * represents %d items" % scale)
302              if show_counts:
303                  format_string += " (%d)"
304                  for left, right, cnt in self.bins():
305                      result.append(format_string % (left, right, '*'*(cnt//scale), cnt))
306              else:
307                  for left, right, cnt in self.bins():
308                      result.append(format_string % (left, right, '*'*(cnt//scale)))
309          elif show_counts:
310              # Print the counts only
311              for left, right, cnt in self.bins():
312                  result.append(format_string % (left, right, cnt))
313
314          return "\n".join(result)
315
316 -    def __str__(self):
317          return self.to_string()
318
319
320
321 -class RunningMean(object):
322      """Running mean calculator.
323
324      This class can be used to calculate the mean of elements from a
325      list, tuple, iterable or any other data source. The mean is
326      calculated on the fly without explicitly summing the values,
327      so it can be used for data sets with arbitrary item count. Also
328      capable of returning the standard deviation (also calculated on
329      the fly)
330      """
331
332      # pylint: disable-msg=C0103
333 -    def __init__(self, items=None, n=0.0, mean=0.0, sd=0.0):
334          """RunningMean(items=None, n=0.0, mean=0.0, sd=0.0)
335
336          Initializes the running mean calculator.
337
338          There are two possible ways to initialize the calculator.
339          First, one can provide an iterable of items; alternatively,
340          one can specify the number of items, the mean and the
341          standard deviation if we want to continue an interrupted
342          calculation.
343
344          @param items: the items that are used to initialize the
345            running mean calcuator. If C{items} is given, C{n},
346            C{mean} and C{sd} must be zeros.
347          @param n: the initial number of elements already processed.
348            If this is given, C{items} must be C{None}.
349          @param mean: the initial mean. If this is given, C{items}
350            must be C{None}.
351          @param sd: the initial standard deviation. If this is given,
352            C{items} must be C{None}."""
353          if items is not None:
354              if n != 0 or mean != 0 or sd != 0:
355                  raise ValueError("n, mean and sd must be zeros if items is not None")
356              self.clear()
358          else:
359              self._nitems = float(n)
360              self._mean = float(mean)
361              if n > 1:
362                  self._sqdiff = float(sd) ** 2 * float(n-1)
363                  self._sd = float(sd)
364              else:
365                  self._sqdiff = 0.0
366                  self._sd = 0.0
367
368 -    def add(self, value, repeat=1):
370
371          Adds the given value to the elements from which we calculate
372          the mean and the standard deviation.
373
374          @param value: the element to be added
375          @param repeat: number of repeated additions
376          """
377          repeat = int(repeat)
378          self._nitems += repeat
379          delta = value - self._mean
380          self._mean += (repeat*delta / self._nitems)
381          self._sqdiff += (repeat*delta) * (value - self._mean)
382          if self._nitems > 1:
383              self._sd = (self._sqdiff / (self._nitems-1)) ** 0.5
384
387
388          Adds the values in the given iterable to the elements from
389          which we calculate the mean. Can also accept a single number.
390          The left shift (C{<<}) operator is aliased to this function,
391          so you can use it to add elements as well:
392
393            >>> rm=RunningMean()
394            >>> rm << [1,2,3,4]
395            >>> rm.result               # doctest:+ELLIPSIS
396            (2.5, 1.290994...)
397
398          @param values: the element(s) to be added
399          @type values: iterable"""
400          try:
401              iterator = iter(values)
402          except TypeError:
403              iterator = iter([values])
404          for value in iterator:
406
407 -    def clear(self):
408          """Resets the running mean calculator."""
409          self._nitems, self._mean = 0.0, 0.0
410          self._sqdiff, self._sd = 0.0, 0.0
411
412      @property
413 -    def result(self):
414          """Returns the current mean and standard deviation as a tuple"""
415          return self._mean, self._sd
416
417      @property
418 -    def mean(self):
419          """Returns the current mean"""
420          return self._mean
421
422      @property
423 -    def sd(self):
424          """Returns the current standard deviation"""
425          return self._sd
426
427      @property
428 -    def var(self):
429          """Returns the current variation"""
430          return self._sd ** 2
431
432 -    def __repr__(self):
433          return "%s(n=%r, mean=%r, sd=%r)" % \
434                  (self.__class__.__name__, int(self._nitems),
435                          self._mean, self._sd)
436
437 -    def __str__(self):
438          return "Running mean (N=%d, %f +- %f)" % \
439              (self._nitems, self._mean, self._sd)
440
442
443 -    def __float__(self):
444          return float(self._mean)
445
446 -    def __int__(self):
447          return int(self._mean)
448
449 -    def __long__(self):
450          return long(self._mean)
451
452 -    def __complex__(self):
453          return complex(self._mean)
454
455 -    def __len__(self):
456          return self._nitems
457
458
459 -def mean(xs):
460      """Returns the mean of an iterable.
461
462      Example:
463
464          >>> mean([1, 4, 7, 11])
465          5.75
466
467      @param xs: an iterable yielding numbers.
468      @return: the mean of the numbers provided by the iterable.
469
470      @see: RunningMean() if you also need the variance or the standard deviation
471      """
472      return RunningMean(xs).mean
473
474 -def median(xs, sort=True):
475      """Returns the median of an unsorted or sorted numeric vector.
476
477      @param xs: the vector itself.
478      @param sort: whether to sort the vector. If you know that the vector is
479        sorted already, pass C{False} here.
480      @return: the median, which will always be a float, even if the vector
481        contained integers originally.
482      """
483      if sort:
484          xs = sorted(xs)
485
486      mid = int(len(xs) / 2)
487      if 2 * mid == len(xs):
488          return float(xs[mid-1] + xs[mid]) / 2
489      else:
490          return float(xs[mid])
491
492 -def percentile(xs, p=(25, 50, 75), sort=True):
493      """Returns the pth percentile of an unsorted or sorted numeric vector.
494
495      This is equivalent to calling quantile(xs, p/100.0); see L{quantile}
496      for more details on the calculation.
497
498      Example:
499
500          >>> round(percentile([15, 20, 40, 35, 50], 40), 2)
501          26.0
502          >>> for perc in percentile([15, 20, 40, 35, 50], (0, 25, 50, 75, 100)):
503          ...     print "%.2f" % perc
504          ...
505          15.00
506          17.50
507          35.00
508          45.00
509          50.00
510
511      @param xs: the vector itself.
512      @param p: the percentile we are looking for. It may also be a list if you
513        want to calculate multiple quantiles with a single call. The default
514        value calculates the 25th, 50th and 75th percentile.
515      @param sort: whether to sort the vector. If you know that the vector is
516        sorted already, pass C{False} here.
517      @return: the pth percentile, which will always be a float, even if the vector
518        contained integers originally. If p is a list, the result will also be a
519        list containing the percentiles for each item in the list.
520      """
521      if hasattr(p, "__iter__"):
522          return quantile(xs, (x/100.0 for x in p), sort)
523      return quantile(xs, p/100.0, sort)
524
525 -def power_law_fit(data, xmin=None, method="auto", return_alpha_only=True):
526      """Fitting a power-law distribution to empirical data
527
528      @param data: the data to fit, a list containing integer values
529      @param xmin: the lower bound for fitting the power-law. If C{None},
530        the optimal xmin value will be estimated as well. Zero means that
531        the smallest possible xmin value will be used.
532      @param method: the fitting method to use. The following methods are
533        implemented so far:
534
535          - C{continuous}, C{hill}: exact maximum likelihood estimation
536            when the input data comes from a continuous scale. This is
537            known as the Hill estimator. The statistical error of
538            this estimator is M{(alpha-1) / sqrt(n)}, where alpha is the
539            estimated exponent and M{n} is the number of data points above
540            M{xmin}. The estimator is known to exhibit a small finite
541            sample-size bias of order M{O(n^-1)}, which is small when
542            M{n > 100}. igraph will try to compensate for the finite sample
543            size if n is small.
544
545          - C{discrete}: exact maximum likelihood estimation when the
546            input comes from a discrete scale (see Clauset et al among the
547            references).
548
549          - C{auto}: exact maximum likelihood estimation where the continuous
550            method is used if the input vector contains at least one fractional
551            value and the discrete method is used if the input vector contains
552            integers only.
553
554      @param return_alpha_only: whether to return the fitted exponent only.
555        When this argument is C{True}, the function will return the fitted power-law
556        exponent only. When C{False}, the function will return a L{FittedPowerLaw}
557        object with much more details. The default value is C{True} for the time
558        being for sake of compatibility with earlier releases, but it will be changed
559        to C{False} from igraph 0.7 onwards.
560
561      @return: the fitted exponent or a L{FittedPowerLaw} object, depending on the
562        value of C{return_alpha_only}.
563
564      @newfield ref: Reference
565      @ref: MEJ Newman: Power laws, Pareto distributions and Zipf's law.
566        Contemporary Physics 46, 323-351 (2005)
567      @ref: A Clauset, CR Shalizi, MEJ Newman: Power-law distributions
568        in empirical data. E-print (2007). arXiv:0706.1062"""
569      from igraph._igraph import _power_law_fit
570
571      if xmin is None or xmin < 0:
572          xmin = -1
573
574      method = method.lower()
575      if method not in ("continuous", "hill", "discrete", "auto"):
576          raise ValueError("unknown method: %s" % method)
577
578      force_continuous = method in ("continuous", "hill")
579      fit = FittedPowerLaw(*_power_law_fit(data, xmin, force_continuous))
580      if return_alpha_only:
581          from warnings import warn
582          warn("power_law_fit will return a FittedPowerLaw object from igraph "\
583                  "0.7 onwards. Better prepare for that by setting return_alpha_only "\
584                  "to False when calling power_law_fit()", PendingDeprecationWarning,
585                  stacklevel=3)
586          return fit.alpha
587      else:
588          return fit
589
590 -def quantile(xs, q=(0.25, 0.5, 0.75), sort=True):
591      """Returns the qth quantile of an unsorted or sorted numeric vector.
592
593      There are a number of different ways to calculate the sample quantile. The
594      method implemented by igraph is the one recommended by NIST. First we
595      calculate a rank n as q(N+1), where N is the number of items in xs, then we
596      split n into its integer component k and decimal component d. If k <= 1,
597      we return the first element; if k >= N, we return the last element,
598      otherwise we return the linear interpolation between xs[k-1] and xs[k]
599      using a factor d.
600
601      Example:
602
603          >>> round(quantile([15, 20, 40, 35, 50], 0.4), 2)
604          26.0
605
606      @param xs: the vector itself.
607      @param q: the quantile we are looking for. It may also be a list if you
608        want to calculate multiple quantiles with a single call. The default
609        value calculates the 25th, 50th and 75th percentile.
610      @param sort: whether to sort the vector. If you know that the vector is
611        sorted already, pass C{False} here.
612      @return: the qth quantile, which will always be a float, even if the vector
613        contained integers originally. If q is a list, the result will also be a
614        list containing the quantiles for each item in the list.
615      """
616      if not xs:
617          raise ValueError("xs must not be empty")
618
619      if sort:
620          xs = sorted(xs)
621
622      if hasattr(q, "__iter__"):
623          qs = q
624          return_single = False
625      else:
626          qs = [q]
627          return_single = True
628
629      result = []
630      for q in qs:
631          if q < 0 or q > 1:
632              raise ValueError("q must be between 0 and 1")
633          n = float(q) * (len(xs)+1)
634          k, d = int(n), n-int(n)
635          if k >= len(xs):
636              result.append(xs[-1])
637          elif k < 1:
638              result.append(xs[0])
639          else:
640              result.append((1-d) * xs[k-1] + d * xs[k])
641      if return_single:
642          result = result[0]
643      return result
644
645 -def sd(xs):
646      """Returns the standard deviation of an iterable.
647
648      Example:
649
650          >>> sd([1, 4, 7, 11])       #doctest:+ELLIPSIS
651          4.2720...
652
653      @param xs: an iterable yielding numbers.
654      @return: the standard deviation of the numbers provided by the iterable.
655
656      @see: RunningMean() if you also need the mean
657      """
658      return RunningMean(xs).sd
659
660 -def var(xs):
661      """Returns the variance of an iterable.
662
663      Example:
664
665          >>> var([1, 4, 8, 11])            #doctest:+ELLIPSIS
666          19.333333...
667
668      @param xs: an iterable yielding numbers.
669      @return: the variance of the numbers provided by the iterable.
670
671      @see: RunningMean() if you also need the mean
672      """
673      return RunningMean(xs).var
674
<!--
expandto(location.href);
// -->

```

 Generated by Epydoc 3.0.1 on Fri Mar 1 23:27:18 2013 http://epydoc.sourceforge.net