1
2
3 """
4 Statistics related stuff in igraph
5 """
6
7 __license__ = u"""\
8 Copyright (C) 2006-2012 Tamas Nepusz <ntamas@gmail.com>
9 Pázmány Péter sétány 1/a, 1117 Budapest, Hungary
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 02110-1301 USA
25 """
26
27 import math
28
29 __all__ = ["FittedPowerLaw", "Histogram", "RunningMean", "mean", "median", \
30 "percentile", "quantile", "power_law_fit"]
34 """Result of fitting a power-law to a vector of samples
35
36 Example:
37
38 >>> result = power_law_fit([1, 2, 3, 4, 5, 6], return_alpha_only=False)
39 >>> result # doctest:+ELLIPSIS
40 FittedPowerLaw(continuous=False, alpha=2.425828..., xmin=3.0, L=-7.54633..., D=0.2138..., p=0.99311...)
41 >>> print result # doctest:+ELLIPSIS
42 Fitted power-law distribution on discrete data
43 <BLANKLINE>
44 Exponent (alpha) = 2.425828
45 Cutoff (xmin) = 3.000000
46 <BLANKLINE>
47 Log-likelihood = -7.546337
48 <BLANKLINE>
49 H0: data was drawn from the fitted distribution
50 <BLANKLINE>
51 KS test statistic = 0.213817
52 p-value = 0.993111
53 <BLANKLINE>
54 H0 could not be rejected at significance level 0.05
55 >>> result.alpha # doctest:+ELLIPSIS
56 2.425828...
57 >>> result.xmin
58 3.0
59 >>> result.continuous
60 False
61 """
62
63 - def __init__(self, continuous, alpha, xmin, L, D, p):
64 self.continuous = continuous
65 self.xmin = xmin
66 self.alpha = alpha
67 self.L = L
68 self.D = D
69 self.p = p
70
72 return "%s(continuous=%r, alpha=%r, xmin=%r, L=%r, D=%r, p=%r)" % \
73 (self.__class__.__name__, self.continuous, self.alpha, \
74 self.xmin, self.L, self.D, self.p)
75
77 return self.summary(significance=0.05)
78
79 - def summary(self, significance=0.05):
80 """Returns the summary of the power law fit.
81
82 @param significance: the significance level of the Kolmogorov-Smirnov test
83 used to decide whether the input data could have come from the fitted
84 distribution
85 @return: the summary as a string
86 """
87 result = ["Fitted power-law distribution on %s data" % \
88 ("discrete", "continuous")[bool(self.continuous)]]
89 result.append("")
90 result.append("Exponent (alpha) = %f" % self.alpha)
91 result.append("Cutoff (xmin) = %f" % self.xmin)
92 result.append("")
93 result.append("Log-likelihood = %f" % self.L)
94 result.append("")
95 result.append("H0: data was drawn from the fitted distribution")
96 result.append("")
97 result.append("KS test statistic = %f" % self.D)
98 result.append("p-value = %f" % self.p)
99 result.append("")
100 if self.p < significance:
101 result.append("H0 rejected at significance level %g" \
102 % significance)
103 else:
104 result.append("H0 could not be rejected at significance "\
105 "level %g" % significance)
106
107 return "\n".join(result)
108
111 """Generic histogram class for real numbers
112
113 Example:
114
115 >>> h = Histogram(5) # Initializing, bin width = 5
116 >>> h << [2,3,2,7,8,5,5,0,7,9] # Adding more items
117 >>> print h
118 N = 10, mean +- sd: 4.8000 +- 2.9740
119 [ 0, 5): **** (4)
120 [ 5, 10): ****** (6)
121 """
122
123 - def __init__(self, bin_width = 1, data = None):
124 """Initializes the histogram with the given data set.
125
126 @param bin_width: the bin width of the histogram.
127 @param data: the data set to be used. Must contain real numbers.
128 """
129 self._bin_width = float(bin_width)
130 self._bins = None
131 self._min, self._max = None, None
132 self._running_mean = RunningMean()
133 self.clear()
134
135 if data:
136 self.add_many(data)
137
138 - def _get_bin(self, num, create = False):
139 """Returns the bin index corresponding to the given number.
140
141 @param num: the number for which the bin is being sought
142 @param create: whether to create a new bin if no bin exists yet.
143 @return: the index of the bin or C{None} if no bin exists yet and
144 {create} is C{False}."""
145 if len(self._bins) == 0:
146 if not create:
147 result = None
148 else:
149 self._min = int(num/self._bin_width)*self._bin_width
150 self._max = self._min+self._bin_width
151 self._bins = [0]
152 result = 0
153 return result
154
155 if num >= self._min:
156 binidx = int((num-self._min)/self._bin_width)
157 if binidx < len(self._bins):
158 return binidx
159 if not create:
160 return None
161 extra_bins = binidx-len(self._bins)+1
162 self._bins.extend([0]*extra_bins)
163 self._max = self._min + len(self._bins)*self._bin_width
164 return binidx
165
166 if not create:
167 return None
168
169 extra_bins = int(math.ceil((self._min-num)/self._bin_width))
170 self._bins[0:0] = [0]*extra_bins
171 self._min -= extra_bins*self._bin_width
172 self._max = self._min + len(self._bins)*self._bin_width
173 return 0
174
175 @property
177 """Returns the number of elements in the histogram"""
178 return len(self._running_mean)
179
180 @property
182 """Returns the mean of the elements in the histogram"""
183 return self._running_mean.mean
184
185
186 @property
188 """Returns the standard deviation of the elements in
189 the histogram"""
190 return self._running_mean.sd
191
192 @property
194 """Returns the variance of the elements in the histogram"""
195 return self._running_mean.var
196
197 - def add(self, num, repeat=1):
198 """Adds a single number to the histogram.
199
200 @param num: the number to be added
201 @param repeat: number of repeated additions
202 """
203 num = float(num)
204 binidx = self._get_bin(num, True)
205 self._bins[binidx] += repeat
206 self._running_mean.add(num, repeat)
207
209 """Adds a single number or the elements of an iterable to the histogram.
210
211 @param data: the data to be added"""
212 try:
213 iterator = iter(data)
214 except TypeError:
215 iterator = iter([data])
216 for x in iterator:
217 self.add(x)
218 __lshift__ = add_many
219
221 """Clears the collected data"""
222 self._bins = []
223 self._min, self._max = None, None
224 self._running_mean = RunningMean()
225
227 """Generator returning the bins of the histogram in increasing order
228
229 @return: a tuple with the following elements: left bound, right bound,
230 number of elements in the bin"""
231 x = self._min
232 for elem in self._bins:
233 yield (x, x+self._bin_width, elem)
234 x += self._bin_width
235
236 - def __plot__(self, context, bbox, _, **kwds):
237 """Plotting support"""
238 from igraph.drawing.coord import DescartesCoordinateSystem
239 coord_system = DescartesCoordinateSystem(context, bbox, \
240 (kwds.get("min", self._min), 0, \
241 kwds.get("max", self._max), kwds.get("max_value", max(self._bins))
242 ))
243
244
245 context.set_line_width(1)
246 context.set_source_rgb(1., 0., 0.)
247 x = self._min
248 for value in self._bins:
249 top_left_x, top_left_y = coord_system.local_to_context(x, value)
250 x += self._bin_width
251 bottom_right_x, bottom_right_y = coord_system.local_to_context(x, 0)
252 context.rectangle(top_left_x, top_left_y, \
253 bottom_right_x - top_left_x, \
254 bottom_right_y - top_left_y)
255 context.fill()
256
257
258 coord_system.draw()
259
260 - def to_string(self, max_width=78, show_bars=True, show_counts=True):
261 """Returns the string representation of the histogram.
262
263 @param max_width: the maximal width of each line of the string
264 This value may not be obeyed if it is too small.
265 @param show_bars: specify whether the histogram bars should be shown
266 @param show_counts: specify whether the histogram counts should be
267 shown. If both I{show_bars} and I{show_counts} are C{False},
268 only a general descriptive statistics (number of elements, mean and
269 standard deviation) is shown.
270 """
271
272 if self._min is None or self._max is None:
273 return "N = 0"
274
275
276 if int(self._min) == self._min and int(self._bin_width) == self._bin_width:
277 number_format = "%d"
278 else:
279 number_format = "%.3f"
280 num_length = max(len(number_format % self._min), \
281 len(number_format % self._max))
282 number_format = "%" + str(num_length) + number_format[1:]
283 format_string = "[%s, %s): %%s" % (number_format, number_format)
284
285
286 if show_bars:
287 maxval = max(self._bins)
288 if show_counts:
289 maxval_length = len(str(maxval))
290 scale = maxval // (max_width-2*num_length-maxval_length-9)
291 else:
292 scale = maxval // (max_width-2*num_length-6)
293 scale = max(scale, 1)
294
295 result = ["N = %d, mean +- sd: %.4f +- %.4f" % \
296 (self.n, self.mean, self.sd)]
297
298 if show_bars:
299
300 if scale > 1:
301 result.append("Each * represents %d items" % scale)
302 if show_counts:
303 format_string += " (%d)"
304 for left, right, cnt in self.bins():
305 result.append(format_string % (left, right, '*'*(cnt//scale), cnt))
306 else:
307 for left, right, cnt in self.bins():
308 result.append(format_string % (left, right, '*'*(cnt//scale)))
309 elif show_counts:
310
311 for left, right, cnt in self.bins():
312 result.append(format_string % (left, right, cnt))
313
314 return "\n".join(result)
315
318
322 """Running mean calculator.
323
324 This class can be used to calculate the mean of elements from a
325 list, tuple, iterable or any other data source. The mean is
326 calculated on the fly without explicitly summing the values,
327 so it can be used for data sets with arbitrary item count. Also
328 capable of returning the standard deviation (also calculated on
329 the fly)
330 """
331
332
333 - def __init__(self, items=None, n=0.0, mean=0.0, sd=0.0):
334 """RunningMean(items=None, n=0.0, mean=0.0, sd=0.0)
335
336 Initializes the running mean calculator.
337
338 There are two possible ways to initialize the calculator.
339 First, one can provide an iterable of items; alternatively,
340 one can specify the number of items, the mean and the
341 standard deviation if we want to continue an interrupted
342 calculation.
343
344 @param items: the items that are used to initialize the
345 running mean calcuator. If C{items} is given, C{n},
346 C{mean} and C{sd} must be zeros.
347 @param n: the initial number of elements already processed.
348 If this is given, C{items} must be C{None}.
349 @param mean: the initial mean. If this is given, C{items}
350 must be C{None}.
351 @param sd: the initial standard deviation. If this is given,
352 C{items} must be C{None}."""
353 if items is not None:
354 if n != 0 or mean != 0 or sd != 0:
355 raise ValueError("n, mean and sd must be zeros if items is not None")
356 self.clear()
357 self.add_many(items)
358 else:
359 self._nitems = float(n)
360 self._mean = float(mean)
361 if n > 1:
362 self._sqdiff = float(sd) ** 2 * float(n-1)
363 self._sd = float(sd)
364 else:
365 self._sqdiff = 0.0
366 self._sd = 0.0
367
368 - def add(self, value, repeat=1):
369 """RunningMean.add(value, repeat=1)
370
371 Adds the given value to the elements from which we calculate
372 the mean and the standard deviation.
373
374 @param value: the element to be added
375 @param repeat: number of repeated additions
376 """
377 repeat = int(repeat)
378 self._nitems += repeat
379 delta = value - self._mean
380 self._mean += (repeat*delta / self._nitems)
381 self._sqdiff += (repeat*delta) * (value - self._mean)
382 if self._nitems > 1:
383 self._sd = (self._sqdiff / (self._nitems-1)) ** 0.5
384
386 """RunningMean.add(values)
387
388 Adds the values in the given iterable to the elements from
389 which we calculate the mean. Can also accept a single number.
390 The left shift (C{<<}) operator is aliased to this function,
391 so you can use it to add elements as well:
392
393 >>> rm=RunningMean()
394 >>> rm << [1,2,3,4]
395 >>> rm.result # doctest:+ELLIPSIS
396 (2.5, 1.290994...)
397
398 @param values: the element(s) to be added
399 @type values: iterable"""
400 try:
401 iterator = iter(values)
402 except TypeError:
403 iterator = iter([values])
404 for value in iterator:
405 self.add(value)
406
408 """Resets the running mean calculator."""
409 self._nitems, self._mean = 0.0, 0.0
410 self._sqdiff, self._sd = 0.0, 0.0
411
412 @property
414 """Returns the current mean and standard deviation as a tuple"""
415 return self._mean, self._sd
416
417 @property
419 """Returns the current mean"""
420 return self._mean
421
422 @property
424 """Returns the current standard deviation"""
425 return self._sd
426
427 @property
429 """Returns the current variation"""
430 return self._sd ** 2
431
433 return "%s(n=%r, mean=%r, sd=%r)" % \
434 (self.__class__.__name__, int(self._nitems),
435 self._mean, self._sd)
436
438 return "Running mean (N=%d, %f +- %f)" % \
439 (self._nitems, self._mean, self._sd)
440
441 __lshift__ = add_many
442
444 return float(self._mean)
445
447 return int(self._mean)
448
450 return long(self._mean)
451
453 return complex(self._mean)
454
457
460 """Returns the mean of an iterable.
461
462 Example:
463
464 >>> mean([1, 4, 7, 11])
465 5.75
466
467 @param xs: an iterable yielding numbers.
468 @return: the mean of the numbers provided by the iterable.
469
470 @see: RunningMean() if you also need the variance or the standard deviation
471 """
472 return RunningMean(xs).mean
473
491
493 """Returns the pth percentile of an unsorted or sorted numeric vector.
494
495 This is equivalent to calling quantile(xs, p/100.0); see L{quantile}
496 for more details on the calculation.
497
498 Example:
499
500 >>> round(percentile([15, 20, 40, 35, 50], 40), 2)
501 26.0
502 >>> for perc in percentile([15, 20, 40, 35, 50], (0, 25, 50, 75, 100)):
503 ... print "%.2f" % perc
504 ...
505 15.00
506 17.50
507 35.00
508 45.00
509 50.00
510
511 @param xs: the vector itself.
512 @param p: the percentile we are looking for. It may also be a list if you
513 want to calculate multiple quantiles with a single call. The default
514 value calculates the 25th, 50th and 75th percentile.
515 @param sort: whether to sort the vector. If you know that the vector is
516 sorted already, pass C{False} here.
517 @return: the pth percentile, which will always be a float, even if the vector
518 contained integers originally. If p is a list, the result will also be a
519 list containing the percentiles for each item in the list.
520 """
521 if hasattr(p, "__iter__"):
522 return quantile(xs, (x/100.0 for x in p), sort)
523 return quantile(xs, p/100.0, sort)
524
525 -def power_law_fit(data, xmin=None, method="auto", return_alpha_only=True):
526 """Fitting a power-law distribution to empirical data
527
528 @param data: the data to fit, a list containing integer values
529 @param xmin: the lower bound for fitting the power-law. If C{None},
530 the optimal xmin value will be estimated as well. Zero means that
531 the smallest possible xmin value will be used.
532 @param method: the fitting method to use. The following methods are
533 implemented so far:
534
535 - C{continuous}, C{hill}: exact maximum likelihood estimation
536 when the input data comes from a continuous scale. This is
537 known as the Hill estimator. The statistical error of
538 this estimator is M{(alpha-1) / sqrt(n)}, where alpha is the
539 estimated exponent and M{n} is the number of data points above
540 M{xmin}. The estimator is known to exhibit a small finite
541 sample-size bias of order M{O(n^-1)}, which is small when
542 M{n > 100}. igraph will try to compensate for the finite sample
543 size if n is small.
544
545 - C{discrete}: exact maximum likelihood estimation when the
546 input comes from a discrete scale (see Clauset et al among the
547 references).
548
549 - C{auto}: exact maximum likelihood estimation where the continuous
550 method is used if the input vector contains at least one fractional
551 value and the discrete method is used if the input vector contains
552 integers only.
553
554 @param return_alpha_only: whether to return the fitted exponent only.
555 When this argument is C{True}, the function will return the fitted power-law
556 exponent only. When C{False}, the function will return a L{FittedPowerLaw}
557 object with much more details. The default value is C{True} for the time
558 being for sake of compatibility with earlier releases, but it will be changed
559 to C{False} from igraph 0.7 onwards.
560
561 @return: the fitted exponent or a L{FittedPowerLaw} object, depending on the
562 value of C{return_alpha_only}.
563
564 @newfield ref: Reference
565 @ref: MEJ Newman: Power laws, Pareto distributions and Zipf's law.
566 Contemporary Physics 46, 323-351 (2005)
567 @ref: A Clauset, CR Shalizi, MEJ Newman: Power-law distributions
568 in empirical data. E-print (2007). arXiv:0706.1062"""
569 from igraph._igraph import _power_law_fit
570
571 if xmin is None or xmin < 0:
572 xmin = -1
573
574 method = method.lower()
575 if method not in ("continuous", "hill", "discrete", "auto"):
576 raise ValueError("unknown method: %s" % method)
577
578 force_continuous = method in ("continuous", "hill")
579 fit = FittedPowerLaw(*_power_law_fit(data, xmin, force_continuous))
580 if return_alpha_only:
581 from warnings import warn
582 warn("power_law_fit will return a FittedPowerLaw object from igraph "\
583 "0.7 onwards. Better prepare for that by setting return_alpha_only "\
584 "to False when calling power_law_fit()", PendingDeprecationWarning,
585 stacklevel=3)
586 return fit.alpha
587 else:
588 return fit
589
590 -def quantile(xs, q=(0.25, 0.5, 0.75), sort=True):
591 """Returns the qth quantile of an unsorted or sorted numeric vector.
592
593 There are a number of different ways to calculate the sample quantile. The
594 method implemented by igraph is the one recommended by NIST. First we
595 calculate a rank n as q(N+1), where N is the number of items in xs, then we
596 split n into its integer component k and decimal component d. If k <= 1,
597 we return the first element; if k >= N, we return the last element,
598 otherwise we return the linear interpolation between xs[k-1] and xs[k]
599 using a factor d.
600
601 Example:
602
603 >>> round(quantile([15, 20, 40, 35, 50], 0.4), 2)
604 26.0
605
606 @param xs: the vector itself.
607 @param q: the quantile we are looking for. It may also be a list if you
608 want to calculate multiple quantiles with a single call. The default
609 value calculates the 25th, 50th and 75th percentile.
610 @param sort: whether to sort the vector. If you know that the vector is
611 sorted already, pass C{False} here.
612 @return: the qth quantile, which will always be a float, even if the vector
613 contained integers originally. If q is a list, the result will also be a
614 list containing the quantiles for each item in the list.
615 """
616 if not xs:
617 raise ValueError("xs must not be empty")
618
619 if sort:
620 xs = sorted(xs)
621
622 if hasattr(q, "__iter__"):
623 qs = q
624 return_single = False
625 else:
626 qs = [q]
627 return_single = True
628
629 result = []
630 for q in qs:
631 if q < 0 or q > 1:
632 raise ValueError("q must be between 0 and 1")
633 n = float(q) * (len(xs)+1)
634 k, d = int(n), n-int(n)
635 if k >= len(xs):
636 result.append(xs[-1])
637 elif k < 1:
638 result.append(xs[0])
639 else:
640 result.append((1-d) * xs[k-1] + d * xs[k])
641 if return_single:
642 result = result[0]
643 return result
644
646 """Returns the standard deviation of an iterable.
647
648 Example:
649
650 >>> sd([1, 4, 7, 11]) #doctest:+ELLIPSIS
651 4.2720...
652
653 @param xs: an iterable yielding numbers.
654 @return: the standard deviation of the numbers provided by the iterable.
655
656 @see: RunningMean() if you also need the mean
657 """
658 return RunningMean(xs).sd
659
661 """Returns the variance of an iterable.
662
663 Example:
664
665 >>> var([1, 4, 8, 11]) #doctest:+ELLIPSIS
666 19.333333...
667
668 @param xs: an iterable yielding numbers.
669 @return: the variance of the numbers provided by the iterable.
670
671 @see: RunningMean() if you also need the mean
672 """
673 return RunningMean(xs).var
674