ayousanz commited on
Commit
45b0b28
·
verified ·
1 Parent(s): b35b196

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .venv/Lib/site-packages/scipy/sparse/linalg/tests/propack_test_data.npz +3 -0
  3. .venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd +3 -0
  4. .venv/Lib/site-packages/scipy/special/__pycache__/__init__.cpython-39.pyc +0 -0
  5. .venv/Lib/site-packages/scipy/special/__pycache__/_orthogonal.cpython-39.pyc +0 -0
  6. .venv/Lib/site-packages/scipy/special/__pycache__/_sf_error.cpython-39.pyc +0 -0
  7. .venv/Lib/site-packages/scipy/special/__pycache__/_spfun_stats.cpython-39.pyc +0 -0
  8. .venv/Lib/site-packages/scipy/special/__pycache__/_spherical_bessel.cpython-39.pyc +0 -0
  9. .venv/Lib/site-packages/scipy/special/__pycache__/_support_alternative_backends.cpython-39.pyc +0 -0
  10. .venv/Lib/site-packages/scipy/stats/__init__.py +643 -0
  11. .venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.dll.a +0 -0
  12. .venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.pyd +0 -0
  13. .venv/Lib/site-packages/scipy/stats/_axis_nan_policy.py +642 -0
  14. .venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.dll.a +0 -0
  15. .venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.pyd +0 -0
  16. .venv/Lib/site-packages/scipy/stats/_biasedurn.pxd +27 -0
  17. .venv/Lib/site-packages/scipy/stats/_binned_statistic.py +795 -0
  18. .venv/Lib/site-packages/scipy/stats/_binomtest.py +375 -0
  19. .venv/Lib/site-packages/scipy/stats/_bws_test.py +177 -0
  20. .venv/Lib/site-packages/scipy/stats/_censored_data.py +459 -0
  21. .venv/Lib/site-packages/scipy/stats/_common.py +5 -0
  22. .venv/Lib/site-packages/scipy/stats/_constants.py +39 -0
  23. .venv/Lib/site-packages/scipy/stats/_continuous_distns.py +0 -0
  24. .venv/Lib/site-packages/scipy/stats/_covariance.py +633 -0
  25. .venv/Lib/site-packages/scipy/stats/_crosstab.py +204 -0
  26. .venv/Lib/site-packages/scipy/stats/_discrete_distns.py +1954 -0
  27. .venv/Lib/site-packages/scipy/stats/_distn_infrastructure.py +0 -0
  28. .venv/Lib/site-packages/scipy/stats/_distr_params.py +288 -0
  29. .venv/Lib/site-packages/scipy/stats/_entropy.py +423 -0
  30. .venv/Lib/site-packages/scipy/stats/_fit.py +1351 -0
  31. .venv/Lib/site-packages/scipy/stats/_generate_pyx.py +27 -0
  32. .venv/Lib/site-packages/scipy/stats/_hypotests.py +2021 -0
  33. .venv/Lib/site-packages/scipy/stats/_kde.py +728 -0
  34. .venv/Lib/site-packages/scipy/stats/_ksstats.py +600 -0
  35. .venv/Lib/site-packages/scipy/stats/_mannwhitneyu.py +519 -0
  36. .venv/Lib/site-packages/scipy/stats/_morestats.py +0 -0
  37. .venv/Lib/site-packages/scipy/stats/_mstats_basic.py +0 -0
  38. .venv/Lib/site-packages/scipy/stats/_mstats_extras.py +521 -0
  39. .venv/Lib/site-packages/scipy/stats/_multicomp.py +459 -0
  40. .venv/Lib/site-packages/scipy/stats/_multivariate.py +0 -0
  41. .venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.dll.a +0 -0
  42. .venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.pyd +0 -0
  43. .venv/Lib/site-packages/scipy/stats/_odds_ratio.py +482 -0
  44. .venv/Lib/site-packages/scipy/stats/_page_trend_test.py +479 -0
  45. .venv/Lib/site-packages/scipy/stats/_qmc.py +0 -0
  46. .venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.dll.a +0 -0
  47. .venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.pyd +0 -0
  48. .venv/Lib/site-packages/scipy/stats/_qmc_cy.pyi +54 -0
  49. .venv/Lib/site-packages/scipy/stats/_qmvnt.py +533 -0
  50. .venv/Lib/site-packages/scipy/stats/_relative_risk.py +263 -0
.gitattributes CHANGED
@@ -90,3 +90,4 @@ reference_sample_wavs/syuukovoice_200918_3_01.wav filter=lfs diff=lfs merge=lfs
90
  .venv/Lib/site-packages/torio/lib/_torio_ffmpeg5.pyd filter=lfs diff=lfs merge=lfs -text
91
  .venv/Lib/site-packages/torio/lib/_torio_ffmpeg6.pyd filter=lfs diff=lfs merge=lfs -text
92
  .venv/Lib/site-packages/torch/lib/cudnn_adv64_9.dll filter=lfs diff=lfs merge=lfs -text
 
 
90
  .venv/Lib/site-packages/torio/lib/_torio_ffmpeg5.pyd filter=lfs diff=lfs merge=lfs -text
91
  .venv/Lib/site-packages/torio/lib/_torio_ffmpeg6.pyd filter=lfs diff=lfs merge=lfs -text
92
  .venv/Lib/site-packages/torch/lib/cudnn_adv64_9.dll filter=lfs diff=lfs merge=lfs -text
93
+ .venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text
.venv/Lib/site-packages/scipy/sparse/linalg/tests/propack_test_data.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfe34d9a92353e08f400f3837136e553a8e91d441186913d39b59bf8a627bba3
3
+ size 600350
.venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bdc657c7357110d74977999bce28b06bdcc7dedf675339d5462ec030c6da0ac
3
+ size 1372160
.venv/Lib/site-packages/scipy/special/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (31.8 kB). View file
 
.venv/Lib/site-packages/scipy/special/__pycache__/_orthogonal.cpython-39.pyc ADDED
Binary file (74.5 kB). View file
 
.venv/Lib/site-packages/scipy/special/__pycache__/_sf_error.cpython-39.pyc ADDED
Binary file (784 Bytes). View file
 
.venv/Lib/site-packages/scipy/special/__pycache__/_spfun_stats.cpython-39.pyc ADDED
Binary file (2.6 kB). View file
 
.venv/Lib/site-packages/scipy/special/__pycache__/_spherical_bessel.cpython-39.pyc ADDED
Binary file (10.7 kB). View file
 
.venv/Lib/site-packages/scipy/special/__pycache__/_support_alternative_backends.cpython-39.pyc ADDED
Binary file (2.24 kB). View file
 
.venv/Lib/site-packages/scipy/stats/__init__.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ .. _statsrefmanual:
3
+
4
+ ==========================================
5
+ Statistical functions (:mod:`scipy.stats`)
6
+ ==========================================
7
+
8
+ .. currentmodule:: scipy.stats
9
+
10
+ This module contains a large number of probability distributions,
11
+ summary and frequency statistics, correlation functions and statistical
12
+ tests, masked statistics, kernel density estimation, quasi-Monte Carlo
13
+ functionality, and more.
14
+
15
+ Statistics is a very large area, and there are topics that are out of scope
16
+ for SciPy and are covered by other packages. Some of the most important ones
17
+ are:
18
+
19
+ - `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
20
+ regression, linear models, time series analysis, extensions to topics
21
+ also covered by ``scipy.stats``.
22
+ - `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
23
+ functionality, interfaces to other statistical languages.
24
+ - `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
25
+ modeling, probabilistic machine learning.
26
+ - `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
27
+ model selection.
28
+ - `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
29
+ - `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
30
+
31
+
32
+ Probability distributions
33
+ =========================
34
+
35
+ Each univariate distribution is an instance of a subclass of `rv_continuous`
36
+ (`rv_discrete` for discrete distributions):
37
+
38
+ .. autosummary::
39
+ :toctree: generated/
40
+
41
+ rv_continuous
42
+ rv_discrete
43
+ rv_histogram
44
+
45
+ Continuous distributions
46
+ ------------------------
47
+
48
+ .. autosummary::
49
+ :toctree: generated/
50
+
51
+ alpha -- Alpha
52
+ anglit -- Anglit
53
+ arcsine -- Arcsine
54
+ argus -- Argus
55
+ beta -- Beta
56
+ betaprime -- Beta Prime
57
+ bradford -- Bradford
58
+ burr -- Burr (Type III)
59
+ burr12 -- Burr (Type XII)
60
+ cauchy -- Cauchy
61
+ chi -- Chi
62
+ chi2 -- Chi-squared
63
+ cosine -- Cosine
64
+ crystalball -- Crystalball
65
+ dgamma -- Double Gamma
66
+ dweibull -- Double Weibull
67
+ erlang -- Erlang
68
+ expon -- Exponential
69
+ exponnorm -- Exponentially Modified Normal
70
+ exponweib -- Exponentiated Weibull
71
+ exponpow -- Exponential Power
72
+ f -- F (Snecdor F)
73
+ fatiguelife -- Fatigue Life (Birnbaum-Saunders)
74
+ fisk -- Fisk
75
+ foldcauchy -- Folded Cauchy
76
+ foldnorm -- Folded Normal
77
+ genlogistic -- Generalized Logistic
78
+ gennorm -- Generalized normal
79
+ genpareto -- Generalized Pareto
80
+ genexpon -- Generalized Exponential
81
+ genextreme -- Generalized Extreme Value
82
+ gausshyper -- Gauss Hypergeometric
83
+ gamma -- Gamma
84
+ gengamma -- Generalized gamma
85
+ genhalflogistic -- Generalized Half Logistic
86
+ genhyperbolic -- Generalized Hyperbolic
87
+ geninvgauss -- Generalized Inverse Gaussian
88
+ gibrat -- Gibrat
89
+ gompertz -- Gompertz (Truncated Gumbel)
90
+ gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
91
+ gumbel_l -- Left Sided Gumbel, etc.
92
+ halfcauchy -- Half Cauchy
93
+ halflogistic -- Half Logistic
94
+ halfnorm -- Half Normal
95
+ halfgennorm -- Generalized Half Normal
96
+ hypsecant -- Hyperbolic Secant
97
+ invgamma -- Inverse Gamma
98
+ invgauss -- Inverse Gaussian
99
+ invweibull -- Inverse Weibull
100
+ jf_skew_t -- Jones and Faddy Skew-T
101
+ johnsonsb -- Johnson SB
102
+ johnsonsu -- Johnson SU
103
+ kappa4 -- Kappa 4 parameter
104
+ kappa3 -- Kappa 3 parameter
105
+ ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
106
+ kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
107
+ kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
108
+ laplace -- Laplace
109
+ laplace_asymmetric -- Asymmetric Laplace
110
+ levy -- Levy
111
+ levy_l
112
+ levy_stable
113
+ logistic -- Logistic
114
+ loggamma -- Log-Gamma
115
+ loglaplace -- Log-Laplace (Log Double Exponential)
116
+ lognorm -- Log-Normal
117
+ loguniform -- Log-Uniform
118
+ lomax -- Lomax (Pareto of the second kind)
119
+ maxwell -- Maxwell
120
+ mielke -- Mielke's Beta-Kappa
121
+ moyal -- Moyal
122
+ nakagami -- Nakagami
123
+ ncx2 -- Non-central chi-squared
124
+ ncf -- Non-central F
125
+ nct -- Non-central Student's T
126
+ norm -- Normal (Gaussian)
127
+ norminvgauss -- Normal Inverse Gaussian
128
+ pareto -- Pareto
129
+ pearson3 -- Pearson type III
130
+ powerlaw -- Power-function
131
+ powerlognorm -- Power log normal
132
+ powernorm -- Power normal
133
+ rdist -- R-distribution
134
+ rayleigh -- Rayleigh
135
+ rel_breitwigner -- Relativistic Breit-Wigner
136
+ rice -- Rice
137
+ recipinvgauss -- Reciprocal Inverse Gaussian
138
+ semicircular -- Semicircular
139
+ skewcauchy -- Skew Cauchy
140
+ skewnorm -- Skew normal
141
+ studentized_range -- Studentized Range
142
+ t -- Student's T
143
+ trapezoid -- Trapezoidal
144
+ triang -- Triangular
145
+ truncexpon -- Truncated Exponential
146
+ truncnorm -- Truncated Normal
147
+ truncpareto -- Truncated Pareto
148
+ truncweibull_min -- Truncated minimum Weibull distribution
149
+ tukeylambda -- Tukey-Lambda
150
+ uniform -- Uniform
151
+ vonmises -- Von-Mises (Circular)
152
+ vonmises_line -- Von-Mises (Line)
153
+ wald -- Wald
154
+ weibull_min -- Minimum Weibull (see Frechet)
155
+ weibull_max -- Maximum Weibull (see Frechet)
156
+ wrapcauchy -- Wrapped Cauchy
157
+
158
+ The ``fit`` method of the univariate continuous distributions uses
159
+ maximum likelihood estimation to fit the distribution to a data set.
160
+ The ``fit`` method can accept regular data or *censored data*.
161
+ Censored data is represented with instances of the `CensoredData`
162
+ class.
163
+
164
+ .. autosummary::
165
+ :toctree: generated/
166
+
167
+ CensoredData
168
+
169
+
170
+ Multivariate distributions
171
+ --------------------------
172
+
173
+ .. autosummary::
174
+ :toctree: generated/
175
+
176
+ multivariate_normal -- Multivariate normal distribution
177
+ matrix_normal -- Matrix normal distribution
178
+ dirichlet -- Dirichlet
179
+ dirichlet_multinomial -- Dirichlet multinomial distribution
180
+ wishart -- Wishart
181
+ invwishart -- Inverse Wishart
182
+ multinomial -- Multinomial distribution
183
+ special_ortho_group -- SO(N) group
184
+ ortho_group -- O(N) group
185
+ unitary_group -- U(N) group
186
+ random_correlation -- random correlation matrices
187
+ multivariate_t -- Multivariate t-distribution
188
+ multivariate_hypergeom -- Multivariate hypergeometric distribution
189
+ random_table -- Distribution of random tables with given marginals
190
+ uniform_direction -- Uniform distribution on S(N-1)
191
+ vonmises_fisher -- Von Mises-Fisher distribution
192
+
193
+ `scipy.stats.multivariate_normal` methods accept instances
194
+ of the following class to represent the covariance.
195
+
196
+ .. autosummary::
197
+ :toctree: generated/
198
+
199
+ Covariance -- Representation of a covariance matrix
200
+
201
+
202
+ Discrete distributions
203
+ ----------------------
204
+
205
+ .. autosummary::
206
+ :toctree: generated/
207
+
208
+ bernoulli -- Bernoulli
209
+ betabinom -- Beta-Binomial
210
+ betanbinom -- Beta-Negative Binomial
211
+ binom -- Binomial
212
+ boltzmann -- Boltzmann (Truncated Discrete Exponential)
213
+ dlaplace -- Discrete Laplacian
214
+ geom -- Geometric
215
+ hypergeom -- Hypergeometric
216
+ logser -- Logarithmic (Log-Series, Series)
217
+ nbinom -- Negative Binomial
218
+ nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
219
+ nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
220
+ nhypergeom -- Negative Hypergeometric
221
+ planck -- Planck (Discrete Exponential)
222
+ poisson -- Poisson
223
+ randint -- Discrete Uniform
224
+ skellam -- Skellam
225
+ yulesimon -- Yule-Simon
226
+ zipf -- Zipf (Zeta)
227
+ zipfian -- Zipfian
228
+
229
+
230
+ An overview of statistical functions is given below. Many of these functions
231
+ have a similar version in `scipy.stats.mstats` which work for masked arrays.
232
+
233
+ Summary statistics
234
+ ==================
235
+
236
+ .. autosummary::
237
+ :toctree: generated/
238
+
239
+ describe -- Descriptive statistics
240
+ gmean -- Geometric mean
241
+ hmean -- Harmonic mean
242
+ pmean -- Power mean
243
+ kurtosis -- Fisher or Pearson kurtosis
244
+ mode -- Modal value
245
+ moment -- Central moment
246
+ expectile -- Expectile
247
+ skew -- Skewness
248
+ kstat --
249
+ kstatvar --
250
+ tmean -- Truncated arithmetic mean
251
+ tvar -- Truncated variance
252
+ tmin --
253
+ tmax --
254
+ tstd --
255
+ tsem --
256
+ variation -- Coefficient of variation
257
+ find_repeats
258
+ rankdata
259
+ tiecorrect
260
+ trim_mean
261
+ gstd -- Geometric Standard Deviation
262
+ iqr
263
+ sem
264
+ bayes_mvs
265
+ mvsdist
266
+ entropy
267
+ differential_entropy
268
+ median_abs_deviation
269
+
270
+ Frequency statistics
271
+ ====================
272
+
273
+ .. autosummary::
274
+ :toctree: generated/
275
+
276
+ cumfreq
277
+ percentileofscore
278
+ scoreatpercentile
279
+ relfreq
280
+
281
+ .. autosummary::
282
+ :toctree: generated/
283
+
284
+ binned_statistic -- Compute a binned statistic for a set of data.
285
+ binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
286
+ binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
287
+
288
+ Hypothesis Tests and related functions
289
+ ======================================
290
+ SciPy has many functions for performing hypothesis tests that return a
291
+ test statistic and a p-value, and several of them return confidence intervals
292
+ and/or other related information.
293
+
294
+ The headings below are based on common uses of the functions within, but due to
295
+ the wide variety of statistical procedures, any attempt at coarse-grained
296
+ categorization will be imperfect. Also, note that tests within the same heading
297
+ are not interchangeable in general (e.g. many have different distributional
298
+ assumptions).
299
+
300
+ One Sample Tests / Paired Sample Tests
301
+ --------------------------------------
302
+ One sample tests are typically used to assess whether a single sample was
303
+ drawn from a specified distribution or a distribution with specified properties
304
+ (e.g. zero mean).
305
+
306
+ .. autosummary::
307
+ :toctree: generated/
308
+
309
+ ttest_1samp
310
+ binomtest
311
+ quantile_test
312
+ skewtest
313
+ kurtosistest
314
+ normaltest
315
+ jarque_bera
316
+ shapiro
317
+ anderson
318
+ cramervonmises
319
+ ks_1samp
320
+ goodness_of_fit
321
+ chisquare
322
+ power_divergence
323
+
324
+ Paired sample tests are often used to assess whether two samples were drawn
325
+ from the same distribution; they differ from the independent sample tests below
326
+ in that each observation in one sample is treated as paired with a
327
+ closely-related observation in the other sample (e.g. when environmental
328
+ factors are controlled between observations within a pair but not among pairs).
329
+ They can also be interpreted or used as one-sample tests (e.g. tests on the
330
+ mean or median of *differences* between paired observations).
331
+
332
+ .. autosummary::
333
+ :toctree: generated/
334
+
335
+ ttest_rel
336
+ wilcoxon
337
+
338
+ Association/Correlation Tests
339
+ -----------------------------
340
+
341
+ These tests are often used to assess whether there is a relationship (e.g.
342
+ linear) between paired observations in multiple samples or among the
343
+ coordinates of multivariate observations.
344
+
345
+ .. autosummary::
346
+ :toctree: generated/
347
+
348
+ linregress
349
+ pearsonr
350
+ spearmanr
351
+ pointbiserialr
352
+ kendalltau
353
+ weightedtau
354
+ somersd
355
+ siegelslopes
356
+ theilslopes
357
+ page_trend_test
358
+ multiscale_graphcorr
359
+
360
+ These association tests and are to work with samples in the form of contingency
361
+ tables. Supporting functions are available in `scipy.stats.contingency`.
362
+
363
+ .. autosummary::
364
+ :toctree: generated/
365
+
366
+ chi2_contingency
367
+ fisher_exact
368
+ barnard_exact
369
+ boschloo_exact
370
+
371
+ Independent Sample Tests
372
+ ------------------------
373
+ Independent sample tests are typically used to assess whether multiple samples
374
+ were independently drawn from the same distribution or different distributions
375
+ with a shared property (e.g. equal means).
376
+
377
+ Some tests are specifically for comparing two samples.
378
+
379
+ .. autosummary::
380
+ :toctree: generated/
381
+
382
+ ttest_ind_from_stats
383
+ poisson_means_test
384
+ ttest_ind
385
+ mannwhitneyu
386
+ bws_test
387
+ ranksums
388
+ brunnermunzel
389
+ mood
390
+ ansari
391
+ cramervonmises_2samp
392
+ epps_singleton_2samp
393
+ ks_2samp
394
+ kstest
395
+
396
+ Others are generalized to multiple samples.
397
+
398
+ .. autosummary::
399
+ :toctree: generated/
400
+
401
+ f_oneway
402
+ tukey_hsd
403
+ dunnett
404
+ kruskal
405
+ alexandergovern
406
+ fligner
407
+ levene
408
+ bartlett
409
+ median_test
410
+ friedmanchisquare
411
+ anderson_ksamp
412
+
413
+ Resampling and Monte Carlo Methods
414
+ ----------------------------------
415
+ The following functions can reproduce the p-value and confidence interval
416
+ results of most of the functions above, and often produce accurate results in a
417
+ wider variety of conditions. They can also be used to perform hypothesis tests
418
+ and generate confidence intervals for custom statistics. This flexibility comes
419
+ at the cost of greater computational requirements and stochastic results.
420
+
421
+ .. autosummary::
422
+ :toctree: generated/
423
+
424
+ monte_carlo_test
425
+ permutation_test
426
+ bootstrap
427
+
428
+ Instances of the following object can be passed into some hypothesis test
429
+ functions to perform a resampling or Monte Carlo version of the hypothesis
430
+ test.
431
+
432
+ .. autosummary::
433
+ :toctree: generated/
434
+
435
+ MonteCarloMethod
436
+ PermutationMethod
437
+ BootstrapMethod
438
+
439
+ Multiple Hypothesis Testing and Meta-Analysis
440
+ ---------------------------------------------
441
+ These functions are for assessing the results of individual tests as a whole.
442
+ Functions for performing specific multiple hypothesis tests (e.g. post hoc
443
+ tests) are listed above.
444
+
445
+ .. autosummary::
446
+ :toctree: generated/
447
+
448
+ combine_pvalues
449
+ false_discovery_control
450
+
451
+
452
+ The following functions are related to the tests above but do not belong in the
453
+ above categories.
454
+
455
+ Quasi-Monte Carlo
456
+ =================
457
+
458
+ .. toctree::
459
+ :maxdepth: 4
460
+
461
+ stats.qmc
462
+
463
+ Contingency Tables
464
+ ==================
465
+
466
+ .. toctree::
467
+ :maxdepth: 4
468
+
469
+ stats.contingency
470
+
471
+ Masked statistics functions
472
+ ===========================
473
+
474
+ .. toctree::
475
+
476
+ stats.mstats
477
+
478
+
479
+ Other statistical functionality
480
+ ===============================
481
+
482
+ Transformations
483
+ ---------------
484
+
485
+ .. autosummary::
486
+ :toctree: generated/
487
+
488
+ boxcox
489
+ boxcox_normmax
490
+ boxcox_llf
491
+ yeojohnson
492
+ yeojohnson_normmax
493
+ yeojohnson_llf
494
+ obrientransform
495
+ sigmaclip
496
+ trimboth
497
+ trim1
498
+ zmap
499
+ zscore
500
+ gzscore
501
+
502
+ Statistical distances
503
+ ---------------------
504
+
505
+ .. autosummary::
506
+ :toctree: generated/
507
+
508
+ wasserstein_distance
509
+ wasserstein_distance_nd
510
+ energy_distance
511
+
512
+ Sampling
513
+ --------
514
+
515
+ .. toctree::
516
+ :maxdepth: 4
517
+
518
+ stats.sampling
519
+
520
+ Random variate generation / CDF Inversion
521
+ -----------------------------------------
522
+
523
+ .. autosummary::
524
+ :toctree: generated/
525
+
526
+ rvs_ratio_uniforms
527
+
528
+ Fitting / Survival Analysis
529
+ ---------------------------
530
+
531
+ .. autosummary::
532
+ :toctree: generated/
533
+
534
+ fit
535
+ ecdf
536
+ logrank
537
+
538
+ Directional statistical functions
539
+ ---------------------------------
540
+
541
+ .. autosummary::
542
+ :toctree: generated/
543
+
544
+ directional_stats
545
+ circmean
546
+ circvar
547
+ circstd
548
+
549
+ Sensitivity Analysis
550
+ --------------------
551
+
552
+ .. autosummary::
553
+ :toctree: generated/
554
+
555
+ sobol_indices
556
+
557
+ Plot-tests
558
+ ----------
559
+
560
+ .. autosummary::
561
+ :toctree: generated/
562
+
563
+ ppcc_max
564
+ ppcc_plot
565
+ probplot
566
+ boxcox_normplot
567
+ yeojohnson_normplot
568
+
569
+ Univariate and multivariate kernel density estimation
570
+ -----------------------------------------------------
571
+
572
+ .. autosummary::
573
+ :toctree: generated/
574
+
575
+ gaussian_kde
576
+
577
+ Warnings / Errors used in :mod:`scipy.stats`
578
+ --------------------------------------------
579
+
580
+ .. autosummary::
581
+ :toctree: generated/
582
+
583
+ DegenerateDataWarning
584
+ ConstantInputWarning
585
+ NearConstantInputWarning
586
+ FitError
587
+
588
+ Result classes used in :mod:`scipy.stats`
589
+ -----------------------------------------
590
+
591
+ .. warning::
592
+
593
+ These classes are private, but they are included here because instances
594
+ of them are returned by other statistical functions. User import and
595
+ instantiation is not supported.
596
+
597
+ .. toctree::
598
+ :maxdepth: 2
599
+
600
+ stats._result_classes
601
+
602
+ """ # noqa: E501
603
+
604
+ from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
605
+ DegenerateDataWarning, FitError)
606
+ from ._stats_py import *
607
+ from ._variation import variation
608
+ from .distributions import *
609
+ from ._morestats import *
610
+ from ._multicomp import *
611
+ from ._binomtest import binomtest
612
+ from ._binned_statistic import *
613
+ from ._kde import gaussian_kde
614
+ from . import mstats
615
+ from . import qmc
616
+ from ._multivariate import *
617
+ from . import contingency
618
+ from .contingency import chi2_contingency
619
+ from ._censored_data import CensoredData
620
+ from ._resampling import (bootstrap, monte_carlo_test, permutation_test,
621
+ MonteCarloMethod, PermutationMethod, BootstrapMethod)
622
+ from ._entropy import *
623
+ from ._hypotests import *
624
+ from ._rvs_sampling import rvs_ratio_uniforms
625
+ from ._page_trend_test import page_trend_test
626
+ from ._mannwhitneyu import mannwhitneyu
627
+ from ._bws_test import bws_test
628
+ from ._fit import fit, goodness_of_fit
629
+ from ._covariance import Covariance
630
+ from ._sensitivity_analysis import *
631
+ from ._survival import *
632
+
633
+ # Deprecated namespaces, to be removed in v2.0.0
634
+ from . import (
635
+ biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
636
+ )
637
+
638
+
639
+ __all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
640
+
641
+ from scipy._lib._testutils import PytestTester
642
+ test = PytestTester(__name__)
643
+ del PytestTester
.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.dll.a ADDED
Binary file (1.74 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.pyd ADDED
Binary file (259 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_axis_nan_policy.py ADDED
@@ -0,0 +1,642 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Many scipy.stats functions support `axis` and `nan_policy` parameters.
2
+ # When the two are combined, it can be tricky to get all the behavior just
3
+ # right. This file contains utility functions useful for scipy.stats functions
4
+ # that support `axis` and `nan_policy`, including a decorator that
5
+ # automatically adds `axis` and `nan_policy` arguments to a function.
6
+
7
+ import numpy as np
8
+ from functools import wraps
9
+ from scipy._lib._docscrape import FunctionDoc, Parameter
10
+ from scipy._lib._util import _contains_nan, AxisError, _get_nan
11
+ import inspect
12
+
13
+
14
+ def _broadcast_arrays(arrays, axis=None):
15
+ """
16
+ Broadcast shapes of arrays, ignoring incompatibility of specified axes
17
+ """
18
+ new_shapes = _broadcast_array_shapes(arrays, axis=axis)
19
+ if axis is None:
20
+ new_shapes = [new_shapes]*len(arrays)
21
+ return [np.broadcast_to(array, new_shape)
22
+ for array, new_shape in zip(arrays, new_shapes)]
23
+
24
+
25
+ def _broadcast_array_shapes(arrays, axis=None):
26
+ """
27
+ Broadcast shapes of arrays, ignoring incompatibility of specified axes
28
+ """
29
+ shapes = [np.asarray(arr).shape for arr in arrays]
30
+ return _broadcast_shapes(shapes, axis)
31
+
32
+
33
+ def _broadcast_shapes(shapes, axis=None):
34
+ """
35
+ Broadcast shapes, ignoring incompatibility of specified axes
36
+ """
37
+ if not shapes:
38
+ return shapes
39
+
40
+ # input validation
41
+ if axis is not None:
42
+ axis = np.atleast_1d(axis)
43
+ axis_int = axis.astype(int)
44
+ if not np.array_equal(axis_int, axis):
45
+ raise AxisError('`axis` must be an integer, a '
46
+ 'tuple of integers, or `None`.')
47
+ axis = axis_int
48
+
49
+ # First, ensure all shapes have same number of dimensions by prepending 1s.
50
+ n_dims = max([len(shape) for shape in shapes])
51
+ new_shapes = np.ones((len(shapes), n_dims), dtype=int)
52
+ for row, shape in zip(new_shapes, shapes):
53
+ row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
54
+
55
+ # Remove the shape elements of the axes to be ignored, but remember them.
56
+ if axis is not None:
57
+ axis[axis < 0] = n_dims + axis[axis < 0]
58
+ axis = np.sort(axis)
59
+ if axis[-1] >= n_dims or axis[0] < 0:
60
+ message = (f"`axis` is out of bounds "
61
+ f"for array of dimension {n_dims}")
62
+ raise AxisError(message)
63
+
64
+ if len(np.unique(axis)) != len(axis):
65
+ raise AxisError("`axis` must contain only distinct elements")
66
+
67
+ removed_shapes = new_shapes[:, axis]
68
+ new_shapes = np.delete(new_shapes, axis, axis=1)
69
+
70
+ # If arrays are broadcastable, shape elements that are 1 may be replaced
71
+ # with a corresponding non-1 shape element. Assuming arrays are
72
+ # broadcastable, that final shape element can be found with:
73
+ new_shape = np.max(new_shapes, axis=0)
74
+ # except in case of an empty array:
75
+ new_shape *= new_shapes.all(axis=0)
76
+
77
+ # Among all arrays, there can only be one unique non-1 shape element.
78
+ # Therefore, if any non-1 shape element does not match what we found
79
+ # above, the arrays must not be broadcastable after all.
80
+ if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
81
+ raise ValueError("Array shapes are incompatible for broadcasting.")
82
+
83
+ if axis is not None:
84
+ # Add back the shape elements that were ignored
85
+ new_axis = axis - np.arange(len(axis))
86
+ new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
87
+ for removed_shape in removed_shapes]
88
+ return new_shapes
89
+ else:
90
+ return tuple(new_shape)
91
+
92
+
93
+ def _broadcast_array_shapes_remove_axis(arrays, axis=None):
94
+ """
95
+ Broadcast shapes of arrays, dropping specified axes
96
+
97
+ Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
98
+ the shape of the broadcast result after consuming/dropping `axis`.
99
+ In other words, return output shape of a typical hypothesis test on
100
+ `arrays` vectorized along `axis`.
101
+
102
+ Examples
103
+ --------
104
+ >>> import numpy as np
105
+ >>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes
106
+ >>> a = np.zeros((5, 2, 1))
107
+ >>> b = np.zeros((9, 3))
108
+ >>> _broadcast_array_shapes((a, b), 1)
109
+ (5, 3)
110
+ """
111
+ # Note that here, `axis=None` means do not consume/drop any axes - _not_
112
+ # ravel arrays before broadcasting.
113
+ shapes = [arr.shape for arr in arrays]
114
+ return _broadcast_shapes_remove_axis(shapes, axis)
115
+
116
+
117
+ def _broadcast_shapes_remove_axis(shapes, axis=None):
118
+ """
119
+ Broadcast shapes, dropping specified axes
120
+
121
+ Same as _broadcast_array_shapes, but given a sequence
122
+ of array shapes `shapes` instead of the arrays themselves.
123
+ """
124
+ shapes = _broadcast_shapes(shapes, axis)
125
+ shape = shapes[0]
126
+ if axis is not None:
127
+ shape = np.delete(shape, axis)
128
+ return tuple(shape)
129
+
130
+
131
+ def _broadcast_concatenate(arrays, axis, paired=False):
132
+ """Concatenate arrays along an axis with broadcasting."""
133
+ arrays = _broadcast_arrays(arrays, axis if not paired else None)
134
+ res = np.concatenate(arrays, axis=axis)
135
+ return res
136
+
137
+
138
+ # TODO: add support for `axis` tuples
139
+ def _remove_nans(samples, paired):
140
+ "Remove nans from paired or unpaired 1D samples"
141
+ # potential optimization: don't copy arrays that don't contain nans
142
+ if not paired:
143
+ return [sample[~np.isnan(sample)] for sample in samples]
144
+
145
+ # for paired samples, we need to remove the whole pair when any part
146
+ # has a nan
147
+ nans = np.isnan(samples[0])
148
+ for sample in samples[1:]:
149
+ nans = nans | np.isnan(sample)
150
+ not_nans = ~nans
151
+ return [sample[not_nans] for sample in samples]
152
+
153
+
154
+ def _remove_sentinel(samples, paired, sentinel):
155
+ "Remove sentinel values from paired or unpaired 1D samples"
156
+ # could consolidate with `_remove_nans`, but it's not quite as simple as
157
+ # passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
158
+
159
+ # potential optimization: don't copy arrays that don't contain sentinel
160
+ if not paired:
161
+ return [sample[sample != sentinel] for sample in samples]
162
+
163
+ # for paired samples, we need to remove the whole pair when any part
164
+ # has a nan
165
+ sentinels = (samples[0] == sentinel)
166
+ for sample in samples[1:]:
167
+ sentinels = sentinels | (sample == sentinel)
168
+ not_sentinels = ~sentinels
169
+ return [sample[not_sentinels] for sample in samples]
170
+
171
+
172
+ def _masked_arrays_2_sentinel_arrays(samples):
173
+ # masked arrays in `samples` are converted to regular arrays, and values
174
+ # corresponding with masked elements are replaced with a sentinel value
175
+
176
+ # return without modifying arrays if none have a mask
177
+ has_mask = False
178
+ for sample in samples:
179
+ mask = getattr(sample, 'mask', False)
180
+ has_mask = has_mask or np.any(mask)
181
+ if not has_mask:
182
+ return samples, None # None means there is no sentinel value
183
+
184
+ # Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
185
+ # values are always omitted, but there are different nan policies.
186
+ dtype = np.result_type(*samples)
187
+ dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
188
+ for i in range(len(samples)):
189
+ # Things get more complicated if the arrays are of different types.
190
+ # We could have different sentinel values for each array, but
191
+ # the purpose of this code is convenience, not efficiency.
192
+ samples[i] = samples[i].astype(dtype, copy=False)
193
+
194
+ inexact = np.issubdtype(dtype, np.inexact)
195
+ info = np.finfo if inexact else np.iinfo
196
+ max_possible, min_possible = info(dtype).max, info(dtype).min
197
+ nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
198
+
199
+ sentinel = max_possible
200
+ # For simplicity, min_possible/np.infs are not candidate sentinel values
201
+ while sentinel > min_possible:
202
+ for sample in samples:
203
+ if np.any(sample == sentinel): # choose a new sentinel value
204
+ sentinel = nextafter(sentinel, -np.inf)
205
+ break
206
+ else: # when sentinel value is OK, break the while loop
207
+ break
208
+ else:
209
+ message = ("This function replaces masked elements with sentinel "
210
+ "values, but the data contains all distinct values of this "
211
+ "data type. Consider promoting the dtype to `np.float64`.")
212
+ raise ValueError(message)
213
+
214
+ # replace masked elements with sentinel value
215
+ out_samples = []
216
+ for sample in samples:
217
+ mask = getattr(sample, 'mask', None)
218
+ if mask is not None: # turn all masked arrays into sentinel arrays
219
+ mask = np.broadcast_to(mask, sample.shape)
220
+ sample = sample.data.copy() if np.any(mask) else sample.data
221
+ sample = np.asarray(sample) # `sample.data` could be a memoryview?
222
+ sample[mask] = sentinel
223
+ out_samples.append(sample)
224
+
225
+ return out_samples, sentinel
226
+
227
+
228
+ def _check_empty_inputs(samples, axis):
229
+ """
230
+ Check for empty sample; return appropriate output for a vectorized hypotest
231
+ """
232
+ # if none of the samples are empty, we need to perform the test
233
+ if not any(sample.size == 0 for sample in samples):
234
+ return None
235
+ # otherwise, the statistic and p-value will be either empty arrays or
236
+ # arrays with NaNs. Produce the appropriate array and return it.
237
+ output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
238
+ output = np.ones(output_shape) * _get_nan(*samples)
239
+ return output
240
+
241
+
242
+ def _add_reduced_axes(res, reduced_axes, keepdims):
243
+ """
244
+ Add reduced axes back to all the arrays in the result object
245
+ if keepdims = True.
246
+ """
247
+ return ([np.expand_dims(output, reduced_axes) for output in res]
248
+ if keepdims else res)
249
+
250
+
251
+ # Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
252
+ _name = 'axis'
253
+ _desc = (
254
+ """If an int, the axis of the input along which to compute the statistic.
255
+ The statistic of each axis-slice (e.g. row) of the input will appear in a
256
+ corresponding element of the output.
257
+ If ``None``, the input will be raveled before computing the statistic."""
258
+ .split('\n'))
259
+
260
+
261
+ def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
262
+ _type = f"int or None, default: {default_axis}"
263
+ _axis_parameter_doc = Parameter(_name, _type, _desc)
264
+ _axis_parameter = inspect.Parameter(_name,
265
+ inspect.Parameter.KEYWORD_ONLY,
266
+ default=default_axis)
267
+ return _axis_parameter_doc, _axis_parameter
268
+
269
+
270
+ _name = 'nan_policy'
271
+ _type = "{'propagate', 'omit', 'raise'}"
272
+ _desc = (
273
+ """Defines how to handle input NaNs.
274
+
275
+ - ``propagate``: if a NaN is present in the axis slice (e.g. row) along
276
+ which the statistic is computed, the corresponding entry of the output
277
+ will be NaN.
278
+ - ``omit``: NaNs will be omitted when performing the calculation.
279
+ If insufficient data remains in the axis slice along which the
280
+ statistic is computed, the corresponding entry of the output will be
281
+ NaN.
282
+ - ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
283
+ .split('\n'))
284
+ _nan_policy_parameter_doc = Parameter(_name, _type, _desc)
285
+ _nan_policy_parameter = inspect.Parameter(_name,
286
+ inspect.Parameter.KEYWORD_ONLY,
287
+ default='propagate')
288
+
289
+ _name = 'keepdims'
290
+ _type = "bool, default: False"
291
+ _desc = (
292
+ """If this is set to True, the axes which are reduced are left
293
+ in the result as dimensions with size one. With this option,
294
+ the result will broadcast correctly against the input array."""
295
+ .split('\n'))
296
+ _keepdims_parameter_doc = Parameter(_name, _type, _desc)
297
+ _keepdims_parameter = inspect.Parameter(_name,
298
+ inspect.Parameter.KEYWORD_ONLY,
299
+ default=False)
300
+
301
+ _standard_note_addition = (
302
+ """\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
303
+ code) are converted to ``np.ndarray`` before the calculation is performed. In
304
+ this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
305
+ rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
306
+ arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
307
+ masked array with ``mask=False``.""").split('\n')
308
+
309
+
310
+ def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
311
+ n_samples=1, paired=False,
312
+ result_to_tuple=None, too_small=0,
313
+ n_outputs=2, kwd_samples=[], override=None):
314
+ """Factory for a wrapper that adds axis/nan_policy params to a function.
315
+
316
+ Parameters
317
+ ----------
318
+ tuple_to_result : callable
319
+ Callable that returns an object of the type returned by the function
320
+ being wrapped (e.g. the namedtuple or dataclass returned by a
321
+ statistical test) provided the separate components (e.g. statistic,
322
+ pvalue).
323
+ default_axis : int, default: 0
324
+ The default value of the axis argument. Standard is 0 except when
325
+ backwards compatibility demands otherwise (e.g. `None`).
326
+ n_samples : int or callable, default: 1
327
+ The number of data samples accepted by the function
328
+ (e.g. `mannwhitneyu`), a callable that accepts a dictionary of
329
+ parameters passed into the function and returns the number of data
330
+ samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
331
+ of samples (e.g. `kruskal`).
332
+ paired : {False, True}
333
+ Whether the function being wrapped treats the samples as paired (i.e.
334
+ corresponding elements of each sample should be considered as different
335
+ components of the same sample.)
336
+ result_to_tuple : callable, optional
337
+ Function that unpacks the results of the function being wrapped into
338
+ a tuple. This is essentially the inverse of `tuple_to_result`. Default
339
+ is `None`, which is appropriate for statistical tests that return a
340
+ statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
341
+ too_small : int or callable, default: 0
342
+ The largest unnacceptably small sample for the function being wrapped.
343
+ For example, some functions require samples of size two or more or they
344
+ raise an error. This argument prevents the error from being raised when
345
+ input is not 1D and instead places a NaN in the corresponding element
346
+ of the result. If callable, it must accept a list of samples, axis,
347
+ and a dictionary of keyword arguments passed to the wrapper function as
348
+ arguments and return a bool indicating weather the samples passed are
349
+ too small.
350
+ n_outputs : int or callable, default: 2
351
+ The number of outputs produced by the function given 1d sample(s). For
352
+ example, hypothesis tests that return a namedtuple or result object
353
+ with attributes ``statistic`` and ``pvalue`` use the default
354
+ ``n_outputs=2``; summary statistics with scalar output use
355
+ ``n_outputs=1``. Alternatively, may be a callable that accepts a
356
+ dictionary of arguments passed into the wrapped function and returns
357
+ the number of outputs corresponding with those arguments.
358
+ kwd_samples : sequence, default: []
359
+ The names of keyword parameters that should be treated as samples. For
360
+ example, `gmean` accepts as its first argument a sample `a` but
361
+ also `weights` as a fourth, optional keyword argument. In this case, we
362
+ use `n_samples=1` and kwd_samples=['weights'].
363
+ override : dict, default: {'vectorization': False, 'nan_propagation': True}
364
+ Pass a dictionary with ``'vectorization': True`` to ensure that the
365
+ decorator overrides the function's behavior for multimensional input.
366
+ Use ``'nan_propagation': False`` to ensure that the decorator does not
367
+ override the function's behavior for ``nan_policy='propagate'``.
368
+ (See `scipy.stats.mode`, for example.)
369
+ """
370
+ # Specify which existing behaviors the decorator must override
371
+ temp = override or {}
372
+ override = {'vectorization': False,
373
+ 'nan_propagation': True}
374
+ override.update(temp)
375
+
376
+ if result_to_tuple is None:
377
+ def result_to_tuple(res):
378
+ return res
379
+
380
+ if not callable(too_small):
381
+ def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
382
+ for sample in samples:
383
+ if sample.shape[axis] <= too_small:
384
+ return True
385
+ return False
386
+ else:
387
+ is_too_small = too_small
388
+
389
+ def axis_nan_policy_decorator(hypotest_fun_in):
390
+ @wraps(hypotest_fun_in)
391
+ def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
392
+
393
+ if _no_deco: # for testing, decorator does nothing
394
+ return hypotest_fun_in(*args, **kwds)
395
+
396
+ # We need to be flexible about whether position or keyword
397
+ # arguments are used, but we need to make sure users don't pass
398
+ # both for the same parameter. To complicate matters, some
399
+ # functions accept samples with *args, and some functions already
400
+ # accept `axis` and `nan_policy` as positional arguments.
401
+ # The strategy is to make sure that there is no duplication
402
+ # between `args` and `kwds`, combine the two into `kwds`, then
403
+ # the samples, `nan_policy`, and `axis` from `kwds`, as they are
404
+ # dealt with separately.
405
+
406
+ # Check for intersection between positional and keyword args
407
+ params = list(inspect.signature(hypotest_fun_in).parameters)
408
+ if n_samples is None:
409
+ # Give unique names to each positional sample argument
410
+ # Note that *args can't be provided as a keyword argument
411
+ params = [f"arg{i}" for i in range(len(args))] + params[1:]
412
+
413
+ # raise if there are too many positional args
414
+ maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
415
+ else len(inspect.getfullargspec(hypotest_fun_in).args))
416
+ if len(args) > maxarg: # let the function raise the right error
417
+ hypotest_fun_in(*args, **kwds)
418
+
419
+ # raise if multiple values passed for same parameter
420
+ d_args = dict(zip(params, args))
421
+ intersection = set(d_args) & set(kwds)
422
+ if intersection: # let the function raise the right error
423
+ hypotest_fun_in(*args, **kwds)
424
+
425
+ # Consolidate other positional and keyword args into `kwds`
426
+ kwds.update(d_args)
427
+
428
+ # rename avoids UnboundLocalError
429
+ if callable(n_samples):
430
+ # Future refactoring idea: no need for callable n_samples.
431
+ # Just replace `n_samples` and `kwd_samples` with a single
432
+ # list of the names of all samples, and treat all of them
433
+ # as `kwd_samples` are treated below.
434
+ n_samp = n_samples(kwds)
435
+ else:
436
+ n_samp = n_samples or len(args)
437
+
438
+ # get the number of outputs
439
+ n_out = n_outputs # rename to avoid UnboundLocalError
440
+ if callable(n_out):
441
+ n_out = n_out(kwds)
442
+
443
+ # If necessary, rearrange function signature: accept other samples
444
+ # as positional args right after the first n_samp args
445
+ kwd_samp = [name for name in kwd_samples
446
+ if kwds.get(name, None) is not None]
447
+ n_kwd_samp = len(kwd_samp)
448
+ if not kwd_samp:
449
+ hypotest_fun_out = hypotest_fun_in
450
+ else:
451
+ def hypotest_fun_out(*samples, **kwds):
452
+ new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
453
+ kwds.update(new_kwds)
454
+ return hypotest_fun_in(*samples[:n_samp], **kwds)
455
+
456
+ # Extract the things we need here
457
+ try: # if something is missing
458
+ samples = [np.atleast_1d(kwds.pop(param))
459
+ for param in (params[:n_samp] + kwd_samp)]
460
+ except KeyError: # let the function raise the right error
461
+ # might need to revisit this if required arg is not a "sample"
462
+ hypotest_fun_in(*args, **kwds)
463
+ vectorized = True if 'axis' in params else False
464
+ vectorized = vectorized and not override['vectorization']
465
+ axis = kwds.pop('axis', default_axis)
466
+ nan_policy = kwds.pop('nan_policy', 'propagate')
467
+ keepdims = kwds.pop("keepdims", False)
468
+ del args # avoid the possibility of passing both `args` and `kwds`
469
+
470
+ # convert masked arrays to regular arrays with sentinel values
471
+ samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
472
+
473
+ # standardize to always work along last axis
474
+ reduced_axes = axis
475
+ if axis is None:
476
+ if samples:
477
+ # when axis=None, take the maximum of all dimensions since
478
+ # all the dimensions are reduced.
479
+ n_dims = np.max([sample.ndim for sample in samples])
480
+ reduced_axes = tuple(range(n_dims))
481
+ samples = [np.asarray(sample.ravel()) for sample in samples]
482
+ else:
483
+ samples = _broadcast_arrays(samples, axis=axis)
484
+ axis = np.atleast_1d(axis)
485
+ n_axes = len(axis)
486
+ # move all axes in `axis` to the end to be raveled
487
+ samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
488
+ for sample in samples]
489
+ shapes = [sample.shape for sample in samples]
490
+ # New shape is unchanged for all axes _not_ in `axis`
491
+ # At the end, we append the product of the shapes of the axes
492
+ # in `axis`. Appending -1 doesn't work for zero-size arrays!
493
+ new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
494
+ for shape in shapes]
495
+ samples = [sample.reshape(new_shape)
496
+ for sample, new_shape in zip(samples, new_shapes)]
497
+ axis = -1 # work over the last axis
498
+ NaN = _get_nan(*samples)
499
+
500
+ # if axis is not needed, just handle nan_policy and return
501
+ ndims = np.array([sample.ndim for sample in samples])
502
+ if np.all(ndims <= 1):
503
+ # Addresses nan_policy == "raise"
504
+ if nan_policy != 'propagate' or override['nan_propagation']:
505
+ contains_nan = [_contains_nan(sample, nan_policy)[0]
506
+ for sample in samples]
507
+ else:
508
+ # Behave as though there are no NaNs (even if there are)
509
+ contains_nan = [False]*len(samples)
510
+
511
+ # Addresses nan_policy == "propagate"
512
+ if any(contains_nan) and (nan_policy == 'propagate'
513
+ and override['nan_propagation']):
514
+ res = np.full(n_out, NaN)
515
+ res = _add_reduced_axes(res, reduced_axes, keepdims)
516
+ return tuple_to_result(*res)
517
+
518
+ # Addresses nan_policy == "omit"
519
+ if any(contains_nan) and nan_policy == 'omit':
520
+ # consider passing in contains_nan
521
+ samples = _remove_nans(samples, paired)
522
+
523
+ # ideally, this is what the behavior would be:
524
+ # if is_too_small(samples):
525
+ # return tuple_to_result(NaN, NaN)
526
+ # but some existing functions raise exceptions, and changing
527
+ # behavior of those would break backward compatibility.
528
+
529
+ if sentinel:
530
+ samples = _remove_sentinel(samples, paired, sentinel)
531
+ res = hypotest_fun_out(*samples, **kwds)
532
+ res = result_to_tuple(res)
533
+ res = _add_reduced_axes(res, reduced_axes, keepdims)
534
+ return tuple_to_result(*res)
535
+
536
+ # check for empty input
537
+ # ideally, move this to the top, but some existing functions raise
538
+ # exceptions for empty input, so overriding it would break
539
+ # backward compatibility.
540
+ empty_output = _check_empty_inputs(samples, axis)
541
+ # only return empty output if zero sized input is too small.
542
+ if (
543
+ empty_output is not None
544
+ and (is_too_small(samples, kwds) or empty_output.size == 0)
545
+ ):
546
+ res = [empty_output.copy() for i in range(n_out)]
547
+ res = _add_reduced_axes(res, reduced_axes, keepdims)
548
+ return tuple_to_result(*res)
549
+
550
+ # otherwise, concatenate all samples along axis, remembering where
551
+ # each separate sample begins
552
+ lengths = np.array([sample.shape[axis] for sample in samples])
553
+ split_indices = np.cumsum(lengths)
554
+ x = _broadcast_concatenate(samples, axis)
555
+
556
+ # Addresses nan_policy == "raise"
557
+ if nan_policy != 'propagate' or override['nan_propagation']:
558
+ contains_nan, _ = _contains_nan(x, nan_policy)
559
+ else:
560
+ contains_nan = False # behave like there are no NaNs
561
+
562
+ if vectorized and not contains_nan and not sentinel:
563
+ res = hypotest_fun_out(*samples, axis=axis, **kwds)
564
+ res = result_to_tuple(res)
565
+ res = _add_reduced_axes(res, reduced_axes, keepdims)
566
+ return tuple_to_result(*res)
567
+
568
+ # Addresses nan_policy == "omit"
569
+ if contains_nan and nan_policy == 'omit':
570
+ def hypotest_fun(x):
571
+ samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
572
+ samples = _remove_nans(samples, paired)
573
+ if sentinel:
574
+ samples = _remove_sentinel(samples, paired, sentinel)
575
+ if is_too_small(samples, kwds):
576
+ return np.full(n_out, NaN)
577
+ return result_to_tuple(hypotest_fun_out(*samples, **kwds))
578
+
579
+ # Addresses nan_policy == "propagate"
580
+ elif (contains_nan and nan_policy == 'propagate'
581
+ and override['nan_propagation']):
582
+ def hypotest_fun(x):
583
+ if np.isnan(x).any():
584
+ return np.full(n_out, NaN)
585
+
586
+ samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
587
+ if sentinel:
588
+ samples = _remove_sentinel(samples, paired, sentinel)
589
+ if is_too_small(samples, kwds):
590
+ return np.full(n_out, NaN)
591
+ return result_to_tuple(hypotest_fun_out(*samples, **kwds))
592
+
593
+ else:
594
+ def hypotest_fun(x):
595
+ samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
596
+ if sentinel:
597
+ samples = _remove_sentinel(samples, paired, sentinel)
598
+ if is_too_small(samples, kwds):
599
+ return np.full(n_out, NaN)
600
+ return result_to_tuple(hypotest_fun_out(*samples, **kwds))
601
+
602
+ x = np.moveaxis(x, axis, 0)
603
+ res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
604
+ res = _add_reduced_axes(res, reduced_axes, keepdims)
605
+ return tuple_to_result(*res)
606
+
607
+ _axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
608
+ doc = FunctionDoc(axis_nan_policy_wrapper)
609
+ parameter_names = [param.name for param in doc['Parameters']]
610
+ if 'axis' in parameter_names:
611
+ doc['Parameters'][parameter_names.index('axis')] = (
612
+ _axis_parameter_doc)
613
+ else:
614
+ doc['Parameters'].append(_axis_parameter_doc)
615
+ if 'nan_policy' in parameter_names:
616
+ doc['Parameters'][parameter_names.index('nan_policy')] = (
617
+ _nan_policy_parameter_doc)
618
+ else:
619
+ doc['Parameters'].append(_nan_policy_parameter_doc)
620
+ if 'keepdims' in parameter_names:
621
+ doc['Parameters'][parameter_names.index('keepdims')] = (
622
+ _keepdims_parameter_doc)
623
+ else:
624
+ doc['Parameters'].append(_keepdims_parameter_doc)
625
+ doc['Notes'] += _standard_note_addition
626
+ doc = str(doc).split("\n", 1)[1] # remove signature
627
+ axis_nan_policy_wrapper.__doc__ = str(doc)
628
+
629
+ sig = inspect.signature(axis_nan_policy_wrapper)
630
+ parameters = sig.parameters
631
+ parameter_list = list(parameters.values())
632
+ if 'axis' not in parameters:
633
+ parameter_list.append(_axis_parameter)
634
+ if 'nan_policy' not in parameters:
635
+ parameter_list.append(_nan_policy_parameter)
636
+ if 'keepdims' not in parameters:
637
+ parameter_list.append(_keepdims_parameter)
638
+ sig = sig.replace(parameters=parameter_list)
639
+ axis_nan_policy_wrapper.__signature__ = sig
640
+
641
+ return axis_nan_policy_wrapper
642
+ return axis_nan_policy_decorator
.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.dll.a ADDED
Binary file (1.57 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.pyd ADDED
Binary file (399 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_biasedurn.pxd ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Declare the class with cdef
2
+ cdef extern from "biasedurn/stocc.h" nogil:
3
+ cdef cppclass CFishersNCHypergeometric:
4
+ CFishersNCHypergeometric(int, int, int, double, double) except +
5
+ int mode()
6
+ double mean()
7
+ double variance()
8
+ double probability(int x)
9
+ double moments(double * mean, double * var)
10
+
11
+ cdef cppclass CWalleniusNCHypergeometric:
12
+ CWalleniusNCHypergeometric() except +
13
+ CWalleniusNCHypergeometric(int, int, int, double, double) except +
14
+ int mode()
15
+ double mean()
16
+ double variance()
17
+ double probability(int x)
18
+ double moments(double * mean, double * var)
19
+
20
+ cdef cppclass StochasticLib3:
21
+ StochasticLib3(int seed) except +
22
+ double Random() except +
23
+ void SetAccuracy(double accur)
24
+ int FishersNCHyp (int n, int m, int N, double odds) except +
25
+ int WalleniusNCHyp (int n, int m, int N, double odds) except +
26
+ double(*next_double)()
27
+ double(*next_normal)(const double m, const double s)
.venv/Lib/site-packages/scipy/stats/_binned_statistic.py ADDED
@@ -0,0 +1,795 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import builtins
2
+ from warnings import catch_warnings, simplefilter
3
+ import numpy as np
4
+ from operator import index
5
+ from collections import namedtuple
6
+
7
+ __all__ = ['binned_statistic',
8
+ 'binned_statistic_2d',
9
+ 'binned_statistic_dd']
10
+
11
+
12
+ BinnedStatisticResult = namedtuple('BinnedStatisticResult',
13
+ ('statistic', 'bin_edges', 'binnumber'))
14
+
15
+
16
+ def binned_statistic(x, values, statistic='mean',
17
+ bins=10, range=None):
18
+ """
19
+ Compute a binned statistic for one or more sets of data.
20
+
21
+ This is a generalization of a histogram function. A histogram divides
22
+ the space into bins, and returns the count of the number of points in
23
+ each bin. This function allows the computation of the sum, mean, median,
24
+ or other statistic of the values (or set of values) within each bin.
25
+
26
+ Parameters
27
+ ----------
28
+ x : (N,) array_like
29
+ A sequence of values to be binned.
30
+ values : (N,) array_like or list of (N,) array_like
31
+ The data on which the statistic will be computed. This must be
32
+ the same shape as `x`, or a set of sequences - each the same shape as
33
+ `x`. If `values` is a set of sequences, the statistic will be computed
34
+ on each independently.
35
+ statistic : string or callable, optional
36
+ The statistic to compute (default is 'mean').
37
+ The following statistics are available:
38
+
39
+ * 'mean' : compute the mean of values for points within each bin.
40
+ Empty bins will be represented by NaN.
41
+ * 'std' : compute the standard deviation within each bin. This
42
+ is implicitly calculated with ddof=0.
43
+ * 'median' : compute the median of values for points within each
44
+ bin. Empty bins will be represented by NaN.
45
+ * 'count' : compute the count of points within each bin. This is
46
+ identical to an unweighted histogram. `values` array is not
47
+ referenced.
48
+ * 'sum' : compute the sum of values for points within each bin.
49
+ This is identical to a weighted histogram.
50
+ * 'min' : compute the minimum of values for points within each bin.
51
+ Empty bins will be represented by NaN.
52
+ * 'max' : compute the maximum of values for point within each bin.
53
+ Empty bins will be represented by NaN.
54
+ * function : a user-defined function which takes a 1D array of
55
+ values, and outputs a single numerical statistic. This function
56
+ will be called on the values in each bin. Empty bins will be
57
+ represented by function([]), or NaN if this returns an error.
58
+
59
+ bins : int or sequence of scalars, optional
60
+ If `bins` is an int, it defines the number of equal-width bins in the
61
+ given range (10 by default). If `bins` is a sequence, it defines the
62
+ bin edges, including the rightmost edge, allowing for non-uniform bin
63
+ widths. Values in `x` that are smaller than lowest bin edge are
64
+ assigned to bin number 0, values beyond the highest bin are assigned to
65
+ ``bins[-1]``. If the bin edges are specified, the number of bins will
66
+ be, (nx = len(bins)-1).
67
+ range : (float, float) or [(float, float)], optional
68
+ The lower and upper range of the bins. If not provided, range
69
+ is simply ``(x.min(), x.max())``. Values outside the range are
70
+ ignored.
71
+
72
+ Returns
73
+ -------
74
+ statistic : array
75
+ The values of the selected statistic in each bin.
76
+ bin_edges : array of dtype float
77
+ Return the bin edges ``(length(statistic)+1)``.
78
+ binnumber: 1-D ndarray of ints
79
+ Indices of the bins (corresponding to `bin_edges`) in which each value
80
+ of `x` belongs. Same length as `values`. A binnumber of `i` means the
81
+ corresponding value is between (bin_edges[i-1], bin_edges[i]).
82
+
83
+ See Also
84
+ --------
85
+ numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
86
+
87
+ Notes
88
+ -----
89
+ All but the last (righthand-most) bin is half-open. In other words, if
90
+ `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
91
+ but excluding 2) and the second ``[2, 3)``. The last bin, however, is
92
+ ``[3, 4]``, which *includes* 4.
93
+
94
+ .. versionadded:: 0.11.0
95
+
96
+ Examples
97
+ --------
98
+ >>> import numpy as np
99
+ >>> from scipy import stats
100
+ >>> import matplotlib.pyplot as plt
101
+
102
+ First some basic examples:
103
+
104
+ Create two evenly spaced bins in the range of the given sample, and sum the
105
+ corresponding values in each of those bins:
106
+
107
+ >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
108
+ >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
109
+ BinnedStatisticResult(statistic=array([4. , 4.5]),
110
+ bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
111
+
112
+ Multiple arrays of values can also be passed. The statistic is calculated
113
+ on each set independently:
114
+
115
+ >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
116
+ >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
117
+ BinnedStatisticResult(statistic=array([[4. , 4.5],
118
+ [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
119
+ binnumber=array([1, 1, 1, 2, 2]))
120
+
121
+ >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
122
+ ... bins=3)
123
+ BinnedStatisticResult(statistic=array([1., 2., 4.]),
124
+ bin_edges=array([1., 2., 3., 4.]),
125
+ binnumber=array([1, 2, 1, 2, 3]))
126
+
127
+ As a second example, we now generate some random data of sailing boat speed
128
+ as a function of wind speed, and then determine how fast our boat is for
129
+ certain wind speeds:
130
+
131
+ >>> rng = np.random.default_rng()
132
+ >>> windspeed = 8 * rng.random(500)
133
+ >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
134
+ >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
135
+ ... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
136
+ >>> plt.figure()
137
+ >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
138
+ >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
139
+ ... label='binned statistic of data')
140
+ >>> plt.legend()
141
+
142
+ Now we can use ``binnumber`` to select all datapoints with a windspeed
143
+ below 1:
144
+
145
+ >>> low_boatspeed = boatspeed[binnumber == 0]
146
+
147
+ As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
148
+ plot of a distribution that shows the mean and distribution around that
149
+ mean per bin, on top of a regular histogram and the probability
150
+ distribution function:
151
+
152
+ >>> x = np.linspace(0, 5, num=500)
153
+ >>> x_pdf = stats.maxwell.pdf(x)
154
+ >>> samples = stats.maxwell.rvs(size=10000)
155
+
156
+ >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
157
+ ... statistic='mean', bins=25)
158
+ >>> bin_width = (bin_edges[1] - bin_edges[0])
159
+ >>> bin_centers = bin_edges[1:] - bin_width/2
160
+
161
+ >>> plt.figure()
162
+ >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
163
+ ... alpha=0.2, label='histogram of data')
164
+ >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
165
+ >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
166
+ ... label='binned statistic of data')
167
+ >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
168
+ >>> plt.legend(fontsize=10)
169
+ >>> plt.show()
170
+
171
+ """
172
+ try:
173
+ N = len(bins)
174
+ except TypeError:
175
+ N = 1
176
+
177
+ if N != 1:
178
+ bins = [np.asarray(bins, float)]
179
+
180
+ if range is not None:
181
+ if len(range) == 2:
182
+ range = [range]
183
+
184
+ medians, edges, binnumbers = binned_statistic_dd(
185
+ [x], values, statistic, bins, range)
186
+
187
+ return BinnedStatisticResult(medians, edges[0], binnumbers)
188
+
189
+
190
+ BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
191
+ ('statistic', 'x_edge', 'y_edge',
192
+ 'binnumber'))
193
+
194
+
195
+ def binned_statistic_2d(x, y, values, statistic='mean',
196
+ bins=10, range=None, expand_binnumbers=False):
197
+ """
198
+ Compute a bidimensional binned statistic for one or more sets of data.
199
+
200
+ This is a generalization of a histogram2d function. A histogram divides
201
+ the space into bins, and returns the count of the number of points in
202
+ each bin. This function allows the computation of the sum, mean, median,
203
+ or other statistic of the values (or set of values) within each bin.
204
+
205
+ Parameters
206
+ ----------
207
+ x : (N,) array_like
208
+ A sequence of values to be binned along the first dimension.
209
+ y : (N,) array_like
210
+ A sequence of values to be binned along the second dimension.
211
+ values : (N,) array_like or list of (N,) array_like
212
+ The data on which the statistic will be computed. This must be
213
+ the same shape as `x`, or a list of sequences - each with the same
214
+ shape as `x`. If `values` is such a list, the statistic will be
215
+ computed on each independently.
216
+ statistic : string or callable, optional
217
+ The statistic to compute (default is 'mean').
218
+ The following statistics are available:
219
+
220
+ * 'mean' : compute the mean of values for points within each bin.
221
+ Empty bins will be represented by NaN.
222
+ * 'std' : compute the standard deviation within each bin. This
223
+ is implicitly calculated with ddof=0.
224
+ * 'median' : compute the median of values for points within each
225
+ bin. Empty bins will be represented by NaN.
226
+ * 'count' : compute the count of points within each bin. This is
227
+ identical to an unweighted histogram. `values` array is not
228
+ referenced.
229
+ * 'sum' : compute the sum of values for points within each bin.
230
+ This is identical to a weighted histogram.
231
+ * 'min' : compute the minimum of values for points within each bin.
232
+ Empty bins will be represented by NaN.
233
+ * 'max' : compute the maximum of values for point within each bin.
234
+ Empty bins will be represented by NaN.
235
+ * function : a user-defined function which takes a 1D array of
236
+ values, and outputs a single numerical statistic. This function
237
+ will be called on the values in each bin. Empty bins will be
238
+ represented by function([]), or NaN if this returns an error.
239
+
240
+ bins : int or [int, int] or array_like or [array, array], optional
241
+ The bin specification:
242
+
243
+ * the number of bins for the two dimensions (nx = ny = bins),
244
+ * the number of bins in each dimension (nx, ny = bins),
245
+ * the bin edges for the two dimensions (x_edge = y_edge = bins),
246
+ * the bin edges in each dimension (x_edge, y_edge = bins).
247
+
248
+ If the bin edges are specified, the number of bins will be,
249
+ (nx = len(x_edge)-1, ny = len(y_edge)-1).
250
+
251
+ range : (2,2) array_like, optional
252
+ The leftmost and rightmost edges of the bins along each dimension
253
+ (if not specified explicitly in the `bins` parameters):
254
+ [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
255
+ considered outliers and not tallied in the histogram.
256
+ expand_binnumbers : bool, optional
257
+ 'False' (default): the returned `binnumber` is a shape (N,) array of
258
+ linearized bin indices.
259
+ 'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
260
+ ndarray, where each row gives the bin numbers in the corresponding
261
+ dimension.
262
+ See the `binnumber` returned value, and the `Examples` section.
263
+
264
+ .. versionadded:: 0.17.0
265
+
266
+ Returns
267
+ -------
268
+ statistic : (nx, ny) ndarray
269
+ The values of the selected statistic in each two-dimensional bin.
270
+ x_edge : (nx + 1) ndarray
271
+ The bin edges along the first dimension.
272
+ y_edge : (ny + 1) ndarray
273
+ The bin edges along the second dimension.
274
+ binnumber : (N,) array of ints or (2,N) ndarray of ints
275
+ This assigns to each element of `sample` an integer that represents the
276
+ bin in which this observation falls. The representation depends on the
277
+ `expand_binnumbers` argument. See `Notes` for details.
278
+
279
+
280
+ See Also
281
+ --------
282
+ numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
283
+
284
+ Notes
285
+ -----
286
+ Binedges:
287
+ All but the last (righthand-most) bin is half-open. In other words, if
288
+ `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
289
+ but excluding 2) and the second ``[2, 3)``. The last bin, however, is
290
+ ``[3, 4]``, which *includes* 4.
291
+
292
+ `binnumber`:
293
+ This returned argument assigns to each element of `sample` an integer that
294
+ represents the bin in which it belongs. The representation depends on the
295
+ `expand_binnumbers` argument. If 'False' (default): The returned
296
+ `binnumber` is a shape (N,) array of linearized indices mapping each
297
+ element of `sample` to its corresponding bin (using row-major ordering).
298
+ Note that the returned linearized bin indices are used for an array with
299
+ extra bins on the outer binedges to capture values outside of the defined
300
+ bin bounds.
301
+ If 'True': The returned `binnumber` is a shape (2,N) ndarray where
302
+ each row indicates bin placements for each dimension respectively. In each
303
+ dimension, a binnumber of `i` means the corresponding value is between
304
+ (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
305
+
306
+ .. versionadded:: 0.11.0
307
+
308
+ Examples
309
+ --------
310
+ >>> from scipy import stats
311
+
312
+ Calculate the counts with explicit bin-edges:
313
+
314
+ >>> x = [0.1, 0.1, 0.1, 0.6]
315
+ >>> y = [2.1, 2.6, 2.1, 2.1]
316
+ >>> binx = [0.0, 0.5, 1.0]
317
+ >>> biny = [2.0, 2.5, 3.0]
318
+ >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
319
+ >>> ret.statistic
320
+ array([[2., 1.],
321
+ [1., 0.]])
322
+
323
+ The bin in which each sample is placed is given by the `binnumber`
324
+ returned parameter. By default, these are the linearized bin indices:
325
+
326
+ >>> ret.binnumber
327
+ array([5, 6, 5, 9])
328
+
329
+ The bin indices can also be expanded into separate entries for each
330
+ dimension using the `expand_binnumbers` parameter:
331
+
332
+ >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
333
+ ... expand_binnumbers=True)
334
+ >>> ret.binnumber
335
+ array([[1, 1, 1, 2],
336
+ [1, 2, 1, 1]])
337
+
338
+ Which shows that the first three elements belong in the xbin 1, and the
339
+ fourth into xbin 2; and so on for y.
340
+
341
+ """
342
+
343
+ # This code is based on np.histogram2d
344
+ try:
345
+ N = len(bins)
346
+ except TypeError:
347
+ N = 1
348
+
349
+ if N != 1 and N != 2:
350
+ xedges = yedges = np.asarray(bins, float)
351
+ bins = [xedges, yedges]
352
+
353
+ medians, edges, binnumbers = binned_statistic_dd(
354
+ [x, y], values, statistic, bins, range,
355
+ expand_binnumbers=expand_binnumbers)
356
+
357
+ return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
358
+
359
+
360
+ BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
361
+ ('statistic', 'bin_edges',
362
+ 'binnumber'))
363
+
364
+
365
+ def _bincount(x, weights):
366
+ if np.iscomplexobj(weights):
367
+ a = np.bincount(x, np.real(weights))
368
+ b = np.bincount(x, np.imag(weights))
369
+ z = a + b*1j
370
+
371
+ else:
372
+ z = np.bincount(x, weights)
373
+ return z
374
+
375
+
376
+ def binned_statistic_dd(sample, values, statistic='mean',
377
+ bins=10, range=None, expand_binnumbers=False,
378
+ binned_statistic_result=None):
379
+ """
380
+ Compute a multidimensional binned statistic for a set of data.
381
+
382
+ This is a generalization of a histogramdd function. A histogram divides
383
+ the space into bins, and returns the count of the number of points in
384
+ each bin. This function allows the computation of the sum, mean, median,
385
+ or other statistic of the values within each bin.
386
+
387
+ Parameters
388
+ ----------
389
+ sample : array_like
390
+ Data to histogram passed as a sequence of N arrays of length D, or
391
+ as an (N,D) array.
392
+ values : (N,) array_like or list of (N,) array_like
393
+ The data on which the statistic will be computed. This must be
394
+ the same shape as `sample`, or a list of sequences - each with the
395
+ same shape as `sample`. If `values` is such a list, the statistic
396
+ will be computed on each independently.
397
+ statistic : string or callable, optional
398
+ The statistic to compute (default is 'mean').
399
+ The following statistics are available:
400
+
401
+ * 'mean' : compute the mean of values for points within each bin.
402
+ Empty bins will be represented by NaN.
403
+ * 'median' : compute the median of values for points within each
404
+ bin. Empty bins will be represented by NaN.
405
+ * 'count' : compute the count of points within each bin. This is
406
+ identical to an unweighted histogram. `values` array is not
407
+ referenced.
408
+ * 'sum' : compute the sum of values for points within each bin.
409
+ This is identical to a weighted histogram.
410
+ * 'std' : compute the standard deviation within each bin. This
411
+ is implicitly calculated with ddof=0. If the number of values
412
+ within a given bin is 0 or 1, the computed standard deviation value
413
+ will be 0 for the bin.
414
+ * 'min' : compute the minimum of values for points within each bin.
415
+ Empty bins will be represented by NaN.
416
+ * 'max' : compute the maximum of values for point within each bin.
417
+ Empty bins will be represented by NaN.
418
+ * function : a user-defined function which takes a 1D array of
419
+ values, and outputs a single numerical statistic. This function
420
+ will be called on the values in each bin. Empty bins will be
421
+ represented by function([]), or NaN if this returns an error.
422
+
423
+ bins : sequence or positive int, optional
424
+ The bin specification must be in one of the following forms:
425
+
426
+ * A sequence of arrays describing the bin edges along each dimension.
427
+ * The number of bins for each dimension (nx, ny, ... = bins).
428
+ * The number of bins for all dimensions (nx = ny = ... = bins).
429
+ range : sequence, optional
430
+ A sequence of lower and upper bin edges to be used if the edges are
431
+ not given explicitly in `bins`. Defaults to the minimum and maximum
432
+ values along each dimension.
433
+ expand_binnumbers : bool, optional
434
+ 'False' (default): the returned `binnumber` is a shape (N,) array of
435
+ linearized bin indices.
436
+ 'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
437
+ ndarray, where each row gives the bin numbers in the corresponding
438
+ dimension.
439
+ See the `binnumber` returned value, and the `Examples` section of
440
+ `binned_statistic_2d`.
441
+ binned_statistic_result : binnedStatisticddResult
442
+ Result of a previous call to the function in order to reuse bin edges
443
+ and bin numbers with new values and/or a different statistic.
444
+ To reuse bin numbers, `expand_binnumbers` must have been set to False
445
+ (the default)
446
+
447
+ .. versionadded:: 0.17.0
448
+
449
+ Returns
450
+ -------
451
+ statistic : ndarray, shape(nx1, nx2, nx3,...)
452
+ The values of the selected statistic in each two-dimensional bin.
453
+ bin_edges : list of ndarrays
454
+ A list of D arrays describing the (nxi + 1) bin edges for each
455
+ dimension.
456
+ binnumber : (N,) array of ints or (D,N) ndarray of ints
457
+ This assigns to each element of `sample` an integer that represents the
458
+ bin in which this observation falls. The representation depends on the
459
+ `expand_binnumbers` argument. See `Notes` for details.
460
+
461
+
462
+ See Also
463
+ --------
464
+ numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
465
+
466
+ Notes
467
+ -----
468
+ Binedges:
469
+ All but the last (righthand-most) bin is half-open in each dimension. In
470
+ other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
471
+ ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
472
+ last bin, however, is ``[3, 4]``, which *includes* 4.
473
+
474
+ `binnumber`:
475
+ This returned argument assigns to each element of `sample` an integer that
476
+ represents the bin in which it belongs. The representation depends on the
477
+ `expand_binnumbers` argument. If 'False' (default): The returned
478
+ `binnumber` is a shape (N,) array of linearized indices mapping each
479
+ element of `sample` to its corresponding bin (using row-major ordering).
480
+ If 'True': The returned `binnumber` is a shape (D,N) ndarray where
481
+ each row indicates bin placements for each dimension respectively. In each
482
+ dimension, a binnumber of `i` means the corresponding value is between
483
+ (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
484
+
485
+ .. versionadded:: 0.11.0
486
+
487
+ Examples
488
+ --------
489
+ >>> import numpy as np
490
+ >>> from scipy import stats
491
+ >>> import matplotlib.pyplot as plt
492
+ >>> from mpl_toolkits.mplot3d import Axes3D
493
+
494
+ Take an array of 600 (x, y) coordinates as an example.
495
+ `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
496
+ of dimension `D+1` is required.
497
+
498
+ >>> mu = np.array([0., 1.])
499
+ >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
500
+ >>> multinormal = stats.multivariate_normal(mu, sigma)
501
+ >>> data = multinormal.rvs(size=600, random_state=235412)
502
+ >>> data.shape
503
+ (600, 2)
504
+
505
+ Create bins and count how many arrays fall in each bin:
506
+
507
+ >>> N = 60
508
+ >>> x = np.linspace(-3, 3, N)
509
+ >>> y = np.linspace(-3, 4, N)
510
+ >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
511
+ ... statistic='count')
512
+ >>> bincounts = ret.statistic
513
+
514
+ Set the volume and the location of bars:
515
+
516
+ >>> dx = x[1] - x[0]
517
+ >>> dy = y[1] - y[0]
518
+ >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
519
+ >>> z = 0
520
+
521
+ >>> bincounts = bincounts.ravel()
522
+ >>> x = x.ravel()
523
+ >>> y = y.ravel()
524
+
525
+ >>> fig = plt.figure()
526
+ >>> ax = fig.add_subplot(111, projection='3d')
527
+ >>> with np.errstate(divide='ignore'): # silence random axes3d warning
528
+ ... ax.bar3d(x, y, z, dx, dy, bincounts)
529
+
530
+ Reuse bin numbers and bin edges with new values:
531
+
532
+ >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
533
+ ... binned_statistic_result=ret,
534
+ ... statistic='mean')
535
+ """
536
+ known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
537
+ if not callable(statistic) and statistic not in known_stats:
538
+ raise ValueError(f'invalid statistic {statistic!r}')
539
+
540
+ try:
541
+ bins = index(bins)
542
+ except TypeError:
543
+ # bins is not an integer
544
+ pass
545
+ # If bins was an integer-like object, now it is an actual Python int.
546
+
547
+ # NOTE: for _bin_edges(), see e.g. gh-11365
548
+ if isinstance(bins, int) and not np.isfinite(sample).all():
549
+ raise ValueError(f'{sample!r} contains non-finite values.')
550
+
551
+ # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
552
+ # `Dlen` is the length of elements along each dimension.
553
+ # This code is based on np.histogramdd
554
+ try:
555
+ # `sample` is an ND-array.
556
+ Dlen, Ndim = sample.shape
557
+ except (AttributeError, ValueError):
558
+ # `sample` is a sequence of 1D arrays.
559
+ sample = np.atleast_2d(sample).T
560
+ Dlen, Ndim = sample.shape
561
+
562
+ # Store initial shape of `values` to preserve it in the output
563
+ values = np.asarray(values)
564
+ input_shape = list(values.shape)
565
+ # Make sure that `values` is 2D to iterate over rows
566
+ values = np.atleast_2d(values)
567
+ Vdim, Vlen = values.shape
568
+
569
+ # Make sure `values` match `sample`
570
+ if statistic != 'count' and Vlen != Dlen:
571
+ raise AttributeError('The number of `values` elements must match the '
572
+ 'length of each `sample` dimension.')
573
+
574
+ try:
575
+ M = len(bins)
576
+ if M != Ndim:
577
+ raise AttributeError('The dimension of bins must be equal '
578
+ 'to the dimension of the sample x.')
579
+ except TypeError:
580
+ bins = Ndim * [bins]
581
+
582
+ if binned_statistic_result is None:
583
+ nbin, edges, dedges = _bin_edges(sample, bins, range)
584
+ binnumbers = _bin_numbers(sample, nbin, edges, dedges)
585
+ else:
586
+ edges = binned_statistic_result.bin_edges
587
+ nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
588
+ # +1 for outlier bins
589
+ dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
590
+ binnumbers = binned_statistic_result.binnumber
591
+
592
+ # Avoid overflow with double precision. Complex `values` -> `complex128`.
593
+ result_type = np.result_type(values, np.float64)
594
+ result = np.empty([Vdim, nbin.prod()], dtype=result_type)
595
+
596
+ if statistic in {'mean', np.mean}:
597
+ result.fill(np.nan)
598
+ flatcount = _bincount(binnumbers, None)
599
+ a = flatcount.nonzero()
600
+ for vv in builtins.range(Vdim):
601
+ flatsum = _bincount(binnumbers, values[vv])
602
+ result[vv, a] = flatsum[a] / flatcount[a]
603
+ elif statistic in {'std', np.std}:
604
+ result.fill(np.nan)
605
+ flatcount = _bincount(binnumbers, None)
606
+ a = flatcount.nonzero()
607
+ for vv in builtins.range(Vdim):
608
+ flatsum = _bincount(binnumbers, values[vv])
609
+ delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
610
+ std = np.sqrt(
611
+ _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
612
+ )
613
+ result[vv, a] = std
614
+ result = np.real(result)
615
+ elif statistic == 'count':
616
+ result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
617
+ result.fill(0)
618
+ flatcount = _bincount(binnumbers, None)
619
+ a = np.arange(len(flatcount))
620
+ result[:, a] = flatcount[np.newaxis, :]
621
+ elif statistic in {'sum', np.sum}:
622
+ result.fill(0)
623
+ for vv in builtins.range(Vdim):
624
+ flatsum = _bincount(binnumbers, values[vv])
625
+ a = np.arange(len(flatsum))
626
+ result[vv, a] = flatsum
627
+ elif statistic in {'median', np.median}:
628
+ result.fill(np.nan)
629
+ for vv in builtins.range(Vdim):
630
+ i = np.lexsort((values[vv], binnumbers))
631
+ _, j, counts = np.unique(binnumbers[i],
632
+ return_index=True, return_counts=True)
633
+ mid = j + (counts - 1) / 2
634
+ mid_a = values[vv, i][np.floor(mid).astype(int)]
635
+ mid_b = values[vv, i][np.ceil(mid).astype(int)]
636
+ medians = (mid_a + mid_b) / 2
637
+ result[vv, binnumbers[i][j]] = medians
638
+ elif statistic in {'min', np.min}:
639
+ result.fill(np.nan)
640
+ for vv in builtins.range(Vdim):
641
+ i = np.argsort(values[vv])[::-1] # Reversed so the min is last
642
+ result[vv, binnumbers[i]] = values[vv, i]
643
+ elif statistic in {'max', np.max}:
644
+ result.fill(np.nan)
645
+ for vv in builtins.range(Vdim):
646
+ i = np.argsort(values[vv])
647
+ result[vv, binnumbers[i]] = values[vv, i]
648
+ elif callable(statistic):
649
+ with np.errstate(invalid='ignore'), catch_warnings():
650
+ simplefilter("ignore", RuntimeWarning)
651
+ try:
652
+ null = statistic([])
653
+ except Exception:
654
+ null = np.nan
655
+ if np.iscomplexobj(null):
656
+ result = result.astype(np.complex128)
657
+ result.fill(null)
658
+ try:
659
+ _calc_binned_statistic(
660
+ Vdim, binnumbers, result, values, statistic
661
+ )
662
+ except ValueError:
663
+ result = result.astype(np.complex128)
664
+ _calc_binned_statistic(
665
+ Vdim, binnumbers, result, values, statistic
666
+ )
667
+
668
+ # Shape into a proper matrix
669
+ result = result.reshape(np.append(Vdim, nbin))
670
+
671
+ # Remove outliers (indices 0 and -1 for each bin-dimension).
672
+ core = tuple([slice(None)] + Ndim * [slice(1, -1)])
673
+ result = result[core]
674
+
675
+ # Unravel binnumbers into an ndarray, each row the bins for each dimension
676
+ if expand_binnumbers and Ndim > 1:
677
+ binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
678
+
679
+ if np.any(result.shape[1:] != nbin - 2):
680
+ raise RuntimeError('Internal Shape Error')
681
+
682
+ # Reshape to have output (`result`) match input (`values`) shape
683
+ result = result.reshape(input_shape[:-1] + list(nbin-2))
684
+
685
+ return BinnedStatisticddResult(result, edges, binnumbers)
686
+
687
+
688
+ def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
689
+ unique_bin_numbers = np.unique(bin_numbers)
690
+ for vv in builtins.range(Vdim):
691
+ bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
692
+ values, vv)
693
+ for i in unique_bin_numbers:
694
+ stat = stat_func(np.array(bin_map[i]))
695
+ if np.iscomplexobj(stat) and not np.iscomplexobj(result):
696
+ raise ValueError("The statistic function returns complex ")
697
+ result[vv, i] = stat
698
+
699
+
700
+ def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
701
+ """ Create hashmap of bin ids to values in bins
702
+ key: bin number
703
+ value: list of binned data
704
+ """
705
+ bin_map = dict()
706
+ for i in unique_bin_numbers:
707
+ bin_map[i] = []
708
+ for i in builtins.range(len(bin_numbers)):
709
+ bin_map[bin_numbers[i]].append(values[vv, i])
710
+ return bin_map
711
+
712
+
713
+ def _bin_edges(sample, bins=None, range=None):
714
+ """ Create edge arrays
715
+ """
716
+ Dlen, Ndim = sample.shape
717
+
718
+ nbin = np.empty(Ndim, int) # Number of bins in each dimension
719
+ edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
720
+ dedges = Ndim * [None] # Spacing between edges (will be 2D array)
721
+
722
+ # Select range for each dimension
723
+ # Used only if number of bins is given.
724
+ if range is None:
725
+ smin = np.atleast_1d(np.array(sample.min(axis=0), float))
726
+ smax = np.atleast_1d(np.array(sample.max(axis=0), float))
727
+ else:
728
+ if len(range) != Ndim:
729
+ raise ValueError(
730
+ f"range given for {len(range)} dimensions; {Ndim} required")
731
+ smin = np.empty(Ndim)
732
+ smax = np.empty(Ndim)
733
+ for i in builtins.range(Ndim):
734
+ if range[i][1] < range[i][0]:
735
+ raise ValueError(
736
+ "In {}range, start must be <= stop".format(
737
+ f"dimension {i + 1} of " if Ndim > 1 else ""))
738
+ smin[i], smax[i] = range[i]
739
+
740
+ # Make sure the bins have a finite width.
741
+ for i in builtins.range(len(smin)):
742
+ if smin[i] == smax[i]:
743
+ smin[i] = smin[i] - .5
744
+ smax[i] = smax[i] + .5
745
+
746
+ # Preserve sample floating point precision in bin edges
747
+ edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
748
+ else float)
749
+
750
+ # Create edge arrays
751
+ for i in builtins.range(Ndim):
752
+ if np.isscalar(bins[i]):
753
+ nbin[i] = bins[i] + 2 # +2 for outlier bins
754
+ edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
755
+ dtype=edges_dtype)
756
+ else:
757
+ edges[i] = np.asarray(bins[i], edges_dtype)
758
+ nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
759
+ dedges[i] = np.diff(edges[i])
760
+
761
+ nbin = np.asarray(nbin)
762
+
763
+ return nbin, edges, dedges
764
+
765
+
766
+ def _bin_numbers(sample, nbin, edges, dedges):
767
+ """Compute the bin number each sample falls into, in each dimension
768
+ """
769
+ Dlen, Ndim = sample.shape
770
+
771
+ sampBin = [
772
+ np.digitize(sample[:, i], edges[i])
773
+ for i in range(Ndim)
774
+ ]
775
+
776
+ # Using `digitize`, values that fall on an edge are put in the right bin.
777
+ # For the rightmost bin, we want values equal to the right
778
+ # edge to be counted in the last bin, and not as an outlier.
779
+ for i in range(Ndim):
780
+ # Find the rounding precision
781
+ dedges_min = dedges[i].min()
782
+ if dedges_min == 0:
783
+ raise ValueError('The smallest edge difference is numerically 0.')
784
+ decimal = int(-np.log10(dedges_min)) + 6
785
+ # Find which points are on the rightmost edge.
786
+ on_edge = np.where((sample[:, i] >= edges[i][-1]) &
787
+ (np.around(sample[:, i], decimal) ==
788
+ np.around(edges[i][-1], decimal)))[0]
789
+ # Shift these points one bin to the left.
790
+ sampBin[i][on_edge] -= 1
791
+
792
+ # Compute the sample indices in the flattened statistic matrix.
793
+ binnumbers = np.ravel_multi_index(sampBin, nbin)
794
+
795
+ return binnumbers
.venv/Lib/site-packages/scipy/stats/_binomtest.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import sqrt
2
+ import numpy as np
3
+ from scipy._lib._util import _validate_int
4
+ from scipy.optimize import brentq
5
+ from scipy.special import ndtri
6
+ from ._discrete_distns import binom
7
+ from ._common import ConfidenceInterval
8
+
9
+
10
+ class BinomTestResult:
11
+ """
12
+ Result of `scipy.stats.binomtest`.
13
+
14
+ Attributes
15
+ ----------
16
+ k : int
17
+ The number of successes (copied from `binomtest` input).
18
+ n : int
19
+ The number of trials (copied from `binomtest` input).
20
+ alternative : str
21
+ Indicates the alternative hypothesis specified in the input
22
+ to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
23
+ or ``'less'``.
24
+ statistic: float
25
+ The estimate of the proportion of successes.
26
+ pvalue : float
27
+ The p-value of the hypothesis test.
28
+
29
+ """
30
+ def __init__(self, k, n, alternative, statistic, pvalue):
31
+ self.k = k
32
+ self.n = n
33
+ self.alternative = alternative
34
+ self.statistic = statistic
35
+ self.pvalue = pvalue
36
+
37
+ # add alias for backward compatibility
38
+ self.proportion_estimate = statistic
39
+
40
+ def __repr__(self):
41
+ s = ("BinomTestResult("
42
+ f"k={self.k}, "
43
+ f"n={self.n}, "
44
+ f"alternative={self.alternative!r}, "
45
+ f"statistic={self.statistic}, "
46
+ f"pvalue={self.pvalue})")
47
+ return s
48
+
49
+ def proportion_ci(self, confidence_level=0.95, method='exact'):
50
+ """
51
+ Compute the confidence interval for ``statistic``.
52
+
53
+ Parameters
54
+ ----------
55
+ confidence_level : float, optional
56
+ Confidence level for the computed confidence interval
57
+ of the estimated proportion. Default is 0.95.
58
+ method : {'exact', 'wilson', 'wilsoncc'}, optional
59
+ Selects the method used to compute the confidence interval
60
+ for the estimate of the proportion:
61
+
62
+ 'exact' :
63
+ Use the Clopper-Pearson exact method [1]_.
64
+ 'wilson' :
65
+ Wilson's method, without continuity correction ([2]_, [3]_).
66
+ 'wilsoncc' :
67
+ Wilson's method, with continuity correction ([2]_, [3]_).
68
+
69
+ Default is ``'exact'``.
70
+
71
+ Returns
72
+ -------
73
+ ci : ``ConfidenceInterval`` object
74
+ The object has attributes ``low`` and ``high`` that hold the
75
+ lower and upper bounds of the confidence interval.
76
+
77
+ References
78
+ ----------
79
+ .. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
80
+ fiducial limits illustrated in the case of the binomial,
81
+ Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
82
+ .. [2] E. B. Wilson, Probable inference, the law of succession, and
83
+ statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
84
+ (1927).
85
+ .. [3] Robert G. Newcombe, Two-sided confidence intervals for the
86
+ single proportion: comparison of seven methods, Statistics
87
+ in Medicine, 17, pp 857-872 (1998).
88
+
89
+ Examples
90
+ --------
91
+ >>> from scipy.stats import binomtest
92
+ >>> result = binomtest(k=7, n=50, p=0.1)
93
+ >>> result.statistic
94
+ 0.14
95
+ >>> result.proportion_ci()
96
+ ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
97
+ """
98
+ if method not in ('exact', 'wilson', 'wilsoncc'):
99
+ raise ValueError(f"method ('{method}') must be one of 'exact', "
100
+ "'wilson' or 'wilsoncc'.")
101
+ if not (0 <= confidence_level <= 1):
102
+ raise ValueError(f'confidence_level ({confidence_level}) must be in '
103
+ 'the interval [0, 1].')
104
+ if method == 'exact':
105
+ low, high = _binom_exact_conf_int(self.k, self.n,
106
+ confidence_level,
107
+ self.alternative)
108
+ else:
109
+ # method is 'wilson' or 'wilsoncc'
110
+ low, high = _binom_wilson_conf_int(self.k, self.n,
111
+ confidence_level,
112
+ self.alternative,
113
+ correction=method == 'wilsoncc')
114
+ return ConfidenceInterval(low=low, high=high)
115
+
116
+
117
+ def _findp(func):
118
+ try:
119
+ p = brentq(func, 0, 1)
120
+ except RuntimeError:
121
+ raise RuntimeError('numerical solver failed to converge when '
122
+ 'computing the confidence limits') from None
123
+ except ValueError as exc:
124
+ raise ValueError('brentq raised a ValueError; report this to the '
125
+ 'SciPy developers') from exc
126
+ return p
127
+
128
+
129
+ def _binom_exact_conf_int(k, n, confidence_level, alternative):
130
+ """
131
+ Compute the estimate and confidence interval for the binomial test.
132
+
133
+ Returns proportion, prop_low, prop_high
134
+ """
135
+ if alternative == 'two-sided':
136
+ alpha = (1 - confidence_level) / 2
137
+ if k == 0:
138
+ plow = 0.0
139
+ else:
140
+ plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
141
+ if k == n:
142
+ phigh = 1.0
143
+ else:
144
+ phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
145
+ elif alternative == 'less':
146
+ alpha = 1 - confidence_level
147
+ plow = 0.0
148
+ if k == n:
149
+ phigh = 1.0
150
+ else:
151
+ phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
152
+ elif alternative == 'greater':
153
+ alpha = 1 - confidence_level
154
+ if k == 0:
155
+ plow = 0.0
156
+ else:
157
+ plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
158
+ phigh = 1.0
159
+ return plow, phigh
160
+
161
+
162
+ def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
163
+ # This function assumes that the arguments have already been validated.
164
+ # In particular, `alternative` must be one of 'two-sided', 'less' or
165
+ # 'greater'.
166
+ p = k / n
167
+ if alternative == 'two-sided':
168
+ z = ndtri(0.5 + 0.5*confidence_level)
169
+ else:
170
+ z = ndtri(confidence_level)
171
+
172
+ # For reference, the formulas implemented here are from
173
+ # Newcombe (1998) (ref. [3] in the proportion_ci docstring).
174
+ denom = 2*(n + z**2)
175
+ center = (2*n*p + z**2)/denom
176
+ q = 1 - p
177
+ if correction:
178
+ if alternative == 'less' or k == 0:
179
+ lo = 0.0
180
+ else:
181
+ dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
182
+ lo = center - dlo
183
+ if alternative == 'greater' or k == n:
184
+ hi = 1.0
185
+ else:
186
+ dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
187
+ hi = center + dhi
188
+ else:
189
+ delta = z/denom * sqrt(4*n*p*q + z**2)
190
+ if alternative == 'less' or k == 0:
191
+ lo = 0.0
192
+ else:
193
+ lo = center - delta
194
+ if alternative == 'greater' or k == n:
195
+ hi = 1.0
196
+ else:
197
+ hi = center + delta
198
+
199
+ return lo, hi
200
+
201
+
202
+ def binomtest(k, n, p=0.5, alternative='two-sided'):
203
+ """
204
+ Perform a test that the probability of success is p.
205
+
206
+ The binomial test [1]_ is a test of the null hypothesis that the
207
+ probability of success in a Bernoulli experiment is `p`.
208
+
209
+ Details of the test can be found in many texts on statistics, such
210
+ as section 24.5 of [2]_.
211
+
212
+ Parameters
213
+ ----------
214
+ k : int
215
+ The number of successes.
216
+ n : int
217
+ The number of trials.
218
+ p : float, optional
219
+ The hypothesized probability of success, i.e. the expected
220
+ proportion of successes. The value must be in the interval
221
+ ``0 <= p <= 1``. The default value is ``p = 0.5``.
222
+ alternative : {'two-sided', 'greater', 'less'}, optional
223
+ Indicates the alternative hypothesis. The default value is
224
+ 'two-sided'.
225
+
226
+ Returns
227
+ -------
228
+ result : `~scipy.stats._result_classes.BinomTestResult` instance
229
+ The return value is an object with the following attributes:
230
+
231
+ k : int
232
+ The number of successes (copied from `binomtest` input).
233
+ n : int
234
+ The number of trials (copied from `binomtest` input).
235
+ alternative : str
236
+ Indicates the alternative hypothesis specified in the input
237
+ to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
238
+ or ``'less'``.
239
+ statistic : float
240
+ The estimate of the proportion of successes.
241
+ pvalue : float
242
+ The p-value of the hypothesis test.
243
+
244
+ The object has the following methods:
245
+
246
+ proportion_ci(confidence_level=0.95, method='exact') :
247
+ Compute the confidence interval for ``statistic``.
248
+
249
+ Notes
250
+ -----
251
+ .. versionadded:: 1.7.0
252
+
253
+ References
254
+ ----------
255
+ .. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
256
+ .. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
257
+ Prentice Hall, Upper Saddle River, New Jersey USA (2010)
258
+
259
+ Examples
260
+ --------
261
+ >>> from scipy.stats import binomtest
262
+
263
+ A car manufacturer claims that no more than 10% of their cars are unsafe.
264
+ 15 cars are inspected for safety, 3 were found to be unsafe. Test the
265
+ manufacturer's claim:
266
+
267
+ >>> result = binomtest(3, n=15, p=0.1, alternative='greater')
268
+ >>> result.pvalue
269
+ 0.18406106910639114
270
+
271
+ The null hypothesis cannot be rejected at the 5% level of significance
272
+ because the returned p-value is greater than the critical value of 5%.
273
+
274
+ The test statistic is equal to the estimated proportion, which is simply
275
+ ``3/15``:
276
+
277
+ >>> result.statistic
278
+ 0.2
279
+
280
+ We can use the `proportion_ci()` method of the result to compute the
281
+ confidence interval of the estimate:
282
+
283
+ >>> result.proportion_ci(confidence_level=0.95)
284
+ ConfidenceInterval(low=0.05684686759024681, high=1.0)
285
+
286
+ """
287
+ k = _validate_int(k, 'k', minimum=0)
288
+ n = _validate_int(n, 'n', minimum=1)
289
+ if k > n:
290
+ raise ValueError(f'k ({k}) must not be greater than n ({n}).')
291
+
292
+ if not (0 <= p <= 1):
293
+ raise ValueError(f"p ({p}) must be in range [0,1]")
294
+
295
+ if alternative not in ('two-sided', 'less', 'greater'):
296
+ raise ValueError(f"alternative ('{alternative}') not recognized; \n"
297
+ "must be 'two-sided', 'less' or 'greater'")
298
+ if alternative == 'less':
299
+ pval = binom.cdf(k, n, p)
300
+ elif alternative == 'greater':
301
+ pval = binom.sf(k-1, n, p)
302
+ else:
303
+ # alternative is 'two-sided'
304
+ d = binom.pmf(k, n, p)
305
+ rerr = 1 + 1e-7
306
+ if k == p * n:
307
+ # special case as shortcut, would also be handled by `else` below
308
+ pval = 1.
309
+ elif k < p * n:
310
+ ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
311
+ -d*rerr, np.ceil(p * n), n)
312
+ # y is the number of terms between mode and n that are <= d*rerr.
313
+ # ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
314
+ # if the first equality doesn't hold, y=n-ix. Otherwise, we
315
+ # need to include ix as well as the equality holds. Note that
316
+ # the equality will hold in very very rare situations due to rerr.
317
+ y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
318
+ pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
319
+ else:
320
+ ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
321
+ d*rerr, 0, np.floor(p * n))
322
+ # y is the number of terms between 0 and mode that are <= d*rerr.
323
+ # we need to add a 1 to account for the 0 index.
324
+ # For comparing this with old behavior, see
325
+ # tst_binary_srch_for_binom_tst method in test_morestats.
326
+ y = ix + 1
327
+ pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
328
+
329
+ pval = min(1.0, pval)
330
+
331
+ result = BinomTestResult(k=k, n=n, alternative=alternative,
332
+ statistic=k/n, pvalue=pval)
333
+ return result
334
+
335
+
336
+ def _binary_search_for_binom_tst(a, d, lo, hi):
337
+ """
338
+ Conducts an implicit binary search on a function specified by `a`.
339
+
340
+ Meant to be used on the binomial PMF for the case of two-sided tests
341
+ to obtain the value on the other side of the mode where the tail
342
+ probability should be computed. The values on either side of
343
+ the mode are always in order, meaning binary search is applicable.
344
+
345
+ Parameters
346
+ ----------
347
+ a : callable
348
+ The function over which to perform binary search. Its values
349
+ for inputs lo and hi should be in ascending order.
350
+ d : float
351
+ The value to search.
352
+ lo : int
353
+ The lower end of range to search.
354
+ hi : int
355
+ The higher end of the range to search.
356
+
357
+ Returns
358
+ -------
359
+ int
360
+ The index, i between lo and hi
361
+ such that a(i)<=d<a(i+1)
362
+ """
363
+ while lo < hi:
364
+ mid = lo + (hi-lo)//2
365
+ midval = a(mid)
366
+ if midval < d:
367
+ lo = mid+1
368
+ elif midval > d:
369
+ hi = mid-1
370
+ else:
371
+ return mid
372
+ if a(lo) <= d:
373
+ return lo
374
+ else:
375
+ return lo-1
.venv/Lib/site-packages/scipy/stats/_bws_test.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from functools import partial
3
+ from scipy import stats
4
+
5
+
6
+ def _bws_input_validation(x, y, alternative, method):
7
+ ''' Input validation and standardization for bws test'''
8
+ x, y = np.atleast_1d(x, y)
9
+ if x.ndim > 1 or y.ndim > 1:
10
+ raise ValueError('`x` and `y` must be exactly one-dimensional.')
11
+ if np.isnan(x).any() or np.isnan(y).any():
12
+ raise ValueError('`x` and `y` must not contain NaNs.')
13
+ if np.size(x) == 0 or np.size(y) == 0:
14
+ raise ValueError('`x` and `y` must be of nonzero size.')
15
+
16
+ z = stats.rankdata(np.concatenate((x, y)))
17
+ x, y = z[:len(x)], z[len(x):]
18
+
19
+ alternatives = {'two-sided', 'less', 'greater'}
20
+ alternative = alternative.lower()
21
+ if alternative not in alternatives:
22
+ raise ValueError(f'`alternative` must be one of {alternatives}.')
23
+
24
+ method = stats.PermutationMethod() if method is None else method
25
+ if not isinstance(method, stats.PermutationMethod):
26
+ raise ValueError('`method` must be an instance of '
27
+ '`scipy.stats.PermutationMethod`')
28
+
29
+ return x, y, alternative, method
30
+
31
+
32
+ def _bws_statistic(x, y, alternative, axis):
33
+ '''Compute the BWS test statistic for two independent samples'''
34
+ # Public function currently does not accept `axis`, but `permutation_test`
35
+ # uses `axis` to make vectorized call.
36
+
37
+ Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
38
+ n, m = Ri.shape[axis], Hj.shape[axis]
39
+ i, j = np.arange(1, n+1), np.arange(1, m+1)
40
+
41
+ Bx_num = Ri - (m + n)/n * i
42
+ By_num = Hj - (m + n)/m * j
43
+
44
+ if alternative == 'two-sided':
45
+ Bx_num *= Bx_num
46
+ By_num *= By_num
47
+ else:
48
+ Bx_num *= np.abs(Bx_num)
49
+ By_num *= np.abs(By_num)
50
+
51
+ Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
52
+ By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
53
+
54
+ Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
55
+ By = 1/m * np.sum(By_num/By_den, axis=axis)
56
+
57
+ B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
58
+
59
+ return B
60
+
61
+
62
+ def bws_test(x, y, *, alternative="two-sided", method=None):
63
+ r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
64
+
65
+ The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of
66
+ the null hypothesis that the distribution underlying sample `x`
67
+ is the same as the distribution underlying sample `y`. Unlike
68
+ the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests,
69
+ the BWS test weights the integral by the variance of the difference
70
+ in cumulative distribution functions (CDFs), emphasizing the tails of the
71
+ distributions, which increases the power of the test in many applications.
72
+
73
+ Parameters
74
+ ----------
75
+ x, y : array-like
76
+ 1-d arrays of samples.
77
+ alternative : {'two-sided', 'less', 'greater'}, optional
78
+ Defines the alternative hypothesis. Default is 'two-sided'.
79
+ Let *F(u)* and *G(u)* be the cumulative distribution functions of the
80
+ distributions underlying `x` and `y`, respectively. Then the following
81
+ alternative hypotheses are available:
82
+
83
+ * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
84
+ at least one *u*.
85
+ * 'less': the distribution underlying `x` is stochastically less than
86
+ the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
87
+ * 'greater': the distribution underlying `x` is stochastically greater
88
+ than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
89
+ *u*.
90
+
91
+ Under a more restrictive set of assumptions, the alternative hypotheses
92
+ can be expressed in terms of the locations of the distributions;
93
+ see [2] section 5.1.
94
+ method : PermutationMethod, optional
95
+ Configures the method used to compute the p-value. The default is
96
+ the default `PermutationMethod` object.
97
+
98
+ Returns
99
+ -------
100
+ res : PermutationTestResult
101
+ An object with attributes:
102
+
103
+ statistic : float
104
+ The observed test statistic of the data.
105
+ pvalue : float
106
+ The p-value for the given alternative.
107
+ null_distribution : ndarray
108
+ The values of the test statistic generated under the null hypothesis.
109
+
110
+ See also
111
+ --------
112
+ scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
113
+
114
+ Notes
115
+ -----
116
+ When ``alternative=='two-sided'``, the statistic is defined by the
117
+ equations given in [1]_ Section 2. This statistic is not appropriate for
118
+ one-sided alternatives; in that case, the statistic is the *negative* of
119
+ that given by the equations in [1]_ Section 2. Consequently, when the
120
+ distribution of the first sample is stochastically greater than that of the
121
+ second sample, the statistic will tend to be positive.
122
+
123
+ References
124
+ ----------
125
+ .. [1] Neuhäuser, M. (2005). Exact Tests Based on the
126
+ Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
127
+ 46(1), 1-29.
128
+ .. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
129
+ On assumptions for hypothesis tests and multiple interpretations of
130
+ decision rules. Statistics surveys, 4, 1.
131
+
132
+ Examples
133
+ --------
134
+ We follow the example of table 3 in [1]_: Fourteen children were divided
135
+ randomly into two groups. Their ranks at performing a specific tests are
136
+ as follows.
137
+
138
+ >>> import numpy as np
139
+ >>> x = [1, 2, 3, 4, 6, 7, 8]
140
+ >>> y = [5, 9, 10, 11, 12, 13, 14]
141
+
142
+ We use the BWS test to assess whether there is a statistically significant
143
+ difference between the two groups.
144
+ The null hypothesis is that there is no difference in the distributions of
145
+ performance between the two groups. We decide that a significance level of
146
+ 1% is required to reject the null hypothesis in favor of the alternative
147
+ that the distributions are different.
148
+ Since the number of samples is very small, we can compare the observed test
149
+ statistic against the *exact* distribution of the test statistic under the
150
+ null hypothesis.
151
+
152
+ >>> from scipy.stats import bws_test
153
+ >>> res = bws_test(x, y)
154
+ >>> print(res.statistic)
155
+ 5.132167152575315
156
+
157
+ This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
158
+ by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
159
+
160
+ >>> print(res.pvalue)
161
+ 0.002913752913752914
162
+
163
+ Because the p-value is below our threshold of 1%, we take this as evidence
164
+ against the null hypothesis in favor of the alternative that there is a
165
+ difference in performance between the two groups.
166
+ '''
167
+
168
+ x, y, alternative, method = _bws_input_validation(x, y, alternative,
169
+ method)
170
+ bws_statistic = partial(_bws_statistic, alternative=alternative)
171
+
172
+ permutation_alternative = 'less' if alternative == 'less' else 'greater'
173
+ res = stats.permutation_test((x, y), bws_statistic,
174
+ alternative=permutation_alternative,
175
+ **method._asdict())
176
+
177
+ return res
.venv/Lib/site-packages/scipy/stats/_censored_data.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ def _validate_1d(a, name, allow_inf=False):
5
+ if np.ndim(a) != 1:
6
+ raise ValueError(f'`{name}` must be a one-dimensional sequence.')
7
+ if np.isnan(a).any():
8
+ raise ValueError(f'`{name}` must not contain nan.')
9
+ if not allow_inf and np.isinf(a).any():
10
+ raise ValueError(f'`{name}` must contain only finite values.')
11
+
12
+
13
+ def _validate_interval(interval):
14
+ interval = np.asarray(interval)
15
+ if interval.shape == (0,):
16
+ # The input was a sequence with length 0.
17
+ interval = interval.reshape((0, 2))
18
+ if interval.ndim != 2 or interval.shape[-1] != 2:
19
+ raise ValueError('`interval` must be a two-dimensional array with '
20
+ 'shape (m, 2), where m is the number of '
21
+ 'interval-censored values, but got shape '
22
+ f'{interval.shape}')
23
+
24
+ if np.isnan(interval).any():
25
+ raise ValueError('`interval` must not contain nan.')
26
+ if np.isinf(interval).all(axis=1).any():
27
+ raise ValueError('In each row in `interval`, both values must not'
28
+ ' be infinite.')
29
+ if (interval[:, 0] > interval[:, 1]).any():
30
+ raise ValueError('In each row of `interval`, the left value must not'
31
+ ' exceed the right value.')
32
+
33
+ uncensored_mask = interval[:, 0] == interval[:, 1]
34
+ left_mask = np.isinf(interval[:, 0])
35
+ right_mask = np.isinf(interval[:, 1])
36
+ interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
37
+
38
+ uncensored2 = interval[uncensored_mask, 0]
39
+ left2 = interval[left_mask, 1]
40
+ right2 = interval[right_mask, 0]
41
+ interval2 = interval[interval_mask]
42
+
43
+ return uncensored2, left2, right2, interval2
44
+
45
+
46
+ def _validate_x_censored(x, censored):
47
+ x = np.asarray(x)
48
+ if x.ndim != 1:
49
+ raise ValueError('`x` must be one-dimensional.')
50
+ censored = np.asarray(censored)
51
+ if censored.ndim != 1:
52
+ raise ValueError('`censored` must be one-dimensional.')
53
+ if (~np.isfinite(x)).any():
54
+ raise ValueError('`x` must not contain nan or inf.')
55
+ if censored.size != x.size:
56
+ raise ValueError('`x` and `censored` must have the same length.')
57
+ return x, censored.astype(bool)
58
+
59
+
60
+ class CensoredData:
61
+ """
62
+ Instances of this class represent censored data.
63
+
64
+ Instances may be passed to the ``fit`` method of continuous
65
+ univariate SciPy distributions for maximum likelihood estimation.
66
+ The *only* method of the univariate continuous distributions that
67
+ understands `CensoredData` is the ``fit`` method. An instance of
68
+ `CensoredData` can not be passed to methods such as ``pdf`` and
69
+ ``cdf``.
70
+
71
+ An observation is said to be *censored* when the precise value is unknown,
72
+ but it has a known upper and/or lower bound. The conventional terminology
73
+ is:
74
+
75
+ * left-censored: an observation is below a certain value but it is
76
+ unknown by how much.
77
+ * right-censored: an observation is above a certain value but it is
78
+ unknown by how much.
79
+ * interval-censored: an observation lies somewhere on an interval between
80
+ two values.
81
+
82
+ Left-, right-, and interval-censored data can be represented by
83
+ `CensoredData`.
84
+
85
+ For convenience, the class methods ``left_censored`` and
86
+ ``right_censored`` are provided to create a `CensoredData`
87
+ instance from a single one-dimensional array of measurements
88
+ and a corresponding boolean array to indicate which measurements
89
+ are censored. The class method ``interval_censored`` accepts two
90
+ one-dimensional arrays that hold the lower and upper bounds of the
91
+ intervals.
92
+
93
+ Parameters
94
+ ----------
95
+ uncensored : array_like, 1D
96
+ Uncensored observations.
97
+ left : array_like, 1D
98
+ Left-censored observations.
99
+ right : array_like, 1D
100
+ Right-censored observations.
101
+ interval : array_like, 2D, with shape (m, 2)
102
+ Interval-censored observations. Each row ``interval[k, :]``
103
+ represents the interval for the kth interval-censored observation.
104
+
105
+ Notes
106
+ -----
107
+ In the input array `interval`, the lower bound of the interval may
108
+ be ``-inf``, and the upper bound may be ``inf``, but at least one must be
109
+ finite. When the lower bound is ``-inf``, the row represents a left-
110
+ censored observation, and when the upper bound is ``inf``, the row
111
+ represents a right-censored observation. If the length of an interval
112
+ is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
113
+ treated as uncensored. So one can represent all the types of censored
114
+ and uncensored data in ``interval``, but it is generally more convenient
115
+ to use `uncensored`, `left` and `right` for uncensored, left-censored and
116
+ right-censored observations, respectively.
117
+
118
+ Examples
119
+ --------
120
+ In the most general case, a censored data set may contain values that
121
+ are left-censored, right-censored, interval-censored, and uncensored.
122
+ For example, here we create a data set with five observations. Two
123
+ are uncensored (values 1 and 1.5), one is a left-censored observation
124
+ of 0, one is a right-censored observation of 10 and one is
125
+ interval-censored in the interval [2, 3].
126
+
127
+ >>> import numpy as np
128
+ >>> from scipy.stats import CensoredData
129
+ >>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
130
+ ... interval=[[2, 3]])
131
+ >>> print(data)
132
+ CensoredData(5 values: 2 not censored, 1 left-censored,
133
+ 1 right-censored, 1 interval-censored)
134
+
135
+ Equivalently,
136
+
137
+ >>> data = CensoredData(interval=[[1, 1],
138
+ ... [1.5, 1.5],
139
+ ... [-np.inf, 0],
140
+ ... [10, np.inf],
141
+ ... [2, 3]])
142
+ >>> print(data)
143
+ CensoredData(5 values: 2 not censored, 1 left-censored,
144
+ 1 right-censored, 1 interval-censored)
145
+
146
+ A common case is to have a mix of uncensored observations and censored
147
+ observations that are all right-censored (or all left-censored). For
148
+ example, consider an experiment in which six devices are started at
149
+ various times and left running until they fail. Assume that time is
150
+ measured in hours, and the experiment is stopped after 30 hours, even
151
+ if all the devices have not failed by that time. We might end up with
152
+ data such as this::
153
+
154
+ Device Start-time Fail-time Time-to-failure
155
+ 1 0 13 13
156
+ 2 2 24 22
157
+ 3 5 22 17
158
+ 4 8 23 15
159
+ 5 10 *** >20
160
+ 6 12 *** >18
161
+
162
+ Two of the devices had not failed when the experiment was stopped;
163
+ the observations of the time-to-failure for these two devices are
164
+ right-censored. We can represent this data with
165
+
166
+ >>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
167
+ >>> print(data)
168
+ CensoredData(6 values: 4 not censored, 2 right-censored)
169
+
170
+ Alternatively, we can use the method `CensoredData.right_censored` to
171
+ create a representation of this data. The time-to-failure observations
172
+ are put the list ``ttf``. The ``censored`` list indicates which values
173
+ in ``ttf`` are censored.
174
+
175
+ >>> ttf = [13, 22, 17, 15, 20, 18]
176
+ >>> censored = [False, False, False, False, True, True]
177
+
178
+ Pass these lists to `CensoredData.right_censored` to create an
179
+ instance of `CensoredData`.
180
+
181
+ >>> data = CensoredData.right_censored(ttf, censored)
182
+ >>> print(data)
183
+ CensoredData(6 values: 4 not censored, 2 right-censored)
184
+
185
+ If the input data is interval censored and already stored in two
186
+ arrays, one holding the low end of the intervals and another
187
+ holding the high ends, the class method ``interval_censored`` can
188
+ be used to create the `CensoredData` instance.
189
+
190
+ This example creates an instance with four interval-censored values.
191
+ The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
192
+
193
+ >>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
194
+ >>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
195
+ >>> data = CensoredData.interval_censored(low=a, high=b)
196
+ >>> print(data)
197
+ CensoredData(4 values: 0 not censored, 4 interval-censored)
198
+
199
+ Finally, we create and censor some data from the `weibull_min`
200
+ distribution, and then fit `weibull_min` to that data. We'll assume
201
+ that the location parameter is known to be 0.
202
+
203
+ >>> from scipy.stats import weibull_min
204
+ >>> rng = np.random.default_rng()
205
+
206
+ Create the random data set.
207
+
208
+ >>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
209
+ >>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
210
+
211
+ Create the `CensoredData` instance with the `right_censored` method.
212
+ The censored values are those where the value is 40.
213
+
214
+ >>> data = CensoredData.right_censored(x, x == 40)
215
+ >>> print(data)
216
+ CensoredData(250 values: 215 not censored, 35 right-censored)
217
+
218
+ 35 values have been right-censored.
219
+
220
+ Fit `weibull_min` to the censored data. We expect to shape and scale
221
+ to be approximately 2.5 and 30, respectively.
222
+
223
+ >>> weibull_min.fit(data, floc=0)
224
+ (2.3575922823897315, 0, 30.40650074451254)
225
+
226
+ """
227
+
228
+ def __init__(self, uncensored=None, *, left=None, right=None,
229
+ interval=None):
230
+ if uncensored is None:
231
+ uncensored = []
232
+ if left is None:
233
+ left = []
234
+ if right is None:
235
+ right = []
236
+ if interval is None:
237
+ interval = np.empty((0, 2))
238
+
239
+ _validate_1d(uncensored, 'uncensored')
240
+ _validate_1d(left, 'left')
241
+ _validate_1d(right, 'right')
242
+ uncensored2, left2, right2, interval2 = _validate_interval(interval)
243
+
244
+ self._uncensored = np.concatenate((uncensored, uncensored2))
245
+ self._left = np.concatenate((left, left2))
246
+ self._right = np.concatenate((right, right2))
247
+ # Note that by construction, the private attribute _interval
248
+ # will be a 2D array that contains only finite values representing
249
+ # intervals with nonzero but finite length.
250
+ self._interval = interval2
251
+
252
+ def __repr__(self):
253
+ uncensored_str = " ".join(np.array_repr(self._uncensored).split())
254
+ left_str = " ".join(np.array_repr(self._left).split())
255
+ right_str = " ".join(np.array_repr(self._right).split())
256
+ interval_str = " ".join(np.array_repr(self._interval).split())
257
+ return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
258
+ f"right={right_str}, interval={interval_str})")
259
+
260
+ def __str__(self):
261
+ num_nc = len(self._uncensored)
262
+ num_lc = len(self._left)
263
+ num_rc = len(self._right)
264
+ num_ic = len(self._interval)
265
+ n = num_nc + num_lc + num_rc + num_ic
266
+ parts = [f'{num_nc} not censored']
267
+ if num_lc > 0:
268
+ parts.append(f'{num_lc} left-censored')
269
+ if num_rc > 0:
270
+ parts.append(f'{num_rc} right-censored')
271
+ if num_ic > 0:
272
+ parts.append(f'{num_ic} interval-censored')
273
+ return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
274
+
275
+ # This is not a complete implementation of the arithmetic operators.
276
+ # All we need is subtracting a scalar and dividing by a scalar.
277
+
278
+ def __sub__(self, other):
279
+ return CensoredData(uncensored=self._uncensored - other,
280
+ left=self._left - other,
281
+ right=self._right - other,
282
+ interval=self._interval - other)
283
+
284
+ def __truediv__(self, other):
285
+ return CensoredData(uncensored=self._uncensored / other,
286
+ left=self._left / other,
287
+ right=self._right / other,
288
+ interval=self._interval / other)
289
+
290
+ def __len__(self):
291
+ """
292
+ The number of values (censored and not censored).
293
+ """
294
+ return (len(self._uncensored) + len(self._left) + len(self._right)
295
+ + len(self._interval))
296
+
297
+ def num_censored(self):
298
+ """
299
+ Number of censored values.
300
+ """
301
+ return len(self._left) + len(self._right) + len(self._interval)
302
+
303
+ @classmethod
304
+ def right_censored(cls, x, censored):
305
+ """
306
+ Create a `CensoredData` instance of right-censored data.
307
+
308
+ Parameters
309
+ ----------
310
+ x : array_like
311
+ `x` is the array of observed data or measurements.
312
+ `x` must be a one-dimensional sequence of finite numbers.
313
+ censored : array_like of bool
314
+ `censored` must be a one-dimensional sequence of boolean
315
+ values. If ``censored[k]`` is True, the corresponding value
316
+ in `x` is right-censored. That is, the value ``x[k]``
317
+ is the lower bound of the true (but unknown) value.
318
+
319
+ Returns
320
+ -------
321
+ data : `CensoredData`
322
+ An instance of `CensoredData` that represents the
323
+ collection of uncensored and right-censored values.
324
+
325
+ Examples
326
+ --------
327
+ >>> from scipy.stats import CensoredData
328
+
329
+ Two uncensored values (4 and 10) and two right-censored values
330
+ (24 and 25).
331
+
332
+ >>> data = CensoredData.right_censored([4, 10, 24, 25],
333
+ ... [False, False, True, True])
334
+ >>> data
335
+ CensoredData(uncensored=array([ 4., 10.]),
336
+ left=array([], dtype=float64), right=array([24., 25.]),
337
+ interval=array([], shape=(0, 2), dtype=float64))
338
+ >>> print(data)
339
+ CensoredData(4 values: 2 not censored, 2 right-censored)
340
+ """
341
+ x, censored = _validate_x_censored(x, censored)
342
+ return cls(uncensored=x[~censored], right=x[censored])
343
+
344
+ @classmethod
345
+ def left_censored(cls, x, censored):
346
+ """
347
+ Create a `CensoredData` instance of left-censored data.
348
+
349
+ Parameters
350
+ ----------
351
+ x : array_like
352
+ `x` is the array of observed data or measurements.
353
+ `x` must be a one-dimensional sequence of finite numbers.
354
+ censored : array_like of bool
355
+ `censored` must be a one-dimensional sequence of boolean
356
+ values. If ``censored[k]`` is True, the corresponding value
357
+ in `x` is left-censored. That is, the value ``x[k]``
358
+ is the upper bound of the true (but unknown) value.
359
+
360
+ Returns
361
+ -------
362
+ data : `CensoredData`
363
+ An instance of `CensoredData` that represents the
364
+ collection of uncensored and left-censored values.
365
+
366
+ Examples
367
+ --------
368
+ >>> from scipy.stats import CensoredData
369
+
370
+ Two uncensored values (0.12 and 0.033) and two left-censored values
371
+ (both 1e-3).
372
+
373
+ >>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
374
+ ... [False, False, True, True])
375
+ >>> data
376
+ CensoredData(uncensored=array([0.12 , 0.033]),
377
+ left=array([0.001, 0.001]), right=array([], dtype=float64),
378
+ interval=array([], shape=(0, 2), dtype=float64))
379
+ >>> print(data)
380
+ CensoredData(4 values: 2 not censored, 2 left-censored)
381
+ """
382
+ x, censored = _validate_x_censored(x, censored)
383
+ return cls(uncensored=x[~censored], left=x[censored])
384
+
385
+ @classmethod
386
+ def interval_censored(cls, low, high):
387
+ """
388
+ Create a `CensoredData` instance of interval-censored data.
389
+
390
+ This method is useful when all the data is interval-censored, and
391
+ the low and high ends of the intervals are already stored in
392
+ separate one-dimensional arrays.
393
+
394
+ Parameters
395
+ ----------
396
+ low : array_like
397
+ The one-dimensional array containing the low ends of the
398
+ intervals.
399
+ high : array_like
400
+ The one-dimensional array containing the high ends of the
401
+ intervals.
402
+
403
+ Returns
404
+ -------
405
+ data : `CensoredData`
406
+ An instance of `CensoredData` that represents the
407
+ collection of censored values.
408
+
409
+ Examples
410
+ --------
411
+ >>> import numpy as np
412
+ >>> from scipy.stats import CensoredData
413
+
414
+ ``a`` and ``b`` are the low and high ends of a collection of
415
+ interval-censored values.
416
+
417
+ >>> a = [0.5, 2.0, 3.0, 5.5]
418
+ >>> b = [1.0, 2.5, 3.5, 7.0]
419
+ >>> data = CensoredData.interval_censored(low=a, high=b)
420
+ >>> print(data)
421
+ CensoredData(4 values: 0 not censored, 4 interval-censored)
422
+ """
423
+ _validate_1d(low, 'low', allow_inf=True)
424
+ _validate_1d(high, 'high', allow_inf=True)
425
+ if len(low) != len(high):
426
+ raise ValueError('`low` and `high` must have the same length.')
427
+ interval = np.column_stack((low, high))
428
+ uncensored, left, right, interval = _validate_interval(interval)
429
+ return cls(uncensored=uncensored, left=left, right=right,
430
+ interval=interval)
431
+
432
+ def _uncensor(self):
433
+ """
434
+ This function is used when a non-censored version of the data
435
+ is needed to create a rough estimate of the parameters of a
436
+ distribution via the method of moments or some similar method.
437
+ The data is "uncensored" by taking the given endpoints as the
438
+ data for the left- or right-censored data, and the mean for the
439
+ interval-censored data.
440
+ """
441
+ data = np.concatenate((self._uncensored, self._left, self._right,
442
+ self._interval.mean(axis=1)))
443
+ return data
444
+
445
+ def _supported(self, a, b):
446
+ """
447
+ Return a subset of self containing the values that are in
448
+ (or overlap with) the interval (a, b).
449
+ """
450
+ uncensored = self._uncensored
451
+ uncensored = uncensored[(a < uncensored) & (uncensored < b)]
452
+ left = self._left
453
+ left = left[a < left]
454
+ right = self._right
455
+ right = right[right < b]
456
+ interval = self._interval
457
+ interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
458
+ return CensoredData(uncensored, left=left, right=right,
459
+ interval=interval)
.venv/Lib/site-packages/scipy/stats/_common.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from collections import namedtuple
2
+
3
+
4
+ ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
5
+ ConfidenceInterval. __doc__ = "Class for confidence intervals."
.venv/Lib/site-packages/scipy/stats/_constants.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Statistics-related constants.
3
+
4
+ """
5
+ import numpy as np
6
+
7
+
8
+ # The smallest representable positive number such that 1.0 + _EPS != 1.0.
9
+ _EPS = np.finfo(float).eps
10
+
11
+ # The largest [in magnitude] usable floating value.
12
+ _XMAX = np.finfo(float).max
13
+
14
+ # The log of the largest usable floating value; useful for knowing
15
+ # when exp(something) will overflow
16
+ _LOGXMAX = np.log(_XMAX)
17
+
18
+ # The smallest [in magnitude] usable (i.e. not subnormal) double precision
19
+ # floating value.
20
+ _XMIN = np.finfo(float).tiny
21
+
22
+ # The log of the smallest [in magnitude] usable (i.e not subnormal)
23
+ # double precision floating value.
24
+ _LOGXMIN = np.log(_XMIN)
25
+
26
+ # -special.psi(1)
27
+ _EULER = 0.577215664901532860606512090082402431042
28
+
29
+ # special.zeta(3, 1) Apery's constant
30
+ _ZETA3 = 1.202056903159594285399738161511449990765
31
+
32
+ # sqrt(pi)
33
+ _SQRT_PI = 1.772453850905516027298167483341145182798
34
+
35
+ # sqrt(2/pi)
36
+ _SQRT_2_OVER_PI = 0.7978845608028654
37
+
38
+ # log(sqrt(2/pi))
39
+ _LOG_SQRT_2_OVER_PI = -0.22579135264472744
.venv/Lib/site-packages/scipy/stats/_continuous_distns.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_covariance.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import cached_property
2
+
3
+ import numpy as np
4
+ from scipy import linalg
5
+ from scipy.stats import _multivariate
6
+
7
+
8
+ __all__ = ["Covariance"]
9
+
10
+
11
+ class Covariance:
12
+ """
13
+ Representation of a covariance matrix
14
+
15
+ Calculations involving covariance matrices (e.g. data whitening,
16
+ multivariate normal function evaluation) are often performed more
17
+ efficiently using a decomposition of the covariance matrix instead of the
18
+ covariance matrix itself. This class allows the user to construct an
19
+ object representing a covariance matrix using any of several
20
+ decompositions and perform calculations using a common interface.
21
+
22
+ .. note::
23
+
24
+ The `Covariance` class cannot be instantiated directly. Instead, use
25
+ one of the factory methods (e.g. `Covariance.from_diagonal`).
26
+
27
+ Examples
28
+ --------
29
+ The `Covariance` class is is used by calling one of its
30
+ factory methods to create a `Covariance` object, then pass that
31
+ representation of the `Covariance` matrix as a shape parameter of a
32
+ multivariate distribution.
33
+
34
+ For instance, the multivariate normal distribution can accept an array
35
+ representing a covariance matrix:
36
+
37
+ >>> from scipy import stats
38
+ >>> import numpy as np
39
+ >>> d = [1, 2, 3]
40
+ >>> A = np.diag(d) # a diagonal covariance matrix
41
+ >>> x = [4, -2, 5] # a point of interest
42
+ >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
43
+ >>> dist.pdf(x)
44
+ 4.9595685102808205e-08
45
+
46
+ but the calculations are performed in a very generic way that does not
47
+ take advantage of any special properties of the covariance matrix. Because
48
+ our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
49
+ to create an object representing the covariance matrix, and
50
+ `multivariate_normal` can use this to compute the probability density
51
+ function more efficiently.
52
+
53
+ >>> cov = stats.Covariance.from_diagonal(d)
54
+ >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
55
+ >>> dist.pdf(x)
56
+ 4.9595685102808205e-08
57
+
58
+ """
59
+ def __init__(self):
60
+ message = ("The `Covariance` class cannot be instantiated directly. "
61
+ "Please use one of the factory methods "
62
+ "(e.g. `Covariance.from_diagonal`).")
63
+ raise NotImplementedError(message)
64
+
65
+ @staticmethod
66
+ def from_diagonal(diagonal):
67
+ r"""
68
+ Return a representation of a covariance matrix from its diagonal.
69
+
70
+ Parameters
71
+ ----------
72
+ diagonal : array_like
73
+ The diagonal elements of a diagonal matrix.
74
+
75
+ Notes
76
+ -----
77
+ Let the diagonal elements of a diagonal covariance matrix :math:`D` be
78
+ stored in the vector :math:`d`.
79
+
80
+ When all elements of :math:`d` are strictly positive, whitening of a
81
+ data point :math:`x` is performed by computing
82
+ :math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
83
+ element-wise.
84
+ :math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
85
+ where the :math:`\log` operation is performed element-wise.
86
+
87
+ This `Covariance` class supports singular covariance matrices. When
88
+ computing ``_log_pdet``, non-positive elements of :math:`d` are
89
+ ignored. Whitening is not well defined when the point to be whitened
90
+ does not lie in the span of the columns of the covariance matrix. The
91
+ convention taken here is to treat the inverse square root of
92
+ non-positive elements of :math:`d` as zeros.
93
+
94
+ Examples
95
+ --------
96
+ Prepare a symmetric positive definite covariance matrix ``A`` and a
97
+ data point ``x``.
98
+
99
+ >>> import numpy as np
100
+ >>> from scipy import stats
101
+ >>> rng = np.random.default_rng()
102
+ >>> n = 5
103
+ >>> A = np.diag(rng.random(n))
104
+ >>> x = rng.random(size=n)
105
+
106
+ Extract the diagonal from ``A`` and create the `Covariance` object.
107
+
108
+ >>> d = np.diag(A)
109
+ >>> cov = stats.Covariance.from_diagonal(d)
110
+
111
+ Compare the functionality of the `Covariance` object against a
112
+ reference implementations.
113
+
114
+ >>> res = cov.whiten(x)
115
+ >>> ref = np.diag(d**-0.5) @ x
116
+ >>> np.allclose(res, ref)
117
+ True
118
+ >>> res = cov.log_pdet
119
+ >>> ref = np.linalg.slogdet(A)[-1]
120
+ >>> np.allclose(res, ref)
121
+ True
122
+
123
+ """
124
+ return CovViaDiagonal(diagonal)
125
+
126
+ @staticmethod
127
+ def from_precision(precision, covariance=None):
128
+ r"""
129
+ Return a representation of a covariance from its precision matrix.
130
+
131
+ Parameters
132
+ ----------
133
+ precision : array_like
134
+ The precision matrix; that is, the inverse of a square, symmetric,
135
+ positive definite covariance matrix.
136
+ covariance : array_like, optional
137
+ The square, symmetric, positive definite covariance matrix. If not
138
+ provided, this may need to be calculated (e.g. to evaluate the
139
+ cumulative distribution function of
140
+ `scipy.stats.multivariate_normal`) by inverting `precision`.
141
+
142
+ Notes
143
+ -----
144
+ Let the covariance matrix be :math:`A`, its precision matrix be
145
+ :math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
146
+ that :math:`L L^T = P`.
147
+ Whitening of a data point :math:`x` is performed by computing
148
+ :math:`x^T L`. :math:`\log\det{A}` is calculated as
149
+ :math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
150
+ element-wise.
151
+
152
+ This `Covariance` class does not support singular covariance matrices
153
+ because the precision matrix does not exist for a singular covariance
154
+ matrix.
155
+
156
+ Examples
157
+ --------
158
+ Prepare a symmetric positive definite precision matrix ``P`` and a
159
+ data point ``x``. (If the precision matrix is not already available,
160
+ consider the other factory methods of the `Covariance` class.)
161
+
162
+ >>> import numpy as np
163
+ >>> from scipy import stats
164
+ >>> rng = np.random.default_rng()
165
+ >>> n = 5
166
+ >>> P = rng.random(size=(n, n))
167
+ >>> P = P @ P.T # a precision matrix must be positive definite
168
+ >>> x = rng.random(size=n)
169
+
170
+ Create the `Covariance` object.
171
+
172
+ >>> cov = stats.Covariance.from_precision(P)
173
+
174
+ Compare the functionality of the `Covariance` object against
175
+ reference implementations.
176
+
177
+ >>> res = cov.whiten(x)
178
+ >>> ref = x @ np.linalg.cholesky(P)
179
+ >>> np.allclose(res, ref)
180
+ True
181
+ >>> res = cov.log_pdet
182
+ >>> ref = -np.linalg.slogdet(P)[-1]
183
+ >>> np.allclose(res, ref)
184
+ True
185
+
186
+ """
187
+ return CovViaPrecision(precision, covariance)
188
+
189
+ @staticmethod
190
+ def from_cholesky(cholesky):
191
+ r"""
192
+ Representation of a covariance provided via the (lower) Cholesky factor
193
+
194
+ Parameters
195
+ ----------
196
+ cholesky : array_like
197
+ The lower triangular Cholesky factor of the covariance matrix.
198
+
199
+ Notes
200
+ -----
201
+ Let the covariance matrix be :math:`A` and :math:`L` be the lower
202
+ Cholesky factor such that :math:`L L^T = A`.
203
+ Whitening of a data point :math:`x` is performed by computing
204
+ :math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
205
+ :math:`2tr(\log{L})`, where the :math:`\log` operation is performed
206
+ element-wise.
207
+
208
+ This `Covariance` class does not support singular covariance matrices
209
+ because the Cholesky decomposition does not exist for a singular
210
+ covariance matrix.
211
+
212
+ Examples
213
+ --------
214
+ Prepare a symmetric positive definite covariance matrix ``A`` and a
215
+ data point ``x``.
216
+
217
+ >>> import numpy as np
218
+ >>> from scipy import stats
219
+ >>> rng = np.random.default_rng()
220
+ >>> n = 5
221
+ >>> A = rng.random(size=(n, n))
222
+ >>> A = A @ A.T # make the covariance symmetric positive definite
223
+ >>> x = rng.random(size=n)
224
+
225
+ Perform the Cholesky decomposition of ``A`` and create the
226
+ `Covariance` object.
227
+
228
+ >>> L = np.linalg.cholesky(A)
229
+ >>> cov = stats.Covariance.from_cholesky(L)
230
+
231
+ Compare the functionality of the `Covariance` object against
232
+ reference implementation.
233
+
234
+ >>> from scipy.linalg import solve_triangular
235
+ >>> res = cov.whiten(x)
236
+ >>> ref = solve_triangular(L, x, lower=True)
237
+ >>> np.allclose(res, ref)
238
+ True
239
+ >>> res = cov.log_pdet
240
+ >>> ref = np.linalg.slogdet(A)[-1]
241
+ >>> np.allclose(res, ref)
242
+ True
243
+
244
+ """
245
+ return CovViaCholesky(cholesky)
246
+
247
+ @staticmethod
248
+ def from_eigendecomposition(eigendecomposition):
249
+ r"""
250
+ Representation of a covariance provided via eigendecomposition
251
+
252
+ Parameters
253
+ ----------
254
+ eigendecomposition : sequence
255
+ A sequence (nominally a tuple) containing the eigenvalue and
256
+ eigenvector arrays as computed by `scipy.linalg.eigh` or
257
+ `numpy.linalg.eigh`.
258
+
259
+ Notes
260
+ -----
261
+ Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
262
+ eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
263
+ such that `V W V^T = A`.
264
+
265
+ When all of the eigenvalues are strictly positive, whitening of a
266
+ data point :math:`x` is performed by computing
267
+ :math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
268
+ element-wise.
269
+ :math:`\log\det{A}` is calculated as :math:`tr(\log{W})`,
270
+ where the :math:`\log` operation is performed element-wise.
271
+
272
+ This `Covariance` class supports singular covariance matrices. When
273
+ computing ``_log_pdet``, non-positive eigenvalues are ignored.
274
+ Whitening is not well defined when the point to be whitened
275
+ does not lie in the span of the columns of the covariance matrix. The
276
+ convention taken here is to treat the inverse square root of
277
+ non-positive eigenvalues as zeros.
278
+
279
+ Examples
280
+ --------
281
+ Prepare a symmetric positive definite covariance matrix ``A`` and a
282
+ data point ``x``.
283
+
284
+ >>> import numpy as np
285
+ >>> from scipy import stats
286
+ >>> rng = np.random.default_rng()
287
+ >>> n = 5
288
+ >>> A = rng.random(size=(n, n))
289
+ >>> A = A @ A.T # make the covariance symmetric positive definite
290
+ >>> x = rng.random(size=n)
291
+
292
+ Perform the eigendecomposition of ``A`` and create the `Covariance`
293
+ object.
294
+
295
+ >>> w, v = np.linalg.eigh(A)
296
+ >>> cov = stats.Covariance.from_eigendecomposition((w, v))
297
+
298
+ Compare the functionality of the `Covariance` object against
299
+ reference implementations.
300
+
301
+ >>> res = cov.whiten(x)
302
+ >>> ref = x @ (v @ np.diag(w**-0.5))
303
+ >>> np.allclose(res, ref)
304
+ True
305
+ >>> res = cov.log_pdet
306
+ >>> ref = np.linalg.slogdet(A)[-1]
307
+ >>> np.allclose(res, ref)
308
+ True
309
+
310
+ """
311
+ return CovViaEigendecomposition(eigendecomposition)
312
+
313
+ def whiten(self, x):
314
+ """
315
+ Perform a whitening transformation on data.
316
+
317
+ "Whitening" ("white" as in "white noise", in which each frequency has
318
+ equal magnitude) transforms a set of random variables into a new set of
319
+ random variables with unit-diagonal covariance. When a whitening
320
+ transform is applied to a sample of points distributed according to
321
+ a multivariate normal distribution with zero mean, the covariance of
322
+ the transformed sample is approximately the identity matrix.
323
+
324
+ Parameters
325
+ ----------
326
+ x : array_like
327
+ An array of points. The last dimension must correspond with the
328
+ dimensionality of the space, i.e., the number of columns in the
329
+ covariance matrix.
330
+
331
+ Returns
332
+ -------
333
+ x_ : array_like
334
+ The transformed array of points.
335
+
336
+ References
337
+ ----------
338
+ .. [1] "Whitening Transformation". Wikipedia.
339
+ https://en.wikipedia.org/wiki/Whitening_transformation
340
+ .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
341
+ coloring linear transformation". Transactions of VSB 18.2
342
+ (2018): 31-35. :doi:`10.31490/tces-2018-0013`
343
+
344
+ Examples
345
+ --------
346
+ >>> import numpy as np
347
+ >>> from scipy import stats
348
+ >>> rng = np.random.default_rng()
349
+ >>> n = 3
350
+ >>> A = rng.random(size=(n, n))
351
+ >>> cov_array = A @ A.T # make matrix symmetric positive definite
352
+ >>> precision = np.linalg.inv(cov_array)
353
+ >>> cov_object = stats.Covariance.from_precision(precision)
354
+ >>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
355
+ >>> x_ = cov_object.whiten(x)
356
+ >>> np.cov(x_, rowvar=False) # near-identity covariance
357
+ array([[0.97862122, 0.00893147, 0.02430451],
358
+ [0.00893147, 0.96719062, 0.02201312],
359
+ [0.02430451, 0.02201312, 0.99206881]])
360
+
361
+ """
362
+ return self._whiten(np.asarray(x))
363
+
364
+ def colorize(self, x):
365
+ """
366
+ Perform a colorizing transformation on data.
367
+
368
+ "Colorizing" ("color" as in "colored noise", in which different
369
+ frequencies may have different magnitudes) transforms a set of
370
+ uncorrelated random variables into a new set of random variables with
371
+ the desired covariance. When a coloring transform is applied to a
372
+ sample of points distributed according to a multivariate normal
373
+ distribution with identity covariance and zero mean, the covariance of
374
+ the transformed sample is approximately the covariance matrix used
375
+ in the coloring transform.
376
+
377
+ Parameters
378
+ ----------
379
+ x : array_like
380
+ An array of points. The last dimension must correspond with the
381
+ dimensionality of the space, i.e., the number of columns in the
382
+ covariance matrix.
383
+
384
+ Returns
385
+ -------
386
+ x_ : array_like
387
+ The transformed array of points.
388
+
389
+ References
390
+ ----------
391
+ .. [1] "Whitening Transformation". Wikipedia.
392
+ https://en.wikipedia.org/wiki/Whitening_transformation
393
+ .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
394
+ coloring linear transformation". Transactions of VSB 18.2
395
+ (2018): 31-35. :doi:`10.31490/tces-2018-0013`
396
+
397
+ Examples
398
+ --------
399
+ >>> import numpy as np
400
+ >>> from scipy import stats
401
+ >>> rng = np.random.default_rng(1638083107694713882823079058616272161)
402
+ >>> n = 3
403
+ >>> A = rng.random(size=(n, n))
404
+ >>> cov_array = A @ A.T # make matrix symmetric positive definite
405
+ >>> cholesky = np.linalg.cholesky(cov_array)
406
+ >>> cov_object = stats.Covariance.from_cholesky(cholesky)
407
+ >>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
408
+ >>> x_ = cov_object.colorize(x)
409
+ >>> cov_data = np.cov(x_, rowvar=False)
410
+ >>> np.allclose(cov_data, cov_array, rtol=3e-2)
411
+ True
412
+ """
413
+ return self._colorize(np.asarray(x))
414
+
415
+ @property
416
+ def log_pdet(self):
417
+ """
418
+ Log of the pseudo-determinant of the covariance matrix
419
+ """
420
+ return np.array(self._log_pdet, dtype=float)[()]
421
+
422
+ @property
423
+ def rank(self):
424
+ """
425
+ Rank of the covariance matrix
426
+ """
427
+ return np.array(self._rank, dtype=int)[()]
428
+
429
+ @property
430
+ def covariance(self):
431
+ """
432
+ Explicit representation of the covariance matrix
433
+ """
434
+ return self._covariance
435
+
436
+ @property
437
+ def shape(self):
438
+ """
439
+ Shape of the covariance array
440
+ """
441
+ return self._shape
442
+
443
+ def _validate_matrix(self, A, name):
444
+ A = np.atleast_2d(A)
445
+ m, n = A.shape[-2:]
446
+ if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
447
+ np.issubdtype(A.dtype, np.floating)):
448
+ message = (f"The input `{name}` must be a square, "
449
+ "two-dimensional array of real numbers.")
450
+ raise ValueError(message)
451
+ return A
452
+
453
+ def _validate_vector(self, A, name):
454
+ A = np.atleast_1d(A)
455
+ if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
456
+ np.issubdtype(A.dtype, np.floating)):
457
+ message = (f"The input `{name}` must be a one-dimensional array "
458
+ "of real numbers.")
459
+ raise ValueError(message)
460
+ return A
461
+
462
+
463
+ class CovViaPrecision(Covariance):
464
+
465
+ def __init__(self, precision, covariance=None):
466
+ precision = self._validate_matrix(precision, 'precision')
467
+ if covariance is not None:
468
+ covariance = self._validate_matrix(covariance, 'covariance')
469
+ message = "`precision.shape` must equal `covariance.shape`."
470
+ if precision.shape != covariance.shape:
471
+ raise ValueError(message)
472
+
473
+ self._chol_P = np.linalg.cholesky(precision)
474
+ self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
475
+ self._rank = precision.shape[-1] # must be full rank if invertible
476
+ self._precision = precision
477
+ self._cov_matrix = covariance
478
+ self._shape = precision.shape
479
+ self._allow_singular = False
480
+
481
+ def _whiten(self, x):
482
+ return x @ self._chol_P
483
+
484
+ @cached_property
485
+ def _covariance(self):
486
+ n = self._shape[-1]
487
+ return (linalg.cho_solve((self._chol_P, True), np.eye(n))
488
+ if self._cov_matrix is None else self._cov_matrix)
489
+
490
+ def _colorize(self, x):
491
+ return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
492
+
493
+
494
+ def _dot_diag(x, d):
495
+ # If d were a full diagonal matrix, x @ d would always do what we want.
496
+ # Special treatment is needed for n-dimensional `d` in which each row
497
+ # includes only the diagonal elements of a covariance matrix.
498
+ return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
499
+
500
+
501
+ class CovViaDiagonal(Covariance):
502
+
503
+ def __init__(self, diagonal):
504
+ diagonal = self._validate_vector(diagonal, 'diagonal')
505
+
506
+ i_zero = diagonal <= 0
507
+ positive_diagonal = np.array(diagonal, dtype=np.float64)
508
+
509
+ positive_diagonal[i_zero] = 1 # ones don't affect determinant
510
+ self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
511
+
512
+ psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
513
+ psuedo_reciprocals[i_zero] = 0
514
+
515
+ self._sqrt_diagonal = np.sqrt(diagonal)
516
+ self._LP = psuedo_reciprocals
517
+ self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
518
+ self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
519
+ self._i_zero = i_zero
520
+ self._shape = self._covariance.shape
521
+ self._allow_singular = True
522
+
523
+ def _whiten(self, x):
524
+ return _dot_diag(x, self._LP)
525
+
526
+ def _colorize(self, x):
527
+ return _dot_diag(x, self._sqrt_diagonal)
528
+
529
+ def _support_mask(self, x):
530
+ """
531
+ Check whether x lies in the support of the distribution.
532
+ """
533
+ return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
534
+
535
+
536
+ class CovViaCholesky(Covariance):
537
+
538
+ def __init__(self, cholesky):
539
+ L = self._validate_matrix(cholesky, 'cholesky')
540
+
541
+ self._factor = L
542
+ self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
543
+ self._rank = L.shape[-1] # must be full rank for cholesky
544
+ self._shape = L.shape
545
+ self._allow_singular = False
546
+
547
+ @cached_property
548
+ def _covariance(self):
549
+ return self._factor @ self._factor.T
550
+
551
+ def _whiten(self, x):
552
+ res = linalg.solve_triangular(self._factor, x.T, lower=True).T
553
+ return res
554
+
555
+ def _colorize(self, x):
556
+ return x @ self._factor.T
557
+
558
+
559
+ class CovViaEigendecomposition(Covariance):
560
+
561
+ def __init__(self, eigendecomposition):
562
+ eigenvalues, eigenvectors = eigendecomposition
563
+ eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
564
+ eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
565
+ message = ("The shapes of `eigenvalues` and `eigenvectors` "
566
+ "must be compatible.")
567
+ try:
568
+ eigenvalues = np.expand_dims(eigenvalues, -2)
569
+ eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
570
+ eigenvalues)
571
+ eigenvalues = eigenvalues[..., 0, :]
572
+ except ValueError:
573
+ raise ValueError(message)
574
+
575
+ i_zero = eigenvalues <= 0
576
+ positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
577
+
578
+ positive_eigenvalues[i_zero] = 1 # ones don't affect determinant
579
+ self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
580
+
581
+ psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
582
+ psuedo_reciprocals[i_zero] = 0
583
+
584
+ self._LP = eigenvectors * psuedo_reciprocals
585
+ self._LA = eigenvectors * np.sqrt(eigenvalues)
586
+ self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
587
+ self._w = eigenvalues
588
+ self._v = eigenvectors
589
+ self._shape = eigenvectors.shape
590
+ self._null_basis = eigenvectors * i_zero
591
+ # This is only used for `_support_mask`, not to decide whether
592
+ # the covariance is singular or not.
593
+ self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
594
+ self._allow_singular = True
595
+
596
+ def _whiten(self, x):
597
+ return x @ self._LP
598
+
599
+ def _colorize(self, x):
600
+ return x @ self._LA.T
601
+
602
+ @cached_property
603
+ def _covariance(self):
604
+ return (self._v * self._w) @ self._v.T
605
+
606
+ def _support_mask(self, x):
607
+ """
608
+ Check whether x lies in the support of the distribution.
609
+ """
610
+ residual = np.linalg.norm(x @ self._null_basis, axis=-1)
611
+ in_support = residual < self._eps
612
+ return in_support
613
+
614
+
615
+ class CovViaPSD(Covariance):
616
+ """
617
+ Representation of a covariance provided via an instance of _PSD
618
+ """
619
+
620
+ def __init__(self, psd):
621
+ self._LP = psd.U
622
+ self._log_pdet = psd.log_pdet
623
+ self._rank = psd.rank
624
+ self._covariance = psd._M
625
+ self._shape = psd._M.shape
626
+ self._psd = psd
627
+ self._allow_singular = False # by default
628
+
629
+ def _whiten(self, x):
630
+ return x @ self._LP
631
+
632
+ def _support_mask(self, x):
633
+ return self._psd._support_mask(x)
.venv/Lib/site-packages/scipy/stats/_crosstab.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy.sparse import coo_matrix
3
+ from scipy._lib._bunch import _make_tuple_bunch
4
+
5
+
6
+ CrosstabResult = _make_tuple_bunch(
7
+ "CrosstabResult", ["elements", "count"]
8
+ )
9
+
10
+
11
+ def crosstab(*args, levels=None, sparse=False):
12
+ """
13
+ Return table of counts for each possible unique combination in ``*args``.
14
+
15
+ When ``len(args) > 1``, the array computed by this function is
16
+ often referred to as a *contingency table* [1]_.
17
+
18
+ The arguments must be sequences with the same length. The second return
19
+ value, `count`, is an integer array with ``len(args)`` dimensions. If
20
+ `levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
21
+ is the number of unique elements in ``args[k]``.
22
+
23
+ Parameters
24
+ ----------
25
+ *args : sequences
26
+ A sequence of sequences whose unique aligned elements are to be
27
+ counted. The sequences in args must all be the same length.
28
+ levels : sequence, optional
29
+ If `levels` is given, it must be a sequence that is the same length as
30
+ `args`. Each element in `levels` is either a sequence or None. If it
31
+ is a sequence, it gives the values in the corresponding sequence in
32
+ `args` that are to be counted. If any value in the sequences in `args`
33
+ does not occur in the corresponding sequence in `levels`, that value
34
+ is ignored and not counted in the returned array `count`. The default
35
+ value of `levels` for ``args[i]`` is ``np.unique(args[i])``
36
+ sparse : bool, optional
37
+ If True, return a sparse matrix. The matrix will be an instance of
38
+ the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
39
+ must be 2-d, only two input sequences are allowed when `sparse` is
40
+ True. Default is False.
41
+
42
+ Returns
43
+ -------
44
+ res : CrosstabResult
45
+ An object containing the following attributes:
46
+
47
+ elements : tuple of numpy.ndarrays.
48
+ Tuple of length ``len(args)`` containing the arrays of elements
49
+ that are counted in `count`. These can be interpreted as the
50
+ labels of the corresponding dimensions of `count`. If `levels` was
51
+ given, then if ``levels[i]`` is not None, ``elements[i]`` will
52
+ hold the values given in ``levels[i]``.
53
+ count : numpy.ndarray or scipy.sparse.coo_matrix
54
+ Counts of the unique elements in ``zip(*args)``, stored in an
55
+ array. Also known as a *contingency table* when ``len(args) > 1``.
56
+
57
+ See Also
58
+ --------
59
+ numpy.unique
60
+
61
+ Notes
62
+ -----
63
+ .. versionadded:: 1.7.0
64
+
65
+ References
66
+ ----------
67
+ .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
68
+
69
+ Examples
70
+ --------
71
+ >>> from scipy.stats.contingency import crosstab
72
+
73
+ Given the lists `a` and `x`, create a contingency table that counts the
74
+ frequencies of the corresponding pairs.
75
+
76
+ >>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
77
+ >>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
78
+ >>> res = crosstab(a, x)
79
+ >>> avals, xvals = res.elements
80
+ >>> avals
81
+ array(['A', 'B'], dtype='<U1')
82
+ >>> xvals
83
+ array(['X', 'Y', 'Z'], dtype='<U1')
84
+ >>> res.count
85
+ array([[2, 3, 0],
86
+ [1, 0, 4]])
87
+
88
+ So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
89
+
90
+ Higher dimensional contingency tables can be created.
91
+
92
+ >>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
93
+ >>> res = crosstab(a, x, p)
94
+ >>> res.count
95
+ array([[[2, 0],
96
+ [2, 1],
97
+ [0, 0]],
98
+ [[1, 0],
99
+ [0, 0],
100
+ [1, 3]]])
101
+ >>> res.count.shape
102
+ (2, 3, 2)
103
+
104
+ The values to be counted can be set by using the `levels` argument.
105
+ It allows the elements of interest in each input sequence to be
106
+ given explicitly instead finding the unique elements of the sequence.
107
+
108
+ For example, suppose one of the arguments is an array containing the
109
+ answers to a survey question, with integer values 1 to 4. Even if the
110
+ value 1 does not occur in the data, we want an entry for it in the table.
111
+
112
+ >>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
113
+ >>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
114
+ >>> options = [1, 2, 3, 4]
115
+ >>> res = crosstab(q1, q2, levels=(options, options))
116
+ >>> res.count
117
+ array([[0, 0, 0, 0],
118
+ [1, 1, 0, 1],
119
+ [1, 4, 0, 1],
120
+ [0, 3, 0, 3]])
121
+
122
+ If `levels` is given, but an element of `levels` is None, the unique values
123
+ of the corresponding argument are used. For example,
124
+
125
+ >>> res = crosstab(q1, q2, levels=(None, options))
126
+ >>> res.elements
127
+ [array([2, 3, 4]), [1, 2, 3, 4]]
128
+ >>> res.count
129
+ array([[1, 1, 0, 1],
130
+ [1, 4, 0, 1],
131
+ [0, 3, 0, 3]])
132
+
133
+ If we want to ignore the pairs where 4 occurs in ``q2``, we can
134
+ give just the values [1, 2] to `levels`, and the 4 will be ignored:
135
+
136
+ >>> res = crosstab(q1, q2, levels=(None, [1, 2]))
137
+ >>> res.elements
138
+ [array([2, 3, 4]), [1, 2]]
139
+ >>> res.count
140
+ array([[1, 1],
141
+ [1, 4],
142
+ [0, 3]])
143
+
144
+ Finally, let's repeat the first example, but return a sparse matrix:
145
+
146
+ >>> res = crosstab(a, x, sparse=True)
147
+ >>> res.count
148
+ <2x3 sparse matrix of type '<class 'numpy.int64'>'
149
+ with 4 stored elements in COOrdinate format>
150
+ >>> res.count.A
151
+ array([[2, 3, 0],
152
+ [1, 0, 4]])
153
+
154
+ """
155
+ nargs = len(args)
156
+ if nargs == 0:
157
+ raise TypeError("At least one input sequence is required.")
158
+
159
+ len0 = len(args[0])
160
+ if not all(len(a) == len0 for a in args[1:]):
161
+ raise ValueError("All input sequences must have the same length.")
162
+
163
+ if sparse and nargs != 2:
164
+ raise ValueError("When `sparse` is True, only two input sequences "
165
+ "are allowed.")
166
+
167
+ if levels is None:
168
+ # Call np.unique with return_inverse=True on each argument.
169
+ actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
170
+ for a in args])
171
+ else:
172
+ # `levels` is not None...
173
+ if len(levels) != nargs:
174
+ raise ValueError('len(levels) must equal the number of input '
175
+ 'sequences')
176
+
177
+ args = [np.asarray(arg) for arg in args]
178
+ mask = np.zeros((nargs, len0), dtype=np.bool_)
179
+ inv = np.zeros((nargs, len0), dtype=np.intp)
180
+ actual_levels = []
181
+ for k, (levels_list, arg) in enumerate(zip(levels, args)):
182
+ if levels_list is None:
183
+ levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
184
+ mask[k, :] = True
185
+ else:
186
+ q = arg == np.asarray(levels_list).reshape(-1, 1)
187
+ mask[k, :] = np.any(q, axis=0)
188
+ qnz = q.T.nonzero()
189
+ inv[k, qnz[0]] = qnz[1]
190
+ actual_levels.append(levels_list)
191
+
192
+ mask_all = mask.all(axis=0)
193
+ indices = tuple(inv[:, mask_all])
194
+
195
+ if sparse:
196
+ count = coo_matrix((np.ones(len(indices[0]), dtype=int),
197
+ (indices[0], indices[1])))
198
+ count.sum_duplicates()
199
+ else:
200
+ shape = [len(u) for u in actual_levels]
201
+ count = np.zeros(shape, dtype=int)
202
+ np.add.at(count, indices, 1)
203
+
204
+ return CrosstabResult(actual_levels, count)
.venv/Lib/site-packages/scipy/stats/_discrete_distns.py ADDED
@@ -0,0 +1,1954 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Author: Travis Oliphant 2002-2011 with contributions from
3
+ # SciPy Developers 2004-2011
4
+ #
5
+ from functools import partial
6
+
7
+ from scipy import special
8
+ from scipy.special import entr, logsumexp, betaln, gammaln as gamln, zeta
9
+ from scipy._lib._util import _lazywhere, rng_integers
10
+ from scipy.interpolate import interp1d
11
+
12
+ from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
13
+
14
+ import numpy as np
15
+
16
+ from ._distn_infrastructure import (rv_discrete, get_distribution_names,
17
+ _check_shape, _ShapeInfo)
18
+ import scipy.stats._boost as _boost
19
+ from ._biasedurn import (_PyFishersNCHypergeometric,
20
+ _PyWalleniusNCHypergeometric,
21
+ _PyStochasticLib3)
22
+
23
+
24
+ def _isintegral(x):
25
+ return x == np.round(x)
26
+
27
+
28
+ class binom_gen(rv_discrete):
29
+ r"""A binomial discrete random variable.
30
+
31
+ %(before_notes)s
32
+
33
+ Notes
34
+ -----
35
+ The probability mass function for `binom` is:
36
+
37
+ .. math::
38
+
39
+ f(k) = \binom{n}{k} p^k (1-p)^{n-k}
40
+
41
+ for :math:`k \in \{0, 1, \dots, n\}`, :math:`0 \leq p \leq 1`
42
+
43
+ `binom` takes :math:`n` and :math:`p` as shape parameters,
44
+ where :math:`p` is the probability of a single success
45
+ and :math:`1-p` is the probability of a single failure.
46
+
47
+ %(after_notes)s
48
+
49
+ %(example)s
50
+
51
+ See Also
52
+ --------
53
+ hypergeom, nbinom, nhypergeom
54
+
55
+ """
56
+ def _shape_info(self):
57
+ return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
58
+ _ShapeInfo("p", False, (0, 1), (True, True))]
59
+
60
+ def _rvs(self, n, p, size=None, random_state=None):
61
+ return random_state.binomial(n, p, size)
62
+
63
+ def _argcheck(self, n, p):
64
+ return (n >= 0) & _isintegral(n) & (p >= 0) & (p <= 1)
65
+
66
+ def _get_support(self, n, p):
67
+ return self.a, n
68
+
69
+ def _logpmf(self, x, n, p):
70
+ k = floor(x)
71
+ combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
72
+ return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
73
+
74
+ def _pmf(self, x, n, p):
75
+ # binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
76
+ return _boost._binom_pdf(x, n, p)
77
+
78
+ def _cdf(self, x, n, p):
79
+ k = floor(x)
80
+ return _boost._binom_cdf(k, n, p)
81
+
82
+ def _sf(self, x, n, p):
83
+ k = floor(x)
84
+ return _boost._binom_sf(k, n, p)
85
+
86
+ def _isf(self, x, n, p):
87
+ return _boost._binom_isf(x, n, p)
88
+
89
+ def _ppf(self, q, n, p):
90
+ return _boost._binom_ppf(q, n, p)
91
+
92
+ def _stats(self, n, p, moments='mv'):
93
+ mu = _boost._binom_mean(n, p)
94
+ var = _boost._binom_variance(n, p)
95
+ g1, g2 = None, None
96
+ if 's' in moments:
97
+ g1 = _boost._binom_skewness(n, p)
98
+ if 'k' in moments:
99
+ g2 = _boost._binom_kurtosis_excess(n, p)
100
+ return mu, var, g1, g2
101
+
102
+ def _entropy(self, n, p):
103
+ k = np.r_[0:n + 1]
104
+ vals = self._pmf(k, n, p)
105
+ return np.sum(entr(vals), axis=0)
106
+
107
+
108
+ binom = binom_gen(name='binom')
109
+
110
+
111
+ class bernoulli_gen(binom_gen):
112
+ r"""A Bernoulli discrete random variable.
113
+
114
+ %(before_notes)s
115
+
116
+ Notes
117
+ -----
118
+ The probability mass function for `bernoulli` is:
119
+
120
+ .. math::
121
+
122
+ f(k) = \begin{cases}1-p &\text{if } k = 0\\
123
+ p &\text{if } k = 1\end{cases}
124
+
125
+ for :math:`k` in :math:`\{0, 1\}`, :math:`0 \leq p \leq 1`
126
+
127
+ `bernoulli` takes :math:`p` as shape parameter,
128
+ where :math:`p` is the probability of a single success
129
+ and :math:`1-p` is the probability of a single failure.
130
+
131
+ %(after_notes)s
132
+
133
+ %(example)s
134
+
135
+ """
136
+ def _shape_info(self):
137
+ return [_ShapeInfo("p", False, (0, 1), (True, True))]
138
+
139
+ def _rvs(self, p, size=None, random_state=None):
140
+ return binom_gen._rvs(self, 1, p, size=size, random_state=random_state)
141
+
142
+ def _argcheck(self, p):
143
+ return (p >= 0) & (p <= 1)
144
+
145
+ def _get_support(self, p):
146
+ # Overrides binom_gen._get_support!x
147
+ return self.a, self.b
148
+
149
+ def _logpmf(self, x, p):
150
+ return binom._logpmf(x, 1, p)
151
+
152
+ def _pmf(self, x, p):
153
+ # bernoulli.pmf(k) = 1-p if k = 0
154
+ # = p if k = 1
155
+ return binom._pmf(x, 1, p)
156
+
157
+ def _cdf(self, x, p):
158
+ return binom._cdf(x, 1, p)
159
+
160
+ def _sf(self, x, p):
161
+ return binom._sf(x, 1, p)
162
+
163
+ def _isf(self, x, p):
164
+ return binom._isf(x, 1, p)
165
+
166
+ def _ppf(self, q, p):
167
+ return binom._ppf(q, 1, p)
168
+
169
+ def _stats(self, p):
170
+ return binom._stats(1, p)
171
+
172
+ def _entropy(self, p):
173
+ return entr(p) + entr(1-p)
174
+
175
+
176
+ bernoulli = bernoulli_gen(b=1, name='bernoulli')
177
+
178
+
179
+ class betabinom_gen(rv_discrete):
180
+ r"""A beta-binomial discrete random variable.
181
+
182
+ %(before_notes)s
183
+
184
+ Notes
185
+ -----
186
+ The beta-binomial distribution is a binomial distribution with a
187
+ probability of success `p` that follows a beta distribution.
188
+
189
+ The probability mass function for `betabinom` is:
190
+
191
+ .. math::
192
+
193
+ f(k) = \binom{n}{k} \frac{B(k + a, n - k + b)}{B(a, b)}
194
+
195
+ for :math:`k \in \{0, 1, \dots, n\}`, :math:`n \geq 0`, :math:`a > 0`,
196
+ :math:`b > 0`, where :math:`B(a, b)` is the beta function.
197
+
198
+ `betabinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
199
+
200
+ References
201
+ ----------
202
+ .. [1] https://en.wikipedia.org/wiki/Beta-binomial_distribution
203
+
204
+ %(after_notes)s
205
+
206
+ .. versionadded:: 1.4.0
207
+
208
+ See Also
209
+ --------
210
+ beta, binom
211
+
212
+ %(example)s
213
+
214
+ """
215
+ def _shape_info(self):
216
+ return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
217
+ _ShapeInfo("a", False, (0, np.inf), (False, False)),
218
+ _ShapeInfo("b", False, (0, np.inf), (False, False))]
219
+
220
+ def _rvs(self, n, a, b, size=None, random_state=None):
221
+ p = random_state.beta(a, b, size)
222
+ return random_state.binomial(n, p, size)
223
+
224
+ def _get_support(self, n, a, b):
225
+ return 0, n
226
+
227
+ def _argcheck(self, n, a, b):
228
+ return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
229
+
230
+ def _logpmf(self, x, n, a, b):
231
+ k = floor(x)
232
+ combiln = -log(n + 1) - betaln(n - k + 1, k + 1)
233
+ return combiln + betaln(k + a, n - k + b) - betaln(a, b)
234
+
235
+ def _pmf(self, x, n, a, b):
236
+ return exp(self._logpmf(x, n, a, b))
237
+
238
+ def _stats(self, n, a, b, moments='mv'):
239
+ e_p = a / (a + b)
240
+ e_q = 1 - e_p
241
+ mu = n * e_p
242
+ var = n * (a + b + n) * e_p * e_q / (a + b + 1)
243
+ g1, g2 = None, None
244
+ if 's' in moments:
245
+ g1 = 1.0 / sqrt(var)
246
+ g1 *= (a + b + 2 * n) * (b - a)
247
+ g1 /= (a + b + 2) * (a + b)
248
+ if 'k' in moments:
249
+ g2 = (a + b).astype(e_p.dtype)
250
+ g2 *= (a + b - 1 + 6 * n)
251
+ g2 += 3 * a * b * (n - 2)
252
+ g2 += 6 * n ** 2
253
+ g2 -= 3 * e_p * b * n * (6 - n)
254
+ g2 -= 18 * e_p * e_q * n ** 2
255
+ g2 *= (a + b) ** 2 * (1 + a + b)
256
+ g2 /= (n * a * b * (a + b + 2) * (a + b + 3) * (a + b + n))
257
+ g2 -= 3
258
+ return mu, var, g1, g2
259
+
260
+
261
+ betabinom = betabinom_gen(name='betabinom')
262
+
263
+
264
+ class nbinom_gen(rv_discrete):
265
+ r"""A negative binomial discrete random variable.
266
+
267
+ %(before_notes)s
268
+
269
+ Notes
270
+ -----
271
+ Negative binomial distribution describes a sequence of i.i.d. Bernoulli
272
+ trials, repeated until a predefined, non-random number of successes occurs.
273
+
274
+ The probability mass function of the number of failures for `nbinom` is:
275
+
276
+ .. math::
277
+
278
+ f(k) = \binom{k+n-1}{n-1} p^n (1-p)^k
279
+
280
+ for :math:`k \ge 0`, :math:`0 < p \leq 1`
281
+
282
+ `nbinom` takes :math:`n` and :math:`p` as shape parameters where :math:`n`
283
+ is the number of successes, :math:`p` is the probability of a single
284
+ success, and :math:`1-p` is the probability of a single failure.
285
+
286
+ Another common parameterization of the negative binomial distribution is
287
+ in terms of the mean number of failures :math:`\mu` to achieve :math:`n`
288
+ successes. The mean :math:`\mu` is related to the probability of success
289
+ as
290
+
291
+ .. math::
292
+
293
+ p = \frac{n}{n + \mu}
294
+
295
+ The number of successes :math:`n` may also be specified in terms of a
296
+ "dispersion", "heterogeneity", or "aggregation" parameter :math:`\alpha`,
297
+ which relates the mean :math:`\mu` to the variance :math:`\sigma^2`,
298
+ e.g. :math:`\sigma^2 = \mu + \alpha \mu^2`. Regardless of the convention
299
+ used for :math:`\alpha`,
300
+
301
+ .. math::
302
+
303
+ p &= \frac{\mu}{\sigma^2} \\
304
+ n &= \frac{\mu^2}{\sigma^2 - \mu}
305
+
306
+ %(after_notes)s
307
+
308
+ %(example)s
309
+
310
+ See Also
311
+ --------
312
+ hypergeom, binom, nhypergeom
313
+
314
+ """
315
+ def _shape_info(self):
316
+ return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
317
+ _ShapeInfo("p", False, (0, 1), (True, True))]
318
+
319
+ def _rvs(self, n, p, size=None, random_state=None):
320
+ return random_state.negative_binomial(n, p, size)
321
+
322
+ def _argcheck(self, n, p):
323
+ return (n > 0) & (p > 0) & (p <= 1)
324
+
325
+ def _pmf(self, x, n, p):
326
+ # nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
327
+ return _boost._nbinom_pdf(x, n, p)
328
+
329
+ def _logpmf(self, x, n, p):
330
+ coeff = gamln(n+x) - gamln(x+1) - gamln(n)
331
+ return coeff + n*log(p) + special.xlog1py(x, -p)
332
+
333
+ def _cdf(self, x, n, p):
334
+ k = floor(x)
335
+ return _boost._nbinom_cdf(k, n, p)
336
+
337
+ def _logcdf(self, x, n, p):
338
+ k = floor(x)
339
+ k, n, p = np.broadcast_arrays(k, n, p)
340
+ cdf = self._cdf(k, n, p)
341
+ cond = cdf > 0.5
342
+ def f1(k, n, p):
343
+ return np.log1p(-special.betainc(k + 1, n, 1 - p))
344
+
345
+ # do calc in place
346
+ logcdf = cdf
347
+ with np.errstate(divide='ignore'):
348
+ logcdf[cond] = f1(k[cond], n[cond], p[cond])
349
+ logcdf[~cond] = np.log(cdf[~cond])
350
+ return logcdf
351
+
352
+ def _sf(self, x, n, p):
353
+ k = floor(x)
354
+ return _boost._nbinom_sf(k, n, p)
355
+
356
+ def _isf(self, x, n, p):
357
+ with np.errstate(over='ignore'): # see gh-17432
358
+ return _boost._nbinom_isf(x, n, p)
359
+
360
+ def _ppf(self, q, n, p):
361
+ with np.errstate(over='ignore'): # see gh-17432
362
+ return _boost._nbinom_ppf(q, n, p)
363
+
364
+ def _stats(self, n, p):
365
+ return (
366
+ _boost._nbinom_mean(n, p),
367
+ _boost._nbinom_variance(n, p),
368
+ _boost._nbinom_skewness(n, p),
369
+ _boost._nbinom_kurtosis_excess(n, p),
370
+ )
371
+
372
+
373
+ nbinom = nbinom_gen(name='nbinom')
374
+
375
+
376
+ class betanbinom_gen(rv_discrete):
377
+ r"""A beta-negative-binomial discrete random variable.
378
+
379
+ %(before_notes)s
380
+
381
+ Notes
382
+ -----
383
+ The beta-negative-binomial distribution is a negative binomial
384
+ distribution with a probability of success `p` that follows a
385
+ beta distribution.
386
+
387
+ The probability mass function for `betanbinom` is:
388
+
389
+ .. math::
390
+
391
+ f(k) = \binom{n + k - 1}{k} \frac{B(a + n, b + k)}{B(a, b)}
392
+
393
+ for :math:`k \ge 0`, :math:`n \geq 0`, :math:`a > 0`,
394
+ :math:`b > 0`, where :math:`B(a, b)` is the beta function.
395
+
396
+ `betanbinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
397
+
398
+ References
399
+ ----------
400
+ .. [1] https://en.wikipedia.org/wiki/Beta_negative_binomial_distribution
401
+
402
+ %(after_notes)s
403
+
404
+ .. versionadded:: 1.12.0
405
+
406
+ See Also
407
+ --------
408
+ betabinom : Beta binomial distribution
409
+
410
+ %(example)s
411
+
412
+ """
413
+ def _shape_info(self):
414
+ return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
415
+ _ShapeInfo("a", False, (0, np.inf), (False, False)),
416
+ _ShapeInfo("b", False, (0, np.inf), (False, False))]
417
+
418
+ def _rvs(self, n, a, b, size=None, random_state=None):
419
+ p = random_state.beta(a, b, size)
420
+ return random_state.negative_binomial(n, p, size)
421
+
422
+ def _argcheck(self, n, a, b):
423
+ return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
424
+
425
+ def _logpmf(self, x, n, a, b):
426
+ k = floor(x)
427
+ combiln = -np.log(n + k) - betaln(n, k + 1)
428
+ return combiln + betaln(a + n, b + k) - betaln(a, b)
429
+
430
+ def _pmf(self, x, n, a, b):
431
+ return exp(self._logpmf(x, n, a, b))
432
+
433
+ def _stats(self, n, a, b, moments='mv'):
434
+ # reference: Wolfram Alpha input
435
+ # BetaNegativeBinomialDistribution[a, b, n]
436
+ def mean(n, a, b):
437
+ return n * b / (a - 1.)
438
+ mu = _lazywhere(a > 1, (n, a, b), f=mean, fillvalue=np.inf)
439
+ def var(n, a, b):
440
+ return (n * b * (n + a - 1.) * (a + b - 1.)
441
+ / ((a - 2.) * (a - 1.)**2.))
442
+ var = _lazywhere(a > 2, (n, a, b), f=var, fillvalue=np.inf)
443
+ g1, g2 = None, None
444
+ def skew(n, a, b):
445
+ return ((2 * n + a - 1.) * (2 * b + a - 1.)
446
+ / (a - 3.) / sqrt(n * b * (n + a - 1.) * (b + a - 1.)
447
+ / (a - 2.)))
448
+ if 's' in moments:
449
+ g1 = _lazywhere(a > 3, (n, a, b), f=skew, fillvalue=np.inf)
450
+ def kurtosis(n, a, b):
451
+ term = (a - 2.)
452
+ term_2 = ((a - 1.)**2. * (a**2. + a * (6 * b - 1.)
453
+ + 6. * (b - 1.) * b)
454
+ + 3. * n**2. * ((a + 5.) * b**2. + (a + 5.)
455
+ * (a - 1.) * b + 2. * (a - 1.)**2)
456
+ + 3 * (a - 1.) * n
457
+ * ((a + 5.) * b**2. + (a + 5.) * (a - 1.) * b
458
+ + 2. * (a - 1.)**2.))
459
+ denominator = ((a - 4.) * (a - 3.) * b * n
460
+ * (a + b - 1.) * (a + n - 1.))
461
+ # Wolfram Alpha uses Pearson kurtosis, so we substract 3 to get
462
+ # scipy's Fisher kurtosis
463
+ return term * term_2 / denominator - 3.
464
+ if 'k' in moments:
465
+ g2 = _lazywhere(a > 4, (n, a, b), f=kurtosis, fillvalue=np.inf)
466
+ return mu, var, g1, g2
467
+
468
+
469
+ betanbinom = betanbinom_gen(name='betanbinom')
470
+
471
+
472
+ class geom_gen(rv_discrete):
473
+ r"""A geometric discrete random variable.
474
+
475
+ %(before_notes)s
476
+
477
+ Notes
478
+ -----
479
+ The probability mass function for `geom` is:
480
+
481
+ .. math::
482
+
483
+ f(k) = (1-p)^{k-1} p
484
+
485
+ for :math:`k \ge 1`, :math:`0 < p \leq 1`
486
+
487
+ `geom` takes :math:`p` as shape parameter,
488
+ where :math:`p` is the probability of a single success
489
+ and :math:`1-p` is the probability of a single failure.
490
+
491
+ %(after_notes)s
492
+
493
+ See Also
494
+ --------
495
+ planck
496
+
497
+ %(example)s
498
+
499
+ """
500
+
501
+ def _shape_info(self):
502
+ return [_ShapeInfo("p", False, (0, 1), (True, True))]
503
+
504
+ def _rvs(self, p, size=None, random_state=None):
505
+ return random_state.geometric(p, size=size)
506
+
507
+ def _argcheck(self, p):
508
+ return (p <= 1) & (p > 0)
509
+
510
+ def _pmf(self, k, p):
511
+ return np.power(1-p, k-1) * p
512
+
513
+ def _logpmf(self, k, p):
514
+ return special.xlog1py(k - 1, -p) + log(p)
515
+
516
+ def _cdf(self, x, p):
517
+ k = floor(x)
518
+ return -expm1(log1p(-p)*k)
519
+
520
+ def _sf(self, x, p):
521
+ return np.exp(self._logsf(x, p))
522
+
523
+ def _logsf(self, x, p):
524
+ k = floor(x)
525
+ return k*log1p(-p)
526
+
527
+ def _ppf(self, q, p):
528
+ vals = ceil(log1p(-q) / log1p(-p))
529
+ temp = self._cdf(vals-1, p)
530
+ return np.where((temp >= q) & (vals > 0), vals-1, vals)
531
+
532
+ def _stats(self, p):
533
+ mu = 1.0/p
534
+ qr = 1.0-p
535
+ var = qr / p / p
536
+ g1 = (2.0-p) / sqrt(qr)
537
+ g2 = np.polyval([1, -6, 6], p)/(1.0-p)
538
+ return mu, var, g1, g2
539
+
540
+ def _entropy(self, p):
541
+ return -np.log(p) - np.log1p(-p) * (1.0-p) / p
542
+
543
+
544
+ geom = geom_gen(a=1, name='geom', longname="A geometric")
545
+
546
+
547
+ class hypergeom_gen(rv_discrete):
548
+ r"""A hypergeometric discrete random variable.
549
+
550
+ The hypergeometric distribution models drawing objects from a bin.
551
+ `M` is the total number of objects, `n` is total number of Type I objects.
552
+ The random variate represents the number of Type I objects in `N` drawn
553
+ without replacement from the total population.
554
+
555
+ %(before_notes)s
556
+
557
+ Notes
558
+ -----
559
+ The symbols used to denote the shape parameters (`M`, `n`, and `N`) are not
560
+ universally accepted. See the Examples for a clarification of the
561
+ definitions used here.
562
+
563
+ The probability mass function is defined as,
564
+
565
+ .. math:: p(k, M, n, N) = \frac{\binom{n}{k} \binom{M - n}{N - k}}
566
+ {\binom{M}{N}}
567
+
568
+ for :math:`k \in [\max(0, N - M + n), \min(n, N)]`, where the binomial
569
+ coefficients are defined as,
570
+
571
+ .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
572
+
573
+ %(after_notes)s
574
+
575
+ Examples
576
+ --------
577
+ >>> import numpy as np
578
+ >>> from scipy.stats import hypergeom
579
+ >>> import matplotlib.pyplot as plt
580
+
581
+ Suppose we have a collection of 20 animals, of which 7 are dogs. Then if
582
+ we want to know the probability of finding a given number of dogs if we
583
+ choose at random 12 of the 20 animals, we can initialize a frozen
584
+ distribution and plot the probability mass function:
585
+
586
+ >>> [M, n, N] = [20, 7, 12]
587
+ >>> rv = hypergeom(M, n, N)
588
+ >>> x = np.arange(0, n+1)
589
+ >>> pmf_dogs = rv.pmf(x)
590
+
591
+ >>> fig = plt.figure()
592
+ >>> ax = fig.add_subplot(111)
593
+ >>> ax.plot(x, pmf_dogs, 'bo')
594
+ >>> ax.vlines(x, 0, pmf_dogs, lw=2)
595
+ >>> ax.set_xlabel('# of dogs in our group of chosen animals')
596
+ >>> ax.set_ylabel('hypergeom PMF')
597
+ >>> plt.show()
598
+
599
+ Instead of using a frozen distribution we can also use `hypergeom`
600
+ methods directly. To for example obtain the cumulative distribution
601
+ function, use:
602
+
603
+ >>> prb = hypergeom.cdf(x, M, n, N)
604
+
605
+ And to generate random numbers:
606
+
607
+ >>> R = hypergeom.rvs(M, n, N, size=10)
608
+
609
+ See Also
610
+ --------
611
+ nhypergeom, binom, nbinom
612
+
613
+ """
614
+ def _shape_info(self):
615
+ return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
616
+ _ShapeInfo("n", True, (0, np.inf), (True, False)),
617
+ _ShapeInfo("N", True, (0, np.inf), (True, False))]
618
+
619
+ def _rvs(self, M, n, N, size=None, random_state=None):
620
+ return random_state.hypergeometric(n, M-n, N, size=size)
621
+
622
+ def _get_support(self, M, n, N):
623
+ return np.maximum(N-(M-n), 0), np.minimum(n, N)
624
+
625
+ def _argcheck(self, M, n, N):
626
+ cond = (M > 0) & (n >= 0) & (N >= 0)
627
+ cond &= (n <= M) & (N <= M)
628
+ cond &= _isintegral(M) & _isintegral(n) & _isintegral(N)
629
+ return cond
630
+
631
+ def _logpmf(self, k, M, n, N):
632
+ tot, good = M, n
633
+ bad = tot - good
634
+ result = (betaln(good+1, 1) + betaln(bad+1, 1) + betaln(tot-N+1, N+1) -
635
+ betaln(k+1, good-k+1) - betaln(N-k+1, bad-N+k+1) -
636
+ betaln(tot+1, 1))
637
+ return result
638
+
639
+ def _pmf(self, k, M, n, N):
640
+ return _boost._hypergeom_pdf(k, n, N, M)
641
+
642
+ def _cdf(self, k, M, n, N):
643
+ return _boost._hypergeom_cdf(k, n, N, M)
644
+
645
+ def _stats(self, M, n, N):
646
+ M, n, N = 1. * M, 1. * n, 1. * N
647
+ m = M - n
648
+
649
+ # Boost kurtosis_excess doesn't return the same as the value
650
+ # computed here.
651
+ g2 = M * (M + 1) - 6. * N * (M - N) - 6. * n * m
652
+ g2 *= (M - 1) * M * M
653
+ g2 += 6. * n * N * (M - N) * m * (5. * M - 6)
654
+ g2 /= n * N * (M - N) * m * (M - 2.) * (M - 3.)
655
+ return (
656
+ _boost._hypergeom_mean(n, N, M),
657
+ _boost._hypergeom_variance(n, N, M),
658
+ _boost._hypergeom_skewness(n, N, M),
659
+ g2,
660
+ )
661
+
662
+ def _entropy(self, M, n, N):
663
+ k = np.r_[N - (M - n):min(n, N) + 1]
664
+ vals = self.pmf(k, M, n, N)
665
+ return np.sum(entr(vals), axis=0)
666
+
667
+ def _sf(self, k, M, n, N):
668
+ return _boost._hypergeom_sf(k, n, N, M)
669
+
670
+ def _logsf(self, k, M, n, N):
671
+ res = []
672
+ for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
673
+ if (quant + 0.5) * (tot + 0.5) < (good - 0.5) * (draw - 0.5):
674
+ # Less terms to sum if we calculate log(1-cdf)
675
+ res.append(log1p(-exp(self.logcdf(quant, tot, good, draw))))
676
+ else:
677
+ # Integration over probability mass function using logsumexp
678
+ k2 = np.arange(quant + 1, draw + 1)
679
+ res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
680
+ return np.asarray(res)
681
+
682
+ def _logcdf(self, k, M, n, N):
683
+ res = []
684
+ for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
685
+ if (quant + 0.5) * (tot + 0.5) > (good - 0.5) * (draw - 0.5):
686
+ # Less terms to sum if we calculate log(1-sf)
687
+ res.append(log1p(-exp(self.logsf(quant, tot, good, draw))))
688
+ else:
689
+ # Integration over probability mass function using logsumexp
690
+ k2 = np.arange(0, quant + 1)
691
+ res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
692
+ return np.asarray(res)
693
+
694
+
695
+ hypergeom = hypergeom_gen(name='hypergeom')
696
+
697
+
698
+ class nhypergeom_gen(rv_discrete):
699
+ r"""A negative hypergeometric discrete random variable.
700
+
701
+ Consider a box containing :math:`M` balls:, :math:`n` red and
702
+ :math:`M-n` blue. We randomly sample balls from the box, one
703
+ at a time and *without* replacement, until we have picked :math:`r`
704
+ blue balls. `nhypergeom` is the distribution of the number of
705
+ red balls :math:`k` we have picked.
706
+
707
+ %(before_notes)s
708
+
709
+ Notes
710
+ -----
711
+ The symbols used to denote the shape parameters (`M`, `n`, and `r`) are not
712
+ universally accepted. See the Examples for a clarification of the
713
+ definitions used here.
714
+
715
+ The probability mass function is defined as,
716
+
717
+ .. math:: f(k; M, n, r) = \frac{{{k+r-1}\choose{k}}{{M-r-k}\choose{n-k}}}
718
+ {{M \choose n}}
719
+
720
+ for :math:`k \in [0, n]`, :math:`n \in [0, M]`, :math:`r \in [0, M-n]`,
721
+ and the binomial coefficient is:
722
+
723
+ .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
724
+
725
+ It is equivalent to observing :math:`k` successes in :math:`k+r-1`
726
+ samples with :math:`k+r`'th sample being a failure. The former
727
+ can be modelled as a hypergeometric distribution. The probability
728
+ of the latter is simply the number of failures remaining
729
+ :math:`M-n-(r-1)` divided by the size of the remaining population
730
+ :math:`M-(k+r-1)`. This relationship can be shown as:
731
+
732
+ .. math:: NHG(k;M,n,r) = HG(k;M,n,k+r-1)\frac{(M-n-(r-1))}{(M-(k+r-1))}
733
+
734
+ where :math:`NHG` is probability mass function (PMF) of the
735
+ negative hypergeometric distribution and :math:`HG` is the
736
+ PMF of the hypergeometric distribution.
737
+
738
+ %(after_notes)s
739
+
740
+ Examples
741
+ --------
742
+ >>> import numpy as np
743
+ >>> from scipy.stats import nhypergeom
744
+ >>> import matplotlib.pyplot as plt
745
+
746
+ Suppose we have a collection of 20 animals, of which 7 are dogs.
747
+ Then if we want to know the probability of finding a given number
748
+ of dogs (successes) in a sample with exactly 12 animals that
749
+ aren't dogs (failures), we can initialize a frozen distribution
750
+ and plot the probability mass function:
751
+
752
+ >>> M, n, r = [20, 7, 12]
753
+ >>> rv = nhypergeom(M, n, r)
754
+ >>> x = np.arange(0, n+2)
755
+ >>> pmf_dogs = rv.pmf(x)
756
+
757
+ >>> fig = plt.figure()
758
+ >>> ax = fig.add_subplot(111)
759
+ >>> ax.plot(x, pmf_dogs, 'bo')
760
+ >>> ax.vlines(x, 0, pmf_dogs, lw=2)
761
+ >>> ax.set_xlabel('# of dogs in our group with given 12 failures')
762
+ >>> ax.set_ylabel('nhypergeom PMF')
763
+ >>> plt.show()
764
+
765
+ Instead of using a frozen distribution we can also use `nhypergeom`
766
+ methods directly. To for example obtain the probability mass
767
+ function, use:
768
+
769
+ >>> prb = nhypergeom.pmf(x, M, n, r)
770
+
771
+ And to generate random numbers:
772
+
773
+ >>> R = nhypergeom.rvs(M, n, r, size=10)
774
+
775
+ To verify the relationship between `hypergeom` and `nhypergeom`, use:
776
+
777
+ >>> from scipy.stats import hypergeom, nhypergeom
778
+ >>> M, n, r = 45, 13, 8
779
+ >>> k = 6
780
+ >>> nhypergeom.pmf(k, M, n, r)
781
+ 0.06180776620271643
782
+ >>> hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
783
+ 0.06180776620271644
784
+
785
+ See Also
786
+ --------
787
+ hypergeom, binom, nbinom
788
+
789
+ References
790
+ ----------
791
+ .. [1] Negative Hypergeometric Distribution on Wikipedia
792
+ https://en.wikipedia.org/wiki/Negative_hypergeometric_distribution
793
+
794
+ .. [2] Negative Hypergeometric Distribution from
795
+ http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Negativehypergeometric.pdf
796
+
797
+ """
798
+
799
+ def _shape_info(self):
800
+ return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
801
+ _ShapeInfo("n", True, (0, np.inf), (True, False)),
802
+ _ShapeInfo("r", True, (0, np.inf), (True, False))]
803
+
804
+ def _get_support(self, M, n, r):
805
+ return 0, n
806
+
807
+ def _argcheck(self, M, n, r):
808
+ cond = (n >= 0) & (n <= M) & (r >= 0) & (r <= M-n)
809
+ cond &= _isintegral(M) & _isintegral(n) & _isintegral(r)
810
+ return cond
811
+
812
+ def _rvs(self, M, n, r, size=None, random_state=None):
813
+
814
+ @_vectorize_rvs_over_shapes
815
+ def _rvs1(M, n, r, size, random_state):
816
+ # invert cdf by calculating all values in support, scalar M, n, r
817
+ a, b = self.support(M, n, r)
818
+ ks = np.arange(a, b+1)
819
+ cdf = self.cdf(ks, M, n, r)
820
+ ppf = interp1d(cdf, ks, kind='next', fill_value='extrapolate')
821
+ rvs = ppf(random_state.uniform(size=size)).astype(int)
822
+ if size is None:
823
+ return rvs.item()
824
+ return rvs
825
+
826
+ return _rvs1(M, n, r, size=size, random_state=random_state)
827
+
828
+ def _logpmf(self, k, M, n, r):
829
+ cond = ((r == 0) & (k == 0))
830
+ result = _lazywhere(~cond, (k, M, n, r),
831
+ lambda k, M, n, r:
832
+ (-betaln(k+1, r) + betaln(k+r, 1) -
833
+ betaln(n-k+1, M-r-n+1) + betaln(M-r-k+1, 1) +
834
+ betaln(n+1, M-n+1) - betaln(M+1, 1)),
835
+ fillvalue=0.0)
836
+ return result
837
+
838
+ def _pmf(self, k, M, n, r):
839
+ # same as the following but numerically more precise
840
+ # return comb(k+r-1, k) * comb(M-r-k, n-k) / comb(M, n)
841
+ return exp(self._logpmf(k, M, n, r))
842
+
843
+ def _stats(self, M, n, r):
844
+ # Promote the datatype to at least float
845
+ # mu = rn / (M-n+1)
846
+ M, n, r = 1.*M, 1.*n, 1.*r
847
+ mu = r*n / (M-n+1)
848
+
849
+ var = r*(M+1)*n / ((M-n+1)*(M-n+2)) * (1 - r / (M-n+1))
850
+
851
+ # The skew and kurtosis are mathematically
852
+ # intractable so return `None`. See [2]_.
853
+ g1, g2 = None, None
854
+ return mu, var, g1, g2
855
+
856
+
857
+ nhypergeom = nhypergeom_gen(name='nhypergeom')
858
+
859
+
860
+ # FIXME: Fails _cdfvec
861
+ class logser_gen(rv_discrete):
862
+ r"""A Logarithmic (Log-Series, Series) discrete random variable.
863
+
864
+ %(before_notes)s
865
+
866
+ Notes
867
+ -----
868
+ The probability mass function for `logser` is:
869
+
870
+ .. math::
871
+
872
+ f(k) = - \frac{p^k}{k \log(1-p)}
873
+
874
+ for :math:`k \ge 1`, :math:`0 < p < 1`
875
+
876
+ `logser` takes :math:`p` as shape parameter,
877
+ where :math:`p` is the probability of a single success
878
+ and :math:`1-p` is the probability of a single failure.
879
+
880
+ %(after_notes)s
881
+
882
+ %(example)s
883
+
884
+ """
885
+
886
+ def _shape_info(self):
887
+ return [_ShapeInfo("p", False, (0, 1), (True, True))]
888
+
889
+ def _rvs(self, p, size=None, random_state=None):
890
+ # looks wrong for p>0.5, too few k=1
891
+ # trying to use generic is worse, no k=1 at all
892
+ return random_state.logseries(p, size=size)
893
+
894
+ def _argcheck(self, p):
895
+ return (p > 0) & (p < 1)
896
+
897
+ def _pmf(self, k, p):
898
+ # logser.pmf(k) = - p**k / (k*log(1-p))
899
+ return -np.power(p, k) * 1.0 / k / special.log1p(-p)
900
+
901
+ def _stats(self, p):
902
+ r = special.log1p(-p)
903
+ mu = p / (p - 1.0) / r
904
+ mu2p = -p / r / (p - 1.0)**2
905
+ var = mu2p - mu*mu
906
+ mu3p = -p / r * (1.0+p) / (1.0 - p)**3
907
+ mu3 = mu3p - 3*mu*mu2p + 2*mu**3
908
+ g1 = mu3 / np.power(var, 1.5)
909
+
910
+ mu4p = -p / r * (
911
+ 1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
912
+ mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
913
+ g2 = mu4 / var**2 - 3.0
914
+ return mu, var, g1, g2
915
+
916
+
917
+ logser = logser_gen(a=1, name='logser', longname='A logarithmic')
918
+
919
+
920
+ class poisson_gen(rv_discrete):
921
+ r"""A Poisson discrete random variable.
922
+
923
+ %(before_notes)s
924
+
925
+ Notes
926
+ -----
927
+ The probability mass function for `poisson` is:
928
+
929
+ .. math::
930
+
931
+ f(k) = \exp(-\mu) \frac{\mu^k}{k!}
932
+
933
+ for :math:`k \ge 0`.
934
+
935
+ `poisson` takes :math:`\mu \geq 0` as shape parameter.
936
+ When :math:`\mu = 0`, the ``pmf`` method
937
+ returns ``1.0`` at quantile :math:`k = 0`.
938
+
939
+ %(after_notes)s
940
+
941
+ %(example)s
942
+
943
+ """
944
+
945
+ def _shape_info(self):
946
+ return [_ShapeInfo("mu", False, (0, np.inf), (True, False))]
947
+
948
+ # Override rv_discrete._argcheck to allow mu=0.
949
+ def _argcheck(self, mu):
950
+ return mu >= 0
951
+
952
+ def _rvs(self, mu, size=None, random_state=None):
953
+ return random_state.poisson(mu, size)
954
+
955
+ def _logpmf(self, k, mu):
956
+ Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
957
+ return Pk
958
+
959
+ def _pmf(self, k, mu):
960
+ # poisson.pmf(k) = exp(-mu) * mu**k / k!
961
+ return exp(self._logpmf(k, mu))
962
+
963
+ def _cdf(self, x, mu):
964
+ k = floor(x)
965
+ return special.pdtr(k, mu)
966
+
967
+ def _sf(self, x, mu):
968
+ k = floor(x)
969
+ return special.pdtrc(k, mu)
970
+
971
+ def _ppf(self, q, mu):
972
+ vals = ceil(special.pdtrik(q, mu))
973
+ vals1 = np.maximum(vals - 1, 0)
974
+ temp = special.pdtr(vals1, mu)
975
+ return np.where(temp >= q, vals1, vals)
976
+
977
+ def _stats(self, mu):
978
+ var = mu
979
+ tmp = np.asarray(mu)
980
+ mu_nonzero = tmp > 0
981
+ g1 = _lazywhere(mu_nonzero, (tmp,), lambda x: sqrt(1.0/x), np.inf)
982
+ g2 = _lazywhere(mu_nonzero, (tmp,), lambda x: 1.0/x, np.inf)
983
+ return mu, var, g1, g2
984
+
985
+
986
+ poisson = poisson_gen(name="poisson", longname='A Poisson')
987
+
988
+
989
+ class planck_gen(rv_discrete):
990
+ r"""A Planck discrete exponential random variable.
991
+
992
+ %(before_notes)s
993
+
994
+ Notes
995
+ -----
996
+ The probability mass function for `planck` is:
997
+
998
+ .. math::
999
+
1000
+ f(k) = (1-\exp(-\lambda)) \exp(-\lambda k)
1001
+
1002
+ for :math:`k \ge 0` and :math:`\lambda > 0`.
1003
+
1004
+ `planck` takes :math:`\lambda` as shape parameter. The Planck distribution
1005
+ can be written as a geometric distribution (`geom`) with
1006
+ :math:`p = 1 - \exp(-\lambda)` shifted by ``loc = -1``.
1007
+
1008
+ %(after_notes)s
1009
+
1010
+ See Also
1011
+ --------
1012
+ geom
1013
+
1014
+ %(example)s
1015
+
1016
+ """
1017
+ def _shape_info(self):
1018
+ return [_ShapeInfo("lambda", False, (0, np.inf), (False, False))]
1019
+
1020
+ def _argcheck(self, lambda_):
1021
+ return lambda_ > 0
1022
+
1023
+ def _pmf(self, k, lambda_):
1024
+ return -expm1(-lambda_)*exp(-lambda_*k)
1025
+
1026
+ def _cdf(self, x, lambda_):
1027
+ k = floor(x)
1028
+ return -expm1(-lambda_*(k+1))
1029
+
1030
+ def _sf(self, x, lambda_):
1031
+ return exp(self._logsf(x, lambda_))
1032
+
1033
+ def _logsf(self, x, lambda_):
1034
+ k = floor(x)
1035
+ return -lambda_*(k+1)
1036
+
1037
+ def _ppf(self, q, lambda_):
1038
+ vals = ceil(-1.0/lambda_ * log1p(-q)-1)
1039
+ vals1 = (vals-1).clip(*(self._get_support(lambda_)))
1040
+ temp = self._cdf(vals1, lambda_)
1041
+ return np.where(temp >= q, vals1, vals)
1042
+
1043
+ def _rvs(self, lambda_, size=None, random_state=None):
1044
+ # use relation to geometric distribution for sampling
1045
+ p = -expm1(-lambda_)
1046
+ return random_state.geometric(p, size=size) - 1.0
1047
+
1048
+ def _stats(self, lambda_):
1049
+ mu = 1/expm1(lambda_)
1050
+ var = exp(-lambda_)/(expm1(-lambda_))**2
1051
+ g1 = 2*cosh(lambda_/2.0)
1052
+ g2 = 4+2*cosh(lambda_)
1053
+ return mu, var, g1, g2
1054
+
1055
+ def _entropy(self, lambda_):
1056
+ C = -expm1(-lambda_)
1057
+ return lambda_*exp(-lambda_)/C - log(C)
1058
+
1059
+
1060
+ planck = planck_gen(a=0, name='planck', longname='A discrete exponential ')
1061
+
1062
+
1063
+ class boltzmann_gen(rv_discrete):
1064
+ r"""A Boltzmann (Truncated Discrete Exponential) random variable.
1065
+
1066
+ %(before_notes)s
1067
+
1068
+ Notes
1069
+ -----
1070
+ The probability mass function for `boltzmann` is:
1071
+
1072
+ .. math::
1073
+
1074
+ f(k) = (1-\exp(-\lambda)) \exp(-\lambda k) / (1-\exp(-\lambda N))
1075
+
1076
+ for :math:`k = 0,..., N-1`.
1077
+
1078
+ `boltzmann` takes :math:`\lambda > 0` and :math:`N > 0` as shape parameters.
1079
+
1080
+ %(after_notes)s
1081
+
1082
+ %(example)s
1083
+
1084
+ """
1085
+ def _shape_info(self):
1086
+ return [_ShapeInfo("lambda_", False, (0, np.inf), (False, False)),
1087
+ _ShapeInfo("N", True, (0, np.inf), (False, False))]
1088
+
1089
+ def _argcheck(self, lambda_, N):
1090
+ return (lambda_ > 0) & (N > 0) & _isintegral(N)
1091
+
1092
+ def _get_support(self, lambda_, N):
1093
+ return self.a, N - 1
1094
+
1095
+ def _pmf(self, k, lambda_, N):
1096
+ # boltzmann.pmf(k) =
1097
+ # (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
1098
+ fact = (1-exp(-lambda_))/(1-exp(-lambda_*N))
1099
+ return fact*exp(-lambda_*k)
1100
+
1101
+ def _cdf(self, x, lambda_, N):
1102
+ k = floor(x)
1103
+ return (1-exp(-lambda_*(k+1)))/(1-exp(-lambda_*N))
1104
+
1105
+ def _ppf(self, q, lambda_, N):
1106
+ qnew = q*(1-exp(-lambda_*N))
1107
+ vals = ceil(-1.0/lambda_ * log(1-qnew)-1)
1108
+ vals1 = (vals-1).clip(0.0, np.inf)
1109
+ temp = self._cdf(vals1, lambda_, N)
1110
+ return np.where(temp >= q, vals1, vals)
1111
+
1112
+ def _stats(self, lambda_, N):
1113
+ z = exp(-lambda_)
1114
+ zN = exp(-lambda_*N)
1115
+ mu = z/(1.0-z)-N*zN/(1-zN)
1116
+ var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
1117
+ trm = (1-zN)/(1-z)
1118
+ trm2 = (z*trm**2 - N*N*zN)
1119
+ g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
1120
+ g1 = g1 / trm2**(1.5)
1121
+ g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
1122
+ g2 = g2 / trm2 / trm2
1123
+ return mu, var, g1, g2
1124
+
1125
+
1126
+ boltzmann = boltzmann_gen(name='boltzmann', a=0,
1127
+ longname='A truncated discrete exponential ')
1128
+
1129
+
1130
+ class randint_gen(rv_discrete):
1131
+ r"""A uniform discrete random variable.
1132
+
1133
+ %(before_notes)s
1134
+
1135
+ Notes
1136
+ -----
1137
+ The probability mass function for `randint` is:
1138
+
1139
+ .. math::
1140
+
1141
+ f(k) = \frac{1}{\texttt{high} - \texttt{low}}
1142
+
1143
+ for :math:`k \in \{\texttt{low}, \dots, \texttt{high} - 1\}`.
1144
+
1145
+ `randint` takes :math:`\texttt{low}` and :math:`\texttt{high}` as shape
1146
+ parameters.
1147
+
1148
+ %(after_notes)s
1149
+
1150
+ Examples
1151
+ --------
1152
+ >>> import numpy as np
1153
+ >>> from scipy.stats import randint
1154
+ >>> import matplotlib.pyplot as plt
1155
+ >>> fig, ax = plt.subplots(1, 1)
1156
+
1157
+ Calculate the first four moments:
1158
+
1159
+ >>> low, high = 7, 31
1160
+ >>> mean, var, skew, kurt = randint.stats(low, high, moments='mvsk')
1161
+
1162
+ Display the probability mass function (``pmf``):
1163
+
1164
+ >>> x = np.arange(low - 5, high + 5)
1165
+ >>> ax.plot(x, randint.pmf(x, low, high), 'bo', ms=8, label='randint pmf')
1166
+ >>> ax.vlines(x, 0, randint.pmf(x, low, high), colors='b', lw=5, alpha=0.5)
1167
+
1168
+ Alternatively, the distribution object can be called (as a function) to
1169
+ fix the shape and location. This returns a "frozen" RV object holding the
1170
+ given parameters fixed.
1171
+
1172
+ Freeze the distribution and display the frozen ``pmf``:
1173
+
1174
+ >>> rv = randint(low, high)
1175
+ >>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-',
1176
+ ... lw=1, label='frozen pmf')
1177
+ >>> ax.legend(loc='lower center')
1178
+ >>> plt.show()
1179
+
1180
+ Check the relationship between the cumulative distribution function
1181
+ (``cdf``) and its inverse, the percent point function (``ppf``):
1182
+
1183
+ >>> q = np.arange(low, high)
1184
+ >>> p = randint.cdf(q, low, high)
1185
+ >>> np.allclose(q, randint.ppf(p, low, high))
1186
+ True
1187
+
1188
+ Generate random numbers:
1189
+
1190
+ >>> r = randint.rvs(low, high, size=1000)
1191
+
1192
+ """
1193
+
1194
+ def _shape_info(self):
1195
+ return [_ShapeInfo("low", True, (-np.inf, np.inf), (False, False)),
1196
+ _ShapeInfo("high", True, (-np.inf, np.inf), (False, False))]
1197
+
1198
+ def _argcheck(self, low, high):
1199
+ return (high > low) & _isintegral(low) & _isintegral(high)
1200
+
1201
+ def _get_support(self, low, high):
1202
+ return low, high-1
1203
+
1204
+ def _pmf(self, k, low, high):
1205
+ # randint.pmf(k) = 1./(high - low)
1206
+ p = np.ones_like(k) / (high - low)
1207
+ return np.where((k >= low) & (k < high), p, 0.)
1208
+
1209
+ def _cdf(self, x, low, high):
1210
+ k = floor(x)
1211
+ return (k - low + 1.) / (high - low)
1212
+
1213
+ def _ppf(self, q, low, high):
1214
+ vals = ceil(q * (high - low) + low) - 1
1215
+ vals1 = (vals - 1).clip(low, high)
1216
+ temp = self._cdf(vals1, low, high)
1217
+ return np.where(temp >= q, vals1, vals)
1218
+
1219
+ def _stats(self, low, high):
1220
+ m2, m1 = np.asarray(high), np.asarray(low)
1221
+ mu = (m2 + m1 - 1.0) / 2
1222
+ d = m2 - m1
1223
+ var = (d*d - 1) / 12.0
1224
+ g1 = 0.0
1225
+ g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
1226
+ return mu, var, g1, g2
1227
+
1228
+ def _rvs(self, low, high, size=None, random_state=None):
1229
+ """An array of *size* random integers >= ``low`` and < ``high``."""
1230
+ if np.asarray(low).size == 1 and np.asarray(high).size == 1:
1231
+ # no need to vectorize in that case
1232
+ return rng_integers(random_state, low, high, size=size)
1233
+
1234
+ if size is not None:
1235
+ # NumPy's RandomState.randint() doesn't broadcast its arguments.
1236
+ # Use `broadcast_to()` to extend the shapes of low and high
1237
+ # up to size. Then we can use the numpy.vectorize'd
1238
+ # randint without needing to pass it a `size` argument.
1239
+ low = np.broadcast_to(low, size)
1240
+ high = np.broadcast_to(high, size)
1241
+ randint = np.vectorize(partial(rng_integers, random_state),
1242
+ otypes=[np.dtype(int)])
1243
+ return randint(low, high)
1244
+
1245
+ def _entropy(self, low, high):
1246
+ return log(high - low)
1247
+
1248
+
1249
+ randint = randint_gen(name='randint', longname='A discrete uniform '
1250
+ '(random integer)')
1251
+
1252
+
1253
+ # FIXME: problems sampling.
1254
+ class zipf_gen(rv_discrete):
1255
+ r"""A Zipf (Zeta) discrete random variable.
1256
+
1257
+ %(before_notes)s
1258
+
1259
+ See Also
1260
+ --------
1261
+ zipfian
1262
+
1263
+ Notes
1264
+ -----
1265
+ The probability mass function for `zipf` is:
1266
+
1267
+ .. math::
1268
+
1269
+ f(k, a) = \frac{1}{\zeta(a) k^a}
1270
+
1271
+ for :math:`k \ge 1`, :math:`a > 1`.
1272
+
1273
+ `zipf` takes :math:`a > 1` as shape parameter. :math:`\zeta` is the
1274
+ Riemann zeta function (`scipy.special.zeta`)
1275
+
1276
+ The Zipf distribution is also known as the zeta distribution, which is
1277
+ a special case of the Zipfian distribution (`zipfian`).
1278
+
1279
+ %(after_notes)s
1280
+
1281
+ References
1282
+ ----------
1283
+ .. [1] "Zeta Distribution", Wikipedia,
1284
+ https://en.wikipedia.org/wiki/Zeta_distribution
1285
+
1286
+ %(example)s
1287
+
1288
+ Confirm that `zipf` is the large `n` limit of `zipfian`.
1289
+
1290
+ >>> import numpy as np
1291
+ >>> from scipy.stats import zipf, zipfian
1292
+ >>> k = np.arange(11)
1293
+ >>> np.allclose(zipf.pmf(k, a), zipfian.pmf(k, a, n=10000000))
1294
+ True
1295
+
1296
+ """
1297
+
1298
+ def _shape_info(self):
1299
+ return [_ShapeInfo("a", False, (1, np.inf), (False, False))]
1300
+
1301
+ def _rvs(self, a, size=None, random_state=None):
1302
+ return random_state.zipf(a, size=size)
1303
+
1304
+ def _argcheck(self, a):
1305
+ return a > 1
1306
+
1307
+ def _pmf(self, k, a):
1308
+ k = k.astype(np.float64)
1309
+ # zipf.pmf(k, a) = 1/(zeta(a) * k**a)
1310
+ Pk = 1.0 / special.zeta(a, 1) * k**-a
1311
+ return Pk
1312
+
1313
+ def _munp(self, n, a):
1314
+ return _lazywhere(
1315
+ a > n + 1, (a, n),
1316
+ lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
1317
+ np.inf)
1318
+
1319
+
1320
+ zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
1321
+
1322
+
1323
+ def _gen_harmonic_gt1(n, a):
1324
+ """Generalized harmonic number, a > 1"""
1325
+ # See https://en.wikipedia.org/wiki/Harmonic_number; search for "hurwitz"
1326
+ return zeta(a, 1) - zeta(a, n+1)
1327
+
1328
+
1329
+ def _gen_harmonic_leq1(n, a):
1330
+ """Generalized harmonic number, a <= 1"""
1331
+ if not np.size(n):
1332
+ return n
1333
+ n_max = np.max(n) # loop starts at maximum of all n
1334
+ out = np.zeros_like(a, dtype=float)
1335
+ # add terms of harmonic series; starting from smallest to avoid roundoff
1336
+ for i in np.arange(n_max, 0, -1, dtype=float):
1337
+ mask = i <= n # don't add terms after nth
1338
+ out[mask] += 1/i**a[mask]
1339
+ return out
1340
+
1341
+
1342
+ def _gen_harmonic(n, a):
1343
+ """Generalized harmonic number"""
1344
+ n, a = np.broadcast_arrays(n, a)
1345
+ return _lazywhere(a > 1, (n, a),
1346
+ f=_gen_harmonic_gt1, f2=_gen_harmonic_leq1)
1347
+
1348
+
1349
+ class zipfian_gen(rv_discrete):
1350
+ r"""A Zipfian discrete random variable.
1351
+
1352
+ %(before_notes)s
1353
+
1354
+ See Also
1355
+ --------
1356
+ zipf
1357
+
1358
+ Notes
1359
+ -----
1360
+ The probability mass function for `zipfian` is:
1361
+
1362
+ .. math::
1363
+
1364
+ f(k, a, n) = \frac{1}{H_{n,a} k^a}
1365
+
1366
+ for :math:`k \in \{1, 2, \dots, n-1, n\}`, :math:`a \ge 0`,
1367
+ :math:`n \in \{1, 2, 3, \dots\}`.
1368
+
1369
+ `zipfian` takes :math:`a` and :math:`n` as shape parameters.
1370
+ :math:`H_{n,a}` is the :math:`n`:sup:`th` generalized harmonic
1371
+ number of order :math:`a`.
1372
+
1373
+ The Zipfian distribution reduces to the Zipf (zeta) distribution as
1374
+ :math:`n \rightarrow \infty`.
1375
+
1376
+ %(after_notes)s
1377
+
1378
+ References
1379
+ ----------
1380
+ .. [1] "Zipf's Law", Wikipedia, https://en.wikipedia.org/wiki/Zipf's_law
1381
+ .. [2] Larry Leemis, "Zipf Distribution", Univariate Distribution
1382
+ Relationships. http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
1383
+
1384
+ %(example)s
1385
+
1386
+ Confirm that `zipfian` reduces to `zipf` for large `n`, `a > 1`.
1387
+
1388
+ >>> import numpy as np
1389
+ >>> from scipy.stats import zipf, zipfian
1390
+ >>> k = np.arange(11)
1391
+ >>> np.allclose(zipfian.pmf(k, a=3.5, n=10000000), zipf.pmf(k, a=3.5))
1392
+ True
1393
+
1394
+ """
1395
+
1396
+ def _shape_info(self):
1397
+ return [_ShapeInfo("a", False, (0, np.inf), (True, False)),
1398
+ _ShapeInfo("n", True, (0, np.inf), (False, False))]
1399
+
1400
+ def _argcheck(self, a, n):
1401
+ # we need np.asarray here because moment (maybe others) don't convert
1402
+ return (a >= 0) & (n > 0) & (n == np.asarray(n, dtype=int))
1403
+
1404
+ def _get_support(self, a, n):
1405
+ return 1, n
1406
+
1407
+ def _pmf(self, k, a, n):
1408
+ k = k.astype(np.float64)
1409
+ return 1.0 / _gen_harmonic(n, a) * k**-a
1410
+
1411
+ def _cdf(self, k, a, n):
1412
+ return _gen_harmonic(k, a) / _gen_harmonic(n, a)
1413
+
1414
+ def _sf(self, k, a, n):
1415
+ k = k + 1 # # to match SciPy convention
1416
+ # see http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
1417
+ return ((k**a*(_gen_harmonic(n, a) - _gen_harmonic(k, a)) + 1)
1418
+ / (k**a*_gen_harmonic(n, a)))
1419
+
1420
+ def _stats(self, a, n):
1421
+ # see # see http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
1422
+ Hna = _gen_harmonic(n, a)
1423
+ Hna1 = _gen_harmonic(n, a-1)
1424
+ Hna2 = _gen_harmonic(n, a-2)
1425
+ Hna3 = _gen_harmonic(n, a-3)
1426
+ Hna4 = _gen_harmonic(n, a-4)
1427
+ mu1 = Hna1/Hna
1428
+ mu2n = (Hna2*Hna - Hna1**2)
1429
+ mu2d = Hna**2
1430
+ mu2 = mu2n / mu2d
1431
+ g1 = (Hna3/Hna - 3*Hna1*Hna2/Hna**2 + 2*Hna1**3/Hna**3)/mu2**(3/2)
1432
+ g2 = (Hna**3*Hna4 - 4*Hna**2*Hna1*Hna3 + 6*Hna*Hna1**2*Hna2
1433
+ - 3*Hna1**4) / mu2n**2
1434
+ g2 -= 3
1435
+ return mu1, mu2, g1, g2
1436
+
1437
+
1438
+ zipfian = zipfian_gen(a=1, name='zipfian', longname='A Zipfian')
1439
+
1440
+
1441
+ class dlaplace_gen(rv_discrete):
1442
+ r"""A Laplacian discrete random variable.
1443
+
1444
+ %(before_notes)s
1445
+
1446
+ Notes
1447
+ -----
1448
+ The probability mass function for `dlaplace` is:
1449
+
1450
+ .. math::
1451
+
1452
+ f(k) = \tanh(a/2) \exp(-a |k|)
1453
+
1454
+ for integers :math:`k` and :math:`a > 0`.
1455
+
1456
+ `dlaplace` takes :math:`a` as shape parameter.
1457
+
1458
+ %(after_notes)s
1459
+
1460
+ %(example)s
1461
+
1462
+ """
1463
+
1464
+ def _shape_info(self):
1465
+ return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
1466
+
1467
+ def _pmf(self, k, a):
1468
+ # dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
1469
+ return tanh(a/2.0) * exp(-a * abs(k))
1470
+
1471
+ def _cdf(self, x, a):
1472
+ k = floor(x)
1473
+
1474
+ def f(k, a):
1475
+ return 1.0 - exp(-a * k) / (exp(a) + 1)
1476
+
1477
+ def f2(k, a):
1478
+ return exp(a * (k + 1)) / (exp(a) + 1)
1479
+
1480
+ return _lazywhere(k >= 0, (k, a), f=f, f2=f2)
1481
+
1482
+ def _ppf(self, q, a):
1483
+ const = 1 + exp(a)
1484
+ vals = ceil(np.where(q < 1.0 / (1 + exp(-a)),
1485
+ log(q*const) / a - 1,
1486
+ -log((1-q) * const) / a))
1487
+ vals1 = vals - 1
1488
+ return np.where(self._cdf(vals1, a) >= q, vals1, vals)
1489
+
1490
+ def _stats(self, a):
1491
+ ea = exp(a)
1492
+ mu2 = 2.*ea/(ea-1.)**2
1493
+ mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
1494
+ return 0., mu2, 0., mu4/mu2**2 - 3.
1495
+
1496
+ def _entropy(self, a):
1497
+ return a / sinh(a) - log(tanh(a/2.0))
1498
+
1499
+ def _rvs(self, a, size=None, random_state=None):
1500
+ # The discrete Laplace is equivalent to the two-sided geometric
1501
+ # distribution with PMF:
1502
+ # f(k) = (1 - alpha)/(1 + alpha) * alpha^abs(k)
1503
+ # Reference:
1504
+ # https://www.sciencedirect.com/science/
1505
+ # article/abs/pii/S0378375804003519
1506
+ # Furthermore, the two-sided geometric distribution is
1507
+ # equivalent to the difference between two iid geometric
1508
+ # distributions.
1509
+ # Reference (page 179):
1510
+ # https://pdfs.semanticscholar.org/61b3/
1511
+ # b99f466815808fd0d03f5d2791eea8b541a1.pdf
1512
+ # Thus, we can leverage the following:
1513
+ # 1) alpha = e^-a
1514
+ # 2) probability_of_success = 1 - alpha (Bernoulli trial)
1515
+ probOfSuccess = -np.expm1(-np.asarray(a))
1516
+ x = random_state.geometric(probOfSuccess, size=size)
1517
+ y = random_state.geometric(probOfSuccess, size=size)
1518
+ return x - y
1519
+
1520
+
1521
+ dlaplace = dlaplace_gen(a=-np.inf,
1522
+ name='dlaplace', longname='A discrete Laplacian')
1523
+
1524
+
1525
+ class skellam_gen(rv_discrete):
1526
+ r"""A Skellam discrete random variable.
1527
+
1528
+ %(before_notes)s
1529
+
1530
+ Notes
1531
+ -----
1532
+ Probability distribution of the difference of two correlated or
1533
+ uncorrelated Poisson random variables.
1534
+
1535
+ Let :math:`k_1` and :math:`k_2` be two Poisson-distributed r.v. with
1536
+ expected values :math:`\lambda_1` and :math:`\lambda_2`. Then,
1537
+ :math:`k_1 - k_2` follows a Skellam distribution with parameters
1538
+ :math:`\mu_1 = \lambda_1 - \rho \sqrt{\lambda_1 \lambda_2}` and
1539
+ :math:`\mu_2 = \lambda_2 - \rho \sqrt{\lambda_1 \lambda_2}`, where
1540
+ :math:`\rho` is the correlation coefficient between :math:`k_1` and
1541
+ :math:`k_2`. If the two Poisson-distributed r.v. are independent then
1542
+ :math:`\rho = 0`.
1543
+
1544
+ Parameters :math:`\mu_1` and :math:`\mu_2` must be strictly positive.
1545
+
1546
+ For details see: https://en.wikipedia.org/wiki/Skellam_distribution
1547
+
1548
+ `skellam` takes :math:`\mu_1` and :math:`\mu_2` as shape parameters.
1549
+
1550
+ %(after_notes)s
1551
+
1552
+ %(example)s
1553
+
1554
+ """
1555
+ def _shape_info(self):
1556
+ return [_ShapeInfo("mu1", False, (0, np.inf), (False, False)),
1557
+ _ShapeInfo("mu2", False, (0, np.inf), (False, False))]
1558
+
1559
+ def _rvs(self, mu1, mu2, size=None, random_state=None):
1560
+ n = size
1561
+ return (random_state.poisson(mu1, n) -
1562
+ random_state.poisson(mu2, n))
1563
+
1564
+ def _pmf(self, x, mu1, mu2):
1565
+ with np.errstate(over='ignore'): # see gh-17432
1566
+ px = np.where(x < 0,
1567
+ _boost._ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
1568
+ _boost._ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
1569
+ # ncx2.pdf() returns nan's for extremely low probabilities
1570
+ return px
1571
+
1572
+ def _cdf(self, x, mu1, mu2):
1573
+ x = floor(x)
1574
+ with np.errstate(over='ignore'): # see gh-17432
1575
+ px = np.where(x < 0,
1576
+ _boost._ncx2_cdf(2*mu2, -2*x, 2*mu1),
1577
+ 1 - _boost._ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
1578
+ return px
1579
+
1580
+ def _stats(self, mu1, mu2):
1581
+ mean = mu1 - mu2
1582
+ var = mu1 + mu2
1583
+ g1 = mean / sqrt((var)**3)
1584
+ g2 = 1 / var
1585
+ return mean, var, g1, g2
1586
+
1587
+
1588
+ skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
1589
+
1590
+
1591
+ class yulesimon_gen(rv_discrete):
1592
+ r"""A Yule-Simon discrete random variable.
1593
+
1594
+ %(before_notes)s
1595
+
1596
+ Notes
1597
+ -----
1598
+
1599
+ The probability mass function for the `yulesimon` is:
1600
+
1601
+ .. math::
1602
+
1603
+ f(k) = \alpha B(k, \alpha+1)
1604
+
1605
+ for :math:`k=1,2,3,...`, where :math:`\alpha>0`.
1606
+ Here :math:`B` refers to the `scipy.special.beta` function.
1607
+
1608
+ The sampling of random variates is based on pg 553, Section 6.3 of [1]_.
1609
+ Our notation maps to the referenced logic via :math:`\alpha=a-1`.
1610
+
1611
+ For details see the wikipedia entry [2]_.
1612
+
1613
+ References
1614
+ ----------
1615
+ .. [1] Devroye, Luc. "Non-uniform Random Variate Generation",
1616
+ (1986) Springer, New York.
1617
+
1618
+ .. [2] https://en.wikipedia.org/wiki/Yule-Simon_distribution
1619
+
1620
+ %(after_notes)s
1621
+
1622
+ %(example)s
1623
+
1624
+ """
1625
+ def _shape_info(self):
1626
+ return [_ShapeInfo("alpha", False, (0, np.inf), (False, False))]
1627
+
1628
+ def _rvs(self, alpha, size=None, random_state=None):
1629
+ E1 = random_state.standard_exponential(size)
1630
+ E2 = random_state.standard_exponential(size)
1631
+ ans = ceil(-E1 / log1p(-exp(-E2 / alpha)))
1632
+ return ans
1633
+
1634
+ def _pmf(self, x, alpha):
1635
+ return alpha * special.beta(x, alpha + 1)
1636
+
1637
+ def _argcheck(self, alpha):
1638
+ return (alpha > 0)
1639
+
1640
+ def _logpmf(self, x, alpha):
1641
+ return log(alpha) + special.betaln(x, alpha + 1)
1642
+
1643
+ def _cdf(self, x, alpha):
1644
+ return 1 - x * special.beta(x, alpha + 1)
1645
+
1646
+ def _sf(self, x, alpha):
1647
+ return x * special.beta(x, alpha + 1)
1648
+
1649
+ def _logsf(self, x, alpha):
1650
+ return log(x) + special.betaln(x, alpha + 1)
1651
+
1652
+ def _stats(self, alpha):
1653
+ mu = np.where(alpha <= 1, np.inf, alpha / (alpha - 1))
1654
+ mu2 = np.where(alpha > 2,
1655
+ alpha**2 / ((alpha - 2.0) * (alpha - 1)**2),
1656
+ np.inf)
1657
+ mu2 = np.where(alpha <= 1, np.nan, mu2)
1658
+ g1 = np.where(alpha > 3,
1659
+ sqrt(alpha - 2) * (alpha + 1)**2 / (alpha * (alpha - 3)),
1660
+ np.inf)
1661
+ g1 = np.where(alpha <= 2, np.nan, g1)
1662
+ g2 = np.where(alpha > 4,
1663
+ alpha + 3 + ((11 * alpha**3 - 49 * alpha - 22) /
1664
+ (alpha * (alpha - 4) * (alpha - 3))),
1665
+ np.inf)
1666
+ g2 = np.where(alpha <= 2, np.nan, g2)
1667
+ return mu, mu2, g1, g2
1668
+
1669
+
1670
+ yulesimon = yulesimon_gen(name='yulesimon', a=1)
1671
+
1672
+
1673
+ def _vectorize_rvs_over_shapes(_rvs1):
1674
+ """Decorator that vectorizes _rvs method to work on ndarray shapes"""
1675
+ # _rvs1 must be a _function_ that accepts _scalar_ args as positional
1676
+ # arguments, `size` and `random_state` as keyword arguments.
1677
+ # _rvs1 must return a random variate array with shape `size`. If `size` is
1678
+ # None, _rvs1 must return a scalar.
1679
+ # When applied to _rvs1, this decorator broadcasts ndarray args
1680
+ # and loops over them, calling _rvs1 for each set of scalar args.
1681
+ # For usage example, see _nchypergeom_gen
1682
+ def _rvs(*args, size, random_state):
1683
+ _rvs1_size, _rvs1_indices = _check_shape(args[0].shape, size)
1684
+
1685
+ size = np.array(size)
1686
+ _rvs1_size = np.array(_rvs1_size)
1687
+ _rvs1_indices = np.array(_rvs1_indices)
1688
+
1689
+ if np.all(_rvs1_indices): # all args are scalars
1690
+ return _rvs1(*args, size, random_state)
1691
+
1692
+ out = np.empty(size)
1693
+
1694
+ # out.shape can mix dimensions associated with arg_shape and _rvs1_size
1695
+ # Sort them to arg_shape + _rvs1_size for easy indexing of dimensions
1696
+ # corresponding with the different sets of scalar args
1697
+ j0 = np.arange(out.ndim)
1698
+ j1 = np.hstack((j0[~_rvs1_indices], j0[_rvs1_indices]))
1699
+ out = np.moveaxis(out, j1, j0)
1700
+
1701
+ for i in np.ndindex(*size[~_rvs1_indices]):
1702
+ # arg can be squeezed because singleton dimensions will be
1703
+ # associated with _rvs1_size, not arg_shape per _check_shape
1704
+ out[i] = _rvs1(*[np.squeeze(arg)[i] for arg in args],
1705
+ _rvs1_size, random_state)
1706
+
1707
+ return np.moveaxis(out, j0, j1) # move axes back before returning
1708
+ return _rvs
1709
+
1710
+
1711
+ class _nchypergeom_gen(rv_discrete):
1712
+ r"""A noncentral hypergeometric discrete random variable.
1713
+
1714
+ For subclassing by nchypergeom_fisher_gen and nchypergeom_wallenius_gen.
1715
+
1716
+ """
1717
+
1718
+ rvs_name = None
1719
+ dist = None
1720
+
1721
+ def _shape_info(self):
1722
+ return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
1723
+ _ShapeInfo("n", True, (0, np.inf), (True, False)),
1724
+ _ShapeInfo("N", True, (0, np.inf), (True, False)),
1725
+ _ShapeInfo("odds", False, (0, np.inf), (False, False))]
1726
+
1727
+ def _get_support(self, M, n, N, odds):
1728
+ N, m1, n = M, n, N # follow Wikipedia notation
1729
+ m2 = N - m1
1730
+ x_min = np.maximum(0, n - m2)
1731
+ x_max = np.minimum(n, m1)
1732
+ return x_min, x_max
1733
+
1734
+ def _argcheck(self, M, n, N, odds):
1735
+ M, n = np.asarray(M), np.asarray(n),
1736
+ N, odds = np.asarray(N), np.asarray(odds)
1737
+ cond1 = (M.astype(int) == M) & (M >= 0)
1738
+ cond2 = (n.astype(int) == n) & (n >= 0)
1739
+ cond3 = (N.astype(int) == N) & (N >= 0)
1740
+ cond4 = odds > 0
1741
+ cond5 = N <= M
1742
+ cond6 = n <= M
1743
+ return cond1 & cond2 & cond3 & cond4 & cond5 & cond6
1744
+
1745
+ def _rvs(self, M, n, N, odds, size=None, random_state=None):
1746
+
1747
+ @_vectorize_rvs_over_shapes
1748
+ def _rvs1(M, n, N, odds, size, random_state):
1749
+ length = np.prod(size)
1750
+ urn = _PyStochasticLib3()
1751
+ rv_gen = getattr(urn, self.rvs_name)
1752
+ rvs = rv_gen(N, n, M, odds, length, random_state)
1753
+ rvs = rvs.reshape(size)
1754
+ return rvs
1755
+
1756
+ return _rvs1(M, n, N, odds, size=size, random_state=random_state)
1757
+
1758
+ def _pmf(self, x, M, n, N, odds):
1759
+
1760
+ x, M, n, N, odds = np.broadcast_arrays(x, M, n, N, odds)
1761
+ if x.size == 0: # np.vectorize doesn't work with zero size input
1762
+ return np.empty_like(x)
1763
+
1764
+ @np.vectorize
1765
+ def _pmf1(x, M, n, N, odds):
1766
+ urn = self.dist(N, n, M, odds, 1e-12)
1767
+ return urn.probability(x)
1768
+
1769
+ return _pmf1(x, M, n, N, odds)
1770
+
1771
+ def _stats(self, M, n, N, odds, moments):
1772
+
1773
+ @np.vectorize
1774
+ def _moments1(M, n, N, odds):
1775
+ urn = self.dist(N, n, M, odds, 1e-12)
1776
+ return urn.moments()
1777
+
1778
+ m, v = (_moments1(M, n, N, odds) if ("m" in moments or "v" in moments)
1779
+ else (None, None))
1780
+ s, k = None, None
1781
+ return m, v, s, k
1782
+
1783
+
1784
+ class nchypergeom_fisher_gen(_nchypergeom_gen):
1785
+ r"""A Fisher's noncentral hypergeometric discrete random variable.
1786
+
1787
+ Fisher's noncentral hypergeometric distribution models drawing objects of
1788
+ two types from a bin. `M` is the total number of objects, `n` is the
1789
+ number of Type I objects, and `odds` is the odds ratio: the odds of
1790
+ selecting a Type I object rather than a Type II object when there is only
1791
+ one object of each type.
1792
+ The random variate represents the number of Type I objects drawn if we
1793
+ take a handful of objects from the bin at once and find out afterwards
1794
+ that we took `N` objects.
1795
+
1796
+ %(before_notes)s
1797
+
1798
+ See Also
1799
+ --------
1800
+ nchypergeom_wallenius, hypergeom, nhypergeom
1801
+
1802
+ Notes
1803
+ -----
1804
+ Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
1805
+ with parameters `N`, `n`, and `M` (respectively) as defined above.
1806
+
1807
+ The probability mass function is defined as
1808
+
1809
+ .. math::
1810
+
1811
+ p(x; M, n, N, \omega) =
1812
+ \frac{\binom{n}{x}\binom{M - n}{N-x}\omega^x}{P_0},
1813
+
1814
+ for
1815
+ :math:`x \in [x_l, x_u]`,
1816
+ :math:`M \in {\mathbb N}`,
1817
+ :math:`n \in [0, M]`,
1818
+ :math:`N \in [0, M]`,
1819
+ :math:`\omega > 0`,
1820
+ where
1821
+ :math:`x_l = \max(0, N - (M - n))`,
1822
+ :math:`x_u = \min(N, n)`,
1823
+
1824
+ .. math::
1825
+
1826
+ P_0 = \sum_{y=x_l}^{x_u} \binom{n}{y}\binom{M - n}{N-y}\omega^y,
1827
+
1828
+ and the binomial coefficients are defined as
1829
+
1830
+ .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
1831
+
1832
+ `nchypergeom_fisher` uses the BiasedUrn package by Agner Fog with
1833
+ permission for it to be distributed under SciPy's license.
1834
+
1835
+ The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
1836
+ universally accepted; they are chosen for consistency with `hypergeom`.
1837
+
1838
+ Note that Fisher's noncentral hypergeometric distribution is distinct
1839
+ from Wallenius' noncentral hypergeometric distribution, which models
1840
+ drawing a pre-determined `N` objects from a bin one by one.
1841
+ When the odds ratio is unity, however, both distributions reduce to the
1842
+ ordinary hypergeometric distribution.
1843
+
1844
+ %(after_notes)s
1845
+
1846
+ References
1847
+ ----------
1848
+ .. [1] Agner Fog, "Biased Urn Theory".
1849
+ https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
1850
+
1851
+ .. [2] "Fisher's noncentral hypergeometric distribution", Wikipedia,
1852
+ https://en.wikipedia.org/wiki/Fisher's_noncentral_hypergeometric_distribution
1853
+
1854
+ %(example)s
1855
+
1856
+ """
1857
+
1858
+ rvs_name = "rvs_fisher"
1859
+ dist = _PyFishersNCHypergeometric
1860
+
1861
+
1862
+ nchypergeom_fisher = nchypergeom_fisher_gen(
1863
+ name='nchypergeom_fisher',
1864
+ longname="A Fisher's noncentral hypergeometric")
1865
+
1866
+
1867
+ class nchypergeom_wallenius_gen(_nchypergeom_gen):
1868
+ r"""A Wallenius' noncentral hypergeometric discrete random variable.
1869
+
1870
+ Wallenius' noncentral hypergeometric distribution models drawing objects of
1871
+ two types from a bin. `M` is the total number of objects, `n` is the
1872
+ number of Type I objects, and `odds` is the odds ratio: the odds of
1873
+ selecting a Type I object rather than a Type II object when there is only
1874
+ one object of each type.
1875
+ The random variate represents the number of Type I objects drawn if we
1876
+ draw a pre-determined `N` objects from a bin one by one.
1877
+
1878
+ %(before_notes)s
1879
+
1880
+ See Also
1881
+ --------
1882
+ nchypergeom_fisher, hypergeom, nhypergeom
1883
+
1884
+ Notes
1885
+ -----
1886
+ Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
1887
+ with parameters `N`, `n`, and `M` (respectively) as defined above.
1888
+
1889
+ The probability mass function is defined as
1890
+
1891
+ .. math::
1892
+
1893
+ p(x; N, n, M) = \binom{n}{x} \binom{M - n}{N-x}
1894
+ \int_0^1 \left(1-t^{\omega/D}\right)^x\left(1-t^{1/D}\right)^{N-x} dt
1895
+
1896
+ for
1897
+ :math:`x \in [x_l, x_u]`,
1898
+ :math:`M \in {\mathbb N}`,
1899
+ :math:`n \in [0, M]`,
1900
+ :math:`N \in [0, M]`,
1901
+ :math:`\omega > 0`,
1902
+ where
1903
+ :math:`x_l = \max(0, N - (M - n))`,
1904
+ :math:`x_u = \min(N, n)`,
1905
+
1906
+ .. math::
1907
+
1908
+ D = \omega(n - x) + ((M - n)-(N-x)),
1909
+
1910
+ and the binomial coefficients are defined as
1911
+
1912
+ .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
1913
+
1914
+ `nchypergeom_wallenius` uses the BiasedUrn package by Agner Fog with
1915
+ permission for it to be distributed under SciPy's license.
1916
+
1917
+ The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
1918
+ universally accepted; they are chosen for consistency with `hypergeom`.
1919
+
1920
+ Note that Wallenius' noncentral hypergeometric distribution is distinct
1921
+ from Fisher's noncentral hypergeometric distribution, which models
1922
+ take a handful of objects from the bin at once, finding out afterwards
1923
+ that `N` objects were taken.
1924
+ When the odds ratio is unity, however, both distributions reduce to the
1925
+ ordinary hypergeometric distribution.
1926
+
1927
+ %(after_notes)s
1928
+
1929
+ References
1930
+ ----------
1931
+ .. [1] Agner Fog, "Biased Urn Theory".
1932
+ https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
1933
+
1934
+ .. [2] "Wallenius' noncentral hypergeometric distribution", Wikipedia,
1935
+ https://en.wikipedia.org/wiki/Wallenius'_noncentral_hypergeometric_distribution
1936
+
1937
+ %(example)s
1938
+
1939
+ """
1940
+
1941
+ rvs_name = "rvs_wallenius"
1942
+ dist = _PyWalleniusNCHypergeometric
1943
+
1944
+
1945
+ nchypergeom_wallenius = nchypergeom_wallenius_gen(
1946
+ name='nchypergeom_wallenius',
1947
+ longname="A Wallenius' noncentral hypergeometric")
1948
+
1949
+
1950
+ # Collect names of classes and objects in this module.
1951
+ pairs = list(globals().copy().items())
1952
+ _distn_names, _distn_gen_names = get_distribution_names(pairs, rv_discrete)
1953
+
1954
+ __all__ = _distn_names + _distn_gen_names
.venv/Lib/site-packages/scipy/stats/_distn_infrastructure.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_distr_params.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sane parameters for stats.distributions.
3
+ """
4
+ import numpy as np
5
+
6
+ distcont = [
7
+ ['alpha', (3.5704770516650459,)],
8
+ ['anglit', ()],
9
+ ['arcsine', ()],
10
+ ['argus', (1.0,)],
11
+ ['beta', (2.3098496451481823, 0.62687954300963677)],
12
+ ['betaprime', (5, 6)],
13
+ ['bradford', (0.29891359763170633,)],
14
+ ['burr', (10.5, 4.3)],
15
+ ['burr12', (10, 4)],
16
+ ['cauchy', ()],
17
+ ['chi', (78,)],
18
+ ['chi2', (55,)],
19
+ ['cosine', ()],
20
+ ['crystalball', (2.0, 3.0)],
21
+ ['dgamma', (1.1023326088288166,)],
22
+ ['dweibull', (2.0685080649914673,)],
23
+ ['erlang', (10,)],
24
+ ['expon', ()],
25
+ ['exponnorm', (1.5,)],
26
+ ['exponpow', (2.697119160358469,)],
27
+ ['exponweib', (2.8923945291034436, 1.9505288745913174)],
28
+ ['f', (29, 18)],
29
+ ['fatiguelife', (29,)], # correction numargs = 1
30
+ ['fisk', (3.0857548622253179,)],
31
+ ['foldcauchy', (4.7164673455831894,)],
32
+ ['foldnorm', (1.9521253373555869,)],
33
+ ['gamma', (1.9932305483800778,)],
34
+ ['gausshyper', (13.763771604130699, 3.1189636648681431,
35
+ 2.5145980350183019, 5.1811649903971615)], # veryslow
36
+ ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
37
+ ['genextreme', (-0.1,)],
38
+ ['gengamma', (4.4162385429431925, 3.1193091679242761)],
39
+ ['gengamma', (4.4162385429431925, -3.1193091679242761)],
40
+ ['genhalflogistic', (0.77274727809929322,)],
41
+ ['genhyperbolic', (0.5, 1.5, -0.5,)],
42
+ ['geninvgauss', (2.3, 1.5)],
43
+ ['genlogistic', (0.41192440799679475,)],
44
+ ['gennorm', (1.2988442399460265,)],
45
+ ['halfgennorm', (0.6748054997000371,)],
46
+ ['genpareto', (0.1,)], # use case with finite moments
47
+ ['gibrat', ()],
48
+ ['gompertz', (0.94743713075105251,)],
49
+ ['gumbel_l', ()],
50
+ ['gumbel_r', ()],
51
+ ['halfcauchy', ()],
52
+ ['halflogistic', ()],
53
+ ['halfnorm', ()],
54
+ ['hypsecant', ()],
55
+ ['invgamma', (4.0668996136993067,)],
56
+ ['invgauss', (0.14546264555347513,)],
57
+ ['invweibull', (10.58,)],
58
+ ['jf_skew_t', (8, 4)],
59
+ ['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
60
+ ['johnsonsu', (2.554395574161155, 2.2482281679651965)],
61
+ ['kappa4', (0.0, 0.0)],
62
+ ['kappa4', (-0.1, 0.1)],
63
+ ['kappa4', (0.0, 0.1)],
64
+ ['kappa4', (0.1, 0.0)],
65
+ ['kappa3', (1.0,)],
66
+ ['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
67
+ ['kstwo', (10,)],
68
+ ['kstwobign', ()],
69
+ ['laplace', ()],
70
+ ['laplace_asymmetric', (2,)],
71
+ ['levy', ()],
72
+ ['levy_l', ()],
73
+ ['levy_stable', (1.8, -0.5)],
74
+ ['loggamma', (0.41411931826052117,)],
75
+ ['logistic', ()],
76
+ ['loglaplace', (3.2505926592051435,)],
77
+ ['lognorm', (0.95368226960575331,)],
78
+ ['loguniform', (0.01, 1.25)],
79
+ ['lomax', (1.8771398388773268,)],
80
+ ['maxwell', ()],
81
+ ['mielke', (10.4, 4.6)],
82
+ ['moyal', ()],
83
+ ['nakagami', (4.9673794866666237,)],
84
+ ['ncf', (27, 27, 0.41578441799226107)],
85
+ ['nct', (14, 0.24045031331198066)],
86
+ ['ncx2', (21, 1.0560465975116415)],
87
+ ['norm', ()],
88
+ ['norminvgauss', (1.25, 0.5)],
89
+ ['pareto', (2.621716532144454,)],
90
+ ['pearson3', (0.1,)],
91
+ ['pearson3', (-2,)],
92
+ ['powerlaw', (1.6591133289905851,)],
93
+ ['powerlaw', (0.6591133289905851,)],
94
+ ['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
95
+ ['powernorm', (4.4453652254590779,)],
96
+ ['rayleigh', ()],
97
+ ['rdist', (1.6,)],
98
+ ['recipinvgauss', (0.63004267809369119,)],
99
+ ['reciprocal', (0.01, 1.25)],
100
+ ['rel_breitwigner', (36.545206797050334, )],
101
+ ['rice', (0.7749725210111873,)],
102
+ ['semicircular', ()],
103
+ ['skewcauchy', (0.5,)],
104
+ ['skewnorm', (4.0,)],
105
+ ['studentized_range', (3.0, 10.0)],
106
+ ['t', (2.7433514990818093,)],
107
+ ['trapezoid', (0.2, 0.8)],
108
+ ['triang', (0.15785029824528218,)],
109
+ ['truncexpon', (4.6907725456810478,)],
110
+ ['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
111
+ ['truncnorm', (0.1, 2.)],
112
+ ['truncpareto', (1.8, 5.3)],
113
+ ['truncpareto', (2, 5)],
114
+ ['truncweibull_min', (2.5, 0.25, 1.75)],
115
+ ['tukeylambda', (3.1321477856738267,)],
116
+ ['uniform', ()],
117
+ ['vonmises', (3.9939042581071398,)],
118
+ ['vonmises_line', (3.9939042581071398,)],
119
+ ['wald', ()],
120
+ ['weibull_max', (2.8687961709100187,)],
121
+ ['weibull_min', (1.7866166930421596,)],
122
+ ['wrapcauchy', (0.031071279018614728,)]]
123
+
124
+
125
+ distdiscrete = [
126
+ ['bernoulli',(0.3,)],
127
+ ['betabinom', (5, 2.3, 0.63)],
128
+ ['betanbinom', (5, 9.3, 1)],
129
+ ['binom', (5, 0.4)],
130
+ ['boltzmann',(1.4, 19)],
131
+ ['dlaplace', (0.8,)], # 0.5
132
+ ['geom', (0.5,)],
133
+ ['hypergeom',(30, 12, 6)],
134
+ ['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
135
+ ['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
136
+ ['nchypergeom_fisher', (140, 80, 60, 0.5)],
137
+ ['nchypergeom_wallenius', (140, 80, 60, 0.5)],
138
+ ['logser', (0.6,)], # re-enabled, numpy ticket:921
139
+ ['nbinom', (0.4, 0.4)], # from tickets: 583
140
+ ['nbinom', (5, 0.5)],
141
+ ['planck', (0.51,)], # 4.1
142
+ ['poisson', (0.6,)],
143
+ ['randint', (7, 31)],
144
+ ['skellam', (15, 8)],
145
+ ['zipf', (6.6,)],
146
+ ['zipfian', (0.75, 15)],
147
+ ['zipfian', (1.25, 10)],
148
+ ['yulesimon', (11.0,)],
149
+ ['nhypergeom', (20, 7, 1)]
150
+ ]
151
+
152
+
153
+ invdistdiscrete = [
154
+ # In each of the following, at least one shape parameter is invalid
155
+ ['hypergeom', (3, 3, 4)],
156
+ ['nhypergeom', (5, 2, 8)],
157
+ ['nchypergeom_fisher', (3, 3, 4, 1)],
158
+ ['nchypergeom_wallenius', (3, 3, 4, 1)],
159
+ ['bernoulli', (1.5, )],
160
+ ['binom', (10, 1.5)],
161
+ ['betabinom', (10, -0.4, -0.5)],
162
+ ['betanbinom', (10, -0.4, -0.5)],
163
+ ['boltzmann', (-1, 4)],
164
+ ['dlaplace', (-0.5, )],
165
+ ['geom', (1.5, )],
166
+ ['logser', (1.5, )],
167
+ ['nbinom', (10, 1.5)],
168
+ ['planck', (-0.5, )],
169
+ ['poisson', (-0.5, )],
170
+ ['randint', (5, 2)],
171
+ ['skellam', (-5, -2)],
172
+ ['zipf', (-2, )],
173
+ ['yulesimon', (-2, )],
174
+ ['zipfian', (-0.75, 15)]
175
+ ]
176
+
177
+
178
+ invdistcont = [
179
+ # In each of the following, at least one shape parameter is invalid
180
+ ['alpha', (-1, )],
181
+ ['anglit', ()],
182
+ ['arcsine', ()],
183
+ ['argus', (-1, )],
184
+ ['beta', (-2, 2)],
185
+ ['betaprime', (-2, 2)],
186
+ ['bradford', (-1, )],
187
+ ['burr', (-1, 1)],
188
+ ['burr12', (-1, 1)],
189
+ ['cauchy', ()],
190
+ ['chi', (-1, )],
191
+ ['chi2', (-1, )],
192
+ ['cosine', ()],
193
+ ['crystalball', (-1, 2)],
194
+ ['dgamma', (-1, )],
195
+ ['dweibull', (-1, )],
196
+ ['erlang', (-1, )],
197
+ ['expon', ()],
198
+ ['exponnorm', (-1, )],
199
+ ['exponweib', (1, -1)],
200
+ ['exponpow', (-1, )],
201
+ ['f', (10, -10)],
202
+ ['fatiguelife', (-1, )],
203
+ ['fisk', (-1, )],
204
+ ['foldcauchy', (-1, )],
205
+ ['foldnorm', (-1, )],
206
+ ['genlogistic', (-1, )],
207
+ ['gennorm', (-1, )],
208
+ ['genpareto', (np.inf, )],
209
+ ['genexpon', (1, 2, -3)],
210
+ ['genextreme', (np.inf, )],
211
+ ['genhyperbolic', (0.5, -0.5, -1.5,)],
212
+ ['gausshyper', (1, 2, 3, -4)],
213
+ ['gamma', (-1, )],
214
+ ['gengamma', (-1, 0)],
215
+ ['genhalflogistic', (-1, )],
216
+ ['geninvgauss', (1, 0)],
217
+ ['gibrat', ()],
218
+ ['gompertz', (-1, )],
219
+ ['gumbel_r', ()],
220
+ ['gumbel_l', ()],
221
+ ['halfcauchy', ()],
222
+ ['halflogistic', ()],
223
+ ['halfnorm', ()],
224
+ ['halfgennorm', (-1, )],
225
+ ['hypsecant', ()],
226
+ ['invgamma', (-1, )],
227
+ ['invgauss', (-1, )],
228
+ ['invweibull', (-1, )],
229
+ ['jf_skew_t', (-1, 0)],
230
+ ['johnsonsb', (1, -2)],
231
+ ['johnsonsu', (1, -2)],
232
+ ['kappa4', (np.nan, 0)],
233
+ ['kappa3', (-1, )],
234
+ ['ksone', (-1, )],
235
+ ['kstwo', (-1, )],
236
+ ['kstwobign', ()],
237
+ ['laplace', ()],
238
+ ['laplace_asymmetric', (-1, )],
239
+ ['levy', ()],
240
+ ['levy_l', ()],
241
+ ['levy_stable', (-1, 1)],
242
+ ['logistic', ()],
243
+ ['loggamma', (-1, )],
244
+ ['loglaplace', (-1, )],
245
+ ['lognorm', (-1, )],
246
+ ['loguniform', (10, 5)],
247
+ ['lomax', (-1, )],
248
+ ['maxwell', ()],
249
+ ['mielke', (1, -2)],
250
+ ['moyal', ()],
251
+ ['nakagami', (-1, )],
252
+ ['ncx2', (-1, 2)],
253
+ ['ncf', (10, 20, -1)],
254
+ ['nct', (-1, 2)],
255
+ ['norm', ()],
256
+ ['norminvgauss', (5, -10)],
257
+ ['pareto', (-1, )],
258
+ ['pearson3', (np.nan, )],
259
+ ['powerlaw', (-1, )],
260
+ ['powerlognorm', (1, -2)],
261
+ ['powernorm', (-1, )],
262
+ ['rdist', (-1, )],
263
+ ['rayleigh', ()],
264
+ ['rice', (-1, )],
265
+ ['recipinvgauss', (-1, )],
266
+ ['semicircular', ()],
267
+ ['skewnorm', (np.inf, )],
268
+ ['studentized_range', (-1, 1)],
269
+ ['rel_breitwigner', (-2, )],
270
+ ['t', (-1, )],
271
+ ['trapezoid', (0, 2)],
272
+ ['triang', (2, )],
273
+ ['truncexpon', (-1, )],
274
+ ['truncnorm', (10, 5)],
275
+ ['truncpareto', (-1, 5)],
276
+ ['truncpareto', (1.8, .5)],
277
+ ['truncweibull_min', (-2.5, 0.25, 1.75)],
278
+ ['tukeylambda', (np.nan, )],
279
+ ['uniform', ()],
280
+ ['vonmises', (-1, )],
281
+ ['vonmises_line', (-1, )],
282
+ ['wald', ()],
283
+ ['weibull_min', (-1, )],
284
+ ['weibull_max', (-1, )],
285
+ ['wrapcauchy', (2, )],
286
+ ['reciprocal', (15, 10)],
287
+ ['skewcauchy', (2, )]
288
+ ]
.venv/Lib/site-packages/scipy/stats/_entropy.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Created on Fri Apr 2 09:06:05 2021
3
+
4
+ @author: matth
5
+ """
6
+
7
+ from __future__ import annotations
8
+ import math
9
+ import numpy as np
10
+ from scipy import special
11
+ from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
12
+
13
+ __all__ = ['entropy', 'differential_entropy']
14
+
15
+
16
+ @_axis_nan_policy_factory(
17
+ lambda x: x,
18
+ n_samples=lambda kwgs: (
19
+ 2 if ("qk" in kwgs and kwgs["qk"] is not None)
20
+ else 1
21
+ ),
22
+ n_outputs=1, result_to_tuple=lambda x: (x,), paired=True,
23
+ too_small=-1 # entropy doesn't have too small inputs
24
+ )
25
+ def entropy(pk: np.typing.ArrayLike,
26
+ qk: np.typing.ArrayLike | None = None,
27
+ base: float | None = None,
28
+ axis: int = 0
29
+ ) -> np.number | np.ndarray:
30
+ """
31
+ Calculate the Shannon entropy/relative entropy of given distribution(s).
32
+
33
+ If only probabilities `pk` are given, the Shannon entropy is calculated as
34
+ ``H = -sum(pk * log(pk))``.
35
+
36
+ If `qk` is not None, then compute the relative entropy
37
+ ``D = sum(pk * log(pk / qk))``. This quantity is also known
38
+ as the Kullback-Leibler divergence.
39
+
40
+ This routine will normalize `pk` and `qk` if they don't sum to 1.
41
+
42
+ Parameters
43
+ ----------
44
+ pk : array_like
45
+ Defines the (discrete) distribution. Along each axis-slice of ``pk``,
46
+ element ``i`` is the (possibly unnormalized) probability of event
47
+ ``i``.
48
+ qk : array_like, optional
49
+ Sequence against which the relative entropy is computed. Should be in
50
+ the same format as `pk`.
51
+ base : float, optional
52
+ The logarithmic base to use, defaults to ``e`` (natural logarithm).
53
+ axis : int, optional
54
+ The axis along which the entropy is calculated. Default is 0.
55
+
56
+ Returns
57
+ -------
58
+ S : {float, array_like}
59
+ The calculated entropy.
60
+
61
+ Notes
62
+ -----
63
+ Informally, the Shannon entropy quantifies the expected uncertainty
64
+ inherent in the possible outcomes of a discrete random variable.
65
+ For example,
66
+ if messages consisting of sequences of symbols from a set are to be
67
+ encoded and transmitted over a noiseless channel, then the Shannon entropy
68
+ ``H(pk)`` gives a tight lower bound for the average number of units of
69
+ information needed per symbol if the symbols occur with frequencies
70
+ governed by the discrete distribution `pk` [1]_. The choice of base
71
+ determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
72
+
73
+ The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
74
+ number of units of information needed per symbol if the encoding is
75
+ optimized for the probability distribution `qk` instead of the true
76
+ distribution `pk`. Informally, the relative entropy quantifies the expected
77
+ excess in surprise experienced if one believes the true distribution is
78
+ `qk` when it is actually `pk`.
79
+
80
+ A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
81
+ equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
82
+ the formula ``CE = -sum(pk * log(qk))``. It gives the average
83
+ number of units of information needed per symbol if an encoding is
84
+ optimized for the probability distribution `qk` when the true distribution
85
+ is `pk`. It is not computed directly by `entropy`, but it can be computed
86
+ using two calls to the function (see Examples).
87
+
88
+ See [2]_ for more information.
89
+
90
+ References
91
+ ----------
92
+ .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
93
+ Bell System Technical Journal, 27: 379-423.
94
+ https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
95
+ .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
96
+ Theory (Wiley Series in Telecommunications and Signal Processing).
97
+ Wiley-Interscience, USA.
98
+
99
+
100
+ Examples
101
+ --------
102
+ The outcome of a fair coin is the most uncertain:
103
+
104
+ >>> import numpy as np
105
+ >>> from scipy.stats import entropy
106
+ >>> base = 2 # work in units of bits
107
+ >>> pk = np.array([1/2, 1/2]) # fair coin
108
+ >>> H = entropy(pk, base=base)
109
+ >>> H
110
+ 1.0
111
+ >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
112
+ True
113
+
114
+ The outcome of a biased coin is less uncertain:
115
+
116
+ >>> qk = np.array([9/10, 1/10]) # biased coin
117
+ >>> entropy(qk, base=base)
118
+ 0.46899559358928117
119
+
120
+ The relative entropy between the fair coin and biased coin is calculated
121
+ as:
122
+
123
+ >>> D = entropy(pk, qk, base=base)
124
+ >>> D
125
+ 0.7369655941662062
126
+ >>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
127
+ True
128
+
129
+ The cross entropy can be calculated as the sum of the entropy and
130
+ relative entropy`:
131
+
132
+ >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
133
+ >>> CE
134
+ 1.736965594166206
135
+ >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
136
+ True
137
+
138
+ """
139
+ if base is not None and base <= 0:
140
+ raise ValueError("`base` must be a positive number or `None`.")
141
+
142
+ pk = np.asarray(pk)
143
+ with np.errstate(invalid='ignore'):
144
+ pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
145
+ if qk is None:
146
+ vec = special.entr(pk)
147
+ else:
148
+ qk = np.asarray(qk)
149
+ pk, qk = _broadcast_arrays((pk, qk), axis=None) # don't ignore any axes
150
+ sum_kwargs = dict(axis=axis, keepdims=True)
151
+ qk = 1.0*qk / np.sum(qk, **sum_kwargs) # type: ignore[operator, call-overload]
152
+ vec = special.rel_entr(pk, qk)
153
+ S = np.sum(vec, axis=axis)
154
+ if base is not None:
155
+ S /= np.log(base)
156
+ return S
157
+
158
+
159
+ def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
160
+ values = samples[0]
161
+ n = values.shape[axis]
162
+ window_length = kwargs.get("window_length",
163
+ math.floor(math.sqrt(n) + 0.5))
164
+ if not 2 <= 2 * window_length < n:
165
+ return True
166
+ return False
167
+
168
+
169
+ @_axis_nan_policy_factory(
170
+ lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,),
171
+ too_small=_differential_entropy_is_too_small
172
+ )
173
+ def differential_entropy(
174
+ values: np.typing.ArrayLike,
175
+ *,
176
+ window_length: int | None = None,
177
+ base: float | None = None,
178
+ axis: int = 0,
179
+ method: str = "auto",
180
+ ) -> np.number | np.ndarray:
181
+ r"""Given a sample of a distribution, estimate the differential entropy.
182
+
183
+ Several estimation methods are available using the `method` parameter. By
184
+ default, a method is selected based the size of the sample.
185
+
186
+ Parameters
187
+ ----------
188
+ values : sequence
189
+ Sample from a continuous distribution.
190
+ window_length : int, optional
191
+ Window length for computing Vasicek estimate. Must be an integer
192
+ between 1 and half of the sample size. If ``None`` (the default), it
193
+ uses the heuristic value
194
+
195
+ .. math::
196
+ \left \lfloor \sqrt{n} + 0.5 \right \rfloor
197
+
198
+ where :math:`n` is the sample size. This heuristic was originally
199
+ proposed in [2]_ and has become common in the literature.
200
+ base : float, optional
201
+ The logarithmic base to use, defaults to ``e`` (natural logarithm).
202
+ axis : int, optional
203
+ The axis along which the differential entropy is calculated.
204
+ Default is 0.
205
+ method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
206
+ The method used to estimate the differential entropy from the sample.
207
+ Default is ``'auto'``. See Notes for more information.
208
+
209
+ Returns
210
+ -------
211
+ entropy : float
212
+ The calculated differential entropy.
213
+
214
+ Notes
215
+ -----
216
+ This function will converge to the true differential entropy in the limit
217
+
218
+ .. math::
219
+ n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
220
+
221
+ The optimal choice of ``window_length`` for a given sample size depends on
222
+ the (unknown) distribution. Typically, the smoother the density of the
223
+ distribution, the larger the optimal value of ``window_length`` [1]_.
224
+
225
+ The following options are available for the `method` parameter.
226
+
227
+ * ``'vasicek'`` uses the estimator presented in [1]_. This is
228
+ one of the first and most influential estimators of differential entropy.
229
+ * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
230
+ is not only consistent but, under some conditions, asymptotically normal.
231
+ * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
232
+ in simulation to have smaller bias and mean squared error than
233
+ the Vasicek estimator.
234
+ * ``'correa'`` uses the estimator presented in [5]_ based on local linear
235
+ regression. In a simulation study, it had consistently smaller mean
236
+ square error than the Vasiceck estimator, but it is more expensive to
237
+ compute.
238
+ * ``'auto'`` selects the method automatically (default). Currently,
239
+ this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
240
+ for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
241
+ samples, but this behavior is subject to change in future versions.
242
+
243
+ All estimators are implemented as described in [6]_.
244
+
245
+ References
246
+ ----------
247
+ .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
248
+ Journal of the Royal Statistical Society:
249
+ Series B (Methodological), 38(1), 54-59.
250
+ .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
251
+ goodness-of-fit test for exponentiality. Communications in
252
+ Statistics-Theory and Methods, 28(5), 1183-1202.
253
+ .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
254
+ class of statistics based on spacings. Scandinavian Journal of
255
+ Statistics, 61-72.
256
+ .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
257
+ of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
258
+ .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
259
+ in Statistics-Theory and Methods, 24(10), 2439-2449.
260
+ .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
261
+ Annals of Data Science, 2(2), 231-241.
262
+ https://link.springer.com/article/10.1007/s40745-015-0045-9
263
+
264
+ Examples
265
+ --------
266
+ >>> import numpy as np
267
+ >>> from scipy.stats import differential_entropy, norm
268
+
269
+ Entropy of a standard normal distribution:
270
+
271
+ >>> rng = np.random.default_rng()
272
+ >>> values = rng.standard_normal(100)
273
+ >>> differential_entropy(values)
274
+ 1.3407817436640392
275
+
276
+ Compare with the true entropy:
277
+
278
+ >>> float(norm.entropy())
279
+ 1.4189385332046727
280
+
281
+ For several sample sizes between 5 and 1000, compare the accuracy of
282
+ the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
283
+ compare the root mean squared error (over 1000 trials) between the estimate
284
+ and the true differential entropy of the distribution.
285
+
286
+ >>> from scipy import stats
287
+ >>> import matplotlib.pyplot as plt
288
+ >>>
289
+ >>>
290
+ >>> def rmse(res, expected):
291
+ ... '''Root mean squared error'''
292
+ ... return np.sqrt(np.mean((res - expected)**2))
293
+ >>>
294
+ >>>
295
+ >>> a, b = np.log10(5), np.log10(1000)
296
+ >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
297
+ >>> reps = 1000 # number of repetitions for each sample size
298
+ >>> expected = stats.expon.entropy()
299
+ >>>
300
+ >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
301
+ >>> for method in method_errors:
302
+ ... for n in ns:
303
+ ... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
304
+ ... res = stats.differential_entropy(rvs, method=method, axis=-1)
305
+ ... error = rmse(res, expected)
306
+ ... method_errors[method].append(error)
307
+ >>>
308
+ >>> for method, errors in method_errors.items():
309
+ ... plt.loglog(ns, errors, label=method)
310
+ >>>
311
+ >>> plt.legend()
312
+ >>> plt.xlabel('sample size')
313
+ >>> plt.ylabel('RMSE (1000 trials)')
314
+ >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
315
+
316
+ """
317
+ values = np.asarray(values)
318
+ values = np.moveaxis(values, axis, -1)
319
+ n = values.shape[-1] # number of observations
320
+
321
+ if window_length is None:
322
+ window_length = math.floor(math.sqrt(n) + 0.5)
323
+
324
+ if not 2 <= 2 * window_length < n:
325
+ raise ValueError(
326
+ f"Window length ({window_length}) must be positive and less "
327
+ f"than half the sample size ({n}).",
328
+ )
329
+
330
+ if base is not None and base <= 0:
331
+ raise ValueError("`base` must be a positive number or `None`.")
332
+
333
+ sorted_data = np.sort(values, axis=-1)
334
+
335
+ methods = {"vasicek": _vasicek_entropy,
336
+ "van es": _van_es_entropy,
337
+ "correa": _correa_entropy,
338
+ "ebrahimi": _ebrahimi_entropy,
339
+ "auto": _vasicek_entropy}
340
+ method = method.lower()
341
+ if method not in methods:
342
+ message = f"`method` must be one of {set(methods)}"
343
+ raise ValueError(message)
344
+
345
+ if method == "auto":
346
+ if n <= 10:
347
+ method = 'van es'
348
+ elif n <= 1000:
349
+ method = 'ebrahimi'
350
+ else:
351
+ method = 'vasicek'
352
+
353
+ res = methods[method](sorted_data, window_length)
354
+
355
+ if base is not None:
356
+ res /= np.log(base)
357
+
358
+ return res
359
+
360
+
361
+ def _pad_along_last_axis(X, m):
362
+ """Pad the data for computing the rolling window difference."""
363
+ # scales a bit better than method in _vasicek_like_entropy
364
+ shape = np.array(X.shape)
365
+ shape[-1] = m
366
+ Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
367
+ Xr = np.broadcast_to(X[..., [-1]], shape)
368
+ return np.concatenate((Xl, X, Xr), axis=-1)
369
+
370
+
371
+ def _vasicek_entropy(X, m):
372
+ """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
373
+ n = X.shape[-1]
374
+ X = _pad_along_last_axis(X, m)
375
+ differences = X[..., 2 * m:] - X[..., : -2 * m:]
376
+ logs = np.log(n/(2*m) * differences)
377
+ return np.mean(logs, axis=-1)
378
+
379
+
380
+ def _van_es_entropy(X, m):
381
+ """Compute the van Es estimator as described in [6]."""
382
+ # No equation number, but referred to as HVE_mn.
383
+ # Typo: there should be a log within the summation.
384
+ n = X.shape[-1]
385
+ difference = X[..., m:] - X[..., :-m]
386
+ term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
387
+ k = np.arange(m, n+1)
388
+ return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
389
+
390
+
391
+ def _ebrahimi_entropy(X, m):
392
+ """Compute the Ebrahimi estimator as described in [6]."""
393
+ # No equation number, but referred to as HE_mn
394
+ n = X.shape[-1]
395
+ X = _pad_along_last_axis(X, m)
396
+
397
+ differences = X[..., 2 * m:] - X[..., : -2 * m:]
398
+
399
+ i = np.arange(1, n+1).astype(float)
400
+ ci = np.ones_like(i)*2
401
+ ci[i <= m] = 1 + (i[i <= m] - 1)/m
402
+ ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
403
+
404
+ logs = np.log(n * differences / (ci * m))
405
+ return np.mean(logs, axis=-1)
406
+
407
+
408
+ def _correa_entropy(X, m):
409
+ """Compute the Correa estimator as described in [6]."""
410
+ # No equation number, but referred to as HC_mn
411
+ n = X.shape[-1]
412
+ X = _pad_along_last_axis(X, m)
413
+
414
+ i = np.arange(1, n+1)
415
+ dj = np.arange(-m, m+1)[:, None]
416
+ j = i + dj
417
+ j0 = j + m - 1 # 0-indexed version of j
418
+
419
+ Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
420
+ difference = X[..., j0] - Xibar
421
+ num = np.sum(difference*dj, axis=-2) # dj is d-i
422
+ den = n*np.sum(difference**2, axis=-2)
423
+ return -np.mean(np.log(num/den), axis=-1)
.venv/Lib/site-packages/scipy/stats/_fit.py ADDED
@@ -0,0 +1,1351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+ from collections import namedtuple
3
+ import numpy as np
4
+ from scipy import optimize, stats
5
+ from scipy._lib._util import check_random_state
6
+
7
+
8
+ def _combine_bounds(name, user_bounds, shape_domain, integral):
9
+ """Intersection of user-defined bounds and distribution PDF/PMF domain"""
10
+
11
+ user_bounds = np.atleast_1d(user_bounds)
12
+
13
+ if user_bounds[0] > user_bounds[1]:
14
+ message = (f"There are no values for `{name}` on the interval "
15
+ f"{list(user_bounds)}.")
16
+ raise ValueError(message)
17
+
18
+ bounds = (max(user_bounds[0], shape_domain[0]),
19
+ min(user_bounds[1], shape_domain[1]))
20
+
21
+ if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):
22
+ message = (f"There are no integer values for `{name}` on the interval "
23
+ f"defined by the user-provided bounds and the domain "
24
+ "of the distribution.")
25
+ raise ValueError(message)
26
+ elif not integral and (bounds[0] > bounds[1]):
27
+ message = (f"There are no values for `{name}` on the interval "
28
+ f"defined by the user-provided bounds and the domain "
29
+ "of the distribution.")
30
+ raise ValueError(message)
31
+
32
+ if not np.all(np.isfinite(bounds)):
33
+ message = (f"The intersection of user-provided bounds for `{name}` "
34
+ f"and the domain of the distribution is not finite. Please "
35
+ f"provide finite bounds for shape `{name}` in `bounds`.")
36
+ raise ValueError(message)
37
+
38
+ return bounds
39
+
40
+
41
+ class FitResult:
42
+ r"""Result of fitting a discrete or continuous distribution to data
43
+
44
+ Attributes
45
+ ----------
46
+ params : namedtuple
47
+ A namedtuple containing the maximum likelihood estimates of the
48
+ shape parameters, location, and (if applicable) scale of the
49
+ distribution.
50
+ success : bool or None
51
+ Whether the optimizer considered the optimization to terminate
52
+ successfully or not.
53
+ message : str or None
54
+ Any status message provided by the optimizer.
55
+
56
+ """
57
+
58
+ def __init__(self, dist, data, discrete, res):
59
+ self._dist = dist
60
+ self._data = data
61
+ self.discrete = discrete
62
+ self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)
63
+
64
+ shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
65
+ if not discrete:
66
+ FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])
67
+ else:
68
+ FitParams = namedtuple('FitParams', shape_names + ['loc'])
69
+
70
+ self.params = FitParams(*res.x)
71
+
72
+ # Optimizer can report success even when nllf is infinite
73
+ if res.success and not np.isfinite(self.nllf()):
74
+ res.success = False
75
+ res.message = ("Optimization converged to parameter values that "
76
+ "are inconsistent with the data.")
77
+ self.success = getattr(res, "success", None)
78
+ self.message = getattr(res, "message", None)
79
+
80
+ def __repr__(self):
81
+ keys = ["params", "success", "message"]
82
+ m = max(map(len, keys)) + 1
83
+ return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))
84
+ for key in keys if getattr(self, key) is not None])
85
+
86
+ def nllf(self, params=None, data=None):
87
+ """Negative log-likelihood function
88
+
89
+ Evaluates the negative of the log-likelihood function of the provided
90
+ data at the provided parameters.
91
+
92
+ Parameters
93
+ ----------
94
+ params : tuple, optional
95
+ The shape parameters, location, and (if applicable) scale of the
96
+ distribution as a single tuple. Default is the maximum likelihood
97
+ estimates (``self.params``).
98
+ data : array_like, optional
99
+ The data for which the log-likelihood function is to be evaluated.
100
+ Default is the data to which the distribution was fit.
101
+
102
+ Returns
103
+ -------
104
+ nllf : float
105
+ The negative of the log-likelihood function.
106
+
107
+ """
108
+ params = params if params is not None else self.params
109
+ data = data if data is not None else self._data
110
+ return self._dist.nnlf(theta=params, x=data)
111
+
112
+ def plot(self, ax=None, *, plot_type="hist"):
113
+ """Visually compare the data against the fitted distribution.
114
+
115
+ Available only if `matplotlib` is installed.
116
+
117
+ Parameters
118
+ ----------
119
+ ax : `matplotlib.axes.Axes`
120
+ Axes object to draw the plot onto, otherwise uses the current Axes.
121
+ plot_type : {"hist", "qq", "pp", "cdf"}
122
+ Type of plot to draw. Options include:
123
+
124
+ - "hist": Superposes the PDF/PMF of the fitted distribution
125
+ over a normalized histogram of the data.
126
+ - "qq": Scatter plot of theoretical quantiles against the
127
+ empirical quantiles. Specifically, the x-coordinates are the
128
+ values of the fitted distribution PPF evaluated at the
129
+ percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the
130
+ number of data points, and the y-coordinates are the sorted
131
+ data points.
132
+ - "pp": Scatter plot of theoretical percentiles against the
133
+ observed percentiles. Specifically, the x-coordinates are the
134
+ percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
135
+ the number of data points, and the y-coordinates are the values
136
+ of the fitted distribution CDF evaluated at the sorted
137
+ data points.
138
+ - "cdf": Superposes the CDF of the fitted distribution over the
139
+ empirical CDF. Specifically, the x-coordinates of the empirical
140
+ CDF are the sorted data points, and the y-coordinates are the
141
+ percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
142
+ the number of data points.
143
+
144
+ Returns
145
+ -------
146
+ ax : `matplotlib.axes.Axes`
147
+ The matplotlib Axes object on which the plot was drawn.
148
+
149
+ Examples
150
+ --------
151
+ >>> import numpy as np
152
+ >>> from scipy import stats
153
+ >>> import matplotlib.pyplot as plt # matplotlib must be installed
154
+ >>> rng = np.random.default_rng()
155
+ >>> data = stats.nbinom(5, 0.5).rvs(size=1000, random_state=rng)
156
+ >>> bounds = [(0, 30), (0, 1)]
157
+ >>> res = stats.fit(stats.nbinom, data, bounds)
158
+ >>> ax = res.plot() # save matplotlib Axes object
159
+
160
+ The `matplotlib.axes.Axes` object can be used to customize the plot.
161
+ See `matplotlib.axes.Axes` documentation for details.
162
+
163
+ >>> ax.set_xlabel('number of trials') # customize axis label
164
+ >>> ax.get_children()[0].set_linewidth(5) # customize line widths
165
+ >>> ax.legend()
166
+ >>> plt.show()
167
+ """
168
+ try:
169
+ import matplotlib # noqa: F401
170
+ except ModuleNotFoundError as exc:
171
+ message = "matplotlib must be installed to use method `plot`."
172
+ raise ModuleNotFoundError(message) from exc
173
+
174
+ plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,
175
+ 'pp': self._pp_plot, 'cdf': self._cdf_plot,
176
+ 'hist': self._hist_plot}
177
+ if plot_type.lower() not in plots:
178
+ message = f"`plot_type` must be one of {set(plots.keys())}"
179
+ raise ValueError(message)
180
+ plot = plots[plot_type.lower()]
181
+
182
+ if ax is None:
183
+ import matplotlib.pyplot as plt
184
+ ax = plt.gca()
185
+
186
+ fit_params = np.atleast_1d(self.params)
187
+
188
+ return plot(ax=ax, fit_params=fit_params)
189
+
190
+ def _hist_plot(self, ax, fit_params):
191
+ from matplotlib.ticker import MaxNLocator
192
+
193
+ support = self._dist.support(*fit_params)
194
+ lb = support[0] if np.isfinite(support[0]) else min(self._data)
195
+ ub = support[1] if np.isfinite(support[1]) else max(self._data)
196
+ pxf = "PMF" if self.discrete else "PDF"
197
+
198
+ if self.discrete:
199
+ x = np.arange(lb, ub + 2)
200
+ y = self.pxf(x, *fit_params)
201
+ ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',
202
+ color='C0')
203
+ options = dict(density=True, bins=x, align='left', color='C1')
204
+ ax.xaxis.set_major_locator(MaxNLocator(integer=True))
205
+ ax.set_xlabel('k')
206
+ ax.set_ylabel('PMF')
207
+ else:
208
+ x = np.linspace(lb, ub, 200)
209
+ y = self.pxf(x, *fit_params)
210
+ ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')
211
+ options = dict(density=True, bins=50, align='mid', color='C1')
212
+ ax.set_xlabel('x')
213
+ ax.set_ylabel('PDF')
214
+
215
+ if len(self._data) > 50 or self.discrete:
216
+ ax.hist(self._data, label="Histogram of Data", **options)
217
+ else:
218
+ ax.plot(self._data, np.zeros_like(self._data), "*",
219
+ label='Data', color='C1')
220
+
221
+ ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")
222
+ ax.legend(*ax.get_legend_handles_labels())
223
+ return ax
224
+
225
+ def _qp_plot(self, ax, fit_params, qq):
226
+ data = np.sort(self._data)
227
+ ps = self._plotting_positions(len(self._data))
228
+
229
+ if qq:
230
+ qp = "Quantiles"
231
+ plot_type = 'Q-Q'
232
+ x = self._dist.ppf(ps, *fit_params)
233
+ y = data
234
+ else:
235
+ qp = "Percentiles"
236
+ plot_type = 'P-P'
237
+ x = ps
238
+ y = self._dist.cdf(data, *fit_params)
239
+
240
+ ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',
241
+ color='C0', zorder=1)
242
+ xlim = ax.get_xlim()
243
+ ylim = ax.get_ylim()
244
+ lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]
245
+ if not qq:
246
+ lim = max(lim[0], 0), min(lim[1], 1)
247
+
248
+ if self.discrete and qq:
249
+ q_min, q_max = int(lim[0]), int(lim[1]+1)
250
+ q_ideal = np.arange(q_min, q_max)
251
+ # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))
252
+ ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',
253
+ alpha=0.25, markerfacecolor='none', clip_on=True)
254
+ elif self.discrete and not qq:
255
+ # The intent of this is to match the plot that would be produced
256
+ # if x were continuous on [0, 1] and y were cdf(ppf(x)).
257
+ # It can be approximated by letting x = np.linspace(0, 1, 1000),
258
+ # but this might not look great when zooming in. The vertical
259
+ # portions are included to indicate where the transition occurs
260
+ # where the data completely obscures the horizontal portions.
261
+ p_min, p_max = lim
262
+ a, b = self._dist.support(*fit_params)
263
+ p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)
264
+ p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)
265
+ q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)
266
+ qs = np.arange(q_min-1, q_max+1)
267
+ ps = self._dist.cdf(qs, *fit_params)
268
+ ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,
269
+ clip_on=True)
270
+ else:
271
+ ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,
272
+ clip_on=True)
273
+
274
+ ax.set_xlim(lim)
275
+ ax.set_ylim(lim)
276
+ ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")
277
+ ax.set_ylabel(f"Data {qp}")
278
+ ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")
279
+ ax.legend(*ax.get_legend_handles_labels())
280
+ ax.set_aspect('equal')
281
+ return ax
282
+
283
+ def _qq_plot(self, **kwargs):
284
+ return self._qp_plot(qq=True, **kwargs)
285
+
286
+ def _pp_plot(self, **kwargs):
287
+ return self._qp_plot(qq=False, **kwargs)
288
+
289
+ def _plotting_positions(self, n, a=.5):
290
+ # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions
291
+ k = np.arange(1, n+1)
292
+ return (k-a) / (n + 1 - 2*a)
293
+
294
+ def _cdf_plot(self, ax, fit_params):
295
+ data = np.sort(self._data)
296
+ ecdf = self._plotting_positions(len(self._data))
297
+ ls = '--' if len(np.unique(data)) < 30 else '.'
298
+ xlabel = 'k' if self.discrete else 'x'
299
+ ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)
300
+
301
+ xlim = ax.get_xlim()
302
+ q = np.linspace(*xlim, 300)
303
+ tcdf = self._dist.cdf(q, *fit_params)
304
+
305
+ ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)
306
+ ax.set_xlim(xlim)
307
+ ax.set_ylim(0, 1)
308
+ ax.set_xlabel(xlabel)
309
+ ax.set_ylabel("CDF")
310
+ ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")
311
+ handles, labels = ax.get_legend_handles_labels()
312
+ ax.legend(handles[::-1], labels[::-1])
313
+ return ax
314
+
315
+
316
+ def fit(dist, data, bounds=None, *, guess=None, method='mle',
317
+ optimizer=optimize.differential_evolution):
318
+ r"""Fit a discrete or continuous distribution to data
319
+
320
+ Given a distribution, data, and bounds on the parameters of the
321
+ distribution, return maximum likelihood estimates of the parameters.
322
+
323
+ Parameters
324
+ ----------
325
+ dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
326
+ The object representing the distribution to be fit to the data.
327
+ data : 1D array_like
328
+ The data to which the distribution is to be fit. If the data contain
329
+ any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will
330
+ raise a ``ValueError``.
331
+ bounds : dict or sequence of tuples, optional
332
+ If a dictionary, each key is the name of a parameter of the
333
+ distribution, and the corresponding value is a tuple containing the
334
+ lower and upper bound on that parameter. If the distribution is
335
+ defined only for a finite range of values of that parameter, no entry
336
+ for that parameter is required; e.g., some distributions have
337
+ parameters which must be on the interval [0, 1]. Bounds for parameters
338
+ location (``loc``) and scale (``scale``) are optional; by default,
339
+ they are fixed to 0 and 1, respectively.
340
+
341
+ If a sequence, element *i* is a tuple containing the lower and upper
342
+ bound on the *i*\ th parameter of the distribution. In this case,
343
+ bounds for *all* distribution shape parameters must be provided.
344
+ Optionally, bounds for location and scale may follow the
345
+ distribution shape parameters.
346
+
347
+ If a shape is to be held fixed (e.g. if it is known), the
348
+ lower and upper bounds may be equal. If a user-provided lower or upper
349
+ bound is beyond a bound of the domain for which the distribution is
350
+ defined, the bound of the distribution's domain will replace the
351
+ user-provided value. Similarly, parameters which must be integral
352
+ will be constrained to integral values within the user-provided bounds.
353
+ guess : dict or array_like, optional
354
+ If a dictionary, each key is the name of a parameter of the
355
+ distribution, and the corresponding value is a guess for the value
356
+ of the parameter.
357
+
358
+ If a sequence, element *i* is a guess for the *i*\ th parameter of the
359
+ distribution. In this case, guesses for *all* distribution shape
360
+ parameters must be provided.
361
+
362
+ If `guess` is not provided, guesses for the decision variables will
363
+ not be passed to the optimizer. If `guess` is provided, guesses for
364
+ any missing parameters will be set at the mean of the lower and
365
+ upper bounds. Guesses for parameters which must be integral will be
366
+ rounded to integral values, and guesses that lie outside the
367
+ intersection of the user-provided bounds and the domain of the
368
+ distribution will be clipped.
369
+ method : {'mle', 'mse'}
370
+ With ``method="mle"`` (default), the fit is computed by minimizing
371
+ the negative log-likelihood function. A large, finite penalty
372
+ (rather than infinite negative log-likelihood) is applied for
373
+ observations beyond the support of the distribution.
374
+ With ``method="mse"``, the fit is computed by minimizing
375
+ the negative log-product spacing function. The same penalty is applied
376
+ for observations beyond the support. We follow the approach of [1]_,
377
+ which is generalized for samples with repeated observations.
378
+ optimizer : callable, optional
379
+ `optimizer` is a callable that accepts the following positional
380
+ argument.
381
+
382
+ fun : callable
383
+ The objective function to be optimized. `fun` accepts one argument
384
+ ``x``, candidate shape parameters of the distribution, and returns
385
+ the objective function value given ``x``, `dist`, and the provided
386
+ `data`.
387
+ The job of `optimizer` is to find values of the decision variables
388
+ that minimizes `fun`.
389
+
390
+ `optimizer` must also accept the following keyword argument.
391
+
392
+ bounds : sequence of tuples
393
+ The bounds on values of the decision variables; each element will
394
+ be a tuple containing the lower and upper bound on a decision
395
+ variable.
396
+
397
+ If `guess` is provided, `optimizer` must also accept the following
398
+ keyword argument.
399
+
400
+ x0 : array_like
401
+ The guesses for each decision variable.
402
+
403
+ If the distribution has any shape parameters that must be integral or
404
+ if the distribution is discrete and the location parameter is not
405
+ fixed, `optimizer` must also accept the following keyword argument.
406
+
407
+ integrality : array_like of bools
408
+ For each decision variable, True if the decision variable
409
+ must be constrained to integer values and False if the decision
410
+ variable is continuous.
411
+
412
+ `optimizer` must return an object, such as an instance of
413
+ `scipy.optimize.OptimizeResult`, which holds the optimal values of
414
+ the decision variables in an attribute ``x``. If attributes
415
+ ``fun``, ``status``, or ``message`` are provided, they will be
416
+ included in the result object returned by `fit`.
417
+
418
+ Returns
419
+ -------
420
+ result : `~scipy.stats._result_classes.FitResult`
421
+ An object with the following fields.
422
+
423
+ params : namedtuple
424
+ A namedtuple containing the maximum likelihood estimates of the
425
+ shape parameters, location, and (if applicable) scale of the
426
+ distribution.
427
+ success : bool or None
428
+ Whether the optimizer considered the optimization to terminate
429
+ successfully or not.
430
+ message : str or None
431
+ Any status message provided by the optimizer.
432
+
433
+ The object has the following method:
434
+
435
+ nllf(params=None, data=None)
436
+ By default, the negative log-likehood function at the fitted
437
+ `params` for the given `data`. Accepts a tuple containing
438
+ alternative shapes, location, and scale of the distribution and
439
+ an array of alternative data.
440
+
441
+ plot(ax=None)
442
+ Superposes the PDF/PMF of the fitted distribution over a normalized
443
+ histogram of the data.
444
+
445
+ See Also
446
+ --------
447
+ rv_continuous, rv_discrete
448
+
449
+ Notes
450
+ -----
451
+ Optimization is more likely to converge to the maximum likelihood estimate
452
+ when the user provides tight bounds containing the maximum likelihood
453
+ estimate. For example, when fitting a binomial distribution to data, the
454
+ number of experiments underlying each sample may be known, in which case
455
+ the corresponding shape parameter ``n`` can be fixed.
456
+
457
+ References
458
+ ----------
459
+ .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings
460
+ method: a unified formulation with illustration of strong
461
+ consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.
462
+
463
+ Examples
464
+ --------
465
+ Suppose we wish to fit a distribution to the following data.
466
+
467
+ >>> import numpy as np
468
+ >>> from scipy import stats
469
+ >>> rng = np.random.default_rng()
470
+ >>> dist = stats.nbinom
471
+ >>> shapes = (5, 0.5)
472
+ >>> data = dist.rvs(*shapes, size=1000, random_state=rng)
473
+
474
+ Suppose we do not know how the data were generated, but we suspect that
475
+ it follows a negative binomial distribution with parameters *n* and *p*\.
476
+ (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer
477
+ than 30, and we know that the parameter *p* must lie on the interval
478
+ [0, 1]. We record this information in a variable `bounds` and pass
479
+ this information to `fit`.
480
+
481
+ >>> bounds = [(0, 30), (0, 1)]
482
+ >>> res = stats.fit(dist, data, bounds)
483
+
484
+ `fit` searches within the user-specified `bounds` for the
485
+ values that best match the data (in the sense of maximum likelihood
486
+ estimation). In this case, it found shape values similar to those
487
+ from which the data were actually generated.
488
+
489
+ >>> res.params
490
+ FitParams(n=5.0, p=0.5028157644634368, loc=0.0) # may vary
491
+
492
+ We can visualize the results by superposing the probability mass function
493
+ of the distribution (with the shapes fit to the data) over a normalized
494
+ histogram of the data.
495
+
496
+ >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
497
+ >>> res.plot()
498
+ >>> plt.show()
499
+
500
+ Note that the estimate for *n* was exactly integral; this is because
501
+ the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`
502
+ object "knows" that. `nbinom` also knows that the shape *p* must be a
503
+ value between 0 and 1. In such a case - when the domain of the distribution
504
+ with respect to a parameter is finite - we are not required to specify
505
+ bounds for the parameter.
506
+
507
+ >>> bounds = {'n': (0, 30)} # omit parameter p using a `dict`
508
+ >>> res2 = stats.fit(dist, data, bounds)
509
+ >>> res2.params
510
+ FitParams(n=5.0, p=0.5016492009232932, loc=0.0) # may vary
511
+
512
+ If we wish to force the distribution to be fit with *n* fixed at 6, we can
513
+ set both the lower and upper bounds on *n* to 6. Note, however, that the
514
+ value of the objective function being optimized is typically worse (higher)
515
+ in this case.
516
+
517
+ >>> bounds = {'n': (6, 6)} # fix parameter `n`
518
+ >>> res3 = stats.fit(dist, data, bounds)
519
+ >>> res3.params
520
+ FitParams(n=6.0, p=0.5486556076755706, loc=0.0) # may vary
521
+ >>> res3.nllf() > res.nllf()
522
+ True # may vary
523
+
524
+ Note that the numerical results of the previous examples are typical, but
525
+ they may vary because the default optimizer used by `fit`,
526
+ `scipy.optimize.differential_evolution`, is stochastic. However, we can
527
+ customize the settings used by the optimizer to ensure reproducibility -
528
+ or even use a different optimizer entirely - using the `optimizer`
529
+ parameter.
530
+
531
+ >>> from scipy.optimize import differential_evolution
532
+ >>> rng = np.random.default_rng(767585560716548)
533
+ >>> def optimizer(fun, bounds, *, integrality):
534
+ ... return differential_evolution(fun, bounds, strategy='best2bin',
535
+ ... seed=rng, integrality=integrality)
536
+ >>> bounds = [(0, 30), (0, 1)]
537
+ >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)
538
+ >>> res4.params
539
+ FitParams(n=5.0, p=0.5015183149259951, loc=0.0)
540
+
541
+ """
542
+ # --- Input Validation / Standardization --- #
543
+ user_bounds = bounds
544
+ user_guess = guess
545
+
546
+ # distribution input validation and information collection
547
+ if hasattr(dist, "pdf"): # can't use isinstance for types
548
+ default_bounds = {'loc': (0, 0), 'scale': (1, 1)}
549
+ discrete = False
550
+ elif hasattr(dist, "pmf"):
551
+ default_bounds = {'loc': (0, 0)}
552
+ discrete = True
553
+ else:
554
+ message = ("`dist` must be an instance of `rv_continuous` "
555
+ "or `rv_discrete.`")
556
+ raise ValueError(message)
557
+
558
+ try:
559
+ param_info = dist._param_info()
560
+ except AttributeError as e:
561
+ message = (f"Distribution `{dist.name}` is not yet supported by "
562
+ "`scipy.stats.fit` because shape information has "
563
+ "not been defined.")
564
+ raise ValueError(message) from e
565
+
566
+ # data input validation
567
+ data = np.asarray(data)
568
+ if data.ndim != 1:
569
+ message = "`data` must be exactly one-dimensional."
570
+ raise ValueError(message)
571
+ if not (np.issubdtype(data.dtype, np.number)
572
+ and np.all(np.isfinite(data))):
573
+ message = "All elements of `data` must be finite numbers."
574
+ raise ValueError(message)
575
+
576
+ # bounds input validation and information collection
577
+ n_params = len(param_info)
578
+ n_shapes = n_params - (1 if discrete else 2)
579
+ param_list = [param.name for param in param_info]
580
+ param_names = ", ".join(param_list)
581
+ shape_names = ", ".join(param_list[:n_shapes])
582
+
583
+ if user_bounds is None:
584
+ user_bounds = {}
585
+
586
+ if isinstance(user_bounds, dict):
587
+ default_bounds.update(user_bounds)
588
+ user_bounds = default_bounds
589
+ user_bounds_array = np.empty((n_params, 2))
590
+ for i in range(n_params):
591
+ param_name = param_info[i].name
592
+ user_bound = user_bounds.pop(param_name, None)
593
+ if user_bound is None:
594
+ user_bound = param_info[i].domain
595
+ user_bounds_array[i] = user_bound
596
+ if user_bounds:
597
+ message = ("Bounds provided for the following unrecognized "
598
+ f"parameters will be ignored: {set(user_bounds)}")
599
+ warnings.warn(message, RuntimeWarning, stacklevel=2)
600
+
601
+ else:
602
+ try:
603
+ user_bounds = np.asarray(user_bounds, dtype=float)
604
+ if user_bounds.size == 0:
605
+ user_bounds = np.empty((0, 2))
606
+ except ValueError as e:
607
+ message = ("Each element of a `bounds` sequence must be a tuple "
608
+ "containing two elements: the lower and upper bound of "
609
+ "a distribution parameter.")
610
+ raise ValueError(message) from e
611
+ if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):
612
+ message = ("Each element of `bounds` must be a tuple specifying "
613
+ "the lower and upper bounds of a shape parameter")
614
+ raise ValueError(message)
615
+ if user_bounds.shape[0] < n_shapes:
616
+ message = (f"A `bounds` sequence must contain at least {n_shapes} "
617
+ "elements: tuples specifying the lower and upper "
618
+ f"bounds of all shape parameters {shape_names}.")
619
+ raise ValueError(message)
620
+ if user_bounds.shape[0] > n_params:
621
+ message = ("A `bounds` sequence may not contain more than "
622
+ f"{n_params} elements: tuples specifying the lower and "
623
+ "upper bounds of distribution parameters "
624
+ f"{param_names}.")
625
+ raise ValueError(message)
626
+
627
+ user_bounds_array = np.empty((n_params, 2))
628
+ user_bounds_array[n_shapes:] = list(default_bounds.values())
629
+ user_bounds_array[:len(user_bounds)] = user_bounds
630
+
631
+ user_bounds = user_bounds_array
632
+ validated_bounds = []
633
+ for i in range(n_params):
634
+ name = param_info[i].name
635
+ user_bound = user_bounds_array[i]
636
+ param_domain = param_info[i].domain
637
+ integral = param_info[i].integrality
638
+ combined = _combine_bounds(name, user_bound, param_domain, integral)
639
+ validated_bounds.append(combined)
640
+
641
+ bounds = np.asarray(validated_bounds)
642
+ integrality = [param.integrality for param in param_info]
643
+
644
+ # guess input validation
645
+
646
+ if user_guess is None:
647
+ guess_array = None
648
+ elif isinstance(user_guess, dict):
649
+ default_guess = {param.name: np.mean(bound)
650
+ for param, bound in zip(param_info, bounds)}
651
+ unrecognized = set(user_guess) - set(default_guess)
652
+ if unrecognized:
653
+ message = ("Guesses provided for the following unrecognized "
654
+ f"parameters will be ignored: {unrecognized}")
655
+ warnings.warn(message, RuntimeWarning, stacklevel=2)
656
+ default_guess.update(user_guess)
657
+
658
+ message = ("Each element of `guess` must be a scalar "
659
+ "guess for a distribution parameter.")
660
+ try:
661
+ guess_array = np.asarray([default_guess[param.name]
662
+ for param in param_info], dtype=float)
663
+ except ValueError as e:
664
+ raise ValueError(message) from e
665
+
666
+ else:
667
+ message = ("Each element of `guess` must be a scalar "
668
+ "guess for a distribution parameter.")
669
+ try:
670
+ user_guess = np.asarray(user_guess, dtype=float)
671
+ except ValueError as e:
672
+ raise ValueError(message) from e
673
+ if user_guess.ndim != 1:
674
+ raise ValueError(message)
675
+ if user_guess.shape[0] < n_shapes:
676
+ message = (f"A `guess` sequence must contain at least {n_shapes} "
677
+ "elements: scalar guesses for the distribution shape "
678
+ f"parameters {shape_names}.")
679
+ raise ValueError(message)
680
+ if user_guess.shape[0] > n_params:
681
+ message = ("A `guess` sequence may not contain more than "
682
+ f"{n_params} elements: scalar guesses for the "
683
+ f"distribution parameters {param_names}.")
684
+ raise ValueError(message)
685
+
686
+ guess_array = np.mean(bounds, axis=1)
687
+ guess_array[:len(user_guess)] = user_guess
688
+
689
+ if guess_array is not None:
690
+ guess_rounded = guess_array.copy()
691
+
692
+ guess_rounded[integrality] = np.round(guess_rounded[integrality])
693
+ rounded = np.where(guess_rounded != guess_array)[0]
694
+ for i in rounded:
695
+ message = (f"Guess for parameter `{param_info[i].name}` "
696
+ f"rounded from {guess_array[i]} to {guess_rounded[i]}.")
697
+ warnings.warn(message, RuntimeWarning, stacklevel=2)
698
+
699
+ guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])
700
+ clipped = np.where(guess_clipped != guess_rounded)[0]
701
+ for i in clipped:
702
+ message = (f"Guess for parameter `{param_info[i].name}` "
703
+ f"clipped from {guess_rounded[i]} to "
704
+ f"{guess_clipped[i]}.")
705
+ warnings.warn(message, RuntimeWarning, stacklevel=2)
706
+
707
+ guess = guess_clipped
708
+ else:
709
+ guess = None
710
+
711
+ # --- Fitting --- #
712
+ def nllf(free_params, data=data): # bind data NOW
713
+ with np.errstate(invalid='ignore', divide='ignore'):
714
+ return dist._penalized_nnlf(free_params, data)
715
+
716
+ def nlpsf(free_params, data=data): # bind data NOW
717
+ with np.errstate(invalid='ignore', divide='ignore'):
718
+ return dist._penalized_nlpsf(free_params, data)
719
+
720
+ methods = {'mle': nllf, 'mse': nlpsf}
721
+ objective = methods[method.lower()]
722
+
723
+ with np.errstate(invalid='ignore', divide='ignore'):
724
+ kwds = {}
725
+ if bounds is not None:
726
+ kwds['bounds'] = bounds
727
+ if np.any(integrality):
728
+ kwds['integrality'] = integrality
729
+ if guess is not None:
730
+ kwds['x0'] = guess
731
+ res = optimizer(objective, **kwds)
732
+
733
+ return FitResult(dist, data, discrete, res)
734
+
735
+
736
+ GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',
737
+ ('fit_result', 'statistic', 'pvalue',
738
+ 'null_distribution'))
739
+
740
+
741
+ def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
742
+ guessed_params=None, statistic='ad', n_mc_samples=9999,
743
+ random_state=None):
744
+ r"""
745
+ Perform a goodness of fit test comparing data to a distribution family.
746
+
747
+ Given a distribution family and data, perform a test of the null hypothesis
748
+ that the data were drawn from a distribution in that family. Any known
749
+ parameters of the distribution may be specified. Remaining parameters of
750
+ the distribution will be fit to the data, and the p-value of the test
751
+ is computed accordingly. Several statistics for comparing the distribution
752
+ to data are available.
753
+
754
+ Parameters
755
+ ----------
756
+ dist : `scipy.stats.rv_continuous`
757
+ The object representing the distribution family under the null
758
+ hypothesis.
759
+ data : 1D array_like
760
+ Finite, uncensored data to be tested.
761
+ known_params : dict, optional
762
+ A dictionary containing name-value pairs of known distribution
763
+ parameters. Monte Carlo samples are randomly drawn from the
764
+ null-hypothesized distribution with these values of the parameters.
765
+ Before the statistic is evaluated for each Monte Carlo sample, only
766
+ remaining unknown parameters of the null-hypothesized distribution
767
+ family are fit to the samples; the known parameters are held fixed.
768
+ If all parameters of the distribution family are known, then the step
769
+ of fitting the distribution family to each sample is omitted.
770
+ fit_params : dict, optional
771
+ A dictionary containing name-value pairs of distribution parameters
772
+ that have already been fit to the data, e.g. using `scipy.stats.fit`
773
+ or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the
774
+ null-hypothesized distribution with these specified values of the
775
+ parameter. On those Monte Carlo samples, however, these and all other
776
+ unknown parameters of the null-hypothesized distribution family are
777
+ fit before the statistic is evaluated.
778
+ guessed_params : dict, optional
779
+ A dictionary containing name-value pairs of distribution parameters
780
+ which have been guessed. These parameters are always considered as
781
+ free parameters and are fit both to the provided `data` as well as
782
+ to the Monte Carlo samples drawn from the null-hypothesized
783
+ distribution. The purpose of these `guessed_params` is to be used as
784
+ initial values for the numerical fitting procedure.
785
+ statistic : {"ad", "ks", "cvm", "filliben"} or callable, optional
786
+ The statistic used to compare data to a distribution after fitting
787
+ unknown parameters of the distribution family to the data. The
788
+ Anderson-Darling ("ad") [1]_, Kolmogorov-Smirnov ("ks") [1]_,
789
+ Cramer-von Mises ("cvm") [1]_, and Filliben ("filliben") [7]_
790
+ statistics are available. Alternatively, a callable with signature
791
+ ``(dist, data, axis)`` may be supplied to compute the statistic. Here
792
+ ``dist`` is a frozen distribution object (potentially with array
793
+ parameters), ``data`` is an array of Monte Carlo samples (of
794
+ compatible shape), and ``axis`` is the axis of ``data`` along which
795
+ the statistic must be computed.
796
+ n_mc_samples : int, default: 9999
797
+ The number of Monte Carlo samples drawn from the null hypothesized
798
+ distribution to form the null distribution of the statistic. The
799
+ sample size of each is the same as the given `data`.
800
+ random_state : {None, int, `numpy.random.Generator`,
801
+ `numpy.random.RandomState`}, optional
802
+
803
+ Pseudorandom number generator state used to generate the Monte Carlo
804
+ samples.
805
+
806
+ If `random_state` is ``None`` (default), the
807
+ `numpy.random.RandomState` singleton is used.
808
+ If `random_state` is an int, a new ``RandomState`` instance is used,
809
+ seeded with `random_state`.
810
+ If `random_state` is already a ``Generator`` or ``RandomState``
811
+ instance, then the provided instance is used.
812
+
813
+ Returns
814
+ -------
815
+ res : GoodnessOfFitResult
816
+ An object with the following attributes.
817
+
818
+ fit_result : `~scipy.stats._result_classes.FitResult`
819
+ An object representing the fit of the provided `dist` to `data`.
820
+ This object includes the values of distribution family parameters
821
+ that fully define the null-hypothesized distribution, that is,
822
+ the distribution from which Monte Carlo samples are drawn.
823
+ statistic : float
824
+ The value of the statistic comparing provided `data` to the
825
+ null-hypothesized distribution.
826
+ pvalue : float
827
+ The proportion of elements in the null distribution with
828
+ statistic values at least as extreme as the statistic value of the
829
+ provided `data`.
830
+ null_distribution : ndarray
831
+ The value of the statistic for each Monte Carlo sample
832
+ drawn from the null-hypothesized distribution.
833
+
834
+ Notes
835
+ -----
836
+ This is a generalized Monte Carlo goodness-of-fit procedure, special cases
837
+ of which correspond with various Anderson-Darling tests, Lilliefors' test,
838
+ etc. The test is described in [2]_, [3]_, and [4]_ as a parametric
839
+ bootstrap test. This is a Monte Carlo test in which parameters that
840
+ specify the distribution from which samples are drawn have been estimated
841
+ from the data. We describe the test using "Monte Carlo" rather than
842
+ "parametric bootstrap" throughout to avoid confusion with the more familiar
843
+ nonparametric bootstrap, and describe how the test is performed below.
844
+
845
+ *Traditional goodness of fit tests*
846
+
847
+ Traditionally, critical values corresponding with a fixed set of
848
+ significance levels are pre-calculated using Monte Carlo methods. Users
849
+ perform the test by calculating the value of the test statistic only for
850
+ their observed `data` and comparing this value to tabulated critical
851
+ values. This practice is not very flexible, as tables are not available for
852
+ all distributions and combinations of known and unknown parameter values.
853
+ Also, results can be inaccurate when critical values are interpolated from
854
+ limited tabulated data to correspond with the user's sample size and
855
+ fitted parameter values. To overcome these shortcomings, this function
856
+ allows the user to perform the Monte Carlo trials adapted to their
857
+ particular data.
858
+
859
+ *Algorithmic overview*
860
+
861
+ In brief, this routine executes the following steps:
862
+
863
+ 1. Fit unknown parameters to the given `data`, thereby forming the
864
+ "null-hypothesized" distribution, and compute the statistic of
865
+ this pair of data and distribution.
866
+ 2. Draw random samples from this null-hypothesized distribution.
867
+ 3. Fit the unknown parameters to each random sample.
868
+ 4. Calculate the statistic between each sample and the distribution that
869
+ has been fit to the sample.
870
+ 5. Compare the value of the statistic corresponding with `data` from (1)
871
+ against the values of the statistic corresponding with the random
872
+ samples from (4). The p-value is the proportion of samples with a
873
+ statistic value greater than or equal to the statistic of the observed
874
+ data.
875
+
876
+ In more detail, the steps are as follows.
877
+
878
+ First, any unknown parameters of the distribution family specified by
879
+ `dist` are fit to the provided `data` using maximum likelihood estimation.
880
+ (One exception is the normal distribution with unknown location and scale:
881
+ we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for
882
+ the scale as recommended in [1]_.)
883
+ These values of the parameters specify a particular member of the
884
+ distribution family referred to as the "null-hypothesized distribution",
885
+ that is, the distribution from which the data were sampled under the null
886
+ hypothesis. The `statistic`, which compares data to a distribution, is
887
+ computed between `data` and the null-hypothesized distribution.
888
+
889
+ Next, many (specifically `n_mc_samples`) new samples, each containing the
890
+ same number of observations as `data`, are drawn from the
891
+ null-hypothesized distribution. All unknown parameters of the distribution
892
+ family `dist` are fit to *each resample*, and the `statistic` is computed
893
+ between each sample and its corresponding fitted distribution. These
894
+ values of the statistic form the Monte Carlo null distribution (not to be
895
+ confused with the "null-hypothesized distribution" above).
896
+
897
+ The p-value of the test is the proportion of statistic values in the Monte
898
+ Carlo null distribution that are at least as extreme as the statistic value
899
+ of the provided `data`. More precisely, the p-value is given by
900
+
901
+ .. math::
902
+
903
+ p = \frac{b + 1}
904
+ {m + 1}
905
+
906
+ where :math:`b` is the number of statistic values in the Monte Carlo null
907
+ distribution that are greater than or equal to the statistic value
908
+ calculated for `data`, and :math:`m` is the number of elements in the
909
+ Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`
910
+ to the numerator and denominator can be thought of as including the
911
+ value of the statistic corresponding with `data` in the null distribution,
912
+ but a more formal explanation is given in [5]_.
913
+
914
+ *Limitations*
915
+
916
+ The test can be very slow for some distribution families because unknown
917
+ parameters of the distribution family must be fit to each of the Monte
918
+ Carlo samples, and for most distributions in SciPy, distribution fitting
919
+ performed via numerical optimization.
920
+
921
+ *Anti-Pattern*
922
+
923
+ For this reason, it may be tempting
924
+ to treat parameters of the distribution pre-fit to `data` (by the user)
925
+ as though they were `known_params`, as specification of all parameters of
926
+ the distribution precludes the need to fit the distribution to each Monte
927
+ Carlo sample. (This is essentially how the original Kilmogorov-Smirnov
928
+ test is performed.) Although such a test can provide evidence against the
929
+ null hypothesis, the test is conservative in the sense that small p-values
930
+ will tend to (greatly) *overestimate* the probability of making a type I
931
+ error (that is, rejecting the null hypothesis although it is true), and the
932
+ power of the test is low (that is, it is less likely to reject the null
933
+ hypothesis even when the null hypothesis is false).
934
+ This is because the Monte Carlo samples are less likely to agree with the
935
+ null-hypothesized distribution as well as `data`. This tends to increase
936
+ the values of the statistic recorded in the null distribution, so that a
937
+ larger number of them exceed the value of statistic for `data`, thereby
938
+ inflating the p-value.
939
+
940
+ References
941
+ ----------
942
+ .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and
943
+ Some Comparisons." Journal of the American Statistical Association,
944
+ Vol. 69, pp. 730-737.
945
+ .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).
946
+ "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.
947
+ .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric
948
+ bootstrap for goodness-of-fit testing in semiparametric models."
949
+ Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.
950
+ .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on
951
+ a weighted bootstrap: A fast large-sample alternative to the
952
+ parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.
953
+ .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should
954
+ Never Be Zero: Calculating Exact P-values When Permutations Are
955
+ Randomly Drawn." Statistical Applications in Genetics and Molecular
956
+ Biology 9.1.
957
+ .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for
958
+ normality with mean and variance unknown." Journal of the American
959
+ statistical Association 62.318: 399-402.
960
+ .. [7] Filliben, James J. "The probability plot correlation coefficient
961
+ test for normality." Technometrics 17.1 (1975): 111-117.
962
+
963
+ Examples
964
+ --------
965
+ A well-known test of the null hypothesis that data were drawn from a
966
+ given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy
967
+ as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following
968
+ data:
969
+
970
+ >>> import numpy as np
971
+ >>> from scipy import stats
972
+ >>> rng = np.random.default_rng()
973
+ >>> x = stats.uniform.rvs(size=75, random_state=rng)
974
+
975
+ were sampled from a normal distribution. To perform a KS test, the
976
+ empirical distribution function of the observed data will be compared
977
+ against the (theoretical) cumulative distribution function of a normal
978
+ distribution. Of course, to do this, the normal distribution under the null
979
+ hypothesis must be fully specified. This is commonly done by first fitting
980
+ the ``loc`` and ``scale`` parameters of the distribution to the observed
981
+ data, then performing the test.
982
+
983
+ >>> loc, scale = np.mean(x), np.std(x, ddof=1)
984
+ >>> cdf = stats.norm(loc, scale).cdf
985
+ >>> stats.ks_1samp(x, cdf)
986
+ KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257)
987
+
988
+ An advantage of the KS-test is that the p-value - the probability of
989
+ obtaining a value of the test statistic under the null hypothesis as
990
+ extreme as the value obtained from the observed data - can be calculated
991
+ exactly and efficiently. `goodness_of_fit` can only approximate these
992
+ results.
993
+
994
+ >>> known_params = {'loc': loc, 'scale': scale}
995
+ >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
996
+ ... statistic='ks', random_state=rng)
997
+ >>> res.statistic, res.pvalue
998
+ (0.1119257570456813, 0.2788)
999
+
1000
+ The statistic matches exactly, but the p-value is estimated by forming
1001
+ a "Monte Carlo null distribution", that is, by explicitly drawing random
1002
+ samples from `scipy.stats.norm` with the provided parameters and
1003
+ calculating the stastic for each. The fraction of these statistic values
1004
+ at least as extreme as ``res.statistic`` approximates the exact p-value
1005
+ calculated by `scipy.stats.ks_1samp`.
1006
+
1007
+ However, in many cases, we would prefer to test only that the data were
1008
+ sampled from one of *any* member of the normal distribution family, not
1009
+ specifically from the normal distribution with the location and scale
1010
+ fitted to the observed sample. In this case, Lilliefors [6]_ argued that
1011
+ the KS test is far too conservative (that is, the p-value overstates
1012
+ the actual probability of rejecting a true null hypothesis) and thus lacks
1013
+ power - the ability to reject the null hypothesis when the null hypothesis
1014
+ is actually false.
1015
+ Indeed, our p-value above is approximately 0.28, which is far too large
1016
+ to reject the null hypothesis at any common significance level.
1017
+
1018
+ Consider why this might be. Note that in the KS test above, the statistic
1019
+ always compares data against the CDF of a normal distribution fitted to the
1020
+ *observed data*. This tends to reduce the value of the statistic for the
1021
+ observed data, but it is "unfair" when computing the statistic for other
1022
+ samples, such as those we randomly draw to form the Monte Carlo null
1023
+ distribution. It is easy to correct for this: whenever we compute the KS
1024
+ statistic of a sample, we use the CDF of a normal distribution fitted
1025
+ to *that sample*. The null distribution in this case has not been
1026
+ calculated exactly and is tyically approximated using Monte Carlo methods
1027
+ as described above. This is where `goodness_of_fit` excels.
1028
+
1029
+ >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',
1030
+ ... random_state=rng)
1031
+ >>> res.statistic, res.pvalue
1032
+ (0.1119257570456813, 0.0196)
1033
+
1034
+ Indeed, this p-value is much smaller, and small enough to (correctly)
1035
+ reject the null hypothesis at common significance levels, including 5% and
1036
+ 2.5%.
1037
+
1038
+ However, the KS statistic is not very sensitive to all deviations from
1039
+ normality. The original advantage of the KS statistic was the ability
1040
+ to compute the null distribution theoretically, but a more sensitive
1041
+ statistic - resulting in a higher test power - can be used now that we can
1042
+ approximate the null distribution
1043
+ computationally. The Anderson-Darling statistic [1]_ tends to be more
1044
+ sensitive, and critical values of the this statistic have been tabulated
1045
+ for various significance levels and sample sizes using Monte Carlo methods.
1046
+
1047
+ >>> res = stats.anderson(x, 'norm')
1048
+ >>> print(res.statistic)
1049
+ 1.2139573337497467
1050
+ >>> print(res.critical_values)
1051
+ [0.549 0.625 0.75 0.875 1.041]
1052
+ >>> print(res.significance_level)
1053
+ [15. 10. 5. 2.5 1. ]
1054
+
1055
+ Here, the observed value of the statistic exceeds the critical value
1056
+ corresponding with a 1% significance level. This tells us that the p-value
1057
+ of the observed data is less than 1%, but what is it? We could interpolate
1058
+ from these (already-interpolated) values, but `goodness_of_fit` can
1059
+ estimate it directly.
1060
+
1061
+ >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',
1062
+ ... random_state=rng)
1063
+ >>> res.statistic, res.pvalue
1064
+ (1.2139573337497467, 0.0034)
1065
+
1066
+ A further advantage is that use of `goodness_of_fit` is not limited to
1067
+ a particular set of distributions or conditions on which parameters
1068
+ are known versus which must be estimated from data. Instead,
1069
+ `goodness_of_fit` can estimate p-values relatively quickly for any
1070
+ distribution with a sufficiently fast and reliable ``fit`` method. For
1071
+ instance, here we perform a goodness of fit test using the Cramer-von Mises
1072
+ statistic against the Rayleigh distribution with known location and unknown
1073
+ scale.
1074
+
1075
+ >>> rng = np.random.default_rng()
1076
+ >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
1077
+ >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
1078
+ ... known_params={'loc': 0}, random_state=rng)
1079
+
1080
+ This executes fairly quickly, but to check the reliability of the ``fit``
1081
+ method, we should inspect the fit result.
1082
+
1083
+ >>> res.fit_result # location is as specified, and scale is reasonable
1084
+ params: FitParams(loc=0.0, scale=2.1026719844231243)
1085
+ success: True
1086
+ message: 'The fit was performed successfully.'
1087
+ >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
1088
+ >>> res.fit_result.plot()
1089
+ >>> plt.show()
1090
+
1091
+ If the distribution is not fit to the observed data as well as possible,
1092
+ the test may not control the type I error rate, that is, the chance of
1093
+ rejecting the null hypothesis even when it is true.
1094
+
1095
+ We should also look for extreme outliers in the null distribution that
1096
+ may be caused by unreliable fitting. These do not necessarily invalidate
1097
+ the result, but they tend to reduce the test's power.
1098
+
1099
+ >>> _, ax = plt.subplots()
1100
+ >>> ax.hist(np.log10(res.null_distribution))
1101
+ >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")
1102
+ >>> ax.set_ylabel("Frequency")
1103
+ >>> ax.set_title("Histogram of the Monte Carlo null distribution")
1104
+ >>> plt.show()
1105
+
1106
+ This plot seems reassuring.
1107
+
1108
+ If ``fit`` method is working reliably, and if the distribution of the test
1109
+ statistic is not particularly sensitive to the values of the fitted
1110
+ parameters, then the p-value provided by `goodness_of_fit` is expected to
1111
+ be a good approximation.
1112
+
1113
+ >>> res.statistic, res.pvalue
1114
+ (0.2231991510248692, 0.0525)
1115
+
1116
+ """
1117
+ args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
1118
+ statistic, n_mc_samples, random_state)
1119
+ (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
1120
+ guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args
1121
+
1122
+ # Fit null hypothesis distribution to data
1123
+ nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
1124
+ fixed_nhd_params)
1125
+ nhd_vals = nhd_fit_fun(data)
1126
+ nhd_dist = dist(*nhd_vals)
1127
+
1128
+ def rvs(size):
1129
+ return nhd_dist.rvs(size=size, random_state=random_state)
1130
+
1131
+ # Define statistic
1132
+ fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
1133
+ if callable(statistic):
1134
+ compare_fun = statistic
1135
+ else:
1136
+ compare_fun = _compare_dict[statistic]
1137
+ alternative = getattr(compare_fun, 'alternative', 'greater')
1138
+
1139
+ def statistic_fun(data, axis):
1140
+ # Make things simple by always working along the last axis.
1141
+ data = np.moveaxis(data, axis, -1)
1142
+ rfd_vals = fit_fun(data)
1143
+ rfd_dist = dist(*rfd_vals)
1144
+ return compare_fun(rfd_dist, data, axis=-1)
1145
+
1146
+ res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,
1147
+ n_resamples=n_mc_samples, axis=-1,
1148
+ alternative=alternative)
1149
+ opt_res = optimize.OptimizeResult()
1150
+ opt_res.success = True
1151
+ opt_res.message = "The fit was performed successfully."
1152
+ opt_res.x = nhd_vals
1153
+ # Only continuous distributions for now, hence discrete=False
1154
+ # There's no fundamental limitation; it's just that we're not using
1155
+ # stats.fit, discrete distributions don't have `fit` method, and
1156
+ # we haven't written any vectorized fit functions for a discrete
1157
+ # distribution yet.
1158
+ return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),
1159
+ res.statistic, res.pvalue,
1160
+ res.null_distribution)
1161
+
1162
+
1163
+ def _get_fit_fun(dist, data, guessed_params, fixed_params):
1164
+
1165
+ shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
1166
+ param_names = shape_names + ['loc', 'scale']
1167
+ fparam_names = ['f'+name for name in param_names]
1168
+ all_fixed = not set(fparam_names).difference(fixed_params)
1169
+ guessed_shapes = [guessed_params.pop(x, None)
1170
+ for x in shape_names if x in guessed_params]
1171
+
1172
+ if all_fixed:
1173
+ def fit_fun(data):
1174
+ return [fixed_params[name] for name in fparam_names]
1175
+ # Define statistic, including fitting distribution to data
1176
+ elif dist in _fit_funs:
1177
+ def fit_fun(data):
1178
+ params = _fit_funs[dist](data, **fixed_params)
1179
+ params = np.asarray(np.broadcast_arrays(*params))
1180
+ if params.ndim > 1:
1181
+ params = params[..., np.newaxis]
1182
+ return params
1183
+ else:
1184
+ def fit_fun_1d(data):
1185
+ return dist.fit(data, *guessed_shapes, **guessed_params,
1186
+ **fixed_params)
1187
+
1188
+ def fit_fun(data):
1189
+ params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)
1190
+ if params.ndim > 1:
1191
+ params = params.T[..., np.newaxis]
1192
+ return params
1193
+
1194
+ return fit_fun
1195
+
1196
+
1197
+ # Vectorized fitting functions. These are to accept ND `data` in which each
1198
+ # row (slice along last axis) is a sample to fit and scalar fixed parameters.
1199
+ # They return a tuple of shape parameter arrays, each of shape data.shape[:-1].
1200
+ def _fit_norm(data, floc=None, fscale=None):
1201
+ loc = floc
1202
+ scale = fscale
1203
+ if loc is None and scale is None:
1204
+ loc = np.mean(data, axis=-1)
1205
+ scale = np.std(data, ddof=1, axis=-1)
1206
+ elif loc is None:
1207
+ loc = np.mean(data, axis=-1)
1208
+ elif scale is None:
1209
+ scale = np.sqrt(((data - loc)**2).mean(axis=-1))
1210
+ return loc, scale
1211
+
1212
+
1213
+ _fit_funs = {stats.norm: _fit_norm} # type: ignore[attr-defined]
1214
+
1215
+
1216
+ # Vectorized goodness of fit statistic functions. These accept a frozen
1217
+ # distribution object and `data` in which each row (slice along last axis) is
1218
+ # a sample.
1219
+
1220
+
1221
+ def _anderson_darling(dist, data, axis):
1222
+ x = np.sort(data, axis=-1)
1223
+ n = data.shape[-1]
1224
+ i = np.arange(1, n+1)
1225
+ Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))
1226
+ S = np.sum(Si, axis=-1)
1227
+ return -n - S
1228
+
1229
+
1230
+ def _compute_dplus(cdfvals): # adapted from _stats_py before gh-17062
1231
+ n = cdfvals.shape[-1]
1232
+ return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)
1233
+
1234
+
1235
+ def _compute_dminus(cdfvals):
1236
+ n = cdfvals.shape[-1]
1237
+ return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)
1238
+
1239
+
1240
+ def _kolmogorov_smirnov(dist, data, axis):
1241
+ x = np.sort(data, axis=-1)
1242
+ cdfvals = dist.cdf(x)
1243
+ Dplus = _compute_dplus(cdfvals) # always works along last axis
1244
+ Dminus = _compute_dminus(cdfvals)
1245
+ return np.maximum(Dplus, Dminus)
1246
+
1247
+
1248
+ def _corr(X, M):
1249
+ # Correlation coefficient r, simplified and vectorized as we need it.
1250
+ # See [7] Equation (2). Lemma 1/2 are only for distributions symmetric
1251
+ # about 0.
1252
+ Xm = X.mean(axis=-1, keepdims=True)
1253
+ Mm = M.mean(axis=-1, keepdims=True)
1254
+ num = np.sum((X - Xm) * (M - Mm), axis=-1)
1255
+ den = np.sqrt(np.sum((X - Xm)**2, axis=-1) * np.sum((M - Mm)**2, axis=-1))
1256
+ return num/den
1257
+
1258
+
1259
+ def _filliben(dist, data, axis):
1260
+ # [7] Section 8 # 1
1261
+ X = np.sort(data, axis=-1)
1262
+
1263
+ # [7] Section 8 # 2
1264
+ n = data.shape[-1]
1265
+ k = np.arange(1, n+1)
1266
+ # Filliben used an approximation for the uniform distribution order
1267
+ # statistic medians.
1268
+ # m = (k - .3175)/(n + 0.365)
1269
+ # m[-1] = 0.5**(1/n)
1270
+ # m[0] = 1 - m[-1]
1271
+ # We can just as easily use the (theoretically) exact values. See e.g.
1272
+ # https://en.wikipedia.org/wiki/Order_statistic
1273
+ # "Order statistics sampled from a uniform distribution"
1274
+ m = stats.beta(k, n + 1 - k).median()
1275
+
1276
+ # [7] Section 8 # 3
1277
+ M = dist.ppf(m)
1278
+
1279
+ # [7] Section 8 # 4
1280
+ return _corr(X, M)
1281
+ _filliben.alternative = 'less' # type: ignore[attr-defined]
1282
+
1283
+
1284
+ def _cramer_von_mises(dist, data, axis):
1285
+ x = np.sort(data, axis=-1)
1286
+ n = data.shape[-1]
1287
+ cdfvals = dist.cdf(x)
1288
+ u = (2*np.arange(1, n+1) - 1)/(2*n)
1289
+ w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)
1290
+ return w
1291
+
1292
+
1293
+ _compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,
1294
+ "cvm": _cramer_von_mises, "filliben": _filliben}
1295
+
1296
+
1297
+ def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
1298
+ n_mc_samples, random_state):
1299
+
1300
+ if not isinstance(dist, stats.rv_continuous):
1301
+ message = ("`dist` must be a (non-frozen) instance of "
1302
+ "`stats.rv_continuous`.")
1303
+ raise TypeError(message)
1304
+
1305
+ data = np.asarray(data, dtype=float)
1306
+ if not data.ndim == 1:
1307
+ message = "`data` must be a one-dimensional array of numbers."
1308
+ raise ValueError(message)
1309
+
1310
+ # Leave validation of these key/value pairs to the `fit` method,
1311
+ # but collect these into dictionaries that will be used
1312
+ known_params = known_params or dict()
1313
+ fit_params = fit_params or dict()
1314
+ guessed_params = guessed_params or dict()
1315
+
1316
+ known_params_f = {("f"+key): val for key, val in known_params.items()}
1317
+ fit_params_f = {("f"+key): val for key, val in fit_params.items()}
1318
+
1319
+ # These are the values of parameters of the null distribution family
1320
+ # with which resamples are drawn
1321
+ fixed_nhd_params = known_params_f.copy()
1322
+ fixed_nhd_params.update(fit_params_f)
1323
+
1324
+ # These are fixed when fitting the distribution family to resamples
1325
+ fixed_rfd_params = known_params_f.copy()
1326
+
1327
+ # These are used as guesses when fitting the distribution family to
1328
+ # the original data
1329
+ guessed_nhd_params = guessed_params.copy()
1330
+
1331
+ # These are used as guesses when fitting the distribution family to
1332
+ # resamples
1333
+ guessed_rfd_params = fit_params.copy()
1334
+ guessed_rfd_params.update(guessed_params)
1335
+
1336
+ if not callable(statistic):
1337
+ statistic = statistic.lower()
1338
+ statistics = {'ad', 'ks', 'cvm', 'filliben'}
1339
+ if statistic not in statistics:
1340
+ message = f"`statistic` must be one of {statistics}."
1341
+ raise ValueError(message)
1342
+
1343
+ n_mc_samples_int = int(n_mc_samples)
1344
+ if n_mc_samples_int != n_mc_samples:
1345
+ message = "`n_mc_samples` must be an integer."
1346
+ raise TypeError(message)
1347
+
1348
+ random_state = check_random_state(random_state)
1349
+
1350
+ return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
1351
+ guessed_rfd_params, statistic, n_mc_samples_int, random_state)
.venv/Lib/site-packages/scipy/stats/_generate_pyx.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import subprocess
3
+ import sys
4
+ import os
5
+ import argparse
6
+
7
+
8
+ def make_boost(outdir):
9
+ # Call code generator inside _boost directory
10
+ code_gen = pathlib.Path(__file__).parent / '_boost/include/code_gen.py'
11
+ subprocess.run([sys.executable, str(code_gen), '-o', outdir],
12
+ check=True)
13
+
14
+
15
+ if __name__ == '__main__':
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("-o", "--outdir", type=str,
18
+ help="Path to the output directory")
19
+ args = parser.parse_args()
20
+
21
+ if not args.outdir:
22
+ raise ValueError("A path to the output directory is required")
23
+ else:
24
+ # Meson build
25
+ srcdir_abs = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
26
+ outdir_abs = pathlib.Path(os.getcwd()) / args.outdir
27
+ make_boost(outdir_abs)
.venv/Lib/site-packages/scipy/stats/_hypotests.py ADDED
@@ -0,0 +1,2021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import namedtuple
2
+ from dataclasses import dataclass
3
+ from math import comb
4
+ import numpy as np
5
+ import warnings
6
+ from itertools import combinations
7
+ import scipy.stats
8
+ from scipy.optimize import shgo
9
+ from . import distributions
10
+ from ._common import ConfidenceInterval
11
+ from ._continuous_distns import chi2, norm
12
+ from scipy.special import gamma, kv, gammaln
13
+ from scipy.fft import ifft
14
+ from ._stats_pythran import _a_ij_Aij_Dij2
15
+ from ._stats_pythran import (
16
+ _concordant_pairs as _P, _discordant_pairs as _Q
17
+ )
18
+ from ._axis_nan_policy import _axis_nan_policy_factory
19
+ from scipy.stats import _stats_py
20
+
21
+ __all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
22
+ 'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
23
+ 'tukey_hsd', 'poisson_means_test']
24
+
25
+ Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
26
+ ('statistic', 'pvalue'))
27
+
28
+
29
+ @_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4)
30
+ def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
31
+ """Compute the Epps-Singleton (ES) test statistic.
32
+
33
+ Test the null hypothesis that two samples have the same underlying
34
+ probability distribution.
35
+
36
+ Parameters
37
+ ----------
38
+ x, y : array-like
39
+ The two samples of observations to be tested. Input must not have more
40
+ than one dimension. Samples can have different lengths.
41
+ t : array-like, optional
42
+ The points (t1, ..., tn) where the empirical characteristic function is
43
+ to be evaluated. It should be positive distinct numbers. The default
44
+ value (0.4, 0.8) is proposed in [1]_. Input must not have more than
45
+ one dimension.
46
+
47
+ Returns
48
+ -------
49
+ statistic : float
50
+ The test statistic.
51
+ pvalue : float
52
+ The associated p-value based on the asymptotic chi2-distribution.
53
+
54
+ See Also
55
+ --------
56
+ ks_2samp, anderson_ksamp
57
+
58
+ Notes
59
+ -----
60
+ Testing whether two samples are generated by the same underlying
61
+ distribution is a classical question in statistics. A widely used test is
62
+ the Kolmogorov-Smirnov (KS) test which relies on the empirical
63
+ distribution function. Epps and Singleton introduce a test based on the
64
+ empirical characteristic function in [1]_.
65
+
66
+ One advantage of the ES test compared to the KS test is that is does
67
+ not assume a continuous distribution. In [1]_, the authors conclude
68
+ that the test also has a higher power than the KS test in many
69
+ examples. They recommend the use of the ES test for discrete samples as
70
+ well as continuous samples with at least 25 observations each, whereas
71
+ `anderson_ksamp` is recommended for smaller sample sizes in the
72
+ continuous case.
73
+
74
+ The p-value is computed from the asymptotic distribution of the test
75
+ statistic which follows a `chi2` distribution. If the sample size of both
76
+ `x` and `y` is below 25, the small sample correction proposed in [1]_ is
77
+ applied to the test statistic.
78
+
79
+ The default values of `t` are determined in [1]_ by considering
80
+ various distributions and finding good values that lead to a high power
81
+ of the test in general. Table III in [1]_ gives the optimal values for
82
+ the distributions tested in that study. The values of `t` are scaled by
83
+ the semi-interquartile range in the implementation, see [1]_.
84
+
85
+ References
86
+ ----------
87
+ .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
88
+ problem using the empirical characteristic function", Journal of
89
+ Statistical Computation and Simulation 26, p. 177--203, 1986.
90
+
91
+ .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
92
+ - the Epps-Singleton two-sample test using the empirical characteristic
93
+ function", The Stata Journal 9(3), p. 454--465, 2009.
94
+
95
+ """
96
+ # x and y are converted to arrays by the decorator
97
+ t = np.asarray(t)
98
+ # check if x and y are valid inputs
99
+ nx, ny = len(x), len(y)
100
+ if (nx < 5) or (ny < 5):
101
+ raise ValueError('x and y should have at least 5 elements, but len(x) '
102
+ f'= {nx} and len(y) = {ny}.')
103
+ if not np.isfinite(x).all():
104
+ raise ValueError('x must not contain nonfinite values.')
105
+ if not np.isfinite(y).all():
106
+ raise ValueError('y must not contain nonfinite values.')
107
+ n = nx + ny
108
+
109
+ # check if t is valid
110
+ if t.ndim > 1:
111
+ raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.')
112
+ if np.less_equal(t, 0).any():
113
+ raise ValueError('t must contain positive elements only.')
114
+
115
+ # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
116
+ # circular import
117
+ from scipy.stats import iqr
118
+ sigma = iqr(np.hstack((x, y))) / 2
119
+ ts = np.reshape(t, (-1, 1)) / sigma
120
+
121
+ # covariance estimation of ES test
122
+ gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T # shape = (nx, 2*len(t))
123
+ gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
124
+ cov_x = np.cov(gx.T, bias=True) # the test uses biased cov-estimate
125
+ cov_y = np.cov(gy.T, bias=True)
126
+ est_cov = (n/nx)*cov_x + (n/ny)*cov_y
127
+ est_cov_inv = np.linalg.pinv(est_cov)
128
+ r = np.linalg.matrix_rank(est_cov_inv)
129
+ if r < 2*len(t):
130
+ warnings.warn('Estimated covariance matrix does not have full rank. '
131
+ 'This indicates a bad choice of the input t and the '
132
+ 'test might not be consistent.', # see p. 183 in [1]_
133
+ stacklevel=2)
134
+
135
+ # compute test statistic w distributed asympt. as chisquare with df=r
136
+ g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
137
+ w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
138
+
139
+ # apply small-sample correction
140
+ if (max(nx, ny) < 25):
141
+ corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
142
+ w = corr * w
143
+
144
+ p = chi2.sf(w, r)
145
+
146
+ return Epps_Singleton_2sampResult(w, p)
147
+
148
+
149
+ def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
150
+ r"""
151
+ Performs the Poisson means test, AKA the "E-test".
152
+
153
+ This is a test of the null hypothesis that the difference between means of
154
+ two Poisson distributions is `diff`. The samples are provided as the
155
+ number of events `k1` and `k2` observed within measurement intervals
156
+ (e.g. of time, space, number of observations) of sizes `n1` and `n2`.
157
+
158
+ Parameters
159
+ ----------
160
+ k1 : int
161
+ Number of events observed from distribution 1.
162
+ n1: float
163
+ Size of sample from distribution 1.
164
+ k2 : int
165
+ Number of events observed from distribution 2.
166
+ n2 : float
167
+ Size of sample from distribution 2.
168
+ diff : float, default=0
169
+ The hypothesized difference in means between the distributions
170
+ underlying the samples.
171
+ alternative : {'two-sided', 'less', 'greater'}, optional
172
+ Defines the alternative hypothesis.
173
+ The following options are available (default is 'two-sided'):
174
+
175
+ * 'two-sided': the difference between distribution means is not
176
+ equal to `diff`
177
+ * 'less': the difference between distribution means is less than
178
+ `diff`
179
+ * 'greater': the difference between distribution means is greater
180
+ than `diff`
181
+
182
+ Returns
183
+ -------
184
+ statistic : float
185
+ The test statistic (see [1]_ equation 3.3).
186
+ pvalue : float
187
+ The probability of achieving such an extreme value of the test
188
+ statistic under the null hypothesis.
189
+
190
+ Notes
191
+ -----
192
+
193
+ Let:
194
+
195
+ .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
196
+
197
+ be a random variable independent of
198
+
199
+ .. math:: X_2 \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
200
+
201
+ and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
202
+ and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
203
+ of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
204
+ ``n2``, respectively, to test the null hypothesis that
205
+
206
+ .. math::
207
+ H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
208
+
209
+ A benefit of the E-test is that it has good power for small sample sizes,
210
+ which can reduce sampling costs [1]_. It has been evaluated and determined
211
+ to be more powerful than the comparable C-test, sometimes referred to as
212
+ the Poisson exact test.
213
+
214
+ References
215
+ ----------
216
+ .. [1] Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
217
+ comparing two Poisson means. Journal of Statistical Planning and
218
+ Inference, 119(1), 23-35.
219
+
220
+ .. [2] Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
221
+ testing samples from Poisson series: With an application to testing
222
+ clover seed for dodder. Biometrika, 31(3/4), 313-323.
223
+
224
+ Examples
225
+ --------
226
+
227
+ Suppose that a gardener wishes to test the number of dodder (weed) seeds
228
+ in a sack of clover seeds that they buy from a seed company. It has
229
+ previously been established that the number of dodder seeds in clover
230
+ follows the Poisson distribution.
231
+
232
+ A 100 gram sample is drawn from the sack before being shipped to the
233
+ gardener. The sample is analyzed, and it is found to contain no dodder
234
+ seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
235
+ another 100 gram sample from the sack. This time, three dodder seeds are
236
+ found in the sample; that is, `k2` is 3. The gardener would like to
237
+ know if the difference is significant and not due to chance. The
238
+ null hypothesis is that the difference between the two samples is merely
239
+ due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
240
+ where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
241
+ difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
242
+ The gardener selects a significance level of 5% to reject the null
243
+ hypothesis in favor of the alternative [2]_.
244
+
245
+ >>> import scipy.stats as stats
246
+ >>> res = stats.poisson_means_test(0, 100, 3, 100)
247
+ >>> res.statistic, res.pvalue
248
+ (-1.7320508075688772, 0.08837900929018157)
249
+
250
+ The p-value is .088, indicating a near 9% chance of observing a value of
251
+ the test statistic under the null hypothesis. This exceeds 5%, so the
252
+ gardener does not reject the null hypothesis as the difference cannot be
253
+ regarded as significant at this level.
254
+ """
255
+
256
+ _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
257
+
258
+ # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
259
+ lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
260
+
261
+ # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
262
+ # case the null hypothesis cannot be rejected ... [and] it is not necessary
263
+ # to compute the p-value". [1] page 26 below eq. (3.6).
264
+ if lmbd_hat2 <= 0:
265
+ return _stats_py.SignificanceResult(0, 1)
266
+
267
+ # The unbiased variance estimate [1] (3.2)
268
+ var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
269
+
270
+ # The _observed_ pivot statistic from the input. It follows the
271
+ # unnumbered equation following equation (3.3) This is used later in
272
+ # comparison with the computed pivot statistics in an indicator function.
273
+ t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
274
+
275
+ # Equation (3.5) of [1] is lengthy, so it is broken into several parts,
276
+ # beginning here. Note that the probability mass function of poisson is
277
+ # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
278
+ # here as nlmbd_hat*. The strategy for evaluating the double summation in
279
+ # (3.5) is to create two arrays of the values of the two products inside
280
+ # the summation and then broadcast them together into a matrix, and then
281
+ # sum across the entire matrix.
282
+
283
+ # Compute constants (as seen in the first and second separated products in
284
+ # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
285
+ nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
286
+ nlmbd_hat2 = n2 * lmbd_hat2
287
+
288
+ # Determine summation bounds for tail ends of distribution rather than
289
+ # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
290
+ # sum.
291
+ x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
292
+ x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
293
+
294
+ # Construct arrays to function as the x_1 and x_2 counters on the summation
295
+ # in (3.5). `x1` is in columns and `x2` is in rows to allow for
296
+ # broadcasting.
297
+ x1 = np.arange(x1_lb, x1_ub + 1)
298
+ x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
299
+
300
+ # These are the two products in equation (3.5) with `prob_x1` being the
301
+ # first (left side) and `prob_x2` being the second (right side). (To
302
+ # make as clear as possible: the 1st contains a "+ d" term, the 2nd does
303
+ # not.)
304
+ prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
305
+ prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
306
+
307
+ # compute constants for use in the "pivot statistic" per the
308
+ # unnumbered equation following (3.3).
309
+ lmbd_x1 = x1 / n1
310
+ lmbd_x2 = x2 / n2
311
+ lmbds_diff = lmbd_x1 - lmbd_x2 - diff
312
+ var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
313
+
314
+ # This is the 'pivot statistic' for use in the indicator of the summation
315
+ # (left side of "I[.]").
316
+ with np.errstate(invalid='ignore', divide='ignore'):
317
+ t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
318
+
319
+ # `[indicator]` implements the "I[.] ... the indicator function" per
320
+ # the paragraph following equation (3.5).
321
+ if alternative == 'two-sided':
322
+ indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
323
+ elif alternative == 'less':
324
+ indicator = t_x1x2 <= t_k1k2
325
+ else:
326
+ indicator = t_x1x2 >= t_k1k2
327
+
328
+ # Multiply all combinations of the products together, exclude terms
329
+ # based on the `indicator` and then sum. (3.5)
330
+ pvalue = np.sum((prob_x1 * prob_x2)[indicator])
331
+ return _stats_py.SignificanceResult(t_k1k2, pvalue)
332
+
333
+
334
+ def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
335
+ # """check for valid types and values of input to `poisson_mean_test`."""
336
+ if k1 != int(k1) or k2 != int(k2):
337
+ raise TypeError('`k1` and `k2` must be integers.')
338
+
339
+ count_err = '`k1` and `k2` must be greater than or equal to 0.'
340
+ if k1 < 0 or k2 < 0:
341
+ raise ValueError(count_err)
342
+
343
+ if n1 <= 0 or n2 <= 0:
344
+ raise ValueError('`n1` and `n2` must be greater than 0.')
345
+
346
+ if diff < 0:
347
+ raise ValueError('diff must be greater than or equal to 0.')
348
+
349
+ alternatives = {'two-sided', 'less', 'greater'}
350
+ if alternative.lower() not in alternatives:
351
+ raise ValueError(f"Alternative must be one of '{alternatives}'.")
352
+
353
+
354
+ class CramerVonMisesResult:
355
+ def __init__(self, statistic, pvalue):
356
+ self.statistic = statistic
357
+ self.pvalue = pvalue
358
+
359
+ def __repr__(self):
360
+ return (f"{self.__class__.__name__}(statistic={self.statistic}, "
361
+ f"pvalue={self.pvalue})")
362
+
363
+
364
+ def _psi1_mod(x):
365
+ """
366
+ psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
367
+ This implements a modified version by excluding the term V(x) / 12
368
+ (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
369
+ twice in _cdf_cvm.
370
+
371
+ Implementation based on MAPLE code of Julian Faraway and R code of the
372
+ function pCvM in the package goftest (v1.1.1), permission granted
373
+ by Adrian Baddeley. Main difference in the implementation: the code
374
+ here keeps adding terms of the series until the terms are small enough.
375
+ """
376
+
377
+ def _ed2(y):
378
+ z = y**2 / 4
379
+ b = kv(1/4, z) + kv(3/4, z)
380
+ return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi)
381
+
382
+ def _ed3(y):
383
+ z = y**2 / 4
384
+ c = np.exp(-z) / np.sqrt(np.pi)
385
+ return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z))
386
+
387
+ def _Ak(k, x):
388
+ m = 2*k + 1
389
+ sx = 2 * np.sqrt(x)
390
+ y1 = x**(3/4)
391
+ y2 = x**(5/4)
392
+
393
+ e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1)
394
+ e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2)
395
+ e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2)
396
+ e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1)
397
+ e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1)
398
+
399
+ return e1 + e2 + e3 + e4 + e5
400
+
401
+ x = np.asarray(x)
402
+ tot = np.zeros_like(x, dtype='float')
403
+ cond = np.ones_like(x, dtype='bool')
404
+ k = 0
405
+ while np.any(cond):
406
+ z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1))
407
+ tot[cond] = tot[cond] + z
408
+ cond[cond] = np.abs(z) >= 1e-7
409
+ k += 1
410
+
411
+ return tot
412
+
413
+
414
+ def _cdf_cvm_inf(x):
415
+ """
416
+ Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
417
+
418
+ See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
419
+
420
+ Implementation based on MAPLE code of Julian Faraway and R code of the
421
+ function pCvM in the package goftest (v1.1.1), permission granted
422
+ by Adrian Baddeley. Main difference in the implementation: the code
423
+ here keeps adding terms of the series until the terms are small enough.
424
+
425
+ The function is not expected to be accurate for large values of x, say
426
+ x > 4, when the cdf is very close to 1.
427
+ """
428
+ x = np.asarray(x)
429
+
430
+ def term(x, k):
431
+ # this expression can be found in [2], second line of (1.3)
432
+ u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x))
433
+ y = 4*k + 1
434
+ q = y**2 / (16*x)
435
+ b = kv(0.25, q)
436
+ return u * np.sqrt(y) * np.exp(-q) * b
437
+
438
+ tot = np.zeros_like(x, dtype='float')
439
+ cond = np.ones_like(x, dtype='bool')
440
+ k = 0
441
+ while np.any(cond):
442
+ z = term(x[cond], k)
443
+ tot[cond] = tot[cond] + z
444
+ cond[cond] = np.abs(z) >= 1e-7
445
+ k += 1
446
+
447
+ return tot
448
+
449
+
450
+ def _cdf_cvm(x, n=None):
451
+ """
452
+ Calculate the cdf of the Cramér-von Mises statistic for a finite sample
453
+ size n. If N is None, use the asymptotic cdf (n=inf).
454
+
455
+ See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
456
+ 1.2 for the asymptotic cdf.
457
+
458
+ The function is not expected to be accurate for large values of x, say
459
+ x > 2, when the cdf is very close to 1 and it might return values > 1
460
+ in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
461
+ is not accurate for small values of n, especially close to the bounds of
462
+ the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
463
+ and 1, respectively. These are limitations of the approximation by Csörgő
464
+ and Faraway (1996) implemented in this function.
465
+ """
466
+ x = np.asarray(x)
467
+ if n is None:
468
+ y = _cdf_cvm_inf(x)
469
+ else:
470
+ # support of the test statistic is [12/n, n/3], see 1.1 in [2]
471
+ y = np.zeros_like(x, dtype='float')
472
+ sup = (1./(12*n) < x) & (x < n/3.)
473
+ # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
474
+ # therefore, we need to add it here
475
+ y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n
476
+ y[x >= n/3] = 1
477
+
478
+ if y.ndim == 0:
479
+ return y[()]
480
+ return y
481
+
482
+
483
+ def _cvm_result_to_tuple(res):
484
+ return res.statistic, res.pvalue
485
+
486
+
487
+ @_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1,
488
+ result_to_tuple=_cvm_result_to_tuple)
489
+ def cramervonmises(rvs, cdf, args=()):
490
+ """Perform the one-sample Cramér-von Mises test for goodness of fit.
491
+
492
+ This performs a test of the goodness of fit of a cumulative distribution
493
+ function (cdf) :math:`F` compared to the empirical distribution function
494
+ :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
495
+ assumed to be independent and identically distributed ([1]_).
496
+ The null hypothesis is that the :math:`X_i` have cumulative distribution
497
+ :math:`F`.
498
+
499
+ Parameters
500
+ ----------
501
+ rvs : array_like
502
+ A 1-D array of observed values of the random variables :math:`X_i`.
503
+ cdf : str or callable
504
+ The cumulative distribution function :math:`F` to test the
505
+ observations against. If a string, it should be the name of a
506
+ distribution in `scipy.stats`. If a callable, that callable is used
507
+ to calculate the cdf: ``cdf(x, *args) -> float``.
508
+ args : tuple, optional
509
+ Distribution parameters. These are assumed to be known; see Notes.
510
+
511
+ Returns
512
+ -------
513
+ res : object with attributes
514
+ statistic : float
515
+ Cramér-von Mises statistic.
516
+ pvalue : float
517
+ The p-value.
518
+
519
+ See Also
520
+ --------
521
+ kstest, cramervonmises_2samp
522
+
523
+ Notes
524
+ -----
525
+ .. versionadded:: 1.6.0
526
+
527
+ The p-value relies on the approximation given by equation 1.8 in [2]_.
528
+ It is important to keep in mind that the p-value is only accurate if
529
+ one tests a simple hypothesis, i.e. the parameters of the reference
530
+ distribution are known. If the parameters are estimated from the data
531
+ (composite hypothesis), the computed p-value is not reliable.
532
+
533
+ References
534
+ ----------
535
+ .. [1] Cramér-von Mises criterion, Wikipedia,
536
+ https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
537
+ .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
538
+ Distribution of Cramér-von Mises Statistics. Journal of the
539
+ Royal Statistical Society, pp. 221-234.
540
+
541
+ Examples
542
+ --------
543
+
544
+ Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
545
+ were, in fact, drawn from the standard normal distribution. We choose a
546
+ significance level of ``alpha=0.05``.
547
+
548
+ >>> import numpy as np
549
+ >>> from scipy import stats
550
+ >>> rng = np.random.default_rng(165417232101553420507139617764912913465)
551
+ >>> x = stats.norm.rvs(size=500, random_state=rng)
552
+ >>> res = stats.cramervonmises(x, 'norm')
553
+ >>> res.statistic, res.pvalue
554
+ (0.1072085112565724, 0.5508482238203407)
555
+
556
+ The p-value exceeds our chosen significance level, so we do not
557
+ reject the null hypothesis that the observed sample is drawn from the
558
+ standard normal distribution.
559
+
560
+ Now suppose we wish to check whether the same samples shifted by 2.1 is
561
+ consistent with being drawn from a normal distribution with a mean of 2.
562
+
563
+ >>> y = x + 2.1
564
+ >>> res = stats.cramervonmises(y, 'norm', args=(2,))
565
+ >>> res.statistic, res.pvalue
566
+ (0.8364446265294695, 0.00596286797008283)
567
+
568
+ Here we have used the `args` keyword to specify the mean (``loc``)
569
+ of the normal distribution to test the data against. This is equivalent
570
+ to the following, in which we create a frozen normal distribution with
571
+ mean 2.1, then pass its ``cdf`` method as an argument.
572
+
573
+ >>> frozen_dist = stats.norm(loc=2)
574
+ >>> res = stats.cramervonmises(y, frozen_dist.cdf)
575
+ >>> res.statistic, res.pvalue
576
+ (0.8364446265294695, 0.00596286797008283)
577
+
578
+ In either case, we would reject the null hypothesis that the observed
579
+ sample is drawn from a normal distribution with a mean of 2 (and default
580
+ variance of 1) because the p-value is less than our chosen
581
+ significance level.
582
+
583
+ """
584
+ if isinstance(cdf, str):
585
+ cdf = getattr(distributions, cdf).cdf
586
+
587
+ vals = np.sort(np.asarray(rvs))
588
+
589
+ if vals.size <= 1:
590
+ raise ValueError('The sample must contain at least two observations.')
591
+
592
+ n = len(vals)
593
+ cdfvals = cdf(vals, *args)
594
+
595
+ u = (2*np.arange(1, n+1) - 1)/(2*n)
596
+ w = 1/(12*n) + np.sum((u - cdfvals)**2)
597
+
598
+ # avoid small negative values that can occur due to the approximation
599
+ p = max(0, 1. - _cdf_cvm(w, n))
600
+
601
+ return CramerVonMisesResult(statistic=w, pvalue=p)
602
+
603
+
604
+ def _get_wilcoxon_distr(n):
605
+ """
606
+ Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
607
+ of ranks of positive differences).
608
+ Returns an array with the probabilities of all the possible ranks
609
+ r = 0, ..., n*(n+1)/2
610
+ """
611
+ c = np.ones(1, dtype=np.float64)
612
+ for k in range(1, n + 1):
613
+ prev_c = c
614
+ c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64)
615
+ m = len(prev_c)
616
+ c[:m] = prev_c * 0.5
617
+ c[-m:] += prev_c * 0.5
618
+ return c
619
+
620
+
621
+ def _get_wilcoxon_distr2(n):
622
+ """
623
+ Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
624
+ of ranks of positive differences).
625
+ Returns an array with the probabilities of all the possible ranks
626
+ r = 0, ..., n*(n+1)/2
627
+ This is a slower reference function
628
+ References
629
+ ----------
630
+ .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
631
+ Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
632
+ """
633
+ ai = np.arange(1, n+1)[:, None]
634
+ t = n*(n+1)/2
635
+ q = 2*t
636
+ j = np.arange(q)
637
+ theta = 2*np.pi/q*j
638
+ phi_sp = np.prod(np.cos(theta*ai), axis=0)
639
+ phi_s = np.exp(1j*theta*t) * phi_sp
640
+ p = np.real(ifft(phi_s))
641
+ res = np.zeros(int(t)+1)
642
+ res[:-1:] = p[::2]
643
+ res[0] /= 2
644
+ res[-1] = res[0]
645
+ return res
646
+
647
+
648
+ def _tau_b(A):
649
+ """Calculate Kendall's tau-b and p-value from contingency table."""
650
+ # See [2] 2.2 and 4.2
651
+
652
+ # contingency table must be truly 2D
653
+ if A.shape[0] == 1 or A.shape[1] == 1:
654
+ return np.nan, np.nan
655
+
656
+ NA = A.sum()
657
+ PA = _P(A)
658
+ QA = _Q(A)
659
+ Sri2 = (A.sum(axis=1)**2).sum()
660
+ Scj2 = (A.sum(axis=0)**2).sum()
661
+ denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
662
+
663
+ tau = (PA-QA)/(denominator)**0.5
664
+
665
+ numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
666
+ s02_tau_b = numerator/denominator
667
+ if s02_tau_b == 0: # Avoid divide by zero
668
+ return tau, 0
669
+ Z = tau/s02_tau_b**0.5
670
+ p = 2*norm.sf(abs(Z)) # 2-sided p-value
671
+
672
+ return tau, p
673
+
674
+
675
+ def _somers_d(A, alternative='two-sided'):
676
+ """Calculate Somers' D and p-value from contingency table."""
677
+ # See [3] page 1740
678
+
679
+ # contingency table must be truly 2D
680
+ if A.shape[0] <= 1 or A.shape[1] <= 1:
681
+ return np.nan, np.nan
682
+
683
+ NA = A.sum()
684
+ NA2 = NA**2
685
+ PA = _P(A)
686
+ QA = _Q(A)
687
+ Sri2 = (A.sum(axis=1)**2).sum()
688
+
689
+ d = (PA - QA)/(NA2 - Sri2)
690
+
691
+ S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
692
+
693
+ with np.errstate(divide='ignore'):
694
+ Z = (PA - QA)/(4*(S))**0.5
695
+
696
+ p = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
697
+
698
+ return d, p
699
+
700
+
701
+ @dataclass
702
+ class SomersDResult:
703
+ statistic: float
704
+ pvalue: float
705
+ table: np.ndarray
706
+
707
+
708
+ def somersd(x, y=None, alternative='two-sided'):
709
+ r"""Calculates Somers' D, an asymmetric measure of ordinal association.
710
+
711
+ Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
712
+ correspondence between two rankings. Both statistics consider the
713
+ difference between the number of concordant and discordant pairs in two
714
+ rankings :math:`X` and :math:`Y`, and both are normalized such that values
715
+ close to 1 indicate strong agreement and values close to -1 indicate
716
+ strong disagreement. They differ in how they are normalized. To show the
717
+ relationship, Somers' :math:`D` can be defined in terms of Kendall's
718
+ :math:`\tau_a`:
719
+
720
+ .. math::
721
+ D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
722
+
723
+ Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
724
+ second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
725
+ :math:`n` rankings can also be viewed as an :math:`r \times s` contingency
726
+ table in which element :math:`i, j` is the number of rank pairs with rank
727
+ :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
728
+ Accordingly, `somersd` also allows the input data to be supplied as a
729
+ single, 2D contingency table instead of as two separate, 1D rankings.
730
+
731
+ Note that the definition of Somers' :math:`D` is asymmetric: in general,
732
+ :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
733
+ :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
734
+ variable, and the "column" variable :math:`Y` is dependent. For Somers'
735
+ :math:`D(X|Y)`, swap the input lists or transpose the input table.
736
+
737
+ Parameters
738
+ ----------
739
+ x : array_like
740
+ 1D array of rankings, treated as the (row) independent variable.
741
+ Alternatively, a 2D contingency table.
742
+ y : array_like, optional
743
+ If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
744
+ same length, treated as the (column) dependent variable.
745
+ If `x` is 2D, `y` is ignored.
746
+ alternative : {'two-sided', 'less', 'greater'}, optional
747
+ Defines the alternative hypothesis. Default is 'two-sided'.
748
+ The following options are available:
749
+ * 'two-sided': the rank correlation is nonzero
750
+ * 'less': the rank correlation is negative (less than zero)
751
+ * 'greater': the rank correlation is positive (greater than zero)
752
+
753
+ Returns
754
+ -------
755
+ res : SomersDResult
756
+ A `SomersDResult` object with the following fields:
757
+
758
+ statistic : float
759
+ The Somers' :math:`D` statistic.
760
+ pvalue : float
761
+ The p-value for a hypothesis test whose null
762
+ hypothesis is an absence of association, :math:`D=0`.
763
+ See notes for more information.
764
+ table : 2D array
765
+ The contingency table formed from rankings `x` and `y` (or the
766
+ provided contingency table, if `x` is a 2D array)
767
+
768
+ See Also
769
+ --------
770
+ kendalltau : Calculates Kendall's tau, another correlation measure.
771
+ weightedtau : Computes a weighted version of Kendall's tau.
772
+ spearmanr : Calculates a Spearman rank-order correlation coefficient.
773
+ pearsonr : Calculates a Pearson correlation coefficient.
774
+
775
+ Notes
776
+ -----
777
+ This function follows the contingency table approach of [2]_ and
778
+ [3]_. *p*-values are computed based on an asymptotic approximation of
779
+ the test statistic distribution under the null hypothesis :math:`D=0`.
780
+
781
+ Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
782
+ :math:`D` should be identical.
783
+ However, the *p*-values returned by `kendalltau` are based
784
+ on the null hypothesis of *independence* between :math:`X` and :math:`Y`
785
+ (i.e. the population from which pairs in :math:`X` and :math:`Y` are
786
+ sampled contains equal numbers of all possible pairs), which is more
787
+ specific than the null hypothesis :math:`D=0` used here. If the null
788
+ hypothesis of independence is desired, it is acceptable to use the
789
+ *p*-value returned by `kendalltau` with the statistic returned by
790
+ `somersd` and vice versa. For more information, see [2]_.
791
+
792
+ Contingency tables are formatted according to the convention used by
793
+ SAS and R: the first ranking supplied (``x``) is the "row" variable, and
794
+ the second ranking supplied (``y``) is the "column" variable. This is
795
+ opposite the convention of Somers' original paper [1]_.
796
+
797
+ References
798
+ ----------
799
+ .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
800
+ Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
801
+ pp. 799--811, 1962.
802
+
803
+ .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
804
+ Tests for Correlation in Two-Way Contingency Tables", *Journal of
805
+ the American Statistical Association* Vol. 72, No. 358, pp.
806
+ 309--315, 1977.
807
+
808
+ .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
809
+ *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
810
+
811
+ .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
812
+ Statistics Tutorials and Statistical Guides*,
813
+ https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
814
+ Accessed July 31, 2020.
815
+
816
+ Examples
817
+ --------
818
+ We calculate Somers' D for the example given in [4]_, in which a hotel
819
+ chain owner seeks to determine the association between hotel room
820
+ cleanliness and customer satisfaction. The independent variable, hotel
821
+ room cleanliness, is ranked on an ordinal scale: "below average (1)",
822
+ "average (2)", or "above average (3)". The dependent variable, customer
823
+ satisfaction, is ranked on a second scale: "very dissatisfied (1)",
824
+ "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
825
+ "moderately satisfied (4)", or "very satisfied (5)". 189 customers
826
+ respond to the survey, and the results are cast into a contingency table
827
+ with the hotel room cleanliness as the "row" variable and customer
828
+ satisfaction as the "column" variable.
829
+
830
+ +-----+-----+-----+-----+-----+-----+
831
+ | | (1) | (2) | (3) | (4) | (5) |
832
+ +=====+=====+=====+=====+=====+=====+
833
+ | (1) | 27 | 25 | 14 | 7 | 0 |
834
+ +-----+-----+-----+-----+-----+-----+
835
+ | (2) | 7 | 14 | 18 | 35 | 12 |
836
+ +-----+-----+-----+-----+-----+-----+
837
+ | (3) | 1 | 3 | 2 | 7 | 17 |
838
+ +-----+-----+-----+-----+-----+-----+
839
+
840
+ For example, 27 customers assigned their room a cleanliness ranking of
841
+ "below average (1)" and a corresponding satisfaction of "very
842
+ dissatisfied (1)". We perform the analysis as follows.
843
+
844
+ >>> from scipy.stats import somersd
845
+ >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
846
+ >>> res = somersd(table)
847
+ >>> res.statistic
848
+ 0.6032766111513396
849
+ >>> res.pvalue
850
+ 1.0007091191074533e-27
851
+
852
+ The value of the Somers' D statistic is approximately 0.6, indicating
853
+ a positive correlation between room cleanliness and customer satisfaction
854
+ in the sample.
855
+ The *p*-value is very small, indicating a very small probability of
856
+ observing such an extreme value of the statistic under the null
857
+ hypothesis that the statistic of the entire population (from which
858
+ our sample of 189 customers is drawn) is zero. This supports the
859
+ alternative hypothesis that the true value of Somers' D for the population
860
+ is nonzero.
861
+
862
+ """
863
+ x, y = np.array(x), np.array(y)
864
+ if x.ndim == 1:
865
+ if x.size != y.size:
866
+ raise ValueError("Rankings must be of equal length.")
867
+ table = scipy.stats.contingency.crosstab(x, y)[1]
868
+ elif x.ndim == 2:
869
+ if np.any(x < 0):
870
+ raise ValueError("All elements of the contingency table must be "
871
+ "non-negative.")
872
+ if np.any(x != x.astype(int)):
873
+ raise ValueError("All elements of the contingency table must be "
874
+ "integer.")
875
+ if x.nonzero()[0].size < 2:
876
+ raise ValueError("At least two elements of the contingency table "
877
+ "must be nonzero.")
878
+ table = x
879
+ else:
880
+ raise ValueError("x must be either a 1D or 2D array")
881
+ # The table type is converted to a float to avoid an integer overflow
882
+ d, p = _somers_d(table.astype(float), alternative)
883
+
884
+ # add alias for consistency with other correlation functions
885
+ res = SomersDResult(d, p, table)
886
+ res.correlation = d
887
+ return res
888
+
889
+
890
+ # This could be combined with `_all_partitions` in `_resampling.py`
891
+ def _all_partitions(nx, ny):
892
+ """
893
+ Partition a set of indices into two fixed-length sets in all possible ways
894
+
895
+ Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
896
+ ny in all possible ways (ignoring order of elements).
897
+ """
898
+ z = np.arange(nx+ny)
899
+ for c in combinations(z, nx):
900
+ x = np.array(c)
901
+ mask = np.ones(nx+ny, bool)
902
+ mask[x] = False
903
+ y = z[mask]
904
+ yield x, y
905
+
906
+
907
+ def _compute_log_combinations(n):
908
+ """Compute all log combination of C(n, k)."""
909
+ gammaln_arr = gammaln(np.arange(n + 1) + 1)
910
+ return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
911
+
912
+
913
+ @dataclass
914
+ class BarnardExactResult:
915
+ statistic: float
916
+ pvalue: float
917
+
918
+
919
+ def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
920
+ r"""Perform a Barnard exact test on a 2x2 contingency table.
921
+
922
+ Parameters
923
+ ----------
924
+ table : array_like of ints
925
+ A 2x2 contingency table. Elements should be non-negative integers.
926
+
927
+ alternative : {'two-sided', 'less', 'greater'}, optional
928
+ Defines the null and alternative hypotheses. Default is 'two-sided'.
929
+ Please see explanations in the Notes section below.
930
+
931
+ pooled : bool, optional
932
+ Whether to compute score statistic with pooled variance (as in
933
+ Student's t-test, for example) or unpooled variance (as in Welch's
934
+ t-test). Default is ``True``.
935
+
936
+ n : int, optional
937
+ Number of sampling points used in the construction of the sampling
938
+ method. Note that this argument will automatically be converted to
939
+ the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
940
+ select sample points. Default is 32. Must be positive. In most cases,
941
+ 32 points is enough to reach good precision. More points comes at
942
+ performance cost.
943
+
944
+ Returns
945
+ -------
946
+ ber : BarnardExactResult
947
+ A result object with the following attributes.
948
+
949
+ statistic : float
950
+ The Wald statistic with pooled or unpooled variance, depending
951
+ on the user choice of `pooled`.
952
+
953
+ pvalue : float
954
+ P-value, the probability of obtaining a distribution at least as
955
+ extreme as the one that was actually observed, assuming that the
956
+ null hypothesis is true.
957
+
958
+ See Also
959
+ --------
960
+ chi2_contingency : Chi-square test of independence of variables in a
961
+ contingency table.
962
+ fisher_exact : Fisher exact test on a 2x2 contingency table.
963
+ boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
964
+ which is an uniformly more powerful alternative to Fisher's exact test.
965
+
966
+ Notes
967
+ -----
968
+ Barnard's test is an exact test used in the analysis of contingency
969
+ tables. It examines the association of two categorical variables, and
970
+ is a more powerful alternative than Fisher's exact test
971
+ for 2x2 contingency tables.
972
+
973
+ Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
974
+ where each column stores the binomial experiment, as in the example
975
+ below. Let's also define :math:`p_1, p_2` the theoretical binomial
976
+ probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
977
+ Barnard exact test, we can assert three different null hypotheses :
978
+
979
+ - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
980
+ with `alternative` = "less"
981
+
982
+ - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
983
+ with `alternative` = "greater"
984
+
985
+ - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
986
+ with `alternative` = "two-sided" (default one)
987
+
988
+ In order to compute Barnard's exact test, we are using the Wald
989
+ statistic [3]_ with pooled or unpooled variance.
990
+ Under the default assumption that both variances are equal
991
+ (``pooled = True``), the statistic is computed as:
992
+
993
+ .. math::
994
+
995
+ T(X) = \frac{
996
+ \hat{p}_1 - \hat{p}_2
997
+ }{
998
+ \sqrt{
999
+ \hat{p}(1 - \hat{p})
1000
+ (\frac{1}{c_1} +
1001
+ \frac{1}{c_2})
1002
+ }
1003
+ }
1004
+
1005
+ with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
1006
+ :math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
1007
+ given the assumption that :math:`p_1 = p_2`.
1008
+
1009
+ If this assumption is invalid (``pooled = False``), the statistic is:
1010
+
1011
+ .. math::
1012
+
1013
+ T(X) = \frac{
1014
+ \hat{p}_1 - \hat{p}_2
1015
+ }{
1016
+ \sqrt{
1017
+ \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
1018
+ \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
1019
+ }
1020
+ }
1021
+
1022
+ The p-value is then computed as:
1023
+
1024
+ .. math::
1025
+
1026
+ \sum
1027
+ \binom{c_1}{x_{11}}
1028
+ \binom{c_2}{x_{12}}
1029
+ \pi^{x_{11} + x_{12}}
1030
+ (1 - \pi)^{t - x_{11} - x_{12}}
1031
+
1032
+ where the sum is over all 2x2 contingency tables :math:`X` such that:
1033
+ * :math:`T(X) \leq T(X_0)` when `alternative` = "less",
1034
+ * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
1035
+ * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
1036
+ Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
1037
+ and :math:`t` the total (sum of the 4 sample's element).
1038
+
1039
+ The returned p-value is the maximum p-value taken over the nuisance
1040
+ parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
1041
+
1042
+ This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
1043
+ number of sample points.
1044
+
1045
+ References
1046
+ ----------
1047
+ .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
1048
+ 34.1/2 (1947): 123-138. :doi:`dpgkg3`
1049
+
1050
+ .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
1051
+ unconditional exact tests for comparing two binomials."
1052
+ *Cytel Software Corporation* 675 (2003): 1-5.
1053
+
1054
+ .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
1055
+
1056
+ Examples
1057
+ --------
1058
+ An example use of Barnard's test is presented in [2]_.
1059
+
1060
+ Consider the following example of a vaccine efficacy study
1061
+ (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
1062
+ inoculated with a recombinant DNA influenza vaccine and the 15 were
1063
+ inoculated with a placebo. Twelve of the 15 subjects in the placebo
1064
+ group (80%) eventually became infected with influenza whereas for the
1065
+ vaccine group, only 7 of the 15 subjects (47%) became infected. The
1066
+ data are tabulated as a 2 x 2 table::
1067
+
1068
+ Vaccine Placebo
1069
+ Yes 7 12
1070
+ No 8 3
1071
+
1072
+ When working with statistical hypothesis testing, we usually use a
1073
+ threshold probability or significance level upon which we decide
1074
+ to reject the null hypothesis :math:`H_0`. Suppose we choose the common
1075
+ significance level of 5%.
1076
+
1077
+ Our alternative hypothesis is that the vaccine will lower the chance of
1078
+ becoming infected with the virus; that is, the probability :math:`p_1` of
1079
+ catching the virus with the vaccine will be *less than* the probability
1080
+ :math:`p_2` of catching the virus without the vaccine. Therefore, we call
1081
+ `barnard_exact` with the ``alternative="less"`` option:
1082
+
1083
+ >>> import scipy.stats as stats
1084
+ >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
1085
+ >>> res.statistic
1086
+ -1.894...
1087
+ >>> res.pvalue
1088
+ 0.03407...
1089
+
1090
+ Under the null hypothesis that the vaccine will not lower the chance of
1091
+ becoming infected, the probability of obtaining test results at least as
1092
+ extreme as the observed data is approximately 3.4%. Since this p-value is
1093
+ less than our chosen significance level, we have evidence to reject
1094
+ :math:`H_0` in favor of the alternative.
1095
+
1096
+ Suppose we had used Fisher's exact test instead:
1097
+
1098
+ >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
1099
+ >>> pvalue
1100
+ 0.0640...
1101
+
1102
+ With the same threshold significance of 5%, we would not have been able
1103
+ to reject the null hypothesis in favor of the alternative. As stated in
1104
+ [2]_, Barnard's test is uniformly more powerful than Fisher's exact test
1105
+ because Barnard's test does not condition on any margin. Fisher's test
1106
+ should only be used when both sets of marginals are fixed.
1107
+
1108
+ """
1109
+ if n <= 0:
1110
+ raise ValueError(
1111
+ "Number of points `n` must be strictly positive, "
1112
+ f"found {n!r}"
1113
+ )
1114
+
1115
+ table = np.asarray(table, dtype=np.int64)
1116
+
1117
+ if not table.shape == (2, 2):
1118
+ raise ValueError("The input `table` must be of shape (2, 2).")
1119
+
1120
+ if np.any(table < 0):
1121
+ raise ValueError("All values in `table` must be nonnegative.")
1122
+
1123
+ if 0 in table.sum(axis=0):
1124
+ # If both values in column are zero, the p-value is 1 and
1125
+ # the score's statistic is NaN.
1126
+ return BarnardExactResult(np.nan, 1.0)
1127
+
1128
+ total_col_1, total_col_2 = table.sum(axis=0)
1129
+
1130
+ x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
1131
+ x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
1132
+
1133
+ # We need to calculate the wald statistics for each combination of x1 and
1134
+ # x2.
1135
+ p1, p2 = x1 / total_col_1, x2 / total_col_2
1136
+
1137
+ if pooled:
1138
+ p = (x1 + x2) / (total_col_1 + total_col_2)
1139
+ variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
1140
+ else:
1141
+ variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
1142
+
1143
+ # To avoid warning when dividing by 0
1144
+ with np.errstate(divide="ignore", invalid="ignore"):
1145
+ wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
1146
+
1147
+ wald_statistic[p1 == p2] = 0 # Removing NaN values
1148
+
1149
+ wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
1150
+
1151
+ if alternative == "two-sided":
1152
+ index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
1153
+ elif alternative == "less":
1154
+ index_arr = wald_statistic <= wald_stat_obs
1155
+ elif alternative == "greater":
1156
+ index_arr = wald_statistic >= wald_stat_obs
1157
+ else:
1158
+ msg = (
1159
+ "`alternative` should be one of {'two-sided', 'less', 'greater'},"
1160
+ f" found {alternative!r}"
1161
+ )
1162
+ raise ValueError(msg)
1163
+
1164
+ x1_sum_x2 = x1 + x2
1165
+
1166
+ x1_log_comb = _compute_log_combinations(total_col_1)
1167
+ x2_log_comb = _compute_log_combinations(total_col_2)
1168
+ x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
1169
+
1170
+ result = shgo(
1171
+ _get_binomial_log_p_value_with_nuisance_param,
1172
+ args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
1173
+ bounds=((0, 1),),
1174
+ n=n,
1175
+ sampling_method="sobol",
1176
+ )
1177
+
1178
+ # result.fun is the negative log pvalue and therefore needs to be
1179
+ # changed before return
1180
+ p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
1181
+ return BarnardExactResult(wald_stat_obs, p_value)
1182
+
1183
+
1184
+ @dataclass
1185
+ class BoschlooExactResult:
1186
+ statistic: float
1187
+ pvalue: float
1188
+
1189
+
1190
+ def boschloo_exact(table, alternative="two-sided", n=32):
1191
+ r"""Perform Boschloo's exact test on a 2x2 contingency table.
1192
+
1193
+ Parameters
1194
+ ----------
1195
+ table : array_like of ints
1196
+ A 2x2 contingency table. Elements should be non-negative integers.
1197
+
1198
+ alternative : {'two-sided', 'less', 'greater'}, optional
1199
+ Defines the null and alternative hypotheses. Default is 'two-sided'.
1200
+ Please see explanations in the Notes section below.
1201
+
1202
+ n : int, optional
1203
+ Number of sampling points used in the construction of the sampling
1204
+ method. Note that this argument will automatically be converted to
1205
+ the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
1206
+ select sample points. Default is 32. Must be positive. In most cases,
1207
+ 32 points is enough to reach good precision. More points comes at
1208
+ performance cost.
1209
+
1210
+ Returns
1211
+ -------
1212
+ ber : BoschlooExactResult
1213
+ A result object with the following attributes.
1214
+
1215
+ statistic : float
1216
+ The statistic used in Boschloo's test; that is, the p-value
1217
+ from Fisher's exact test.
1218
+
1219
+ pvalue : float
1220
+ P-value, the probability of obtaining a distribution at least as
1221
+ extreme as the one that was actually observed, assuming that the
1222
+ null hypothesis is true.
1223
+
1224
+ See Also
1225
+ --------
1226
+ chi2_contingency : Chi-square test of independence of variables in a
1227
+ contingency table.
1228
+ fisher_exact : Fisher exact test on a 2x2 contingency table.
1229
+ barnard_exact : Barnard's exact test, which is a more powerful alternative
1230
+ than Fisher's exact test for 2x2 contingency tables.
1231
+
1232
+ Notes
1233
+ -----
1234
+ Boschloo's test is an exact test used in the analysis of contingency
1235
+ tables. It examines the association of two categorical variables, and
1236
+ is a uniformly more powerful alternative to Fisher's exact test
1237
+ for 2x2 contingency tables.
1238
+
1239
+ Boschloo's exact test uses the p-value of Fisher's exact test as a
1240
+ statistic, and Boschloo's p-value is the probability under the null
1241
+ hypothesis of observing such an extreme value of this statistic.
1242
+
1243
+ Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
1244
+ where each column stores the binomial experiment, as in the example
1245
+ below. Let's also define :math:`p_1, p_2` the theoretical binomial
1246
+ probabilities for :math:`x_{11}` and :math:`x_{12}`. When using
1247
+ Boschloo exact test, we can assert three different alternative hypotheses:
1248
+
1249
+ - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
1250
+ with `alternative` = "less"
1251
+
1252
+ - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
1253
+ with `alternative` = "greater"
1254
+
1255
+ - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
1256
+ with `alternative` = "two-sided" (default)
1257
+
1258
+ There are multiple conventions for computing a two-sided p-value when the
1259
+ null distribution is asymmetric. Here, we apply the convention that the
1260
+ p-value of a two-sided test is twice the minimum of the p-values of the
1261
+ one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
1262
+ different convention, so for a given `table`, the statistic reported by
1263
+ `boschloo_exact` may differ from the p-value reported by `fisher_exact`
1264
+ when ``alternative='two-sided'``.
1265
+
1266
+ .. versionadded:: 1.7.0
1267
+
1268
+ References
1269
+ ----------
1270
+ .. [1] R.D. Boschloo. "Raised conditional level of significance for the
1271
+ 2 x 2-table when testing the equality of two probabilities",
1272
+ Statistica Neerlandica, 24(1), 1970
1273
+
1274
+ .. [2] "Boschloo's test", Wikipedia,
1275
+ https://en.wikipedia.org/wiki/Boschloo%27s_test
1276
+
1277
+ .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
1278
+ Human Resource Management, 43(4), 395-407, 2004,
1279
+ :doi:`10.1002/hrm.20032`.
1280
+
1281
+ Examples
1282
+ --------
1283
+ In the following example, we consider the article "Employee
1284
+ attitudes and job satisfaction" [3]_
1285
+ which reports the results of a survey from 63 scientists and 117 college
1286
+ professors. Of the 63 scientists, 31 said they were very satisfied with
1287
+ their jobs, whereas 74 of the college professors were very satisfied
1288
+ with their work. Is this significant evidence that college
1289
+ professors are happier with their work than scientists?
1290
+ The following table summarizes the data mentioned above::
1291
+
1292
+ college professors scientists
1293
+ Very Satisfied 74 31
1294
+ Dissatisfied 43 32
1295
+
1296
+ When working with statistical hypothesis testing, we usually use a
1297
+ threshold probability or significance level upon which we decide
1298
+ to reject the null hypothesis :math:`H_0`. Suppose we choose the common
1299
+ significance level of 5%.
1300
+
1301
+ Our alternative hypothesis is that college professors are truly more
1302
+ satisfied with their work than scientists. Therefore, we expect
1303
+ :math:`p_1` the proportion of very satisfied college professors to be
1304
+ greater than :math:`p_2`, the proportion of very satisfied scientists.
1305
+ We thus call `boschloo_exact` with the ``alternative="greater"`` option:
1306
+
1307
+ >>> import scipy.stats as stats
1308
+ >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
1309
+ >>> res.statistic
1310
+ 0.0483...
1311
+ >>> res.pvalue
1312
+ 0.0355...
1313
+
1314
+ Under the null hypothesis that scientists are happier in their work than
1315
+ college professors, the probability of obtaining test
1316
+ results at least as extreme as the observed data is approximately 3.55%.
1317
+ Since this p-value is less than our chosen significance level, we have
1318
+ evidence to reject :math:`H_0` in favor of the alternative hypothesis.
1319
+
1320
+ """
1321
+ hypergeom = distributions.hypergeom
1322
+
1323
+ if n <= 0:
1324
+ raise ValueError(
1325
+ "Number of points `n` must be strictly positive,"
1326
+ f" found {n!r}"
1327
+ )
1328
+
1329
+ table = np.asarray(table, dtype=np.int64)
1330
+
1331
+ if not table.shape == (2, 2):
1332
+ raise ValueError("The input `table` must be of shape (2, 2).")
1333
+
1334
+ if np.any(table < 0):
1335
+ raise ValueError("All values in `table` must be nonnegative.")
1336
+
1337
+ if 0 in table.sum(axis=0):
1338
+ # If both values in column are zero, the p-value is 1 and
1339
+ # the score's statistic is NaN.
1340
+ return BoschlooExactResult(np.nan, np.nan)
1341
+
1342
+ total_col_1, total_col_2 = table.sum(axis=0)
1343
+ total = total_col_1 + total_col_2
1344
+ x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
1345
+ x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
1346
+ x1_sum_x2 = x1 + x2
1347
+
1348
+ if alternative == 'less':
1349
+ pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
1350
+ elif alternative == 'greater':
1351
+ # Same formula as the 'less' case, but with the second column.
1352
+ pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
1353
+ elif alternative == 'two-sided':
1354
+ boschloo_less = boschloo_exact(table, alternative="less", n=n)
1355
+ boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
1356
+
1357
+ res = (
1358
+ boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
1359
+ else boschloo_greater
1360
+ )
1361
+
1362
+ # Two-sided p-value is defined as twice the minimum of the one-sided
1363
+ # p-values
1364
+ pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
1365
+ return BoschlooExactResult(res.statistic, pvalue)
1366
+ else:
1367
+ msg = (
1368
+ f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
1369
+ f" found {alternative!r}"
1370
+ )
1371
+ raise ValueError(msg)
1372
+
1373
+ fisher_stat = pvalues[table[0, 0], table[0, 1]]
1374
+
1375
+ # fisher_stat * (1+1e-13) guards us from small numerical error. It is
1376
+ # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
1377
+ # For more throughout explanations, see gh-14178
1378
+ index_arr = pvalues <= fisher_stat * (1+1e-13)
1379
+
1380
+ x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
1381
+ x1_log_comb = _compute_log_combinations(total_col_1)
1382
+ x2_log_comb = _compute_log_combinations(total_col_2)
1383
+ x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
1384
+
1385
+ result = shgo(
1386
+ _get_binomial_log_p_value_with_nuisance_param,
1387
+ args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
1388
+ bounds=((0, 1),),
1389
+ n=n,
1390
+ sampling_method="sobol",
1391
+ )
1392
+
1393
+ # result.fun is the negative log pvalue and therefore needs to be
1394
+ # changed before return
1395
+ p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
1396
+ return BoschlooExactResult(fisher_stat, p_value)
1397
+
1398
+
1399
+ def _get_binomial_log_p_value_with_nuisance_param(
1400
+ nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
1401
+ ):
1402
+ r"""
1403
+ Compute the log pvalue in respect of a nuisance parameter considering
1404
+ a 2x2 sample space.
1405
+
1406
+ Parameters
1407
+ ----------
1408
+ nuisance_param : float
1409
+ nuisance parameter used in the computation of the maximisation of
1410
+ the p-value. Must be between 0 and 1
1411
+
1412
+ x1_sum_x2 : ndarray
1413
+ Sum of x1 and x2 inside barnard_exact
1414
+
1415
+ x1_sum_x2_log_comb : ndarray
1416
+ sum of the log combination of x1 and x2
1417
+
1418
+ index_arr : ndarray of boolean
1419
+
1420
+ Returns
1421
+ -------
1422
+ p_value : float
1423
+ Return the maximum p-value considering every nuisance parameter
1424
+ between 0 and 1
1425
+
1426
+ Notes
1427
+ -----
1428
+
1429
+ Both Barnard's test and Boschloo's test iterate over a nuisance parameter
1430
+ :math:`\pi \in [0, 1]` to find the maximum p-value. To search this
1431
+ maxima, this function return the negative log pvalue with respect to the
1432
+ nuisance parameter passed in params. This negative log p-value is then
1433
+ used in `shgo` to find the minimum negative pvalue which is our maximum
1434
+ pvalue.
1435
+
1436
+ Also, to compute the different combination used in the
1437
+ p-values' computation formula, this function uses `gammaln` which is
1438
+ more tolerant for large value than `scipy.special.comb`. `gammaln` gives
1439
+ a log combination. For the little precision loss, performances are
1440
+ improved a lot.
1441
+ """
1442
+ t1, t2 = x1_sum_x2.shape
1443
+ n = t1 + t2 - 2
1444
+ with np.errstate(divide="ignore", invalid="ignore"):
1445
+ log_nuisance = np.log(
1446
+ nuisance_param,
1447
+ out=np.zeros_like(nuisance_param),
1448
+ where=nuisance_param >= 0,
1449
+ )
1450
+ log_1_minus_nuisance = np.log(
1451
+ 1 - nuisance_param,
1452
+ out=np.zeros_like(nuisance_param),
1453
+ where=1 - nuisance_param >= 0,
1454
+ )
1455
+
1456
+ nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
1457
+ nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
1458
+
1459
+ nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
1460
+ nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
1461
+
1462
+ tmp_log_values_arr = (
1463
+ x1_sum_x2_log_comb
1464
+ + nuisance_power_x1_x2
1465
+ + nuisance_power_n_minus_x1_x2
1466
+ )
1467
+
1468
+ tmp_values_from_index = tmp_log_values_arr[index_arr]
1469
+
1470
+ # To avoid dividing by zero in log function and getting inf value,
1471
+ # values are centered according to the max
1472
+ max_value = tmp_values_from_index.max()
1473
+
1474
+ # To have better result's precision, the log pvalue is taken here.
1475
+ # Indeed, pvalue is included inside [0, 1] interval. Passing the
1476
+ # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
1477
+ # help us to achieve better precision
1478
+ with np.errstate(divide="ignore", invalid="ignore"):
1479
+ log_probs = np.exp(tmp_values_from_index - max_value).sum()
1480
+ log_pvalue = max_value + np.log(
1481
+ log_probs,
1482
+ out=np.full_like(log_probs, -np.inf),
1483
+ where=log_probs > 0,
1484
+ )
1485
+
1486
+ # Since shgo find the minima, minus log pvalue is returned
1487
+ return -log_pvalue
1488
+
1489
+
1490
+ def _pval_cvm_2samp_exact(s, m, n):
1491
+ """
1492
+ Compute the exact p-value of the Cramer-von Mises two-sample test
1493
+ for a given value s of the test statistic.
1494
+ m and n are the sizes of the samples.
1495
+
1496
+ [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
1497
+ the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
1498
+ vol. 17, no. 8, pp. 1-15, Dec. 2006.
1499
+ [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
1500
+ Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
1501
+ 33(3), 1148-1159, (September, 1962)
1502
+ """
1503
+
1504
+ # [1, p. 3]
1505
+ lcm = np.lcm(m, n)
1506
+ # [1, p. 4], below eq. 3
1507
+ a = lcm // m
1508
+ b = lcm // n
1509
+ # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
1510
+ # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
1511
+ mn = m * n
1512
+ zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
1513
+
1514
+ # bound maximum value that may appear in `gs` (remember both rows!)
1515
+ zeta_bound = lcm**2 * (m + n) # bound elements in row 1
1516
+ combinations = comb(m + n, m) # sum of row 2
1517
+ max_gs = max(zeta_bound, combinations)
1518
+ dtype = np.min_scalar_type(max_gs)
1519
+
1520
+ # the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
1521
+ gs = ([np.array([[0], [1]], dtype=dtype)]
1522
+ + [np.empty((2, 0), dtype=dtype) for _ in range(m)])
1523
+ for u in range(n + 1):
1524
+ next_gs = []
1525
+ tmp = np.empty((2, 0), dtype=dtype)
1526
+ for v, g in enumerate(gs):
1527
+ # Calculate g recursively with eq. 11 in [1]. Even though it
1528
+ # doesn't look like it, this also does 12/13 (all of Algorithm 1).
1529
+ vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
1530
+ tmp = np.concatenate([
1531
+ np.stack([vi, tmp[1, i0] + g[1, i1]]),
1532
+ np.delete(tmp, i0, 1),
1533
+ np.delete(g, i1, 1)
1534
+ ], 1)
1535
+ res = (a * v - b * u) ** 2
1536
+ tmp[0] += res.astype(dtype)
1537
+ next_gs.append(tmp)
1538
+ gs = next_gs
1539
+ value, freq = gs[m]
1540
+ return np.float64(np.sum(freq[value >= zeta]) / combinations)
1541
+
1542
+
1543
+ @_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1,
1544
+ result_to_tuple=_cvm_result_to_tuple)
1545
+ def cramervonmises_2samp(x, y, method='auto'):
1546
+ """Perform the two-sample Cramér-von Mises test for goodness of fit.
1547
+
1548
+ This is the two-sample version of the Cramér-von Mises test ([1]_):
1549
+ for two independent samples :math:`X_1, ..., X_n` and
1550
+ :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
1551
+ come from the same (unspecified) continuous distribution.
1552
+
1553
+ Parameters
1554
+ ----------
1555
+ x : array_like
1556
+ A 1-D array of observed values of the random variables :math:`X_i`.
1557
+ y : array_like
1558
+ A 1-D array of observed values of the random variables :math:`Y_i`.
1559
+ method : {'auto', 'asymptotic', 'exact'}, optional
1560
+ The method used to compute the p-value, see Notes for details.
1561
+ The default is 'auto'.
1562
+
1563
+ Returns
1564
+ -------
1565
+ res : object with attributes
1566
+ statistic : float
1567
+ Cramér-von Mises statistic.
1568
+ pvalue : float
1569
+ The p-value.
1570
+
1571
+ See Also
1572
+ --------
1573
+ cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
1574
+
1575
+ Notes
1576
+ -----
1577
+ .. versionadded:: 1.7.0
1578
+
1579
+ The statistic is computed according to equation 9 in [2]_. The
1580
+ calculation of the p-value depends on the keyword `method`:
1581
+
1582
+ - ``asymptotic``: The p-value is approximated by using the limiting
1583
+ distribution of the test statistic.
1584
+ - ``exact``: The exact p-value is computed by enumerating all
1585
+ possible combinations of the test statistic, see [2]_.
1586
+
1587
+ If ``method='auto'``, the exact approach is used
1588
+ if both samples contain equal to or less than 20 observations,
1589
+ otherwise the asymptotic distribution is used.
1590
+
1591
+ If the underlying distribution is not continuous, the p-value is likely to
1592
+ be conservative (Section 6.2 in [3]_). When ranking the data to compute
1593
+ the test statistic, midranks are used if there are ties.
1594
+
1595
+ References
1596
+ ----------
1597
+ .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
1598
+ .. [2] Anderson, T.W. (1962). On the distribution of the two-sample
1599
+ Cramer-von-Mises criterion. The Annals of Mathematical
1600
+ Statistics, pp. 1148-1159.
1601
+ .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
1602
+
1603
+ Examples
1604
+ --------
1605
+
1606
+ Suppose we wish to test whether two samples generated by
1607
+ ``scipy.stats.norm.rvs`` have the same distribution. We choose a
1608
+ significance level of alpha=0.05.
1609
+
1610
+ >>> import numpy as np
1611
+ >>> from scipy import stats
1612
+ >>> rng = np.random.default_rng()
1613
+ >>> x = stats.norm.rvs(size=100, random_state=rng)
1614
+ >>> y = stats.norm.rvs(size=70, random_state=rng)
1615
+ >>> res = stats.cramervonmises_2samp(x, y)
1616
+ >>> res.statistic, res.pvalue
1617
+ (0.29376470588235293, 0.1412873014573014)
1618
+
1619
+ The p-value exceeds our chosen significance level, so we do not
1620
+ reject the null hypothesis that the observed samples are drawn from the
1621
+ same distribution.
1622
+
1623
+ For small sample sizes, one can compute the exact p-values:
1624
+
1625
+ >>> x = stats.norm.rvs(size=7, random_state=rng)
1626
+ >>> y = stats.t.rvs(df=2, size=6, random_state=rng)
1627
+ >>> res = stats.cramervonmises_2samp(x, y, method='exact')
1628
+ >>> res.statistic, res.pvalue
1629
+ (0.197802197802198, 0.31643356643356646)
1630
+
1631
+ The p-value based on the asymptotic distribution is a good approximation
1632
+ even though the sample size is small.
1633
+
1634
+ >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
1635
+ >>> res.statistic, res.pvalue
1636
+ (0.197802197802198, 0.2966041181527128)
1637
+
1638
+ Independent of the method, one would not reject the null hypothesis at the
1639
+ chosen significance level in this example.
1640
+
1641
+ """
1642
+ xa = np.sort(np.asarray(x))
1643
+ ya = np.sort(np.asarray(y))
1644
+
1645
+ if xa.size <= 1 or ya.size <= 1:
1646
+ raise ValueError('x and y must contain at least two observations.')
1647
+ if method not in ['auto', 'exact', 'asymptotic']:
1648
+ raise ValueError('method must be either auto, exact or asymptotic.')
1649
+
1650
+ nx = len(xa)
1651
+ ny = len(ya)
1652
+
1653
+ if method == 'auto':
1654
+ if max(nx, ny) > 20:
1655
+ method = 'asymptotic'
1656
+ else:
1657
+ method = 'exact'
1658
+
1659
+ # get ranks of x and y in the pooled sample
1660
+ z = np.concatenate([xa, ya])
1661
+ # in case of ties, use midrank (see [1])
1662
+ r = scipy.stats.rankdata(z, method='average')
1663
+ rx = r[:nx]
1664
+ ry = r[nx:]
1665
+
1666
+ # compute U (eq. 10 in [2])
1667
+ u = nx * np.sum((rx - np.arange(1, nx+1))**2)
1668
+ u += ny * np.sum((ry - np.arange(1, ny+1))**2)
1669
+
1670
+ # compute T (eq. 9 in [2])
1671
+ k, N = nx*ny, nx + ny
1672
+ t = u / (k*N) - (4*k - 1)/(6*N)
1673
+
1674
+ if method == 'exact':
1675
+ p = _pval_cvm_2samp_exact(u, nx, ny)
1676
+ else:
1677
+ # compute expected value and variance of T (eq. 11 and 14 in [2])
1678
+ et = (1 + 1/N)/6
1679
+ vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k)
1680
+ vt = vt / (45 * N**2 * 4 * k)
1681
+
1682
+ # computed the normalized statistic (eq. 15 in [2])
1683
+ tn = 1/6 + (t - et) / np.sqrt(45 * vt)
1684
+
1685
+ # approximate distribution of tn with limiting distribution
1686
+ # of the one-sample test statistic
1687
+ # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
1688
+ if tn < 0.003:
1689
+ p = 1.0
1690
+ else:
1691
+ p = max(0, 1. - _cdf_cvm_inf(tn))
1692
+
1693
+ return CramerVonMisesResult(statistic=t, pvalue=p)
1694
+
1695
+
1696
+ class TukeyHSDResult:
1697
+ """Result of `scipy.stats.tukey_hsd`.
1698
+
1699
+ Attributes
1700
+ ----------
1701
+ statistic : float ndarray
1702
+ The computed statistic of the test for each comparison. The element
1703
+ at index ``(i, j)`` is the statistic for the comparison between groups
1704
+ ``i`` and ``j``.
1705
+ pvalue : float ndarray
1706
+ The associated p-value from the studentized range distribution. The
1707
+ element at index ``(i, j)`` is the p-value for the comparison
1708
+ between groups ``i`` and ``j``.
1709
+
1710
+ Notes
1711
+ -----
1712
+ The string representation of this object displays the most recently
1713
+ calculated confidence interval, and if none have been previously
1714
+ calculated, it will evaluate ``confidence_interval()``.
1715
+
1716
+ References
1717
+ ----------
1718
+ .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
1719
+ Method."
1720
+ https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1721
+ 28 November 2020.
1722
+ """
1723
+
1724
+ def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err):
1725
+ self.statistic = statistic
1726
+ self.pvalue = pvalue
1727
+ self._ntreatments = _ntreatments
1728
+ self._nobs = _nobs
1729
+ self._stand_err = _stand_err
1730
+ self._ci = None
1731
+ self._ci_cl = None
1732
+
1733
+ def __str__(self):
1734
+ # Note: `__str__` prints the confidence intervals from the most
1735
+ # recent call to `confidence_interval`. If it has not been called,
1736
+ # it will be called with the default CL of .95.
1737
+ if self._ci is None:
1738
+ self.confidence_interval(confidence_level=.95)
1739
+ s = ("Tukey's HSD Pairwise Group Comparisons"
1740
+ f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
1741
+ s += "Comparison Statistic p-value Lower CI Upper CI\n"
1742
+ for i in range(self.pvalue.shape[0]):
1743
+ for j in range(self.pvalue.shape[0]):
1744
+ if i != j:
1745
+ s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
1746
+ f"{self.pvalue[i, j]:>10.3f}"
1747
+ f"{self._ci.low[i, j]:>10.3f}"
1748
+ f"{self._ci.high[i, j]:>10.3f}\n")
1749
+ return s
1750
+
1751
+ def confidence_interval(self, confidence_level=.95):
1752
+ """Compute the confidence interval for the specified confidence level.
1753
+
1754
+ Parameters
1755
+ ----------
1756
+ confidence_level : float, optional
1757
+ Confidence level for the computed confidence interval
1758
+ of the estimated proportion. Default is .95.
1759
+
1760
+ Returns
1761
+ -------
1762
+ ci : ``ConfidenceInterval`` object
1763
+ The object has attributes ``low`` and ``high`` that hold the
1764
+ lower and upper bounds of the confidence intervals for each
1765
+ comparison. The high and low values are accessible for each
1766
+ comparison at index ``(i, j)`` between groups ``i`` and ``j``.
1767
+
1768
+ References
1769
+ ----------
1770
+ .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
1771
+ Tukey's Method."
1772
+ https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1773
+ 28 November 2020.
1774
+
1775
+ Examples
1776
+ --------
1777
+ >>> from scipy.stats import tukey_hsd
1778
+ >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1779
+ >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1780
+ >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1781
+ >>> result = tukey_hsd(group0, group1, group2)
1782
+ >>> ci = result.confidence_interval()
1783
+ >>> ci.low
1784
+ array([[-3.649159, -8.249159, -3.909159],
1785
+ [ 0.950841, -3.649159, 0.690841],
1786
+ [-3.389159, -7.989159, -3.649159]])
1787
+ >>> ci.high
1788
+ array([[ 3.649159, -0.950841, 3.389159],
1789
+ [ 8.249159, 3.649159, 7.989159],
1790
+ [ 3.909159, -0.690841, 3.649159]])
1791
+ """
1792
+ # check to see if the supplied confidence level matches that of the
1793
+ # previously computed CI.
1794
+ if (self._ci is not None and self._ci_cl is not None and
1795
+ confidence_level == self._ci_cl):
1796
+ return self._ci
1797
+
1798
+ if not 0 < confidence_level < 1:
1799
+ raise ValueError("Confidence level must be between 0 and 1.")
1800
+ # determine the critical value of the studentized range using the
1801
+ # appropriate confidence level, number of treatments, and degrees
1802
+ # of freedom as determined by the number of data less the number of
1803
+ # treatments. ("Confidence limits for Tukey's method")[1]. Note that
1804
+ # in the cases of unequal sample sizes there will be a criterion for
1805
+ # each group comparison.
1806
+ params = (confidence_level, self._nobs, self._ntreatments - self._nobs)
1807
+ srd = distributions.studentized_range.ppf(*params)
1808
+ # also called maximum critical value, the Tukey criterion is the
1809
+ # studentized range critical value * the square root of mean square
1810
+ # error over the sample size.
1811
+ tukey_criterion = srd * self._stand_err
1812
+ # the confidence levels are determined by the
1813
+ # `mean_differences` +- `tukey_criterion`
1814
+ upper_conf = self.statistic + tukey_criterion
1815
+ lower_conf = self.statistic - tukey_criterion
1816
+ self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
1817
+ self._ci_cl = confidence_level
1818
+ return self._ci
1819
+
1820
+
1821
+ def _tukey_hsd_iv(args):
1822
+ if (len(args)) < 2:
1823
+ raise ValueError("There must be more than 1 treatment.")
1824
+ args = [np.asarray(arg) for arg in args]
1825
+ for arg in args:
1826
+ if arg.ndim != 1:
1827
+ raise ValueError("Input samples must be one-dimensional.")
1828
+ if arg.size <= 1:
1829
+ raise ValueError("Input sample size must be greater than one.")
1830
+ if np.isinf(arg).any():
1831
+ raise ValueError("Input samples must be finite.")
1832
+ return args
1833
+
1834
+
1835
+ def tukey_hsd(*args):
1836
+ """Perform Tukey's HSD test for equality of means over multiple treatments.
1837
+
1838
+ Tukey's honestly significant difference (HSD) test performs pairwise
1839
+ comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
1840
+ assesses whether the true means underlying each sample are identical,
1841
+ Tukey's HSD is a post hoc test used to compare the mean of each sample
1842
+ to the mean of each other sample.
1843
+
1844
+ The null hypothesis is that the distributions underlying the samples all
1845
+ have the same mean. The test statistic, which is computed for every
1846
+ possible pairing of samples, is simply the difference between the sample
1847
+ means. For each pair, the p-value is the probability under the null
1848
+ hypothesis (and other assumptions; see notes) of observing such an extreme
1849
+ value of the statistic, considering that many pairwise comparisons are
1850
+ being performed. Confidence intervals for the difference between each pair
1851
+ of means are also available.
1852
+
1853
+ Parameters
1854
+ ----------
1855
+ sample1, sample2, ... : array_like
1856
+ The sample measurements for each group. There must be at least
1857
+ two arguments.
1858
+
1859
+ Returns
1860
+ -------
1861
+ result : `~scipy.stats._result_classes.TukeyHSDResult` instance
1862
+ The return value is an object with the following attributes:
1863
+
1864
+ statistic : float ndarray
1865
+ The computed statistic of the test for each comparison. The element
1866
+ at index ``(i, j)`` is the statistic for the comparison between
1867
+ groups ``i`` and ``j``.
1868
+ pvalue : float ndarray
1869
+ The computed p-value of the test for each comparison. The element
1870
+ at index ``(i, j)`` is the p-value for the comparison between
1871
+ groups ``i`` and ``j``.
1872
+
1873
+ The object has the following methods:
1874
+
1875
+ confidence_interval(confidence_level=0.95):
1876
+ Compute the confidence interval for the specified confidence level.
1877
+
1878
+ See Also
1879
+ --------
1880
+ dunnett : performs comparison of means against a control group.
1881
+
1882
+ Notes
1883
+ -----
1884
+ The use of this test relies on several assumptions.
1885
+
1886
+ 1. The observations are independent within and among groups.
1887
+ 2. The observations within each group are normally distributed.
1888
+ 3. The distributions from which the samples are drawn have the same finite
1889
+ variance.
1890
+
1891
+ The original formulation of the test was for samples of equal size [6]_.
1892
+ In case of unequal sample sizes, the test uses the Tukey-Kramer method
1893
+ [4]_.
1894
+
1895
+ References
1896
+ ----------
1897
+ .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
1898
+ Method."
1899
+ https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
1900
+ 28 November 2020.
1901
+ .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
1902
+ Difference (HSD) Test."
1903
+ https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
1904
+ .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
1905
+ Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
1906
+ .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
1907
+ Means with Unequal Numbers of Replications." Biometrics, vol. 12,
1908
+ no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
1909
+ Accessed 25 May 2021.
1910
+ .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
1911
+ The ANOVA table and tests of hypotheses about means"
1912
+ https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
1913
+ 2 June 2021.
1914
+ .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
1915
+ Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
1916
+ www.jstor.org/stable/3001913. Accessed 14 June 2021.
1917
+
1918
+
1919
+ Examples
1920
+ --------
1921
+ Here are some data comparing the time to relief of three brands of
1922
+ headache medicine, reported in minutes. Data adapted from [3]_.
1923
+
1924
+ >>> import numpy as np
1925
+ >>> from scipy.stats import tukey_hsd
1926
+ >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1927
+ >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1928
+ >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1929
+
1930
+ We would like to see if the means between any of the groups are
1931
+ significantly different. First, visually examine a box and whisker plot.
1932
+
1933
+ >>> import matplotlib.pyplot as plt
1934
+ >>> fig, ax = plt.subplots(1, 1)
1935
+ >>> ax.boxplot([group0, group1, group2])
1936
+ >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
1937
+ >>> ax.set_ylabel("mean") # doctest: +SKIP
1938
+ >>> plt.show()
1939
+
1940
+ From the box and whisker plot, we can see overlap in the interquartile
1941
+ ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
1942
+ test to determine if the difference between means is significant. We
1943
+ set a significance level of .05 to reject the null hypothesis.
1944
+
1945
+ >>> res = tukey_hsd(group0, group1, group2)
1946
+ >>> print(res)
1947
+ Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
1948
+ Comparison Statistic p-value Lower CI Upper CI
1949
+ (0 - 1) -4.600 0.014 -8.249 -0.951
1950
+ (0 - 2) -0.260 0.980 -3.909 3.389
1951
+ (1 - 0) 4.600 0.014 0.951 8.249
1952
+ (1 - 2) 4.340 0.020 0.691 7.989
1953
+ (2 - 0) 0.260 0.980 -3.389 3.909
1954
+ (2 - 1) -4.340 0.020 -7.989 -0.691
1955
+
1956
+ The null hypothesis is that each group has the same mean. The p-value for
1957
+ comparisons between ``group0`` and ``group1`` as well as ``group1`` and
1958
+ ``group2`` do not exceed .05, so we reject the null hypothesis that they
1959
+ have the same means. The p-value of the comparison between ``group0``
1960
+ and ``group2`` exceeds .05, so we accept the null hypothesis that there
1961
+ is not a significant difference between their means.
1962
+
1963
+ We can also compute the confidence interval associated with our chosen
1964
+ confidence level.
1965
+
1966
+ >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
1967
+ >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
1968
+ >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
1969
+ >>> result = tukey_hsd(group0, group1, group2)
1970
+ >>> conf = res.confidence_interval(confidence_level=.99)
1971
+ >>> for ((i, j), l) in np.ndenumerate(conf.low):
1972
+ ... # filter out self comparisons
1973
+ ... if i != j:
1974
+ ... h = conf.high[i,j]
1975
+ ... print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
1976
+ (0 - 1) -9.480 0.280
1977
+ (0 - 2) -5.140 4.620
1978
+ (1 - 0) -0.280 9.480
1979
+ (1 - 2) -0.540 9.220
1980
+ (2 - 0) -4.620 5.140
1981
+ (2 - 1) -9.220 0.540
1982
+ """
1983
+ args = _tukey_hsd_iv(args)
1984
+ ntreatments = len(args)
1985
+ means = np.asarray([np.mean(arg) for arg in args])
1986
+ nsamples_treatments = np.asarray([a.size for a in args])
1987
+ nobs = np.sum(nsamples_treatments)
1988
+
1989
+ # determine mean square error [5]. Note that this is sometimes called
1990
+ # mean square error within.
1991
+ mse = (np.sum([np.var(arg, ddof=1) for arg in args] *
1992
+ (nsamples_treatments - 1)) / (nobs - ntreatments))
1993
+
1994
+ # The calculation of the standard error differs when treatments differ in
1995
+ # size. See ("Unequal sample sizes")[1].
1996
+ if np.unique(nsamples_treatments).size == 1:
1997
+ # all input groups are the same length, so only one value needs to be
1998
+ # calculated [1].
1999
+ normalize = 2 / nsamples_treatments[0]
2000
+ else:
2001
+ # to compare groups of differing sizes, we must compute a variance
2002
+ # value for each individual comparison. Use broadcasting to get the
2003
+ # resulting matrix. [3], verified against [4] (page 308).
2004
+ normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
2005
+
2006
+ # the standard error is used in the computation of the tukey criterion and
2007
+ # finding the p-values.
2008
+ stand_err = np.sqrt(normalize * mse / 2)
2009
+
2010
+ # the mean difference is the test statistic.
2011
+ mean_differences = means[None].T - means
2012
+
2013
+ # Calculate the t-statistic to use within the survival function of the
2014
+ # studentized range to get the p-value.
2015
+ t_stat = np.abs(mean_differences) / stand_err
2016
+
2017
+ params = t_stat, ntreatments, nobs - ntreatments
2018
+ pvalues = distributions.studentized_range.sf(*params)
2019
+
2020
+ return TukeyHSDResult(mean_differences, pvalues, ntreatments,
2021
+ nobs, stand_err)
.venv/Lib/site-packages/scipy/stats/_kde.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-------------------------------------------------------------------------------
2
+ #
3
+ # Define classes for (uni/multi)-variate kernel density estimation.
4
+ #
5
+ # Currently, only Gaussian kernels are implemented.
6
+ #
7
+ # Written by: Robert Kern
8
+ #
9
+ # Date: 2004-08-09
10
+ #
11
+ # Modified: 2005-02-10 by Robert Kern.
12
+ # Contributed to SciPy
13
+ # 2005-10-07 by Robert Kern.
14
+ # Some fixes to match the new scipy_core
15
+ #
16
+ # Copyright 2004-2005 by Enthought, Inc.
17
+ #
18
+ #-------------------------------------------------------------------------------
19
+
20
+ # Standard library imports.
21
+ import warnings
22
+
23
+ # SciPy imports.
24
+ from scipy import linalg, special
25
+ from scipy._lib._util import check_random_state
26
+
27
+ from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
28
+ sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
29
+ ones, cov)
30
+ import numpy as np
31
+
32
+ # Local imports.
33
+ from . import _mvn
34
+ from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
35
+
36
+ # deprecated import to be removed in SciPy 1.13.0
37
+ from scipy.special import logsumexp # noqa: F401
38
+
39
+
40
+ __all__ = ['gaussian_kde']
41
+
42
+
43
+ class gaussian_kde:
44
+ """Representation of a kernel-density estimate using Gaussian kernels.
45
+
46
+ Kernel density estimation is a way to estimate the probability density
47
+ function (PDF) of a random variable in a non-parametric way.
48
+ `gaussian_kde` works for both uni-variate and multi-variate data. It
49
+ includes automatic bandwidth determination. The estimation works best for
50
+ a unimodal distribution; bimodal or multi-modal distributions tend to be
51
+ oversmoothed.
52
+
53
+ Parameters
54
+ ----------
55
+ dataset : array_like
56
+ Datapoints to estimate from. In case of univariate data this is a 1-D
57
+ array, otherwise a 2-D array with shape (# of dims, # of data).
58
+ bw_method : str, scalar or callable, optional
59
+ The method used to calculate the estimator bandwidth. This can be
60
+ 'scott', 'silverman', a scalar constant or a callable. If a scalar,
61
+ this will be used directly as `kde.factor`. If a callable, it should
62
+ take a `gaussian_kde` instance as only parameter and return a scalar.
63
+ If None (default), 'scott' is used. See Notes for more details.
64
+ weights : array_like, optional
65
+ weights of datapoints. This must be the same shape as dataset.
66
+ If None (default), the samples are assumed to be equally weighted
67
+
68
+ Attributes
69
+ ----------
70
+ dataset : ndarray
71
+ The dataset with which `gaussian_kde` was initialized.
72
+ d : int
73
+ Number of dimensions.
74
+ n : int
75
+ Number of datapoints.
76
+ neff : int
77
+ Effective number of datapoints.
78
+
79
+ .. versionadded:: 1.2.0
80
+ factor : float
81
+ The bandwidth factor, obtained from `kde.covariance_factor`. The square
82
+ of `kde.factor` multiplies the covariance matrix of the data in the kde
83
+ estimation.
84
+ covariance : ndarray
85
+ The covariance matrix of `dataset`, scaled by the calculated bandwidth
86
+ (`kde.factor`).
87
+ inv_cov : ndarray
88
+ The inverse of `covariance`.
89
+
90
+ Methods
91
+ -------
92
+ evaluate
93
+ __call__
94
+ integrate_gaussian
95
+ integrate_box_1d
96
+ integrate_box
97
+ integrate_kde
98
+ pdf
99
+ logpdf
100
+ resample
101
+ set_bandwidth
102
+ covariance_factor
103
+
104
+ Notes
105
+ -----
106
+ Bandwidth selection strongly influences the estimate obtained from the KDE
107
+ (much more so than the actual shape of the kernel). Bandwidth selection
108
+ can be done by a "rule of thumb", by cross-validation, by "plug-in
109
+ methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
110
+ uses a rule of thumb, the default is Scott's Rule.
111
+
112
+ Scott's Rule [1]_, implemented as `scotts_factor`, is::
113
+
114
+ n**(-1./(d+4)),
115
+
116
+ with ``n`` the number of data points and ``d`` the number of dimensions.
117
+ In the case of unequally weighted points, `scotts_factor` becomes::
118
+
119
+ neff**(-1./(d+4)),
120
+
121
+ with ``neff`` the effective number of datapoints.
122
+ Silverman's Rule [2]_, implemented as `silverman_factor`, is::
123
+
124
+ (n * (d + 2) / 4.)**(-1. / (d + 4)).
125
+
126
+ or in the case of unequally weighted points::
127
+
128
+ (neff * (d + 2) / 4.)**(-1. / (d + 4)).
129
+
130
+ Good general descriptions of kernel density estimation can be found in [1]_
131
+ and [2]_, the mathematics for this multi-dimensional implementation can be
132
+ found in [1]_.
133
+
134
+ With a set of weighted samples, the effective number of datapoints ``neff``
135
+ is defined by::
136
+
137
+ neff = sum(weights)^2 / sum(weights^2)
138
+
139
+ as detailed in [5]_.
140
+
141
+ `gaussian_kde` does not currently support data that lies in a
142
+ lower-dimensional subspace of the space in which it is expressed. For such
143
+ data, consider performing principle component analysis / dimensionality
144
+ reduction and using `gaussian_kde` with the transformed data.
145
+
146
+ References
147
+ ----------
148
+ .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
149
+ Visualization", John Wiley & Sons, New York, Chicester, 1992.
150
+ .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
151
+ Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
152
+ Chapman and Hall, London, 1986.
153
+ .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
154
+ Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
155
+ .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
156
+ conditional density estimation", Computational Statistics & Data
157
+ Analysis, Vol. 36, pp. 279-298, 2001.
158
+ .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
159
+ Series A (General), 132, 272
160
+
161
+ Examples
162
+ --------
163
+ Generate some random two-dimensional data:
164
+
165
+ >>> import numpy as np
166
+ >>> from scipy import stats
167
+ >>> def measure(n):
168
+ ... "Measurement model, return two coupled measurements."
169
+ ... m1 = np.random.normal(size=n)
170
+ ... m2 = np.random.normal(scale=0.5, size=n)
171
+ ... return m1+m2, m1-m2
172
+
173
+ >>> m1, m2 = measure(2000)
174
+ >>> xmin = m1.min()
175
+ >>> xmax = m1.max()
176
+ >>> ymin = m2.min()
177
+ >>> ymax = m2.max()
178
+
179
+ Perform a kernel density estimate on the data:
180
+
181
+ >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
182
+ >>> positions = np.vstack([X.ravel(), Y.ravel()])
183
+ >>> values = np.vstack([m1, m2])
184
+ >>> kernel = stats.gaussian_kde(values)
185
+ >>> Z = np.reshape(kernel(positions).T, X.shape)
186
+
187
+ Plot the results:
188
+
189
+ >>> import matplotlib.pyplot as plt
190
+ >>> fig, ax = plt.subplots()
191
+ >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
192
+ ... extent=[xmin, xmax, ymin, ymax])
193
+ >>> ax.plot(m1, m2, 'k.', markersize=2)
194
+ >>> ax.set_xlim([xmin, xmax])
195
+ >>> ax.set_ylim([ymin, ymax])
196
+ >>> plt.show()
197
+
198
+ """
199
+ def __init__(self, dataset, bw_method=None, weights=None):
200
+ self.dataset = atleast_2d(asarray(dataset))
201
+ if not self.dataset.size > 1:
202
+ raise ValueError("`dataset` input should have multiple elements.")
203
+
204
+ self.d, self.n = self.dataset.shape
205
+
206
+ if weights is not None:
207
+ self._weights = atleast_1d(weights).astype(float)
208
+ self._weights /= sum(self._weights)
209
+ if self.weights.ndim != 1:
210
+ raise ValueError("`weights` input should be one-dimensional.")
211
+ if len(self._weights) != self.n:
212
+ raise ValueError("`weights` input should be of length n")
213
+ self._neff = 1/sum(self._weights**2)
214
+
215
+ # This can be converted to a warning once gh-10205 is resolved
216
+ if self.d > self.n:
217
+ msg = ("Number of dimensions is greater than number of samples. "
218
+ "This results in a singular data covariance matrix, which "
219
+ "cannot be treated using the algorithms implemented in "
220
+ "`gaussian_kde`. Note that `gaussian_kde` interprets each "
221
+ "*column* of `dataset` to be a point; consider transposing "
222
+ "the input to `dataset`.")
223
+ raise ValueError(msg)
224
+
225
+ try:
226
+ self.set_bandwidth(bw_method=bw_method)
227
+ except linalg.LinAlgError as e:
228
+ msg = ("The data appears to lie in a lower-dimensional subspace "
229
+ "of the space in which it is expressed. This has resulted "
230
+ "in a singular data covariance matrix, which cannot be "
231
+ "treated using the algorithms implemented in "
232
+ "`gaussian_kde`. Consider performing principle component "
233
+ "analysis / dimensionality reduction and using "
234
+ "`gaussian_kde` with the transformed data.")
235
+ raise linalg.LinAlgError(msg) from e
236
+
237
+ def evaluate(self, points):
238
+ """Evaluate the estimated pdf on a set of points.
239
+
240
+ Parameters
241
+ ----------
242
+ points : (# of dimensions, # of points)-array
243
+ Alternatively, a (# of dimensions,) vector can be passed in and
244
+ treated as a single point.
245
+
246
+ Returns
247
+ -------
248
+ values : (# of points,)-array
249
+ The values at each point.
250
+
251
+ Raises
252
+ ------
253
+ ValueError : if the dimensionality of the input points is different than
254
+ the dimensionality of the KDE.
255
+
256
+ """
257
+ points = atleast_2d(asarray(points))
258
+
259
+ d, m = points.shape
260
+ if d != self.d:
261
+ if d == 1 and m == self.d:
262
+ # points was passed in as a row vector
263
+ points = reshape(points, (self.d, 1))
264
+ m = 1
265
+ else:
266
+ msg = (f"points have dimension {d}, "
267
+ f"dataset has dimension {self.d}")
268
+ raise ValueError(msg)
269
+
270
+ output_dtype, spec = _get_output_dtype(self.covariance, points)
271
+ result = gaussian_kernel_estimate[spec](
272
+ self.dataset.T, self.weights[:, None],
273
+ points.T, self.cho_cov, output_dtype)
274
+
275
+ return result[:, 0]
276
+
277
+ __call__ = evaluate
278
+
279
+ def integrate_gaussian(self, mean, cov):
280
+ """
281
+ Multiply estimated density by a multivariate Gaussian and integrate
282
+ over the whole space.
283
+
284
+ Parameters
285
+ ----------
286
+ mean : aray_like
287
+ A 1-D array, specifying the mean of the Gaussian.
288
+ cov : array_like
289
+ A 2-D array, specifying the covariance matrix of the Gaussian.
290
+
291
+ Returns
292
+ -------
293
+ result : scalar
294
+ The value of the integral.
295
+
296
+ Raises
297
+ ------
298
+ ValueError
299
+ If the mean or covariance of the input Gaussian differs from
300
+ the KDE's dimensionality.
301
+
302
+ """
303
+ mean = atleast_1d(squeeze(mean))
304
+ cov = atleast_2d(cov)
305
+
306
+ if mean.shape != (self.d,):
307
+ raise ValueError("mean does not have dimension %s" % self.d)
308
+ if cov.shape != (self.d, self.d):
309
+ raise ValueError("covariance does not have dimension %s" % self.d)
310
+
311
+ # make mean a column vector
312
+ mean = mean[:, newaxis]
313
+
314
+ sum_cov = self.covariance + cov
315
+
316
+ # This will raise LinAlgError if the new cov matrix is not s.p.d
317
+ # cho_factor returns (ndarray, bool) where bool is a flag for whether
318
+ # or not ndarray is upper or lower triangular
319
+ sum_cov_chol = linalg.cho_factor(sum_cov)
320
+
321
+ diff = self.dataset - mean
322
+ tdiff = linalg.cho_solve(sum_cov_chol, diff)
323
+
324
+ sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
325
+ norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
326
+
327
+ energies = sum(diff * tdiff, axis=0) / 2.0
328
+ result = sum(exp(-energies)*self.weights, axis=0) / norm_const
329
+
330
+ return result
331
+
332
+ def integrate_box_1d(self, low, high):
333
+ """
334
+ Computes the integral of a 1D pdf between two bounds.
335
+
336
+ Parameters
337
+ ----------
338
+ low : scalar
339
+ Lower bound of integration.
340
+ high : scalar
341
+ Upper bound of integration.
342
+
343
+ Returns
344
+ -------
345
+ value : scalar
346
+ The result of the integral.
347
+
348
+ Raises
349
+ ------
350
+ ValueError
351
+ If the KDE is over more than one dimension.
352
+
353
+ """
354
+ if self.d != 1:
355
+ raise ValueError("integrate_box_1d() only handles 1D pdfs")
356
+
357
+ stdev = ravel(sqrt(self.covariance))[0]
358
+
359
+ normalized_low = ravel((low - self.dataset) / stdev)
360
+ normalized_high = ravel((high - self.dataset) / stdev)
361
+
362
+ value = np.sum(self.weights*(
363
+ special.ndtr(normalized_high) -
364
+ special.ndtr(normalized_low)))
365
+ return value
366
+
367
+ def integrate_box(self, low_bounds, high_bounds, maxpts=None):
368
+ """Computes the integral of a pdf over a rectangular interval.
369
+
370
+ Parameters
371
+ ----------
372
+ low_bounds : array_like
373
+ A 1-D array containing the lower bounds of integration.
374
+ high_bounds : array_like
375
+ A 1-D array containing the upper bounds of integration.
376
+ maxpts : int, optional
377
+ The maximum number of points to use for integration.
378
+
379
+ Returns
380
+ -------
381
+ value : scalar
382
+ The result of the integral.
383
+
384
+ """
385
+ if maxpts is not None:
386
+ extra_kwds = {'maxpts': maxpts}
387
+ else:
388
+ extra_kwds = {}
389
+
390
+ value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
391
+ self.dataset, self.weights,
392
+ self.covariance, **extra_kwds)
393
+ if inform:
394
+ msg = ('An integral in _mvn.mvnun requires more points than %s' %
395
+ (self.d * 1000))
396
+ warnings.warn(msg, stacklevel=2)
397
+
398
+ return value
399
+
400
+ def integrate_kde(self, other):
401
+ """
402
+ Computes the integral of the product of this kernel density estimate
403
+ with another.
404
+
405
+ Parameters
406
+ ----------
407
+ other : gaussian_kde instance
408
+ The other kde.
409
+
410
+ Returns
411
+ -------
412
+ value : scalar
413
+ The result of the integral.
414
+
415
+ Raises
416
+ ------
417
+ ValueError
418
+ If the KDEs have different dimensionality.
419
+
420
+ """
421
+ if other.d != self.d:
422
+ raise ValueError("KDEs are not the same dimensionality")
423
+
424
+ # we want to iterate over the smallest number of points
425
+ if other.n < self.n:
426
+ small = other
427
+ large = self
428
+ else:
429
+ small = self
430
+ large = other
431
+
432
+ sum_cov = small.covariance + large.covariance
433
+ sum_cov_chol = linalg.cho_factor(sum_cov)
434
+ result = 0.0
435
+ for i in range(small.n):
436
+ mean = small.dataset[:, i, newaxis]
437
+ diff = large.dataset - mean
438
+ tdiff = linalg.cho_solve(sum_cov_chol, diff)
439
+
440
+ energies = sum(diff * tdiff, axis=0) / 2.0
441
+ result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
442
+
443
+ sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
444
+ norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
445
+
446
+ result /= norm_const
447
+
448
+ return result
449
+
450
+ def resample(self, size=None, seed=None):
451
+ """Randomly sample a dataset from the estimated pdf.
452
+
453
+ Parameters
454
+ ----------
455
+ size : int, optional
456
+ The number of samples to draw. If not provided, then the size is
457
+ the same as the effective number of samples in the underlying
458
+ dataset.
459
+ seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
460
+ If `seed` is None (or `np.random`), the `numpy.random.RandomState`
461
+ singleton is used.
462
+ If `seed` is an int, a new ``RandomState`` instance is used,
463
+ seeded with `seed`.
464
+ If `seed` is already a ``Generator`` or ``RandomState`` instance then
465
+ that instance is used.
466
+
467
+ Returns
468
+ -------
469
+ resample : (self.d, `size`) ndarray
470
+ The sampled dataset.
471
+
472
+ """ # numpy/numpydoc#87 # noqa: E501
473
+ if size is None:
474
+ size = int(self.neff)
475
+
476
+ random_state = check_random_state(seed)
477
+ norm = transpose(random_state.multivariate_normal(
478
+ zeros((self.d,), float), self.covariance, size=size
479
+ ))
480
+ indices = random_state.choice(self.n, size=size, p=self.weights)
481
+ means = self.dataset[:, indices]
482
+
483
+ return means + norm
484
+
485
+ def scotts_factor(self):
486
+ """Compute Scott's factor.
487
+
488
+ Returns
489
+ -------
490
+ s : float
491
+ Scott's factor.
492
+ """
493
+ return power(self.neff, -1./(self.d+4))
494
+
495
+ def silverman_factor(self):
496
+ """Compute the Silverman factor.
497
+
498
+ Returns
499
+ -------
500
+ s : float
501
+ The silverman factor.
502
+ """
503
+ return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
504
+
505
+ # Default method to calculate bandwidth, can be overwritten by subclass
506
+ covariance_factor = scotts_factor
507
+ covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
508
+ multiplies the data covariance matrix to obtain the kernel covariance
509
+ matrix. The default is `scotts_factor`. A subclass can overwrite this
510
+ method to provide a different method, or set it through a call to
511
+ `kde.set_bandwidth`."""
512
+
513
+ def set_bandwidth(self, bw_method=None):
514
+ """Compute the estimator bandwidth with given method.
515
+
516
+ The new bandwidth calculated after a call to `set_bandwidth` is used
517
+ for subsequent evaluations of the estimated density.
518
+
519
+ Parameters
520
+ ----------
521
+ bw_method : str, scalar or callable, optional
522
+ The method used to calculate the estimator bandwidth. This can be
523
+ 'scott', 'silverman', a scalar constant or a callable. If a
524
+ scalar, this will be used directly as `kde.factor`. If a callable,
525
+ it should take a `gaussian_kde` instance as only parameter and
526
+ return a scalar. If None (default), nothing happens; the current
527
+ `kde.covariance_factor` method is kept.
528
+
529
+ Notes
530
+ -----
531
+ .. versionadded:: 0.11
532
+
533
+ Examples
534
+ --------
535
+ >>> import numpy as np
536
+ >>> import scipy.stats as stats
537
+ >>> x1 = np.array([-7, -5, 1, 4, 5.])
538
+ >>> kde = stats.gaussian_kde(x1)
539
+ >>> xs = np.linspace(-10, 10, num=50)
540
+ >>> y1 = kde(xs)
541
+ >>> kde.set_bandwidth(bw_method='silverman')
542
+ >>> y2 = kde(xs)
543
+ >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
544
+ >>> y3 = kde(xs)
545
+
546
+ >>> import matplotlib.pyplot as plt
547
+ >>> fig, ax = plt.subplots()
548
+ >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
549
+ ... label='Data points (rescaled)')
550
+ >>> ax.plot(xs, y1, label='Scott (default)')
551
+ >>> ax.plot(xs, y2, label='Silverman')
552
+ >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
553
+ >>> ax.legend()
554
+ >>> plt.show()
555
+
556
+ """
557
+ if bw_method is None:
558
+ pass
559
+ elif bw_method == 'scott':
560
+ self.covariance_factor = self.scotts_factor
561
+ elif bw_method == 'silverman':
562
+ self.covariance_factor = self.silverman_factor
563
+ elif np.isscalar(bw_method) and not isinstance(bw_method, str):
564
+ self._bw_method = 'use constant'
565
+ self.covariance_factor = lambda: bw_method
566
+ elif callable(bw_method):
567
+ self._bw_method = bw_method
568
+ self.covariance_factor = lambda: self._bw_method(self)
569
+ else:
570
+ msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
571
+ "or a callable."
572
+ raise ValueError(msg)
573
+
574
+ self._compute_covariance()
575
+
576
+ def _compute_covariance(self):
577
+ """Computes the covariance matrix for each Gaussian kernel using
578
+ covariance_factor().
579
+ """
580
+ self.factor = self.covariance_factor()
581
+ # Cache covariance and Cholesky decomp of covariance
582
+ if not hasattr(self, '_data_cho_cov'):
583
+ self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
584
+ bias=False,
585
+ aweights=self.weights))
586
+ self._data_cho_cov = linalg.cholesky(self._data_covariance,
587
+ lower=True)
588
+
589
+ self.covariance = self._data_covariance * self.factor**2
590
+ self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
591
+ self.log_det = 2*np.log(np.diag(self.cho_cov
592
+ * np.sqrt(2*pi))).sum()
593
+
594
+ @property
595
+ def inv_cov(self):
596
+ # Re-compute from scratch each time because I'm not sure how this is
597
+ # used in the wild. (Perhaps users change the `dataset`, since it's
598
+ # not a private attribute?) `_compute_covariance` used to recalculate
599
+ # all these, so we'll recalculate everything now that this is a
600
+ # a property.
601
+ self.factor = self.covariance_factor()
602
+ self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
603
+ bias=False, aweights=self.weights))
604
+ return linalg.inv(self._data_covariance) / self.factor**2
605
+
606
+ def pdf(self, x):
607
+ """
608
+ Evaluate the estimated pdf on a provided set of points.
609
+
610
+ Notes
611
+ -----
612
+ This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
613
+ docstring for more details.
614
+
615
+ """
616
+ return self.evaluate(x)
617
+
618
+ def logpdf(self, x):
619
+ """
620
+ Evaluate the log of the estimated pdf on a provided set of points.
621
+ """
622
+ points = atleast_2d(x)
623
+
624
+ d, m = points.shape
625
+ if d != self.d:
626
+ if d == 1 and m == self.d:
627
+ # points was passed in as a row vector
628
+ points = reshape(points, (self.d, 1))
629
+ m = 1
630
+ else:
631
+ msg = (f"points have dimension {d}, "
632
+ f"dataset has dimension {self.d}")
633
+ raise ValueError(msg)
634
+
635
+ output_dtype, spec = _get_output_dtype(self.covariance, points)
636
+ result = gaussian_kernel_estimate_log[spec](
637
+ self.dataset.T, self.weights[:, None],
638
+ points.T, self.cho_cov, output_dtype)
639
+
640
+ return result[:, 0]
641
+
642
+ def marginal(self, dimensions):
643
+ """Return a marginal KDE distribution
644
+
645
+ Parameters
646
+ ----------
647
+ dimensions : int or 1-d array_like
648
+ The dimensions of the multivariate distribution corresponding
649
+ with the marginal variables, that is, the indices of the dimensions
650
+ that are being retained. The other dimensions are marginalized out.
651
+
652
+ Returns
653
+ -------
654
+ marginal_kde : gaussian_kde
655
+ An object representing the marginal distribution.
656
+
657
+ Notes
658
+ -----
659
+ .. versionadded:: 1.10.0
660
+
661
+ """
662
+
663
+ dims = np.atleast_1d(dimensions)
664
+
665
+ if not np.issubdtype(dims.dtype, np.integer):
666
+ msg = ("Elements of `dimensions` must be integers - the indices "
667
+ "of the marginal variables being retained.")
668
+ raise ValueError(msg)
669
+
670
+ n = len(self.dataset) # number of dimensions
671
+ original_dims = dims.copy()
672
+
673
+ dims[dims < 0] = n + dims[dims < 0]
674
+
675
+ if len(np.unique(dims)) != len(dims):
676
+ msg = ("All elements of `dimensions` must be unique.")
677
+ raise ValueError(msg)
678
+
679
+ i_invalid = (dims < 0) | (dims >= n)
680
+ if np.any(i_invalid):
681
+ msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
682
+ f"for a distribution in {n} dimensions.")
683
+ raise ValueError(msg)
684
+
685
+ dataset = self.dataset[dims]
686
+ weights = self.weights
687
+
688
+ return gaussian_kde(dataset, bw_method=self.covariance_factor(),
689
+ weights=weights)
690
+
691
+ @property
692
+ def weights(self):
693
+ try:
694
+ return self._weights
695
+ except AttributeError:
696
+ self._weights = ones(self.n)/self.n
697
+ return self._weights
698
+
699
+ @property
700
+ def neff(self):
701
+ try:
702
+ return self._neff
703
+ except AttributeError:
704
+ self._neff = 1/sum(self.weights**2)
705
+ return self._neff
706
+
707
+
708
+ def _get_output_dtype(covariance, points):
709
+ """
710
+ Calculates the output dtype and the "spec" (=C type name).
711
+
712
+ This was necessary in order to deal with the fused types in the Cython
713
+ routine `gaussian_kernel_estimate`. See gh-10824 for details.
714
+ """
715
+ output_dtype = np.common_type(covariance, points)
716
+ itemsize = np.dtype(output_dtype).itemsize
717
+ if itemsize == 4:
718
+ spec = 'float'
719
+ elif itemsize == 8:
720
+ spec = 'double'
721
+ elif itemsize in (12, 16):
722
+ spec = 'long double'
723
+ else:
724
+ raise ValueError(
725
+ f"{output_dtype} has unexpected item size: {itemsize}"
726
+ )
727
+
728
+ return output_dtype, spec
.venv/Lib/site-packages/scipy/stats/_ksstats.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
2
+ # D_n = sup_x{|F_n(x) - F(x)|},
3
+ # F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
4
+ # F(x) is the CDF of a probability distribution.
5
+ #
6
+ # Exact methods:
7
+ # Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
8
+ # or a recursion algorithm due to Pomeranz[2].
9
+ # Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
10
+ # the Durbin algorithm.
11
+ # D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
12
+ # Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
13
+ # For d > 0.5, the latter intersection probability is 0.
14
+ #
15
+ # Approximate methods:
16
+ # For d close to 0.5, ignoring that intersection term may still give a
17
+ # reasonable approximation.
18
+ # Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
19
+ # Kolmogorov's initial asymptotic, suitable for large d. (See
20
+ # scipy.special.kolmogorov for that asymptotic)
21
+ # Pelz-Good[6] used the functional equation for Jacobi theta functions to
22
+ # transform the Li-Chien/Korolyuk formula produce a computational formula
23
+ # suitable for small d.
24
+ #
25
+ # Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
26
+ # the above approaches and it is that which is used here.
27
+ #
28
+ # Other approaches:
29
+ # Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
30
+ # Moscovich and Nadler[9] use FFTs to compute the convolutions.
31
+
32
+ # References:
33
+ # [1] Durbin J (1968).
34
+ # "The Probability that the Sample Distribution Function Lies Between Two
35
+ # Parallel Straight Lines."
36
+ # Annals of Mathematical Statistics, 39, 398-411.
37
+ # [2] Pomeranz J (1974).
38
+ # "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
39
+ # Small Samples (Algorithm 487)."
40
+ # Communications of the ACM, 17(12), 703-704.
41
+ # [3] Marsaglia G, Tsang WW, Wang J (2003).
42
+ # "Evaluating Kolmogorov's Distribution."
43
+ # Journal of Statistical Software, 8(18), 1-4.
44
+ # [4] LI-CHIEN, C. (1956).
45
+ # "On the exact distribution of the statistics of A. N. Kolmogorov and
46
+ # their asymptotic expansion."
47
+ # Acta Matematica Sinica, 6, 55-81.
48
+ # [5] KOROLYUK, V. S. (1960).
49
+ # "Asymptotic analysis of the distribution of the maximum deviation in
50
+ # the Bernoulli scheme."
51
+ # Theor. Probability Appl., 4, 339-366.
52
+ # [6] Pelz W, Good IJ (1976).
53
+ # "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
54
+ # Statistic."
55
+ # Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
56
+ # [7] Simard, R., L'Ecuyer, P. (2011)
57
+ # "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
58
+ # Journal of Statistical Software, Vol 39, 11, 1-18.
59
+ # [8] Carvalho, Luis (2015)
60
+ # "An Improved Evaluation of Kolmogorov's Distribution"
61
+ # Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
62
+ # [9] Amit Moscovich, Boaz Nadler (2017)
63
+ # "Fast calculation of boundary crossing probabilities for Poisson
64
+ # processes",
65
+ # Statistics & Probability Letters, Vol 123, 177-182.
66
+
67
+
68
+ import numpy as np
69
+ import scipy.special
70
+ import scipy.special._ufuncs as scu
71
+ from scipy._lib._finite_differences import _derivative
72
+
73
+ _E128 = 128
74
+ _EP128 = np.ldexp(np.longdouble(1), _E128)
75
+ _EM128 = np.ldexp(np.longdouble(1), -_E128)
76
+
77
+ _SQRT2PI = np.sqrt(2 * np.pi)
78
+ _LOG_2PI = np.log(2 * np.pi)
79
+ _MIN_LOG = -708
80
+ _SQRT3 = np.sqrt(3)
81
+ _PI_SQUARED = np.pi ** 2
82
+ _PI_FOUR = np.pi ** 4
83
+ _PI_SIX = np.pi ** 6
84
+
85
+ # [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
86
+ # then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
87
+ _STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
88
+ -1.9175269175269175269e-3, 8.4175084175084175084e-4,
89
+ -5.952380952380952381e-4, 7.9365079365079365079e-4,
90
+ -2.7777777777777777778e-3, 8.3333333333333333333e-2]
91
+
92
+
93
+ def _log_nfactorial_div_n_pow_n(n):
94
+ # Computes n! / n**n
95
+ # = (n-1)! / n**(n-1)
96
+ # Uses Stirling's approximation, but removes n*log(n) up-front to
97
+ # avoid subtractive cancellation.
98
+ # = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
99
+ rn = 1.0/n
100
+ return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
101
+
102
+
103
+ def _clip_prob(p):
104
+ """clips a probability to range 0<=p<=1."""
105
+ return np.clip(p, 0.0, 1.0)
106
+
107
+
108
+ def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
109
+ """Selects either the CDF or SF, and then clips to range 0<=p<=1."""
110
+ p = np.where(cdf, cdfprob, sfprob)
111
+ return _clip_prob(p)
112
+
113
+
114
+ def _kolmogn_DMTW(n, d, cdf=True):
115
+ r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
116
+ the Durbin matrix algorithm.
117
+
118
+ Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
119
+ """
120
+ # Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
121
+ # Generate initial matrix H of size m*m where m=(2k-1)
122
+ # Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
123
+ # Requires memory O(m^2) and computation O(m^2 log(n)).
124
+ # Most suitable for small m.
125
+
126
+ if d >= 1.0:
127
+ return _select_and_clip_prob(1.0, 0.0, cdf)
128
+ nd = n * d
129
+ if nd <= 0.5:
130
+ return _select_and_clip_prob(0.0, 1.0, cdf)
131
+ k = int(np.ceil(nd))
132
+ h = k - nd
133
+ m = 2 * k - 1
134
+
135
+ H = np.zeros([m, m])
136
+
137
+ # Initialize: v is first column (and last row) of H
138
+ # v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
139
+ # w[j] = 1/(j)!
140
+ # q = k-th row of H (actually i!/n^i*H^i)
141
+ intm = np.arange(1, m + 1)
142
+ v = 1.0 - h ** intm
143
+ w = np.empty(m)
144
+ fac = 1.0
145
+ for j in intm:
146
+ w[j - 1] = fac
147
+ fac /= j # This might underflow. Isn't a problem.
148
+ v[j - 1] *= fac
149
+ tt = max(2 * h - 1.0, 0)**m - 2*h**m
150
+ v[-1] = (1.0 + tt) * fac
151
+
152
+ for i in range(1, m):
153
+ H[i - 1:, i] = w[:m - i + 1]
154
+ H[:, 0] = v
155
+ H[-1, :] = np.flip(v, axis=0)
156
+
157
+ Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
158
+ nn = n
159
+ expnt = 0 # Scaling of Hpwr
160
+ Hexpnt = 0 # Scaling of H
161
+ while nn > 0:
162
+ if nn % 2:
163
+ Hpwr = np.matmul(Hpwr, H)
164
+ expnt += Hexpnt
165
+ H = np.matmul(H, H)
166
+ Hexpnt *= 2
167
+ # Scale as needed.
168
+ if np.abs(H[k - 1, k - 1]) > _EP128:
169
+ H /= _EP128
170
+ Hexpnt += _E128
171
+ nn = nn // 2
172
+
173
+ p = Hpwr[k - 1, k - 1]
174
+
175
+ # Multiply by n!/n^n
176
+ for i in range(1, n + 1):
177
+ p = i * p / n
178
+ if np.abs(p) < _EM128:
179
+ p *= _EP128
180
+ expnt -= _E128
181
+
182
+ # unscale
183
+ if expnt != 0:
184
+ p = np.ldexp(p, expnt)
185
+
186
+ return _select_and_clip_prob(p, 1.0-p, cdf)
187
+
188
+
189
+ def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
190
+ """Compute the endpoints of the interval for row i."""
191
+ if i == 0:
192
+ j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
193
+ else:
194
+ # i + 1 = 2*ip1div2 + ip1mod2
195
+ ip1div2, ip1mod2 = divmod(i + 1, 2)
196
+ if ip1mod2 == 0: # i is odd
197
+ if ip1div2 == n + 1:
198
+ j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
199
+ else:
200
+ j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
201
+ else:
202
+ j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
203
+
204
+ return max(j1 + 2, 0), min(j2, n)
205
+
206
+
207
+ def _kolmogn_Pomeranz(n, x, cdf=True):
208
+ r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
209
+
210
+ Pomeranz (1974) [2]
211
+ """
212
+
213
+ # V is n*(2n+2) matrix.
214
+ # Each row is convolution of the previous row and probabilities from a
215
+ # Poisson distribution.
216
+ # Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
217
+ # Only two rows are needed at any given stage:
218
+ # - Call them V0 and V1.
219
+ # - Swap each iteration
220
+ # Only a few (contiguous) entries in each row can be non-zero.
221
+ # - Keep track of start and end (j1 and j2 below)
222
+ # - V0s and V1s track the start in the two rows
223
+ # Scale intermediate results as needed.
224
+ # Only a few different Poisson distributions can occur
225
+ t = n * x
226
+ ll = int(np.floor(t))
227
+ f = 1.0 * (t - ll) # fractional part of t
228
+ g = min(f, 1.0 - f)
229
+ ceilf = (1 if f > 0 else 0)
230
+ roundf = (1 if f > 0.5 else 0)
231
+ npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
232
+ gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
233
+ twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
234
+ onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
235
+ # gpower etc are *almost* Poisson probs, just missing normalizing factor.
236
+
237
+ gpower[0] = 1.0
238
+ twogpower[0] = 1.0
239
+ onem2gpower[0] = 1.0
240
+ expnt = 0
241
+ g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
242
+ for m in range(1, npwrs):
243
+ gpower[m] = gpower[m - 1] * g_over_n / m
244
+ twogpower[m] = twogpower[m - 1] * two_g_over_n / m
245
+ onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
246
+
247
+ V0 = np.zeros([npwrs])
248
+ V1 = np.zeros([npwrs])
249
+ V1[0] = 1 # first row
250
+ V0s, V1s = 0, 0 # start indices of the two rows
251
+
252
+ j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
253
+ for i in range(1, 2 * n + 2):
254
+ # Preserve j1, V1, V1s, V0s from last iteration
255
+ k1 = j1
256
+ V0, V1 = V1, V0
257
+ V0s, V1s = V1s, V0s
258
+ V1.fill(0.0)
259
+ j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
260
+ if i == 1 or i == 2 * n + 1:
261
+ pwrs = gpower
262
+ else:
263
+ pwrs = (twogpower if i % 2 else onem2gpower)
264
+ ln2 = j2 - k1 + 1
265
+ if ln2 > 0:
266
+ conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
267
+ conv_start = j1 - k1 # First index to use from conv
268
+ conv_len = j2 - j1 + 1 # Number of entries to use from conv
269
+ V1[:conv_len] = conv[conv_start:conv_start + conv_len]
270
+ # Scale to avoid underflow.
271
+ if 0 < np.max(V1) < _EM128:
272
+ V1 *= _EP128
273
+ expnt -= _E128
274
+ V1s = V0s + j1 - k1
275
+
276
+ # multiply by n!
277
+ ans = V1[n - V1s]
278
+ for m in range(1, n + 1):
279
+ if np.abs(ans) > _EP128:
280
+ ans *= _EM128
281
+ expnt += _E128
282
+ ans *= m
283
+
284
+ # Undo any intermediate scaling
285
+ if expnt != 0:
286
+ ans = np.ldexp(ans, expnt)
287
+ ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
288
+ return ans
289
+
290
+
291
+ def _kolmogn_PelzGood(n, x, cdf=True):
292
+ """Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
293
+
294
+ Start with Li-Chien, Korolyuk approximation:
295
+ Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
296
+ where z = x*sqrt(n).
297
+ Transform each K_(z) using Jacobi theta functions into a form suitable
298
+ for small z.
299
+ Pelz-Good (1976). [6]
300
+ """
301
+ if x <= 0.0:
302
+ return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
303
+ if x >= 1.0:
304
+ return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
305
+
306
+ z = np.sqrt(n) * x
307
+ zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
308
+
309
+ qlog = -_PI_SQUARED / 8 / zsquared
310
+ if qlog < _MIN_LOG: # z ~ 0.041743441416853426
311
+ return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
312
+
313
+ q = np.exp(qlog)
314
+
315
+ # Coefficients of terms in the sums for K1, K2 and K3
316
+ k1a = -zsquared
317
+ k1b = _PI_SQUARED / 4
318
+
319
+ k2a = 6 * zsix + 2 * zfour
320
+ k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
321
+ k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
322
+
323
+ k3d = _PI_SIX * (5 - 30 * zsquared) / 64
324
+ k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
325
+ k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
326
+ k3a = -30 * zsix - 90 * z**8
327
+
328
+ K0to3 = np.zeros(4)
329
+ # Use a Horner scheme to evaluate sum c_i q^(i^2)
330
+ # Reduces to a sum over odd integers.
331
+ maxk = int(np.ceil(16 * z / np.pi))
332
+ for k in range(maxk, 0, -1):
333
+ m = 2 * k - 1
334
+ msquared, mfour, msix = m**2, m**4, m**6
335
+ qpower = np.power(q, 8 * k)
336
+ coeffs = np.array([1.0,
337
+ k1a + k1b*msquared,
338
+ k2a + k2b*msquared + k2c*mfour,
339
+ k3a + k3b*msquared + k3c*mfour + k3d*msix])
340
+ K0to3 *= qpower
341
+ K0to3 += coeffs
342
+ K0to3 *= q
343
+ K0to3 *= _SQRT2PI
344
+ # z**10 > 0 as z > 0.04
345
+ K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
346
+
347
+ # Now do the other sum over the other terms, all integers k
348
+ # K_2: (pi^2 k^2) q^(k^2),
349
+ # K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
350
+ # Don't expect much subtractive cancellation so use direct calculation
351
+ q = np.exp(-_PI_SQUARED / 2 / zsquared)
352
+ ks = np.arange(maxk, 0, -1)
353
+ ksquared = ks ** 2
354
+ sqrt3z = _SQRT3 * z
355
+ kspi = np.pi * ks
356
+ qpwers = q ** ksquared
357
+ k2extra = np.sum(ksquared * qpwers)
358
+ k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
359
+ K0to3[2] += k2extra
360
+ k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
361
+ k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
362
+ K0to3[3] += k3extra
363
+ powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
364
+ K0to3 /= powers_of_n
365
+
366
+ if not cdf:
367
+ K0to3 *= -1
368
+ K0to3[0] += 1
369
+
370
+ Ksum = sum(K0to3)
371
+ return Ksum
372
+
373
+
374
+ def _kolmogn(n, x, cdf=True):
375
+ """Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
376
+
377
+ x must be of type float, n of type integer.
378
+
379
+ Simard & L'Ecuyer (2011) [7].
380
+ """
381
+ if np.isnan(n):
382
+ return n # Keep the same type of nan
383
+ if int(n) != n or n <= 0:
384
+ return np.nan
385
+ if x >= 1.0:
386
+ return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
387
+ if x <= 0.0:
388
+ return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
389
+ t = n * x
390
+ if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
391
+ if t <= 0.5:
392
+ return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
393
+ if n <= 140:
394
+ prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
395
+ else:
396
+ prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
397
+ return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
398
+ if t >= n - 1: # Ruben-Gambino
399
+ prob = 2 * (1.0 - x)**n
400
+ return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
401
+ if x >= 0.5: # Exact: 2 * smirnov
402
+ prob = 2 * scipy.special.smirnov(n, x)
403
+ return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
404
+
405
+ nxsquared = t * x
406
+ if n <= 140:
407
+ if nxsquared <= 0.754693:
408
+ prob = _kolmogn_DMTW(n, x, cdf=True)
409
+ return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
410
+ if nxsquared <= 4:
411
+ prob = _kolmogn_Pomeranz(n, x, cdf=True)
412
+ return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
413
+ # Now use Miller approximation of 2*smirnov
414
+ prob = 2 * scipy.special.smirnov(n, x)
415
+ return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
416
+
417
+ # Split CDF and SF as they have different cutoffs on nxsquared.
418
+ if not cdf:
419
+ if nxsquared >= 370.0:
420
+ return 0.0
421
+ if nxsquared >= 2.2:
422
+ prob = 2 * scipy.special.smirnov(n, x)
423
+ return _clip_prob(prob)
424
+ # Fall through and compute the SF as 1.0-CDF
425
+ if nxsquared >= 18.0:
426
+ cdfprob = 1.0
427
+ elif n <= 100000 and n * x**1.5 <= 1.4:
428
+ cdfprob = _kolmogn_DMTW(n, x, cdf=True)
429
+ else:
430
+ cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
431
+ return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
432
+
433
+
434
+ def _kolmogn_p(n, x):
435
+ """Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
436
+
437
+ x must be of type float, n of type integer.
438
+ """
439
+ if np.isnan(n):
440
+ return n # Keep the same type of nan
441
+ if int(n) != n or n <= 0:
442
+ return np.nan
443
+ if x >= 1.0 or x <= 0:
444
+ return 0
445
+ t = n * x
446
+ if t <= 1.0:
447
+ # Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
448
+ if t <= 0.5:
449
+ return 0.0
450
+ if n <= 140:
451
+ prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
452
+ else:
453
+ prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
454
+ return prd * 2 * n**2
455
+ if t >= n - 1:
456
+ # Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
457
+ return 2 * (1.0 - x) ** (n-1) * n
458
+ if x >= 0.5:
459
+ return 2 * scipy.stats.ksone.pdf(x, n)
460
+
461
+ # Just take a small delta.
462
+ # Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
463
+ # as the CDF is a piecewise degree n polynomial.
464
+ # It has knots at 1/n, 2/n, ... (n-1)/n
465
+ # and is not a C-infinity function at the knots
466
+ delta = x / 2.0**16
467
+ delta = min(delta, x - 1.0/n)
468
+ delta = min(delta, 0.5 - x)
469
+
470
+ def _kk(_x):
471
+ return kolmogn(n, _x)
472
+
473
+ return _derivative(_kk, x, dx=delta, order=5)
474
+
475
+
476
+ def _kolmogni(n, p, q):
477
+ """Computes the PPF/ISF of kolmogn.
478
+
479
+ n of type integer, n>= 1
480
+ p is the CDF, q the SF, p+q=1
481
+ """
482
+ if np.isnan(n):
483
+ return n # Keep the same type of nan
484
+ if int(n) != n or n <= 0:
485
+ return np.nan
486
+ if p <= 0:
487
+ return 1.0/n
488
+ if q <= 0:
489
+ return 1.0
490
+ delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
491
+ if delta <= 1.0/n:
492
+ return (delta + 1.0 / n) / 2
493
+ x = -np.expm1(np.log(q/2.0)/n)
494
+ if x >= 1 - 1.0/n:
495
+ return x
496
+ x1 = scu._kolmogci(p)/np.sqrt(n)
497
+ x1 = min(x1, 1.0 - 1.0/n)
498
+
499
+ def _f(x):
500
+ return _kolmogn(n, x) - p
501
+
502
+ return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
503
+
504
+
505
+ def kolmogn(n, x, cdf=True):
506
+ """Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
507
+
508
+ The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
509
+ for a sample of size n drawn from a distribution with CDF F(t), where
510
+ :math:`D_n &= sup_t |F_n(t) - F(t)|`, and
511
+ :math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
512
+
513
+ Parameters
514
+ ----------
515
+ n : integer, array_like
516
+ the number of samples
517
+ x : float, array_like
518
+ The K-S statistic, float between 0 and 1
519
+ cdf : bool, optional
520
+ whether to compute the CDF(default=true) or the SF.
521
+
522
+ Returns
523
+ -------
524
+ cdf : ndarray
525
+ CDF (or SF it cdf is False) at the specified locations.
526
+
527
+ The return value has shape the result of numpy broadcasting n and x.
528
+ """
529
+ it = np.nditer([n, x, cdf, None],
530
+ op_dtypes=[None, np.float64, np.bool_, np.float64])
531
+ for _n, _x, _cdf, z in it:
532
+ if np.isnan(_n):
533
+ z[...] = _n
534
+ continue
535
+ if int(_n) != _n:
536
+ raise ValueError(f'n is not integral: {_n}')
537
+ z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
538
+ result = it.operands[-1]
539
+ return result
540
+
541
+
542
+ def kolmognp(n, x):
543
+ """Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
544
+
545
+ Parameters
546
+ ----------
547
+ n : integer, array_like
548
+ the number of samples
549
+ x : float, array_like
550
+ The K-S statistic, float between 0 and 1
551
+
552
+ Returns
553
+ -------
554
+ pdf : ndarray
555
+ The PDF at the specified locations
556
+
557
+ The return value has shape the result of numpy broadcasting n and x.
558
+ """
559
+ it = np.nditer([n, x, None])
560
+ for _n, _x, z in it:
561
+ if np.isnan(_n):
562
+ z[...] = _n
563
+ continue
564
+ if int(_n) != _n:
565
+ raise ValueError(f'n is not integral: {_n}')
566
+ z[...] = _kolmogn_p(int(_n), _x)
567
+ result = it.operands[-1]
568
+ return result
569
+
570
+
571
+ def kolmogni(n, q, cdf=True):
572
+ """Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
573
+
574
+ Parameters
575
+ ----------
576
+ n : integer, array_like
577
+ the number of samples
578
+ q : float, array_like
579
+ Probabilities, float between 0 and 1
580
+ cdf : bool, optional
581
+ whether to compute the PPF(default=true) or the ISF.
582
+
583
+ Returns
584
+ -------
585
+ ppf : ndarray
586
+ PPF (or ISF if cdf is False) at the specified locations
587
+
588
+ The return value has shape the result of numpy broadcasting n and x.
589
+ """
590
+ it = np.nditer([n, q, cdf, None])
591
+ for _n, _q, _cdf, z in it:
592
+ if np.isnan(_n):
593
+ z[...] = _n
594
+ continue
595
+ if int(_n) != _n:
596
+ raise ValueError(f'n is not integral: {_n}')
597
+ _pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
598
+ z[...] = _kolmogni(int(_n), _pcdf, _psf)
599
+ result = it.operands[-1]
600
+ return result
.venv/Lib/site-packages/scipy/stats/_mannwhitneyu.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from collections import namedtuple
3
+ from scipy import special
4
+ from scipy import stats
5
+ from scipy.stats._stats_py import _rankdata
6
+ from ._axis_nan_policy import _axis_nan_policy_factory
7
+
8
+
9
+ def _broadcast_concatenate(x, y, axis):
10
+ '''Broadcast then concatenate arrays, leaving concatenation axis last'''
11
+ x = np.moveaxis(x, axis, -1)
12
+ y = np.moveaxis(y, axis, -1)
13
+ z = np.broadcast(x[..., 0], y[..., 0])
14
+ x = np.broadcast_to(x, z.shape + (x.shape[-1],))
15
+ y = np.broadcast_to(y, z.shape + (y.shape[-1],))
16
+ z = np.concatenate((x, y), axis=-1)
17
+ return x, y, z
18
+
19
+
20
+ class _MWU:
21
+ '''Distribution of MWU statistic under the null hypothesis'''
22
+ # Possible improvement: if m and n are small enough, use integer arithmetic
23
+
24
+ def __init__(self):
25
+ '''Minimal initializer'''
26
+ self._fmnks = -np.ones((1, 1, 1))
27
+ self._recursive = None
28
+
29
+ def pmf(self, k, m, n):
30
+
31
+ # In practice, `pmf` is never called with k > m*n/2.
32
+ # If it were, we'd exploit symmetry here:
33
+ # k = np.array(k, copy=True)
34
+ # k2 = m*n - k
35
+ # i = k2 < k
36
+ # k[i] = k2[i]
37
+
38
+ if (self._recursive is None and m <= 500 and n <= 500
39
+ or self._recursive):
40
+ return self.pmf_recursive(k, m, n)
41
+ else:
42
+ return self.pmf_iterative(k, m, n)
43
+
44
+ def pmf_recursive(self, k, m, n):
45
+ '''Probability mass function, recursive version'''
46
+ self._resize_fmnks(m, n, np.max(k))
47
+ # could loop over just the unique elements, but probably not worth
48
+ # the time to find them
49
+ for i in np.ravel(k):
50
+ self._f(m, n, i)
51
+ return self._fmnks[m, n, k] / special.binom(m + n, m)
52
+
53
+ def pmf_iterative(self, k, m, n):
54
+ '''Probability mass function, iterative version'''
55
+ fmnks = {}
56
+ for i in np.ravel(k):
57
+ fmnks = _mwu_f_iterative(m, n, i, fmnks)
58
+ return (np.array([fmnks[(m, n, ki)] for ki in k])
59
+ / special.binom(m + n, m))
60
+
61
+ def cdf(self, k, m, n):
62
+ '''Cumulative distribution function'''
63
+
64
+ # In practice, `cdf` is never called with k > m*n/2.
65
+ # If it were, we'd exploit symmetry here rather than in `sf`
66
+ pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)
67
+ cdfs = np.cumsum(pmfs)
68
+ return cdfs[k]
69
+
70
+ def sf(self, k, m, n):
71
+ '''Survival function'''
72
+ # Note that both CDF and SF include the PMF at k. The p-value is
73
+ # calculated from the SF and should include the mass at k, so this
74
+ # is desirable
75
+
76
+ # Use the fact that the distribution is symmetric; i.e.
77
+ # _f(m, n, m*n-k) = _f(m, n, k), and sum from the left
78
+ kc = np.asarray(m*n - k) # complement of k
79
+ i = k < kc
80
+ if np.any(i):
81
+ kc[i] = k[i]
82
+ cdfs = np.asarray(self.cdf(kc, m, n))
83
+ cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i], m, n)
84
+ else:
85
+ cdfs = np.asarray(self.cdf(kc, m, n))
86
+ return cdfs[()]
87
+
88
+ def _resize_fmnks(self, m, n, k):
89
+ '''If necessary, expand the array that remembers PMF values'''
90
+ # could probably use `np.pad` but I'm not sure it would save code
91
+ shape_old = np.array(self._fmnks.shape)
92
+ shape_new = np.array((m+1, n+1, k+1))
93
+ if np.any(shape_new > shape_old):
94
+ shape = np.maximum(shape_old, shape_new)
95
+ fmnks = -np.ones(shape) # create the new array
96
+ m0, n0, k0 = shape_old
97
+ fmnks[:m0, :n0, :k0] = self._fmnks # copy remembered values
98
+ self._fmnks = fmnks
99
+
100
+ def _f(self, m, n, k):
101
+ '''Recursive implementation of function of [3] Theorem 2.5'''
102
+
103
+ # [3] Theorem 2.5 Line 1
104
+ if k < 0 or m < 0 or n < 0 or k > m*n:
105
+ return 0
106
+
107
+ # if already calculated, return the value
108
+ if self._fmnks[m, n, k] >= 0:
109
+ return self._fmnks[m, n, k]
110
+
111
+ if k == 0 and m >= 0 and n >= 0: # [3] Theorem 2.5 Line 2
112
+ fmnk = 1
113
+ else: # [3] Theorem 2.5 Line 3 / Equation 3
114
+ fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)
115
+
116
+ self._fmnks[m, n, k] = fmnk # remember result
117
+
118
+ return fmnk
119
+
120
+
121
+ # Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'
122
+ _mwu_state = _MWU()
123
+
124
+
125
+ def _mwu_f_iterative(m, n, k, fmnks):
126
+ '''Iterative implementation of function of [3] Theorem 2.5'''
127
+
128
+ def _base_case(m, n, k):
129
+ '''Base cases from recursive version'''
130
+
131
+ # if already calculated, return the value
132
+ if fmnks.get((m, n, k), -1) >= 0:
133
+ return fmnks[(m, n, k)]
134
+
135
+ # [3] Theorem 2.5 Line 1
136
+ elif k < 0 or m < 0 or n < 0 or k > m*n:
137
+ return 0
138
+
139
+ # [3] Theorem 2.5 Line 2
140
+ elif k == 0 and m >= 0 and n >= 0:
141
+ return 1
142
+
143
+ return None
144
+
145
+ stack = [(m, n, k)]
146
+ fmnk = None
147
+
148
+ while stack:
149
+ # Popping only if necessary would save a tiny bit of time, but NWI.
150
+ m, n, k = stack.pop()
151
+
152
+ # If we're at a base case, continue (stack unwinds)
153
+ fmnk = _base_case(m, n, k)
154
+ if fmnk is not None:
155
+ fmnks[(m, n, k)] = fmnk
156
+ continue
157
+
158
+ # If both terms are base cases, continue (stack unwinds)
159
+ f1 = _base_case(m-1, n, k-n)
160
+ f2 = _base_case(m, n-1, k)
161
+ if f1 is not None and f2 is not None:
162
+ # [3] Theorem 2.5 Line 3 / Equation 3
163
+ fmnk = f1 + f2
164
+ fmnks[(m, n, k)] = fmnk
165
+ continue
166
+
167
+ # recurse deeper
168
+ stack.append((m, n, k))
169
+ if f1 is None:
170
+ stack.append((m-1, n, k-n))
171
+ if f2 is None:
172
+ stack.append((m, n-1, k))
173
+
174
+ return fmnks
175
+
176
+
177
+ def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
178
+ '''Standardized MWU statistic'''
179
+ # Follows mannwhitneyu [2]
180
+ mu = n1 * n2 / 2
181
+ n = n1 + n2
182
+
183
+ # Tie correction according to [2], "Normal approximation and tie correction"
184
+ # "A more computationally-efficient form..."
185
+ tie_term = (t**3 - t).sum(axis=-1)
186
+ s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
187
+
188
+ numerator = U - mu
189
+
190
+ # Continuity correction.
191
+ # Because SF is always used to calculate the p-value, we can always
192
+ # _subtract_ 0.5 for the continuity correction. This always increases the
193
+ # p-value to account for the rest of the probability mass _at_ q = U.
194
+ if continuity:
195
+ numerator -= 0.5
196
+
197
+ # no problem evaluating the norm SF at an infinity
198
+ with np.errstate(divide='ignore', invalid='ignore'):
199
+ z = numerator / s
200
+ return z
201
+
202
+
203
+ def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
204
+ ''' Input validation and standardization for mannwhitneyu '''
205
+ # Would use np.asarray_chkfinite, but infs are OK
206
+ x, y = np.atleast_1d(x), np.atleast_1d(y)
207
+ if np.isnan(x).any() or np.isnan(y).any():
208
+ raise ValueError('`x` and `y` must not contain NaNs.')
209
+ if np.size(x) == 0 or np.size(y) == 0:
210
+ raise ValueError('`x` and `y` must be of nonzero size.')
211
+
212
+ bools = {True, False}
213
+ if use_continuity not in bools:
214
+ raise ValueError(f'`use_continuity` must be one of {bools}.')
215
+
216
+ alternatives = {"two-sided", "less", "greater"}
217
+ alternative = alternative.lower()
218
+ if alternative not in alternatives:
219
+ raise ValueError(f'`alternative` must be one of {alternatives}.')
220
+
221
+ axis_int = int(axis)
222
+ if axis != axis_int:
223
+ raise ValueError('`axis` must be an integer.')
224
+
225
+ if not isinstance(method, stats.PermutationMethod):
226
+ methods = {"asymptotic", "exact", "auto"}
227
+ method = method.lower()
228
+ if method not in methods:
229
+ raise ValueError(f'`method` must be one of {methods}.')
230
+
231
+ return x, y, use_continuity, alternative, axis_int, method
232
+
233
+
234
+ def _mwu_choose_method(n1, n2, ties):
235
+ """Choose method 'asymptotic' or 'exact' depending on input size, ties"""
236
+
237
+ # if both inputs are large, asymptotic is OK
238
+ if n1 > 8 and n2 > 8:
239
+ return "asymptotic"
240
+
241
+ # if there are any ties, asymptotic is preferred
242
+ if ties:
243
+ return "asymptotic"
244
+
245
+ return "exact"
246
+
247
+
248
+ MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
249
+
250
+
251
+ @_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
252
+ def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
253
+ axis=0, method="auto"):
254
+ r'''Perform the Mann-Whitney U rank test on two independent samples.
255
+
256
+ The Mann-Whitney U test is a nonparametric test of the null hypothesis
257
+ that the distribution underlying sample `x` is the same as the
258
+ distribution underlying sample `y`. It is often used as a test of
259
+ difference in location between distributions.
260
+
261
+ Parameters
262
+ ----------
263
+ x, y : array-like
264
+ N-d arrays of samples. The arrays must be broadcastable except along
265
+ the dimension given by `axis`.
266
+ use_continuity : bool, optional
267
+ Whether a continuity correction (1/2) should be applied.
268
+ Default is True when `method` is ``'asymptotic'``; has no effect
269
+ otherwise.
270
+ alternative : {'two-sided', 'less', 'greater'}, optional
271
+ Defines the alternative hypothesis. Default is 'two-sided'.
272
+ Let *F(u)* and *G(u)* be the cumulative distribution functions of the
273
+ distributions underlying `x` and `y`, respectively. Then the following
274
+ alternative hypotheses are available:
275
+
276
+ * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
277
+ at least one *u*.
278
+ * 'less': the distribution underlying `x` is stochastically less
279
+ than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
280
+ * 'greater': the distribution underlying `x` is stochastically greater
281
+ than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
282
+
283
+ Note that the mathematical expressions in the alternative hypotheses
284
+ above describe the CDFs of the underlying distributions. The directions
285
+ of the inequalities appear inconsistent with the natural language
286
+ description at first glance, but they are not. For example, suppose
287
+ *X* and *Y* are random variables that follow distributions with CDFs
288
+ *F* and *G*, respectively. If *F(u) > G(u)* for all *u*, samples drawn
289
+ from *X* tend to be less than those drawn from *Y*.
290
+
291
+ Under a more restrictive set of assumptions, the alternative hypotheses
292
+ can be expressed in terms of the locations of the distributions;
293
+ see [5] section 5.1.
294
+ axis : int, optional
295
+ Axis along which to perform the test. Default is 0.
296
+ method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
297
+ Selects the method used to calculate the *p*-value.
298
+ Default is 'auto'. The following options are available.
299
+
300
+ * ``'asymptotic'``: compares the standardized test statistic
301
+ against the normal distribution, correcting for ties.
302
+ * ``'exact'``: computes the exact *p*-value by comparing the observed
303
+ :math:`U` statistic against the exact distribution of the :math:`U`
304
+ statistic under the null hypothesis. No correction is made for ties.
305
+ * ``'auto'``: chooses ``'exact'`` when the size of one of the samples
306
+ is less than or equal to 8 and there are no ties;
307
+ chooses ``'asymptotic'`` otherwise.
308
+ * `PermutationMethod` instance. In this case, the p-value
309
+ is computed using `permutation_test` with the provided
310
+ configuration options and other appropriate settings.
311
+
312
+ Returns
313
+ -------
314
+ res : MannwhitneyuResult
315
+ An object containing attributes:
316
+
317
+ statistic : float
318
+ The Mann-Whitney U statistic corresponding with sample `x`. See
319
+ Notes for the test statistic corresponding with sample `y`.
320
+ pvalue : float
321
+ The associated *p*-value for the chosen `alternative`.
322
+
323
+ Notes
324
+ -----
325
+ If ``U1`` is the statistic corresponding with sample `x`, then the
326
+ statistic corresponding with sample `y` is
327
+ ``U2 = x.shape[axis] * y.shape[axis] - U1``.
328
+
329
+ `mannwhitneyu` is for independent samples. For related / paired samples,
330
+ consider `scipy.stats.wilcoxon`.
331
+
332
+ `method` ``'exact'`` is recommended when there are no ties and when either
333
+ sample size is less than 8 [1]_. The implementation follows the recurrence
334
+ relation originally proposed in [1]_ as it is described in [3]_.
335
+ Note that the exact method is *not* corrected for ties, but
336
+ `mannwhitneyu` will not raise errors or warnings if there are ties in the
337
+ data. If there are ties and either samples is small (fewer than ~10
338
+ observations), consider passing an instance of `PermutationMethod`
339
+ as the `method` to perform a permutation test.
340
+
341
+ The Mann-Whitney U test is a non-parametric version of the t-test for
342
+ independent samples. When the means of samples from the populations
343
+ are normally distributed, consider `scipy.stats.ttest_ind`.
344
+
345
+ See Also
346
+ --------
347
+ scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
348
+
349
+ References
350
+ ----------
351
+ .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
352
+ variables is stochastically larger than the other", The Annals of
353
+ Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
354
+ .. [2] Mann-Whitney U Test, Wikipedia,
355
+ http://en.wikipedia.org/wiki/Mann-Whitney_U_test
356
+ .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
357
+ Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
358
+ Inference, Vol. 79, pp. 349-364, 1999.
359
+ .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
360
+ Learning Support Centre, 2004.
361
+ .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
362
+ or t-test? On assumptions for hypothesis tests and multiple \
363
+ interpretations of decision rules." Statistics surveys, Vol. 4, pp.
364
+ 1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
365
+
366
+ Examples
367
+ --------
368
+ We follow the example from [4]_: nine randomly sampled young adults were
369
+ diagnosed with type II diabetes at the ages below.
370
+
371
+ >>> males = [19, 22, 16, 29, 24]
372
+ >>> females = [20, 11, 17, 12]
373
+
374
+ We use the Mann-Whitney U test to assess whether there is a statistically
375
+ significant difference in the diagnosis age of males and females.
376
+ The null hypothesis is that the distribution of male diagnosis ages is
377
+ the same as the distribution of female diagnosis ages. We decide
378
+ that a confidence level of 95% is required to reject the null hypothesis
379
+ in favor of the alternative that the distributions are different.
380
+ Since the number of samples is very small and there are no ties in the
381
+ data, we can compare the observed test statistic against the *exact*
382
+ distribution of the test statistic under the null hypothesis.
383
+
384
+ >>> from scipy.stats import mannwhitneyu
385
+ >>> U1, p = mannwhitneyu(males, females, method="exact")
386
+ >>> print(U1)
387
+ 17.0
388
+
389
+ `mannwhitneyu` always reports the statistic associated with the first
390
+ sample, which, in this case, is males. This agrees with :math:`U_M = 17`
391
+ reported in [4]_. The statistic associated with the second statistic
392
+ can be calculated:
393
+
394
+ >>> nx, ny = len(males), len(females)
395
+ >>> U2 = nx*ny - U1
396
+ >>> print(U2)
397
+ 3.0
398
+
399
+ This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
400
+ *p*-value can be calculated from either statistic, and the value produced
401
+ by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
402
+
403
+ >>> print(p)
404
+ 0.1111111111111111
405
+
406
+ The exact distribution of the test statistic is asymptotically normal, so
407
+ the example continues by comparing the exact *p*-value against the
408
+ *p*-value produced using the normal approximation.
409
+
410
+ >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
411
+ >>> print(pnorm)
412
+ 0.11134688653314041
413
+
414
+ Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
415
+ value :math:`p = 0.09` given in [4]_. The reason is that [4]_
416
+ does not apply the continuity correction performed by `mannwhitneyu`;
417
+ `mannwhitneyu` reduces the distance between the test statistic and the
418
+ mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
419
+ discrete statistic is being compared against a continuous distribution.
420
+ Here, the :math:`U` statistic used is less than the mean, so we reduce
421
+ the distance by adding 0.5 in the numerator.
422
+
423
+ >>> import numpy as np
424
+ >>> from scipy.stats import norm
425
+ >>> U = min(U1, U2)
426
+ >>> N = nx + ny
427
+ >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
428
+ >>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
429
+ >>> print(p)
430
+ 0.11134688653314041
431
+
432
+ If desired, we can disable the continuity correction to get a result
433
+ that agrees with that reported in [4]_.
434
+
435
+ >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
436
+ ... method="asymptotic")
437
+ >>> print(pnorm)
438
+ 0.0864107329737
439
+
440
+ Regardless of whether we perform an exact or asymptotic test, the
441
+ probability of the test statistic being as extreme or more extreme by
442
+ chance exceeds 5%, so we do not consider the results statistically
443
+ significant.
444
+
445
+ Suppose that, before seeing the data, we had hypothesized that females
446
+ would tend to be diagnosed at a younger age than males.
447
+ In that case, it would be natural to provide the female ages as the
448
+ first input, and we would have performed a one-sided test using
449
+ ``alternative = 'less'``: females are diagnosed at an age that is
450
+ stochastically less than that of males.
451
+
452
+ >>> res = mannwhitneyu(females, males, alternative="less", method="exact")
453
+ >>> print(res)
454
+ MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
455
+
456
+ Again, the probability of getting a sufficiently low value of the
457
+ test statistic by chance under the null hypothesis is greater than 5%,
458
+ so we do not reject the null hypothesis in favor of our alternative.
459
+
460
+ If it is reasonable to assume that the means of samples from the
461
+ populations are normally distributed, we could have used a t-test to
462
+ perform the analysis.
463
+
464
+ >>> from scipy.stats import ttest_ind
465
+ >>> res = ttest_ind(females, males, alternative="less")
466
+ >>> print(res)
467
+ Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)
468
+
469
+ Under this assumption, the *p*-value would be low enough to reject the
470
+ null hypothesis in favor of the alternative.
471
+
472
+ '''
473
+
474
+ x, y, use_continuity, alternative, axis_int, method = (
475
+ _mwu_input_validation(x, y, use_continuity, alternative, axis, method))
476
+
477
+ x, y, xy = _broadcast_concatenate(x, y, axis)
478
+
479
+ n1, n2 = x.shape[-1], y.shape[-1]
480
+
481
+ # Follows [2]
482
+ ranks, t = _rankdata(xy, 'average', return_ties=True) # method 2, step 1
483
+ R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
484
+ U1 = R1 - n1*(n1+1)/2 # method 2, step 3
485
+ U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
486
+
487
+ if alternative == "greater":
488
+ U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
489
+ elif alternative == "less":
490
+ U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
491
+ else:
492
+ U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
493
+
494
+ if method == "auto":
495
+ method = _mwu_choose_method(n1, n2, np.any(t > 1))
496
+
497
+ if method == "exact":
498
+ p = _mwu_state.sf(U.astype(int), min(n1, n2), max(n1, n2))
499
+ elif method == "asymptotic":
500
+ z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
501
+ p = stats.norm.sf(z)
502
+ else: # `PermutationMethod` instance (already validated)
503
+ def statistic(x, y, axis):
504
+ return mannwhitneyu(x, y, use_continuity=use_continuity,
505
+ alternative=alternative, axis=axis,
506
+ method="asymptotic").statistic
507
+
508
+ res = stats.permutation_test((x, y), statistic, axis=axis,
509
+ **method._asdict(), alternative=alternative)
510
+ p = res.pvalue
511
+ f = 1
512
+
513
+ p *= f
514
+
515
+ # Ensure that test statistic is not greater than 1
516
+ # This could happen for exact test when U = m*n/2
517
+ p = np.clip(p, 0, 1)
518
+
519
+ return MannwhitneyuResult(U1, p)
.venv/Lib/site-packages/scipy/stats/_morestats.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_mstats_basic.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_mstats_extras.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Additional statistics functions with support for masked arrays.
3
+
4
+ """
5
+
6
+ # Original author (2007): Pierre GF Gerard-Marchant
7
+
8
+
9
+ __all__ = ['compare_medians_ms',
10
+ 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
11
+ 'idealfourths',
12
+ 'median_cihs','mjci','mquantiles_cimj',
13
+ 'rsh',
14
+ 'trimmed_mean_ci',]
15
+
16
+
17
+ import numpy as np
18
+ from numpy import float64, ndarray
19
+
20
+ import numpy.ma as ma
21
+ from numpy.ma import MaskedArray
22
+
23
+ from . import _mstats_basic as mstats
24
+
25
+ from scipy.stats.distributions import norm, beta, t, binom
26
+
27
+
28
+ def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
29
+ """
30
+ Computes quantile estimates with the Harrell-Davis method.
31
+
32
+ The quantile estimates are calculated as a weighted linear combination
33
+ of order statistics.
34
+
35
+ Parameters
36
+ ----------
37
+ data : array_like
38
+ Data array.
39
+ prob : sequence, optional
40
+ Sequence of probabilities at which to compute the quantiles.
41
+ axis : int or None, optional
42
+ Axis along which to compute the quantiles. If None, use a flattened
43
+ array.
44
+ var : bool, optional
45
+ Whether to return the variance of the estimate.
46
+
47
+ Returns
48
+ -------
49
+ hdquantiles : MaskedArray
50
+ A (p,) array of quantiles (if `var` is False), or a (2,p) array of
51
+ quantiles and variances (if `var` is True), where ``p`` is the
52
+ number of quantiles.
53
+
54
+ See Also
55
+ --------
56
+ hdquantiles_sd
57
+
58
+ Examples
59
+ --------
60
+ >>> import numpy as np
61
+ >>> from scipy.stats.mstats import hdquantiles
62
+ >>>
63
+ >>> # Sample data
64
+ >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
65
+ >>>
66
+ >>> # Probabilities at which to compute quantiles
67
+ >>> probabilities = [0.25, 0.5, 0.75]
68
+ >>>
69
+ >>> # Compute Harrell-Davis quantile estimates
70
+ >>> quantile_estimates = hdquantiles(data, prob=probabilities)
71
+ >>>
72
+ >>> # Display the quantile estimates
73
+ >>> for i, quantile in enumerate(probabilities):
74
+ ... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
75
+ 25th percentile: 3.1505820231763066 # may vary
76
+ 50th percentile: 5.194344084883956
77
+ 75th percentile: 7.430626414674935
78
+
79
+ """
80
+ def _hd_1D(data,prob,var):
81
+ "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
82
+ xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
83
+ # Don't use length here, in case we have a numpy scalar
84
+ n = xsorted.size
85
+
86
+ hd = np.empty((2,len(prob)), float64)
87
+ if n < 2:
88
+ hd.flat = np.nan
89
+ if var:
90
+ return hd
91
+ return hd[0]
92
+
93
+ v = np.arange(n+1) / float(n)
94
+ betacdf = beta.cdf
95
+ for (i,p) in enumerate(prob):
96
+ _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
97
+ w = _w[1:] - _w[:-1]
98
+ hd_mean = np.dot(w, xsorted)
99
+ hd[0,i] = hd_mean
100
+ #
101
+ hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
102
+ #
103
+ hd[0, prob == 0] = xsorted[0]
104
+ hd[0, prob == 1] = xsorted[-1]
105
+ if var:
106
+ hd[1, prob == 0] = hd[1, prob == 1] = np.nan
107
+ return hd
108
+ return hd[0]
109
+ # Initialization & checks
110
+ data = ma.array(data, copy=False, dtype=float64)
111
+ p = np.atleast_1d(np.asarray(prob))
112
+ # Computes quantiles along axis (or globally)
113
+ if (axis is None) or (data.ndim == 1):
114
+ result = _hd_1D(data, p, var)
115
+ else:
116
+ if data.ndim > 2:
117
+ raise ValueError("Array 'data' must be at most two dimensional, "
118
+ "but got data.ndim = %d" % data.ndim)
119
+ result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
120
+
121
+ return ma.fix_invalid(result, copy=False)
122
+
123
+
124
+ def hdmedian(data, axis=-1, var=False):
125
+ """
126
+ Returns the Harrell-Davis estimate of the median along the given axis.
127
+
128
+ Parameters
129
+ ----------
130
+ data : ndarray
131
+ Data array.
132
+ axis : int, optional
133
+ Axis along which to compute the quantiles. If None, use a flattened
134
+ array.
135
+ var : bool, optional
136
+ Whether to return the variance of the estimate.
137
+
138
+ Returns
139
+ -------
140
+ hdmedian : MaskedArray
141
+ The median values. If ``var=True``, the variance is returned inside
142
+ the masked array. E.g. for a 1-D array the shape change from (1,) to
143
+ (2,).
144
+
145
+ """
146
+ result = hdquantiles(data,[0.5], axis=axis, var=var)
147
+ return result.squeeze()
148
+
149
+
150
+ def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
151
+ """
152
+ The standard error of the Harrell-Davis quantile estimates by jackknife.
153
+
154
+ Parameters
155
+ ----------
156
+ data : array_like
157
+ Data array.
158
+ prob : sequence, optional
159
+ Sequence of quantiles to compute.
160
+ axis : int, optional
161
+ Axis along which to compute the quantiles. If None, use a flattened
162
+ array.
163
+
164
+ Returns
165
+ -------
166
+ hdquantiles_sd : MaskedArray
167
+ Standard error of the Harrell-Davis quantile estimates.
168
+
169
+ See Also
170
+ --------
171
+ hdquantiles
172
+
173
+ """
174
+ def _hdsd_1D(data, prob):
175
+ "Computes the std error for 1D arrays."
176
+ xsorted = np.sort(data.compressed())
177
+ n = len(xsorted)
178
+
179
+ hdsd = np.empty(len(prob), float64)
180
+ if n < 2:
181
+ hdsd.flat = np.nan
182
+
183
+ vv = np.arange(n) / float(n-1)
184
+ betacdf = beta.cdf
185
+
186
+ for (i,p) in enumerate(prob):
187
+ _w = betacdf(vv, n*p, n*(1-p))
188
+ w = _w[1:] - _w[:-1]
189
+ # cumulative sum of weights and data points if
190
+ # ith point is left out for jackknife
191
+ mx_ = np.zeros_like(xsorted)
192
+ mx_[1:] = np.cumsum(w * xsorted[:-1])
193
+ # similar but from the right
194
+ mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
195
+ hdsd[i] = np.sqrt(mx_.var() * (n - 1))
196
+ return hdsd
197
+
198
+ # Initialization & checks
199
+ data = ma.array(data, copy=False, dtype=float64)
200
+ p = np.atleast_1d(np.asarray(prob))
201
+ # Computes quantiles along axis (or globally)
202
+ if (axis is None):
203
+ result = _hdsd_1D(data, p)
204
+ else:
205
+ if data.ndim > 2:
206
+ raise ValueError("Array 'data' must be at most two dimensional, "
207
+ "but got data.ndim = %d" % data.ndim)
208
+ result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
209
+
210
+ return ma.fix_invalid(result, copy=False).ravel()
211
+
212
+
213
+ def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
214
+ alpha=0.05, axis=None):
215
+ """
216
+ Selected confidence interval of the trimmed mean along the given axis.
217
+
218
+ Parameters
219
+ ----------
220
+ data : array_like
221
+ Input data.
222
+ limits : {None, tuple}, optional
223
+ None or a two item tuple.
224
+ Tuple of the percentages to cut on each side of the array, with respect
225
+ to the number of unmasked data, as floats between 0. and 1. If ``n``
226
+ is the number of unmasked data before trimming, then
227
+ (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
228
+ largest data are masked. The total number of unmasked data after
229
+ trimming is ``n * (1. - sum(limits))``.
230
+ The value of one limit can be set to None to indicate an open interval.
231
+
232
+ Defaults to (0.2, 0.2).
233
+ inclusive : (2,) tuple of boolean, optional
234
+ If relative==False, tuple indicating whether values exactly equal to
235
+ the absolute limits are allowed.
236
+ If relative==True, tuple indicating whether the number of data being
237
+ masked on each side should be rounded (True) or truncated (False).
238
+
239
+ Defaults to (True, True).
240
+ alpha : float, optional
241
+ Confidence level of the intervals.
242
+
243
+ Defaults to 0.05.
244
+ axis : int, optional
245
+ Axis along which to cut. If None, uses a flattened version of `data`.
246
+
247
+ Defaults to None.
248
+
249
+ Returns
250
+ -------
251
+ trimmed_mean_ci : (2,) ndarray
252
+ The lower and upper confidence intervals of the trimmed data.
253
+
254
+ """
255
+ data = ma.array(data, copy=False)
256
+ trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
257
+ tmean = trimmed.mean(axis)
258
+ tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
259
+ df = trimmed.count(axis) - 1
260
+ tppf = t.ppf(1-alpha/2.,df)
261
+ return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
262
+
263
+
264
+ def mjci(data, prob=[0.25,0.5,0.75], axis=None):
265
+ """
266
+ Returns the Maritz-Jarrett estimators of the standard error of selected
267
+ experimental quantiles of the data.
268
+
269
+ Parameters
270
+ ----------
271
+ data : ndarray
272
+ Data array.
273
+ prob : sequence, optional
274
+ Sequence of quantiles to compute.
275
+ axis : int or None, optional
276
+ Axis along which to compute the quantiles. If None, use a flattened
277
+ array.
278
+
279
+ """
280
+ def _mjci_1D(data, p):
281
+ data = np.sort(data.compressed())
282
+ n = data.size
283
+ prob = (np.array(p) * n + 0.5).astype(int)
284
+ betacdf = beta.cdf
285
+
286
+ mj = np.empty(len(prob), float64)
287
+ x = np.arange(1,n+1, dtype=float64) / n
288
+ y = x - 1./n
289
+ for (i,m) in enumerate(prob):
290
+ W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
291
+ C1 = np.dot(W,data)
292
+ C2 = np.dot(W,data**2)
293
+ mj[i] = np.sqrt(C2 - C1**2)
294
+ return mj
295
+
296
+ data = ma.array(data, copy=False)
297
+ if data.ndim > 2:
298
+ raise ValueError("Array 'data' must be at most two dimensional, "
299
+ "but got data.ndim = %d" % data.ndim)
300
+
301
+ p = np.atleast_1d(np.asarray(prob))
302
+ # Computes quantiles along axis (or globally)
303
+ if (axis is None):
304
+ return _mjci_1D(data, p)
305
+ else:
306
+ return ma.apply_along_axis(_mjci_1D, axis, data, p)
307
+
308
+
309
+ def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
310
+ """
311
+ Computes the alpha confidence interval for the selected quantiles of the
312
+ data, with Maritz-Jarrett estimators.
313
+
314
+ Parameters
315
+ ----------
316
+ data : ndarray
317
+ Data array.
318
+ prob : sequence, optional
319
+ Sequence of quantiles to compute.
320
+ alpha : float, optional
321
+ Confidence level of the intervals.
322
+ axis : int or None, optional
323
+ Axis along which to compute the quantiles.
324
+ If None, use a flattened array.
325
+
326
+ Returns
327
+ -------
328
+ ci_lower : ndarray
329
+ The lower boundaries of the confidence interval. Of the same length as
330
+ `prob`.
331
+ ci_upper : ndarray
332
+ The upper boundaries of the confidence interval. Of the same length as
333
+ `prob`.
334
+
335
+ """
336
+ alpha = min(alpha, 1 - alpha)
337
+ z = norm.ppf(1 - alpha/2.)
338
+ xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
339
+ smj = mjci(data, prob, axis=axis)
340
+ return (xq - z * smj, xq + z * smj)
341
+
342
+
343
+ def median_cihs(data, alpha=0.05, axis=None):
344
+ """
345
+ Computes the alpha-level confidence interval for the median of the data.
346
+
347
+ Uses the Hettmasperger-Sheather method.
348
+
349
+ Parameters
350
+ ----------
351
+ data : array_like
352
+ Input data. Masked values are discarded. The input should be 1D only,
353
+ or `axis` should be set to None.
354
+ alpha : float, optional
355
+ Confidence level of the intervals.
356
+ axis : int or None, optional
357
+ Axis along which to compute the quantiles. If None, use a flattened
358
+ array.
359
+
360
+ Returns
361
+ -------
362
+ median_cihs
363
+ Alpha level confidence interval.
364
+
365
+ """
366
+ def _cihs_1D(data, alpha):
367
+ data = np.sort(data.compressed())
368
+ n = len(data)
369
+ alpha = min(alpha, 1-alpha)
370
+ k = int(binom._ppf(alpha/2., n, 0.5))
371
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
372
+ if gk < 1-alpha:
373
+ k -= 1
374
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
375
+ gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
376
+ I = (gk - 1 + alpha)/(gk - gkk)
377
+ lambd = (n-k) * I / float(k + (n-2*k)*I)
378
+ lims = (lambd*data[k] + (1-lambd)*data[k-1],
379
+ lambd*data[n-k-1] + (1-lambd)*data[n-k])
380
+ return lims
381
+ data = ma.array(data, copy=False)
382
+ # Computes quantiles along axis (or globally)
383
+ if (axis is None):
384
+ result = _cihs_1D(data, alpha)
385
+ else:
386
+ if data.ndim > 2:
387
+ raise ValueError("Array 'data' must be at most two dimensional, "
388
+ "but got data.ndim = %d" % data.ndim)
389
+ result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
390
+
391
+ return result
392
+
393
+
394
+ def compare_medians_ms(group_1, group_2, axis=None):
395
+ """
396
+ Compares the medians from two independent groups along the given axis.
397
+
398
+ The comparison is performed using the McKean-Schrader estimate of the
399
+ standard error of the medians.
400
+
401
+ Parameters
402
+ ----------
403
+ group_1 : array_like
404
+ First dataset. Has to be of size >=7.
405
+ group_2 : array_like
406
+ Second dataset. Has to be of size >=7.
407
+ axis : int, optional
408
+ Axis along which the medians are estimated. If None, the arrays are
409
+ flattened. If `axis` is not None, then `group_1` and `group_2`
410
+ should have the same shape.
411
+
412
+ Returns
413
+ -------
414
+ compare_medians_ms : {float, ndarray}
415
+ If `axis` is None, then returns a float, otherwise returns a 1-D
416
+ ndarray of floats with a length equal to the length of `group_1`
417
+ along `axis`.
418
+
419
+ Examples
420
+ --------
421
+
422
+ >>> from scipy import stats
423
+ >>> a = [1, 2, 3, 4, 5, 6, 7]
424
+ >>> b = [8, 9, 10, 11, 12, 13, 14]
425
+ >>> stats.mstats.compare_medians_ms(a, b, axis=None)
426
+ 1.0693225866553746e-05
427
+
428
+ The function is vectorized to compute along a given axis.
429
+
430
+ >>> import numpy as np
431
+ >>> rng = np.random.default_rng()
432
+ >>> x = rng.random(size=(3, 7))
433
+ >>> y = rng.random(size=(3, 8))
434
+ >>> stats.mstats.compare_medians_ms(x, y, axis=1)
435
+ array([0.36908985, 0.36092538, 0.2765313 ])
436
+
437
+ References
438
+ ----------
439
+ .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
440
+ for studentizing the sample median." Communications in
441
+ Statistics-Simulation and Computation 13.6 (1984): 751-773.
442
+
443
+ """
444
+ (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
445
+ (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
446
+ mstats.stde_median(group_2, axis=axis))
447
+ W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
448
+ return 1 - norm.cdf(W)
449
+
450
+
451
+ def idealfourths(data, axis=None):
452
+ """
453
+ Returns an estimate of the lower and upper quartiles.
454
+
455
+ Uses the ideal fourths algorithm.
456
+
457
+ Parameters
458
+ ----------
459
+ data : array_like
460
+ Input array.
461
+ axis : int, optional
462
+ Axis along which the quartiles are estimated. If None, the arrays are
463
+ flattened.
464
+
465
+ Returns
466
+ -------
467
+ idealfourths : {list of floats, masked array}
468
+ Returns the two internal values that divide `data` into four parts
469
+ using the ideal fourths algorithm either along the flattened array
470
+ (if `axis` is None) or along `axis` of `data`.
471
+
472
+ """
473
+ def _idf(data):
474
+ x = data.compressed()
475
+ n = len(x)
476
+ if n < 3:
477
+ return [np.nan,np.nan]
478
+ (j,h) = divmod(n/4. + 5/12.,1)
479
+ j = int(j)
480
+ qlo = (1-h)*x[j-1] + h*x[j]
481
+ k = n - j
482
+ qup = (1-h)*x[k] + h*x[k-1]
483
+ return [qlo, qup]
484
+ data = ma.sort(data, axis=axis).view(MaskedArray)
485
+ if (axis is None):
486
+ return _idf(data)
487
+ else:
488
+ return ma.apply_along_axis(_idf, axis, data)
489
+
490
+
491
+ def rsh(data, points=None):
492
+ """
493
+ Evaluates Rosenblatt's shifted histogram estimators for each data point.
494
+
495
+ Rosenblatt's estimator is a centered finite-difference approximation to the
496
+ derivative of the empirical cumulative distribution function.
497
+
498
+ Parameters
499
+ ----------
500
+ data : sequence
501
+ Input data, should be 1-D. Masked values are ignored.
502
+ points : sequence or None, optional
503
+ Sequence of points where to evaluate Rosenblatt shifted histogram.
504
+ If None, use the data.
505
+
506
+ """
507
+ data = ma.array(data, copy=False)
508
+ if points is None:
509
+ points = data
510
+ else:
511
+ points = np.atleast_1d(np.asarray(points))
512
+
513
+ if data.ndim != 1:
514
+ raise AttributeError("The input array should be 1D only !")
515
+
516
+ n = data.count()
517
+ r = idealfourths(data, axis=None)
518
+ h = 1.2 * (r[-1]-r[0]) / n**(1./5)
519
+ nhi = (data[:,None] <= points[None,:] + h).sum(0)
520
+ nlo = (data[:,None] < points[None,:] - h).sum(0)
521
+ return (nhi-nlo) / (2.*n*h)
.venv/Lib/site-packages/scipy/stats/_multicomp.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from dataclasses import dataclass, field
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ from scipy import stats
10
+ from scipy.optimize import minimize_scalar
11
+ from scipy.stats._common import ConfidenceInterval
12
+ from scipy.stats._qmc import check_random_state
13
+ from scipy.stats._stats_py import _var
14
+
15
+ if TYPE_CHECKING:
16
+ import numpy.typing as npt
17
+ from scipy._lib._util import DecimalNumber, SeedType
18
+ from typing import Literal, Sequence # noqa: UP035
19
+
20
+
21
+ __all__ = [
22
+ 'dunnett'
23
+ ]
24
+
25
+
26
+ @dataclass
27
+ class DunnettResult:
28
+ """Result object returned by `scipy.stats.dunnett`.
29
+
30
+ Attributes
31
+ ----------
32
+ statistic : float ndarray
33
+ The computed statistic of the test for each comparison. The element
34
+ at index ``i`` is the statistic for the comparison between
35
+ groups ``i`` and the control.
36
+ pvalue : float ndarray
37
+ The computed p-value of the test for each comparison. The element
38
+ at index ``i`` is the p-value for the comparison between
39
+ group ``i`` and the control.
40
+ """
41
+ statistic: np.ndarray
42
+ pvalue: np.ndarray
43
+ _alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
44
+ _rho: np.ndarray = field(repr=False)
45
+ _df: int = field(repr=False)
46
+ _std: float = field(repr=False)
47
+ _mean_samples: np.ndarray = field(repr=False)
48
+ _mean_control: np.ndarray = field(repr=False)
49
+ _n_samples: np.ndarray = field(repr=False)
50
+ _n_control: int = field(repr=False)
51
+ _rng: SeedType = field(repr=False)
52
+ _ci: ConfidenceInterval | None = field(default=None, repr=False)
53
+ _ci_cl: DecimalNumber | None = field(default=None, repr=False)
54
+
55
+ def __str__(self):
56
+ # Note: `__str__` prints the confidence intervals from the most
57
+ # recent call to `confidence_interval`. If it has not been called,
58
+ # it will be called with the default CL of .95.
59
+ if self._ci is None:
60
+ self.confidence_interval(confidence_level=.95)
61
+ s = (
62
+ "Dunnett's test"
63
+ f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
64
+ "Comparison Statistic p-value Lower CI Upper CI\n"
65
+ )
66
+ for i in range(self.pvalue.size):
67
+ s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
68
+ f"{self.pvalue[i]:>10.3f}"
69
+ f"{self._ci.low[i]:>10.3f}"
70
+ f"{self._ci.high[i]:>10.3f}\n")
71
+
72
+ return s
73
+
74
+ def _allowance(
75
+ self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
76
+ ) -> float:
77
+ """Allowance.
78
+
79
+ It is the quantity to add/subtract from the observed difference
80
+ between the means of observed groups and the mean of the control
81
+ group. The result gives confidence limits.
82
+
83
+ Parameters
84
+ ----------
85
+ confidence_level : float, optional
86
+ Confidence level for the computed confidence interval.
87
+ Default is .95.
88
+ tol : float, optional
89
+ A tolerance for numerical optimization: the allowance will produce
90
+ a confidence within ``10*tol*(1 - confidence_level)`` of the
91
+ specified level, or a warning will be emitted. Tight tolerances
92
+ may be impractical due to noisy evaluation of the objective.
93
+ Default is 1e-3.
94
+
95
+ Returns
96
+ -------
97
+ allowance : float
98
+ Allowance around the mean.
99
+ """
100
+ alpha = 1 - confidence_level
101
+
102
+ def pvalue_from_stat(statistic):
103
+ statistic = np.array(statistic)
104
+ sf = _pvalue_dunnett(
105
+ rho=self._rho, df=self._df,
106
+ statistic=statistic, alternative=self._alternative,
107
+ rng=self._rng
108
+ )
109
+ return abs(sf - alpha)/alpha
110
+
111
+ # Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
112
+ # evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
113
+ # to tolerate a noisy objective function and may fail to find the
114
+ # minimum accurately. We mitigate this possibility with the validation
115
+ # step below, but implementation of a noise-tolerant root finder or
116
+ # minimizer would be a welcome enhancement. See gh-18150.
117
+ res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
118
+ critical_value = res.x
119
+
120
+ # validation
121
+ # tol*10 because tol=1e-3 means we tolerate a 1% change at most
122
+ if res.success is False or res.fun >= tol*10:
123
+ warnings.warn(
124
+ "Computation of the confidence interval did not converge to "
125
+ "the desired level. The confidence level corresponding with "
126
+ f"the returned interval is approximately {alpha*(1+res.fun)}.",
127
+ stacklevel=3
128
+ )
129
+
130
+ # From [1] p. 1101 between (1) and (3)
131
+ allowance = critical_value*self._std*np.sqrt(
132
+ 1/self._n_samples + 1/self._n_control
133
+ )
134
+ return abs(allowance)
135
+
136
+ def confidence_interval(
137
+ self, confidence_level: DecimalNumber = 0.95
138
+ ) -> ConfidenceInterval:
139
+ """Compute the confidence interval for the specified confidence level.
140
+
141
+ Parameters
142
+ ----------
143
+ confidence_level : float, optional
144
+ Confidence level for the computed confidence interval.
145
+ Default is .95.
146
+
147
+ Returns
148
+ -------
149
+ ci : ``ConfidenceInterval`` object
150
+ The object has attributes ``low`` and ``high`` that hold the
151
+ lower and upper bounds of the confidence intervals for each
152
+ comparison. The high and low values are accessible for each
153
+ comparison at index ``i`` for each group ``i``.
154
+
155
+ """
156
+ # check to see if the supplied confidence level matches that of the
157
+ # previously computed CI.
158
+ if (self._ci is not None) and (confidence_level == self._ci_cl):
159
+ return self._ci
160
+
161
+ if not (0 < confidence_level < 1):
162
+ raise ValueError("Confidence level must be between 0 and 1.")
163
+
164
+ allowance = self._allowance(confidence_level=confidence_level)
165
+ diff_means = self._mean_samples - self._mean_control
166
+
167
+ low = diff_means-allowance
168
+ high = diff_means+allowance
169
+
170
+ if self._alternative == 'greater':
171
+ high = [np.inf] * len(diff_means)
172
+ elif self._alternative == 'less':
173
+ low = [-np.inf] * len(diff_means)
174
+
175
+ self._ci_cl = confidence_level
176
+ self._ci = ConfidenceInterval(
177
+ low=low,
178
+ high=high
179
+ )
180
+ return self._ci
181
+
182
+
183
+ def dunnett(
184
+ *samples: npt.ArrayLike, # noqa: D417
185
+ control: npt.ArrayLike,
186
+ alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
187
+ random_state: SeedType = None
188
+ ) -> DunnettResult:
189
+ """Dunnett's test: multiple comparisons of means against a control group.
190
+
191
+ This is an implementation of Dunnett's original, single-step test as
192
+ described in [1]_.
193
+
194
+ Parameters
195
+ ----------
196
+ sample1, sample2, ... : 1D array_like
197
+ The sample measurements for each experimental group.
198
+ control : 1D array_like
199
+ The sample measurements for the control group.
200
+ alternative : {'two-sided', 'less', 'greater'}, optional
201
+ Defines the alternative hypothesis.
202
+
203
+ The null hypothesis is that the means of the distributions underlying
204
+ the samples and control are equal. The following alternative
205
+ hypotheses are available (default is 'two-sided'):
206
+
207
+ * 'two-sided': the means of the distributions underlying the samples
208
+ and control are unequal.
209
+ * 'less': the means of the distributions underlying the samples
210
+ are less than the mean of the distribution underlying the control.
211
+ * 'greater': the means of the distributions underlying the
212
+ samples are greater than the mean of the distribution underlying
213
+ the control.
214
+ random_state : {None, int, `numpy.random.Generator`}, optional
215
+ If `random_state` is an int or None, a new `numpy.random.Generator` is
216
+ created using ``np.random.default_rng(random_state)``.
217
+ If `random_state` is already a ``Generator`` instance, then the
218
+ provided instance is used.
219
+
220
+ The random number generator is used to control the randomized
221
+ Quasi-Monte Carlo integration of the multivariate-t distribution.
222
+
223
+ Returns
224
+ -------
225
+ res : `~scipy.stats._result_classes.DunnettResult`
226
+ An object containing attributes:
227
+
228
+ statistic : float ndarray
229
+ The computed statistic of the test for each comparison. The element
230
+ at index ``i`` is the statistic for the comparison between
231
+ groups ``i`` and the control.
232
+ pvalue : float ndarray
233
+ The computed p-value of the test for each comparison. The element
234
+ at index ``i`` is the p-value for the comparison between
235
+ group ``i`` and the control.
236
+
237
+ And the following method:
238
+
239
+ confidence_interval(confidence_level=0.95) :
240
+ Compute the difference in means of the groups
241
+ with the control +- the allowance.
242
+
243
+ See Also
244
+ --------
245
+ tukey_hsd : performs pairwise comparison of means.
246
+
247
+ Notes
248
+ -----
249
+ Like the independent-sample t-test, Dunnett's test [1]_ is used to make
250
+ inferences about the means of distributions from which samples were drawn.
251
+ However, when multiple t-tests are performed at a fixed significance level,
252
+ the "family-wise error rate" - the probability of incorrectly rejecting the
253
+ null hypothesis in at least one test - will exceed the significance level.
254
+ Dunnett's test is designed to perform multiple comparisons while
255
+ controlling the family-wise error rate.
256
+
257
+ Dunnett's test compares the means of multiple experimental groups
258
+ against a single control group. Tukey's Honestly Significant Difference Test
259
+ is another multiple-comparison test that controls the family-wise error
260
+ rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
261
+ When pairwise comparisons between experimental groups are not needed,
262
+ Dunnett's test is preferable due to its higher power.
263
+
264
+
265
+ The use of this test relies on several assumptions.
266
+
267
+ 1. The observations are independent within and among groups.
268
+ 2. The observations within each group are normally distributed.
269
+ 3. The distributions from which the samples are drawn have the same finite
270
+ variance.
271
+
272
+ References
273
+ ----------
274
+ .. [1] Charles W. Dunnett. "A Multiple Comparison Procedure for Comparing
275
+ Several Treatments with a Control."
276
+ Journal of the American Statistical Association, 50:272, 1096-1121,
277
+ :doi:`10.1080/01621459.1955.10501294`, 1955.
278
+
279
+ Examples
280
+ --------
281
+ In [1]_, the influence of drugs on blood count measurements on three groups
282
+ of animal is investigated.
283
+
284
+ The following table summarizes the results of the experiment in which
285
+ two groups received different drugs, and one group acted as a control.
286
+ Blood counts (in millions of cells per cubic millimeter) were recorded::
287
+
288
+ >>> import numpy as np
289
+ >>> control = np.array([7.40, 8.50, 7.20, 8.24, 9.84, 8.32])
290
+ >>> drug_a = np.array([9.76, 8.80, 7.68, 9.36])
291
+ >>> drug_b = np.array([12.80, 9.68, 12.16, 9.20, 10.55])
292
+
293
+ We would like to see if the means between any of the groups are
294
+ significantly different. First, visually examine a box and whisker plot.
295
+
296
+ >>> import matplotlib.pyplot as plt
297
+ >>> fig, ax = plt.subplots(1, 1)
298
+ >>> ax.boxplot([control, drug_a, drug_b])
299
+ >>> ax.set_xticklabels(["Control", "Drug A", "Drug B"]) # doctest: +SKIP
300
+ >>> ax.set_ylabel("mean") # doctest: +SKIP
301
+ >>> plt.show()
302
+
303
+ Note the overlapping interquartile ranges of the drug A group and control
304
+ group and the apparent separation between the drug B group and control
305
+ group.
306
+
307
+ Next, we will use Dunnett's test to assess whether the difference
308
+ between group means is significant while controlling the family-wise error
309
+ rate: the probability of making any false discoveries.
310
+ Let the null hypothesis be that the experimental groups have the same
311
+ mean as the control and the alternative be that an experimental group does
312
+ not have the same mean as the control. We will consider a 5% family-wise
313
+ error rate to be acceptable, and therefore we choose 0.05 as the threshold
314
+ for significance.
315
+
316
+ >>> from scipy.stats import dunnett
317
+ >>> res = dunnett(drug_a, drug_b, control=control)
318
+ >>> res.pvalue
319
+ array([0.62004941, 0.0059035 ]) # may vary
320
+
321
+ The p-value corresponding with the comparison between group A and control
322
+ exceeds 0.05, so we do not reject the null hypothesis for that comparison.
323
+ However, the p-value corresponding with the comparison between group B
324
+ and control is less than 0.05, so we consider the experimental results
325
+ to be evidence against the null hypothesis in favor of the alternative:
326
+ group B has a different mean than the control group.
327
+
328
+ """
329
+ samples_, control_, rng = _iv_dunnett(
330
+ samples=samples, control=control,
331
+ alternative=alternative, random_state=random_state
332
+ )
333
+
334
+ rho, df, n_group, n_samples, n_control = _params_dunnett(
335
+ samples=samples_, control=control_
336
+ )
337
+
338
+ statistic, std, mean_control, mean_samples = _statistic_dunnett(
339
+ samples_, control_, df, n_samples, n_control
340
+ )
341
+
342
+ pvalue = _pvalue_dunnett(
343
+ rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
344
+ )
345
+
346
+ return DunnettResult(
347
+ statistic=statistic, pvalue=pvalue,
348
+ _alternative=alternative,
349
+ _rho=rho, _df=df, _std=std,
350
+ _mean_samples=mean_samples,
351
+ _mean_control=mean_control,
352
+ _n_samples=n_samples,
353
+ _n_control=n_control,
354
+ _rng=rng
355
+ )
356
+
357
+
358
+ def _iv_dunnett(
359
+ samples: Sequence[npt.ArrayLike],
360
+ control: npt.ArrayLike,
361
+ alternative: Literal['two-sided', 'less', 'greater'],
362
+ random_state: SeedType
363
+ ) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
364
+ """Input validation for Dunnett's test."""
365
+ rng = check_random_state(random_state)
366
+
367
+ if alternative not in {'two-sided', 'less', 'greater'}:
368
+ raise ValueError(
369
+ "alternative must be 'less', 'greater' or 'two-sided'"
370
+ )
371
+
372
+ ndim_msg = "Control and samples groups must be 1D arrays"
373
+ n_obs_msg = "Control and samples groups must have at least 1 observation"
374
+
375
+ control = np.asarray(control)
376
+ samples_ = [np.asarray(sample) for sample in samples]
377
+
378
+ # samples checks
379
+ samples_control: list[np.ndarray] = samples_ + [control]
380
+ for sample in samples_control:
381
+ if sample.ndim > 1:
382
+ raise ValueError(ndim_msg)
383
+
384
+ if sample.size < 1:
385
+ raise ValueError(n_obs_msg)
386
+
387
+ return samples_, control, rng
388
+
389
+
390
+ def _params_dunnett(
391
+ samples: list[np.ndarray], control: np.ndarray
392
+ ) -> tuple[np.ndarray, int, int, np.ndarray, int]:
393
+ """Specific parameters for Dunnett's test.
394
+
395
+ Degree of freedom is the number of observations minus the number of groups
396
+ including the control.
397
+ """
398
+ n_samples = np.array([sample.size for sample in samples])
399
+
400
+ # From [1] p. 1100 d.f. = (sum N)-(p+1)
401
+ n_sample = n_samples.sum()
402
+ n_control = control.size
403
+ n = n_sample + n_control
404
+ n_groups = len(samples)
405
+ df = n - n_groups - 1
406
+
407
+ # From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
408
+ rho = n_control/n_samples + 1
409
+ rho = 1/np.sqrt(rho[:, None] * rho[None, :])
410
+ np.fill_diagonal(rho, 1)
411
+
412
+ return rho, df, n_groups, n_samples, n_control
413
+
414
+
415
+ def _statistic_dunnett(
416
+ samples: list[np.ndarray], control: np.ndarray, df: int,
417
+ n_samples: np.ndarray, n_control: int
418
+ ) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
419
+ """Statistic of Dunnett's test.
420
+
421
+ Computation based on the original single-step test from [1].
422
+ """
423
+ mean_control = np.mean(control)
424
+ mean_samples = np.array([np.mean(sample) for sample in samples])
425
+ all_samples = [control] + samples
426
+ all_means = np.concatenate([[mean_control], mean_samples])
427
+
428
+ # Variance estimate s^2 from [1] Eq. 1
429
+ s2 = np.sum([_var(sample, mean=mean)*sample.size
430
+ for sample, mean in zip(all_samples, all_means)]) / df
431
+ std = np.sqrt(s2)
432
+
433
+ # z score inferred from [1] unlabeled equation after Eq. 1
434
+ z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
435
+
436
+ return z / std, std, mean_control, mean_samples
437
+
438
+
439
+ def _pvalue_dunnett(
440
+ rho: np.ndarray, df: int, statistic: np.ndarray,
441
+ alternative: Literal['two-sided', 'less', 'greater'],
442
+ rng: SeedType = None
443
+ ) -> np.ndarray:
444
+ """pvalue from the multivariate t-distribution.
445
+
446
+ Critical values come from the multivariate student-t distribution.
447
+ """
448
+ statistic = statistic.reshape(-1, 1)
449
+
450
+ mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
451
+ if alternative == "two-sided":
452
+ statistic = abs(statistic)
453
+ pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
454
+ elif alternative == "greater":
455
+ pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
456
+ else:
457
+ pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
458
+
459
+ return np.atleast_1d(pvalue)
.venv/Lib/site-packages/scipy/stats/_multivariate.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.dll.a ADDED
Binary file (1.5 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.pyd ADDED
Binary file (106 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_odds_ratio.py ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from scipy.special import ndtri
4
+ from scipy.optimize import brentq
5
+ from ._discrete_distns import nchypergeom_fisher
6
+ from ._common import ConfidenceInterval
7
+
8
+
9
+ def _sample_odds_ratio(table):
10
+ """
11
+ Given a table [[a, b], [c, d]], compute a*d/(b*c).
12
+
13
+ Return nan if the numerator and denominator are 0.
14
+ Return inf if just the denominator is 0.
15
+ """
16
+ # table must be a 2x2 numpy array.
17
+ if table[1, 0] > 0 and table[0, 1] > 0:
18
+ oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
19
+ elif table[0, 0] == 0 or table[1, 1] == 0:
20
+ oddsratio = np.nan
21
+ else:
22
+ oddsratio = np.inf
23
+ return oddsratio
24
+
25
+
26
+ def _solve(func):
27
+ """
28
+ Solve func(nc) = 0. func must be an increasing function.
29
+ """
30
+ # We could just as well call the variable `x` instead of `nc`, but we
31
+ # always call this function with functions for which nc (the noncentrality
32
+ # parameter) is the variable for which we are solving.
33
+ nc = 1.0
34
+ value = func(nc)
35
+ if value == 0:
36
+ return nc
37
+
38
+ # Multiplicative factor by which to increase or decrease nc when
39
+ # searching for a bracketing interval.
40
+ factor = 2.0
41
+ # Find a bracketing interval.
42
+ if value > 0:
43
+ nc /= factor
44
+ while func(nc) > 0:
45
+ nc /= factor
46
+ lo = nc
47
+ hi = factor*nc
48
+ else:
49
+ nc *= factor
50
+ while func(nc) < 0:
51
+ nc *= factor
52
+ lo = nc/factor
53
+ hi = nc
54
+
55
+ # lo and hi bracket the solution for nc.
56
+ nc = brentq(func, lo, hi, xtol=1e-13)
57
+ return nc
58
+
59
+
60
+ def _nc_hypergeom_mean_inverse(x, M, n, N):
61
+ """
62
+ For the given noncentral hypergeometric parameters x, M, n,and N
63
+ (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
64
+ contingency table), find the noncentrality parameter of Fisher's
65
+ noncentral hypergeometric distribution whose mean is x.
66
+ """
67
+ nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
68
+ return nc
69
+
70
+
71
+ def _hypergeom_params_from_table(table):
72
+ # The notation M, n and N is consistent with stats.hypergeom and
73
+ # stats.nchypergeom_fisher.
74
+ x = table[0, 0]
75
+ M = table.sum()
76
+ n = table[0].sum()
77
+ N = table[:, 0].sum()
78
+ return x, M, n, N
79
+
80
+
81
+ def _ci_upper(table, alpha):
82
+ """
83
+ Compute the upper end of the confidence interval.
84
+ """
85
+ if _sample_odds_ratio(table) == np.inf:
86
+ return np.inf
87
+
88
+ x, M, n, N = _hypergeom_params_from_table(table)
89
+
90
+ # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
91
+ # it in the lambda expression.
92
+ nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
93
+ return nc
94
+
95
+
96
+ def _ci_lower(table, alpha):
97
+ """
98
+ Compute the lower end of the confidence interval.
99
+ """
100
+ if _sample_odds_ratio(table) == 0:
101
+ return 0
102
+
103
+ x, M, n, N = _hypergeom_params_from_table(table)
104
+
105
+ nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
106
+ return nc
107
+
108
+
109
+ def _conditional_oddsratio(table):
110
+ """
111
+ Conditional MLE of the odds ratio for the 2x2 contingency table.
112
+ """
113
+ x, M, n, N = _hypergeom_params_from_table(table)
114
+ # Get the bounds of the support. The support of the noncentral
115
+ # hypergeometric distribution with parameters M, n, and N is the same
116
+ # for all values of the noncentrality parameter, so we can use 1 here.
117
+ lo, hi = nchypergeom_fisher.support(M, n, N, 1)
118
+
119
+ # Check if x is at one of the extremes of the support. If so, we know
120
+ # the odds ratio is either 0 or inf.
121
+ if x == lo:
122
+ # x is at the low end of the support.
123
+ return 0
124
+ if x == hi:
125
+ # x is at the high end of the support.
126
+ return np.inf
127
+
128
+ nc = _nc_hypergeom_mean_inverse(x, M, n, N)
129
+ return nc
130
+
131
+
132
+ def _conditional_oddsratio_ci(table, confidence_level=0.95,
133
+ alternative='two-sided'):
134
+ """
135
+ Conditional exact confidence interval for the odds ratio.
136
+ """
137
+ if alternative == 'two-sided':
138
+ alpha = 0.5*(1 - confidence_level)
139
+ lower = _ci_lower(table, alpha)
140
+ upper = _ci_upper(table, alpha)
141
+ elif alternative == 'less':
142
+ lower = 0.0
143
+ upper = _ci_upper(table, 1 - confidence_level)
144
+ else:
145
+ # alternative == 'greater'
146
+ lower = _ci_lower(table, 1 - confidence_level)
147
+ upper = np.inf
148
+
149
+ return lower, upper
150
+
151
+
152
+ def _sample_odds_ratio_ci(table, confidence_level=0.95,
153
+ alternative='two-sided'):
154
+ oddsratio = _sample_odds_ratio(table)
155
+ log_or = np.log(oddsratio)
156
+ se = np.sqrt((1/table).sum())
157
+ if alternative == 'less':
158
+ z = ndtri(confidence_level)
159
+ loglow = -np.inf
160
+ loghigh = log_or + z*se
161
+ elif alternative == 'greater':
162
+ z = ndtri(confidence_level)
163
+ loglow = log_or - z*se
164
+ loghigh = np.inf
165
+ else:
166
+ # alternative is 'two-sided'
167
+ z = ndtri(0.5*confidence_level + 0.5)
168
+ loglow = log_or - z*se
169
+ loghigh = log_or + z*se
170
+
171
+ return np.exp(loglow), np.exp(loghigh)
172
+
173
+
174
+ class OddsRatioResult:
175
+ """
176
+ Result of `scipy.stats.contingency.odds_ratio`. See the
177
+ docstring for `odds_ratio` for more details.
178
+
179
+ Attributes
180
+ ----------
181
+ statistic : float
182
+ The computed odds ratio.
183
+
184
+ * If `kind` is ``'sample'``, this is sample (or unconditional)
185
+ estimate, given by
186
+ ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
187
+ * If `kind` is ``'conditional'``, this is the conditional
188
+ maximum likelihood estimate for the odds ratio. It is
189
+ the noncentrality parameter of Fisher's noncentral
190
+ hypergeometric distribution with the same hypergeometric
191
+ parameters as `table` and whose mean is ``table[0, 0]``.
192
+
193
+ Methods
194
+ -------
195
+ confidence_interval :
196
+ Confidence interval for the odds ratio.
197
+ """
198
+
199
+ def __init__(self, _table, _kind, statistic):
200
+ # for now, no need to make _table and _kind public, since this sort of
201
+ # information is returned in very few `scipy.stats` results
202
+ self._table = _table
203
+ self._kind = _kind
204
+ self.statistic = statistic
205
+
206
+ def __repr__(self):
207
+ return f"OddsRatioResult(statistic={self.statistic})"
208
+
209
+ def confidence_interval(self, confidence_level=0.95,
210
+ alternative='two-sided'):
211
+ """
212
+ Confidence interval for the odds ratio.
213
+
214
+ Parameters
215
+ ----------
216
+ confidence_level: float
217
+ Desired confidence level for the confidence interval.
218
+ The value must be given as a fraction between 0 and 1.
219
+ Default is 0.95 (meaning 95%).
220
+
221
+ alternative : {'two-sided', 'less', 'greater'}, optional
222
+ The alternative hypothesis of the hypothesis test to which the
223
+ confidence interval corresponds. That is, suppose the null
224
+ hypothesis is that the true odds ratio equals ``OR`` and the
225
+ confidence interval is ``(low, high)``. Then the following options
226
+ for `alternative` are available (default is 'two-sided'):
227
+
228
+ * 'two-sided': the true odds ratio is not equal to ``OR``. There
229
+ is evidence against the null hypothesis at the chosen
230
+ `confidence_level` if ``high < OR`` or ``low > OR``.
231
+ * 'less': the true odds ratio is less than ``OR``. The ``low`` end
232
+ of the confidence interval is 0, and there is evidence against
233
+ the null hypothesis at the chosen `confidence_level` if
234
+ ``high < OR``.
235
+ * 'greater': the true odds ratio is greater than ``OR``. The
236
+ ``high`` end of the confidence interval is ``np.inf``, and there
237
+ is evidence against the null hypothesis at the chosen
238
+ `confidence_level` if ``low > OR``.
239
+
240
+ Returns
241
+ -------
242
+ ci : ``ConfidenceInterval`` instance
243
+ The confidence interval, represented as an object with
244
+ attributes ``low`` and ``high``.
245
+
246
+ Notes
247
+ -----
248
+ When `kind` is ``'conditional'``, the limits of the confidence
249
+ interval are the conditional "exact confidence limits" as described
250
+ by Fisher [1]_. The conditional odds ratio and confidence interval are
251
+ also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
252
+
253
+ When `kind` is ``'sample'``, the confidence interval is computed
254
+ under the assumption that the logarithm of the odds ratio is normally
255
+ distributed with standard error given by::
256
+
257
+ se = sqrt(1/a + 1/b + 1/c + 1/d)
258
+
259
+ where ``a``, ``b``, ``c`` and ``d`` are the elements of the
260
+ contingency table. (See, for example, [2]_, section 3.1.3.2,
261
+ or [3]_, section 2.3.3).
262
+
263
+ References
264
+ ----------
265
+ .. [1] R. A. Fisher (1935), The logic of inductive inference,
266
+ Journal of the Royal Statistical Society, Vol. 98, No. 1,
267
+ pp. 39-82.
268
+ .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
269
+ Methods, Techniques, and Applications, CRC Press LLC, Boca
270
+ Raton, Florida.
271
+ .. [3] Alan Agresti, An Introduction to Categorical Data Analysis
272
+ (second edition), Wiley, Hoboken, NJ, USA (2007).
273
+ """
274
+ if alternative not in ['two-sided', 'less', 'greater']:
275
+ raise ValueError("`alternative` must be 'two-sided', 'less' or "
276
+ "'greater'.")
277
+
278
+ if confidence_level < 0 or confidence_level > 1:
279
+ raise ValueError('confidence_level must be between 0 and 1')
280
+
281
+ if self._kind == 'conditional':
282
+ ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
283
+ else:
284
+ ci = self._sample_odds_ratio_ci(confidence_level, alternative)
285
+ return ci
286
+
287
+ def _conditional_odds_ratio_ci(self, confidence_level=0.95,
288
+ alternative='two-sided'):
289
+ """
290
+ Confidence interval for the conditional odds ratio.
291
+ """
292
+
293
+ table = self._table
294
+ if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
295
+ # If both values in a row or column are zero, the p-value is 1,
296
+ # the odds ratio is NaN and the confidence interval is (0, inf).
297
+ ci = (0, np.inf)
298
+ else:
299
+ ci = _conditional_oddsratio_ci(table,
300
+ confidence_level=confidence_level,
301
+ alternative=alternative)
302
+ return ConfidenceInterval(low=ci[0], high=ci[1])
303
+
304
+ def _sample_odds_ratio_ci(self, confidence_level=0.95,
305
+ alternative='two-sided'):
306
+ """
307
+ Confidence interval for the sample odds ratio.
308
+ """
309
+ if confidence_level < 0 or confidence_level > 1:
310
+ raise ValueError('confidence_level must be between 0 and 1')
311
+
312
+ table = self._table
313
+ if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
314
+ # If both values in a row or column are zero, the p-value is 1,
315
+ # the odds ratio is NaN and the confidence interval is (0, inf).
316
+ ci = (0, np.inf)
317
+ else:
318
+ ci = _sample_odds_ratio_ci(table,
319
+ confidence_level=confidence_level,
320
+ alternative=alternative)
321
+ return ConfidenceInterval(low=ci[0], high=ci[1])
322
+
323
+
324
+ def odds_ratio(table, *, kind='conditional'):
325
+ r"""
326
+ Compute the odds ratio for a 2x2 contingency table.
327
+
328
+ Parameters
329
+ ----------
330
+ table : array_like of ints
331
+ A 2x2 contingency table. Elements must be non-negative integers.
332
+ kind : str, optional
333
+ Which kind of odds ratio to compute, either the sample
334
+ odds ratio (``kind='sample'``) or the conditional odds ratio
335
+ (``kind='conditional'``). Default is ``'conditional'``.
336
+
337
+ Returns
338
+ -------
339
+ result : `~scipy.stats._result_classes.OddsRatioResult` instance
340
+ The returned object has two computed attributes:
341
+
342
+ statistic : float
343
+ * If `kind` is ``'sample'``, this is sample (or unconditional)
344
+ estimate, given by
345
+ ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
346
+ * If `kind` is ``'conditional'``, this is the conditional
347
+ maximum likelihood estimate for the odds ratio. It is
348
+ the noncentrality parameter of Fisher's noncentral
349
+ hypergeometric distribution with the same hypergeometric
350
+ parameters as `table` and whose mean is ``table[0, 0]``.
351
+
352
+ The object has the method `confidence_interval` that computes
353
+ the confidence interval of the odds ratio.
354
+
355
+ See Also
356
+ --------
357
+ scipy.stats.fisher_exact
358
+ relative_risk
359
+
360
+ Notes
361
+ -----
362
+ The conditional odds ratio was discussed by Fisher (see "Example 1"
363
+ of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
364
+
365
+ .. versionadded:: 1.10.0
366
+
367
+ References
368
+ ----------
369
+ .. [1] R. A. Fisher (1935), The logic of inductive inference,
370
+ Journal of the Royal Statistical Society, Vol. 98, No. 1,
371
+ pp. 39-82.
372
+ .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
373
+ Volume I - The analysis of case-control studies. IARC Sci Publ.
374
+ (32):5-338. PMID: 7216345. (See section 4.2.)
375
+ .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
376
+ Methods, Techniques, and Applications, CRC Press LLC, Boca
377
+ Raton, Florida.
378
+ .. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
379
+ Cardiovascular Events in Women and Men: A Sex-Specific
380
+ Meta-analysis of Randomized Controlled Trials."
381
+ JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
382
+
383
+ Examples
384
+ --------
385
+ In epidemiology, individuals are classified as "exposed" or
386
+ "unexposed" to some factor or treatment. If the occurrence of some
387
+ illness is under study, those who have the illness are often
388
+ classified as "cases", and those without it are "noncases". The
389
+ counts of the occurrences of these classes gives a contingency
390
+ table::
391
+
392
+ exposed unexposed
393
+ cases a b
394
+ noncases c d
395
+
396
+ The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
397
+ be interpreted as the odds of a case occurring in the exposed group,
398
+ and ``b/d`` as the odds of a case occurring in the unexposed group.
399
+ The sample odds ratio is the ratio of these odds. If the odds ratio
400
+ is greater than 1, it suggests that there is a positive association
401
+ between being exposed and being a case.
402
+
403
+ Interchanging the rows or columns of the contingency table inverts
404
+ the odds ratio, so it is import to understand the meaning of labels
405
+ given to the rows and columns of the table when interpreting the
406
+ odds ratio.
407
+
408
+ In [4]_, the use of aspirin to prevent cardiovascular events in women
409
+ and men was investigated. The study notably concluded:
410
+
411
+ ...aspirin therapy reduced the risk of a composite of
412
+ cardiovascular events due to its effect on reducing the risk of
413
+ ischemic stroke in women [...]
414
+
415
+ The article lists studies of various cardiovascular events. Let's
416
+ focus on the ischemic stoke in women.
417
+
418
+ The following table summarizes the results of the experiment in which
419
+ participants took aspirin or a placebo on a regular basis for several
420
+ years. Cases of ischemic stroke were recorded::
421
+
422
+ Aspirin Control/Placebo
423
+ Ischemic stroke 176 230
424
+ No stroke 21035 21018
425
+
426
+ The question we ask is "Is there evidence that the aspirin reduces the
427
+ risk of ischemic stroke?"
428
+
429
+ Compute the odds ratio:
430
+
431
+ >>> from scipy.stats.contingency import odds_ratio
432
+ >>> res = odds_ratio([[176, 230], [21035, 21018]])
433
+ >>> res.statistic
434
+ 0.7646037659999126
435
+
436
+ For this sample, the odds of getting an ischemic stroke for those who have
437
+ been taking aspirin are 0.76 times that of those
438
+ who have received the placebo.
439
+
440
+ To make statistical inferences about the population under study,
441
+ we can compute the 95% confidence interval for the odds ratio:
442
+
443
+ >>> res.confidence_interval(confidence_level=0.95)
444
+ ConfidenceInterval(low=0.6241234078749812, high=0.9354102892100372)
445
+
446
+ The 95% confidence interval for the conditional odds ratio is
447
+ approximately (0.62, 0.94).
448
+
449
+ The fact that the entire 95% confidence interval falls below 1 supports
450
+ the authors' conclusion that the aspirin was associated with a
451
+ statistically significant reduction in ischemic stroke.
452
+ """
453
+ if kind not in ['conditional', 'sample']:
454
+ raise ValueError("`kind` must be 'conditional' or 'sample'.")
455
+
456
+ c = np.asarray(table)
457
+
458
+ if c.shape != (2, 2):
459
+ raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
460
+ "of shape (2, 2).")
461
+
462
+ if not np.issubdtype(c.dtype, np.integer):
463
+ raise ValueError("`table` must be an array of integers, but got "
464
+ f"type {c.dtype}")
465
+ c = c.astype(np.int64)
466
+
467
+ if np.any(c < 0):
468
+ raise ValueError("All values in `table` must be nonnegative.")
469
+
470
+ if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
471
+ # If both values in a row or column are zero, the p-value is NaN and
472
+ # the odds ratio is NaN.
473
+ result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
474
+ return result
475
+
476
+ if kind == 'sample':
477
+ oddsratio = _sample_odds_ratio(c)
478
+ else: # kind is 'conditional'
479
+ oddsratio = _conditional_oddsratio(c)
480
+
481
+ result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
482
+ return result
.venv/Lib/site-packages/scipy/stats/_page_trend_test.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import permutations
2
+ import numpy as np
3
+ import math
4
+ from ._continuous_distns import norm
5
+ import scipy.stats
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass
10
+ class PageTrendTestResult:
11
+ statistic: float
12
+ pvalue: float
13
+ method: str
14
+
15
+
16
+ def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
17
+ r"""
18
+ Perform Page's Test, a measure of trend in observations between treatments.
19
+
20
+ Page's Test (also known as Page's :math:`L` test) is useful when:
21
+
22
+ * there are :math:`n \geq 3` treatments,
23
+ * :math:`m \geq 2` subjects are observed for each treatment, and
24
+ * the observations are hypothesized to have a particular order.
25
+
26
+ Specifically, the test considers the null hypothesis that
27
+
28
+ .. math::
29
+
30
+ m_1 = m_2 = m_3 \cdots = m_n,
31
+
32
+ where :math:`m_j` is the mean of the observed quantity under treatment
33
+ :math:`j`, against the alternative hypothesis that
34
+
35
+ .. math::
36
+
37
+ m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
38
+
39
+ where at least one inequality is strict.
40
+
41
+ As noted by [4]_, Page's :math:`L` test has greater statistical power than
42
+ the Friedman test against the alternative that there is a difference in
43
+ trend, as Friedman's test only considers a difference in the means of the
44
+ observations without considering their order. Whereas Spearman :math:`\rho`
45
+ considers the correlation between the ranked observations of two variables
46
+ (e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
47
+ carries), Page's :math:`L` is concerned with a trend in an observation
48
+ (e.g. the airspeed velocity of a swallow) across several distinct
49
+ treatments (e.g. carrying each of five coconuts of different weight) even
50
+ as the observation is repeated with multiple subjects (e.g. one European
51
+ swallow and one African swallow).
52
+
53
+ Parameters
54
+ ----------
55
+ data : array-like
56
+ A :math:`m \times n` array; the element in row :math:`i` and
57
+ column :math:`j` is the observation corresponding with subject
58
+ :math:`i` and treatment :math:`j`. By default, the columns are
59
+ assumed to be arranged in order of increasing predicted mean.
60
+
61
+ ranked : boolean, optional
62
+ By default, `data` is assumed to be observations rather than ranks;
63
+ it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
64
+ `data` is provided in the form of ranks, pass argument ``True``.
65
+
66
+ predicted_ranks : array-like, optional
67
+ The predicted ranks of the column means. If not specified,
68
+ the columns are assumed to be arranged in order of increasing
69
+ predicted mean, so the default `predicted_ranks` are
70
+ :math:`[1, 2, \dots, n-1, n]`.
71
+
72
+ method : {'auto', 'asymptotic', 'exact'}, optional
73
+ Selects the method used to calculate the *p*-value. The following
74
+ options are available.
75
+
76
+ * 'auto': selects between 'exact' and 'asymptotic' to
77
+ achieve reasonably accurate results in reasonable time (default)
78
+ * 'asymptotic': compares the standardized test statistic against
79
+ the normal distribution
80
+ * 'exact': computes the exact *p*-value by comparing the observed
81
+ :math:`L` statistic against those realized by all possible
82
+ permutations of ranks (under the null hypothesis that each
83
+ permutation is equally likely)
84
+
85
+ Returns
86
+ -------
87
+ res : PageTrendTestResult
88
+ An object containing attributes:
89
+
90
+ statistic : float
91
+ Page's :math:`L` test statistic.
92
+ pvalue : float
93
+ The associated *p*-value
94
+ method : {'asymptotic', 'exact'}
95
+ The method used to compute the *p*-value
96
+
97
+ See Also
98
+ --------
99
+ rankdata, friedmanchisquare, spearmanr
100
+
101
+ Notes
102
+ -----
103
+ As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
104
+ :math:`n` objects or events or performances or persons or trials ranked."
105
+ Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
106
+ "groupings by ability or some other control variable, or judges doing
107
+ the ranking, or random replications of some other sort."
108
+
109
+ The procedure for calculating the :math:`L` statistic, adapted from
110
+ [1]_, is:
111
+
112
+ 1. "Predetermine with careful logic the appropriate hypotheses
113
+ concerning the predicted ordering of the experimental results.
114
+ If no reasonable basis for ordering any treatments is known, the
115
+ :math:`L` test is not appropriate."
116
+ 2. "As in other experiments, determine at what level of confidence
117
+ you will reject the null hypothesis that there is no agreement of
118
+ experimental results with the monotonic hypothesis."
119
+ 3. "Cast the experimental material into a two-way table of :math:`n`
120
+ columns (treatments, objects ranked, conditions) and :math:`m`
121
+ rows (subjects, replication groups, levels of control variables)."
122
+ 4. "When experimental observations are recorded, rank them across each
123
+ row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
124
+ 5. "Add the ranks in each column", e.g.
125
+ ``colsums = np.sum(ranks, axis=0)``.
126
+ 6. "Multiply each sum of ranks by the predicted rank for that same
127
+ column", e.g. ``products = predicted_ranks * colsums``.
128
+ 7. "Sum all such products", e.g. ``L = products.sum()``.
129
+
130
+ [1]_ continues by suggesting use of the standardized statistic
131
+
132
+ .. math::
133
+
134
+ \chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
135
+
136
+ "which is distributed approximately as chi-square with 1 degree of
137
+ freedom. The ordinary use of :math:`\chi^2` tables would be
138
+ equivalent to a two-sided test of agreement. If a one-sided test
139
+ is desired, *as will almost always be the case*, the probability
140
+ discovered in the chi-square table should be *halved*."
141
+
142
+ However, this standardized statistic does not distinguish between the
143
+ observed values being well correlated with the predicted ranks and being
144
+ _anti_-correlated with the predicted ranks. Instead, we follow [2]_
145
+ and calculate the standardized statistic
146
+
147
+ .. math::
148
+
149
+ \Lambda = \frac{L - E_0}{\sqrt{V_0}},
150
+
151
+ where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
152
+ :math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
153
+ normal under the null hypothesis".
154
+
155
+ The *p*-value for ``method='exact'`` is generated by comparing the observed
156
+ value of :math:`L` against the :math:`L` values generated for all
157
+ :math:`(n!)^m` possible permutations of ranks. The calculation is performed
158
+ using the recursive method of [5].
159
+
160
+ The *p*-values are not adjusted for the possibility of ties. When
161
+ ties are present, the reported ``'exact'`` *p*-values may be somewhat
162
+ larger (i.e. more conservative) than the true *p*-value [2]_. The
163
+ ``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
164
+ conservative) than the ``'exact'`` *p*-values.
165
+
166
+ References
167
+ ----------
168
+ .. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
169
+ a significant test for linear ranks", *Journal of the American
170
+ Statistical Association* 58(301), p. 216--230, 1963.
171
+
172
+ .. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
173
+ approach*, CRC Press, p. 150--152, 2012.
174
+
175
+ .. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
176
+ Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
177
+ Accessed July 12, 2020.
178
+
179
+ .. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
180
+ https://en.wikipedia.org/wiki/Page%27s_trend_test,
181
+ Accessed July 12, 2020.
182
+
183
+ .. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
184
+ the two-way layout", *Communications in Statistics - Simulation and
185
+ Computation*, 6(1), p. 49--61, 1977.
186
+
187
+ Examples
188
+ --------
189
+ We use the example from [3]_: 10 students are asked to rate three
190
+ teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
191
+ with 1 being the lowest and 5 being the highest. We have decided that
192
+ a confidence level of 99% is required to reject the null hypothesis in
193
+ favor of our alternative: that the seminar will have the highest ratings
194
+ and the tutorial will have the lowest. Initially, the data have been
195
+ tabulated with each row representing an individual student's ratings of
196
+ the three methods in the following order: tutorial, lecture, seminar.
197
+
198
+ >>> table = [[3, 4, 3],
199
+ ... [2, 2, 4],
200
+ ... [3, 3, 5],
201
+ ... [1, 3, 2],
202
+ ... [2, 3, 2],
203
+ ... [2, 4, 5],
204
+ ... [1, 2, 4],
205
+ ... [3, 4, 4],
206
+ ... [2, 4, 5],
207
+ ... [1, 3, 4]]
208
+
209
+ Because the tutorial is hypothesized to have the lowest ratings, the
210
+ column corresponding with tutorial rankings should be first; the seminar
211
+ is hypothesized to have the highest ratings, so its column should be last.
212
+ Since the columns are already arranged in this order of increasing
213
+ predicted mean, we can pass the table directly into `page_trend_test`.
214
+
215
+ >>> from scipy.stats import page_trend_test
216
+ >>> res = page_trend_test(table)
217
+ >>> res
218
+ PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
219
+ method='exact')
220
+
221
+ This *p*-value indicates that there is a 0.1819% chance that
222
+ the :math:`L` statistic would reach such an extreme value under the null
223
+ hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
224
+ the null hypothesis in favor of our alternative at a 99% confidence level.
225
+
226
+ The value of the :math:`L` statistic is 133.5. To check this manually,
227
+ we rank the data such that high scores correspond with high ranks, settling
228
+ ties with an average rank:
229
+
230
+ >>> from scipy.stats import rankdata
231
+ >>> ranks = rankdata(table, axis=1)
232
+ >>> ranks
233
+ array([[1.5, 3. , 1.5],
234
+ [1.5, 1.5, 3. ],
235
+ [1.5, 1.5, 3. ],
236
+ [1. , 3. , 2. ],
237
+ [1.5, 3. , 1.5],
238
+ [1. , 2. , 3. ],
239
+ [1. , 2. , 3. ],
240
+ [1. , 2.5, 2.5],
241
+ [1. , 2. , 3. ],
242
+ [1. , 2. , 3. ]])
243
+
244
+ We add the ranks within each column, multiply the sums by the
245
+ predicted ranks, and sum the products.
246
+
247
+ >>> import numpy as np
248
+ >>> m, n = ranks.shape
249
+ >>> predicted_ranks = np.arange(1, n+1)
250
+ >>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
251
+ >>> res.statistic == L
252
+ True
253
+
254
+ As presented in [3]_, the asymptotic approximation of the *p*-value is the
255
+ survival function of the normal distribution evaluated at the standardized
256
+ test statistic:
257
+
258
+ >>> from scipy.stats import norm
259
+ >>> E0 = (m*n*(n+1)**2)/4
260
+ >>> V0 = (m*n**2*(n+1)*(n**2-1))/144
261
+ >>> Lambda = (L-E0)/np.sqrt(V0)
262
+ >>> p = norm.sf(Lambda)
263
+ >>> p
264
+ 0.0012693433690751756
265
+
266
+ This does not precisely match the *p*-value reported by `page_trend_test`
267
+ above. The asymptotic distribution is not very accurate, nor conservative,
268
+ for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
269
+ use ``method='exact'`` based on the dimensions of the table and the
270
+ recommendations in Page's original paper [1]_. To override
271
+ `page_trend_test`'s choice, provide the `method` argument.
272
+
273
+ >>> res = page_trend_test(table, method="asymptotic")
274
+ >>> res
275
+ PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
276
+ method='asymptotic')
277
+
278
+ If the data are already ranked, we can pass in the ``ranks`` instead of
279
+ the ``table`` to save computation time.
280
+
281
+ >>> res = page_trend_test(ranks, # ranks of data
282
+ ... ranked=True, # data is already ranked
283
+ ... )
284
+ >>> res
285
+ PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
286
+ method='exact')
287
+
288
+ Suppose the raw data had been tabulated in an order different from the
289
+ order of predicted means, say lecture, seminar, tutorial.
290
+
291
+ >>> table = np.asarray(table)[:, [1, 2, 0]]
292
+
293
+ Since the arrangement of this table is not consistent with the assumed
294
+ ordering, we can either rearrange the table or provide the
295
+ `predicted_ranks`. Remembering that the lecture is predicted
296
+ to have the middle rank, the seminar the highest, and tutorial the lowest,
297
+ we pass:
298
+
299
+ >>> res = page_trend_test(table, # data as originally tabulated
300
+ ... predicted_ranks=[2, 3, 1], # our predicted order
301
+ ... )
302
+ >>> res
303
+ PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
304
+ method='exact')
305
+
306
+ """
307
+
308
+ # Possible values of the method parameter and the corresponding function
309
+ # used to evaluate the p value
310
+ methods = {"asymptotic": _l_p_asymptotic,
311
+ "exact": _l_p_exact,
312
+ "auto": None}
313
+ if method not in methods:
314
+ raise ValueError(f"`method` must be in {set(methods)}")
315
+
316
+ ranks = np.asarray(data)
317
+ if ranks.ndim != 2: # TODO: relax this to accept 3d arrays?
318
+ raise ValueError("`data` must be a 2d array.")
319
+
320
+ m, n = ranks.shape
321
+ if m < 2 or n < 3:
322
+ raise ValueError("Page's L is only appropriate for data with two "
323
+ "or more rows and three or more columns.")
324
+
325
+ if np.any(np.isnan(data)):
326
+ raise ValueError("`data` contains NaNs, which cannot be ranked "
327
+ "meaningfully")
328
+
329
+ # ensure NumPy array and rank the data if it's not already ranked
330
+ if ranked:
331
+ # Only a basic check on whether data is ranked. Checking that the data
332
+ # is properly ranked could take as much time as ranking it.
333
+ if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
334
+ raise ValueError("`data` is not properly ranked. Rank the data or "
335
+ "pass `ranked=False`.")
336
+ else:
337
+ ranks = scipy.stats.rankdata(data, axis=-1)
338
+
339
+ # generate predicted ranks if not provided, ensure valid NumPy array
340
+ if predicted_ranks is None:
341
+ predicted_ranks = np.arange(1, n+1)
342
+ else:
343
+ predicted_ranks = np.asarray(predicted_ranks)
344
+ if (predicted_ranks.ndim < 1 or
345
+ (set(predicted_ranks) != set(range(1, n+1)) or
346
+ len(predicted_ranks) != n)):
347
+ raise ValueError(f"`predicted_ranks` must include each integer "
348
+ f"from 1 to {n} (the number of columns in "
349
+ f"`data`) exactly once.")
350
+
351
+ if not isinstance(ranked, bool):
352
+ raise TypeError("`ranked` must be boolean.")
353
+
354
+ # Calculate the L statistic
355
+ L = _l_vectorized(ranks, predicted_ranks)
356
+
357
+ # Calculate the p-value
358
+ if method == "auto":
359
+ method = _choose_method(ranks)
360
+ p_fun = methods[method] # get the function corresponding with the method
361
+ p = p_fun(L, m, n)
362
+
363
+ page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
364
+ return page_result
365
+
366
+
367
+ def _choose_method(ranks):
368
+ '''Choose method for computing p-value automatically'''
369
+ m, n = ranks.shape
370
+ if n > 8 or (m > 12 and n > 3) or m > 20: # as in [1], [4]
371
+ method = "asymptotic"
372
+ else:
373
+ method = "exact"
374
+ return method
375
+
376
+
377
+ def _l_vectorized(ranks, predicted_ranks):
378
+ '''Calculate's Page's L statistic for each page of a 3d array'''
379
+ colsums = ranks.sum(axis=-2, keepdims=True)
380
+ products = predicted_ranks * colsums
381
+ Ls = products.sum(axis=-1)
382
+ Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
383
+ return Ls
384
+
385
+
386
+ def _l_p_asymptotic(L, m, n):
387
+ '''Calculate the p-value of Page's L from the asymptotic distribution'''
388
+ # Using [1] as a reference, the asymptotic p-value would be calculated as:
389
+ # chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
390
+ # p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
391
+ # but this is insensitive to the direction of the hypothesized ranking
392
+
393
+ # See [2] page 151
394
+ E0 = (m*n*(n+1)**2)/4
395
+ V0 = (m*n**2*(n+1)*(n**2-1))/144
396
+ Lambda = (L-E0)/np.sqrt(V0)
397
+ # This is a one-sided "greater" test - calculate the probability that the
398
+ # L statistic under H0 would be greater than the observed L statistic
399
+ p = norm.sf(Lambda)
400
+ return p
401
+
402
+
403
+ def _l_p_exact(L, m, n):
404
+ '''Calculate the p-value of Page's L exactly'''
405
+ # [1] uses m, n; [5] uses n, k.
406
+ # Switch convention here because exact calculation code references [5].
407
+ L, n, k = int(L), int(m), int(n)
408
+ _pagel_state.set_k(k)
409
+ return _pagel_state.sf(L, n)
410
+
411
+
412
+ class _PageL:
413
+ '''Maintains state between `page_trend_test` executions'''
414
+
415
+ def __init__(self):
416
+ '''Lightweight initialization'''
417
+ self.all_pmfs = {}
418
+
419
+ def set_k(self, k):
420
+ '''Calculate lower and upper limits of L for single row'''
421
+ self.k = k
422
+ # See [5] top of page 52
423
+ self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
424
+
425
+ def sf(self, l, n):
426
+ '''Survival function of Page's L statistic'''
427
+ ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
428
+ return np.sum(ps)
429
+
430
+ def p_l_k_1(self):
431
+ '''Relative frequency of each L value over all possible single rows'''
432
+
433
+ # See [5] Equation (6)
434
+ ranks = range(1, self.k+1)
435
+ # generate all possible rows of length k
436
+ rank_perms = np.array(list(permutations(ranks)))
437
+ # compute Page's L for all possible rows
438
+ Ls = (ranks*rank_perms).sum(axis=1)
439
+ # count occurrences of each L value
440
+ counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
441
+ # factorial(k) is number of possible permutations
442
+ return counts/math.factorial(self.k)
443
+
444
+ def pmf(self, l, n):
445
+ '''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
446
+
447
+ if n not in self.all_pmfs:
448
+ self.all_pmfs[n] = {}
449
+ if self.k not in self.all_pmfs[n]:
450
+ self.all_pmfs[n][self.k] = {}
451
+
452
+ # Cache results to avoid repeating calculation. Initially this was
453
+ # written with lru_cache, but this seems faster? Also, we could add
454
+ # an option to save this for future lookup.
455
+ if l in self.all_pmfs[n][self.k]:
456
+ return self.all_pmfs[n][self.k][l]
457
+
458
+ if n == 1:
459
+ ps = self.p_l_k_1() # [5] Equation 6
460
+ ls = range(self.a, self.b+1)
461
+ # not fast, but we'll only be here once
462
+ self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
463
+ return self.all_pmfs[n][self.k][l]
464
+
465
+ p = 0
466
+ low = max(l-(n-1)*self.b, self.a) # [5] Equation 2
467
+ high = min(l-(n-1)*self.a, self.b)
468
+
469
+ # [5] Equation 1
470
+ for t in range(low, high+1):
471
+ p1 = self.pmf(l-t, n-1)
472
+ p2 = self.pmf(t, 1)
473
+ p += p1*p2
474
+ self.all_pmfs[n][self.k][l] = p
475
+ return p
476
+
477
+
478
+ # Maintain state for faster repeat calls to page_trend_test w/ method='exact'
479
+ _pagel_state = _PageL()
.venv/Lib/site-packages/scipy/stats/_qmc.py ADDED
The diff for this file is too large to render. See raw diff
 
.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.dll.a ADDED
Binary file (1.54 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.pyd ADDED
Binary file (409 kB). View file
 
.venv/Lib/site-packages/scipy/stats/_qmc_cy.pyi ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from scipy._lib._util import DecimalNumber, IntNumber
3
+
4
+
5
+ def _cy_wrapper_centered_discrepancy(
6
+ sample: np.ndarray,
7
+ iterative: bool,
8
+ workers: IntNumber,
9
+ ) -> float: ...
10
+
11
+
12
+ def _cy_wrapper_wrap_around_discrepancy(
13
+ sample: np.ndarray,
14
+ iterative: bool,
15
+ workers: IntNumber,
16
+ ) -> float: ...
17
+
18
+
19
+ def _cy_wrapper_mixture_discrepancy(
20
+ sample: np.ndarray,
21
+ iterative: bool,
22
+ workers: IntNumber,
23
+ ) -> float: ...
24
+
25
+
26
+ def _cy_wrapper_l2_star_discrepancy(
27
+ sample: np.ndarray,
28
+ iterative: bool,
29
+ workers: IntNumber,
30
+ ) -> float: ...
31
+
32
+
33
+ def _cy_wrapper_update_discrepancy(
34
+ x_new_view: np.ndarray,
35
+ sample_view: np.ndarray,
36
+ initial_disc: DecimalNumber,
37
+ ) -> float: ...
38
+
39
+
40
+ def _cy_van_der_corput(
41
+ n: IntNumber,
42
+ base: IntNumber,
43
+ start_index: IntNumber,
44
+ workers: IntNumber,
45
+ ) -> np.ndarray: ...
46
+
47
+
48
+ def _cy_van_der_corput_scrambled(
49
+ n: IntNumber,
50
+ base: IntNumber,
51
+ start_index: IntNumber,
52
+ permutations: np.ndarray,
53
+ workers: IntNumber,
54
+ ) -> np.ndarray: ...
.venv/Lib/site-packages/scipy/stats/_qmvnt.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Integration of multivariate normal and t distributions.
2
+
3
+ # Adapted from the MATLAB original implementations by Dr. Alan Genz.
4
+
5
+ # http://www.math.wsu.edu/faculty/genz/software/software.html
6
+
7
+ # Copyright (C) 2013, Alan Genz, All rights reserved.
8
+ # Python implementation is copyright (C) 2022, Robert Kern, All rights
9
+ # reserved.
10
+
11
+ # Redistribution and use in source and binary forms, with or without
12
+ # modification, are permitted provided the following conditions are met:
13
+ # 1. Redistributions of source code must retain the above copyright
14
+ # notice, this list of conditions and the following disclaimer.
15
+ # 2. Redistributions in binary form must reproduce the above copyright
16
+ # notice, this list of conditions and the following disclaimer in
17
+ # the documentation and/or other materials provided with the
18
+ # distribution.
19
+ # 3. The contributor name(s) may not be used to endorse or promote
20
+ # products derived from this software without specific prior
21
+ # written permission.
22
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26
+ # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27
+ # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28
+ # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
29
+ # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
30
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
31
+ # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
32
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
+
34
+
35
+ import numpy as np
36
+
37
+ from scipy.fft import fft, ifft
38
+ from scipy.special import gammaincinv, ndtr, ndtri
39
+ from scipy.stats._qmc import primes_from_2_to
40
+
41
+
42
+ phi = ndtr
43
+ phinv = ndtri
44
+
45
+
46
+ def _factorize_int(n):
47
+ """Return a sorted list of the unique prime factors of a positive integer.
48
+ """
49
+ # NOTE: There are lots faster ways to do this, but this isn't terrible.
50
+ factors = set()
51
+ for p in primes_from_2_to(int(np.sqrt(n)) + 1):
52
+ while not (n % p):
53
+ factors.add(p)
54
+ n //= p
55
+ if n == 1:
56
+ break
57
+ if n != 1:
58
+ factors.add(n)
59
+ return sorted(factors)
60
+
61
+
62
+ def _primitive_root(p):
63
+ """Compute a primitive root of the prime number `p`.
64
+
65
+ Used in the CBC lattice construction.
66
+
67
+ References
68
+ ----------
69
+ .. [1] https://en.wikipedia.org/wiki/Primitive_root_modulo_n
70
+ """
71
+ # p is prime
72
+ pm = p - 1
73
+ factors = _factorize_int(pm)
74
+ n = len(factors)
75
+ r = 2
76
+ k = 0
77
+ while k < n:
78
+ d = pm // factors[k]
79
+ # pow() doesn't like numpy scalar types.
80
+ rd = pow(int(r), int(d), int(p))
81
+ if rd == 1:
82
+ r += 1
83
+ k = 0
84
+ else:
85
+ k += 1
86
+ return r
87
+
88
+
89
+ def _cbc_lattice(n_dim, n_qmc_samples):
90
+ """Compute a QMC lattice generator using a Fast CBC construction.
91
+
92
+ Parameters
93
+ ----------
94
+ n_dim : int > 0
95
+ The number of dimensions for the lattice.
96
+ n_qmc_samples : int > 0
97
+ The desired number of QMC samples. This will be rounded down to the
98
+ nearest prime to enable the CBC construction.
99
+
100
+ Returns
101
+ -------
102
+ q : float array : shape=(n_dim,)
103
+ The lattice generator vector. All values are in the open interval
104
+ `(0, 1)`.
105
+ actual_n_qmc_samples : int
106
+ The prime number of QMC samples that must be used with this lattice,
107
+ no more, no less.
108
+
109
+ References
110
+ ----------
111
+ .. [1] Nuyens, D. and Cools, R. "Fast Component-by-Component Construction,
112
+ a Reprise for Different Kernels", In H. Niederreiter and D. Talay,
113
+ editors, Monte-Carlo and Quasi-Monte Carlo Methods 2004,
114
+ Springer-Verlag, 2006, 371-385.
115
+ """
116
+ # Round down to the nearest prime number.
117
+ primes = primes_from_2_to(n_qmc_samples + 1)
118
+ n_qmc_samples = primes[-1]
119
+
120
+ bt = np.ones(n_dim)
121
+ gm = np.hstack([1.0, 0.8 ** np.arange(n_dim - 1)])
122
+ q = 1
123
+ w = 0
124
+ z = np.arange(1, n_dim + 1)
125
+ m = (n_qmc_samples - 1) // 2
126
+ g = _primitive_root(n_qmc_samples)
127
+ # Slightly faster way to compute perm[j] = pow(g, j, n_qmc_samples)
128
+ # Shame that we don't have modulo pow() implemented as a ufunc.
129
+ perm = np.ones(m, dtype=int)
130
+ for j in range(m - 1):
131
+ perm[j + 1] = (g * perm[j]) % n_qmc_samples
132
+ perm = np.minimum(n_qmc_samples - perm, perm)
133
+ pn = perm / n_qmc_samples
134
+ c = pn * pn - pn + 1.0 / 6
135
+ fc = fft(c)
136
+ for s in range(1, n_dim):
137
+ reordered = np.hstack([
138
+ c[:w+1][::-1],
139
+ c[w+1:m][::-1],
140
+ ])
141
+ q = q * (bt[s-1] + gm[s-1] * reordered)
142
+ w = ifft(fc * fft(q)).real.argmin()
143
+ z[s] = perm[w]
144
+ q = z / n_qmc_samples
145
+ return q, n_qmc_samples
146
+
147
+
148
+ # Note: this function is not currently used or tested by any SciPy code. It is
149
+ # included in this file to facilitate the development of a parameter for users
150
+ # to set the desired CDF accuracy, but must be reviewed and tested before use.
151
+ def _qauto(func, covar, low, high, rng, error=1e-3, limit=10_000, **kwds):
152
+ """Automatically rerun the integration to get the required error bound.
153
+
154
+ Parameters
155
+ ----------
156
+ func : callable
157
+ Either :func:`_qmvn` or :func:`_qmvt`.
158
+ covar, low, high : array
159
+ As specified in :func:`_qmvn` and :func:`_qmvt`.
160
+ rng : Generator, optional
161
+ default_rng(), yada, yada
162
+ error : float > 0
163
+ The desired error bound.
164
+ limit : int > 0:
165
+ The rough limit of the number of integration points to consider. The
166
+ integration will stop looping once this limit has been *exceeded*.
167
+ **kwds :
168
+ Other keyword arguments to pass to `func`. When using :func:`_qmvt`, be
169
+ sure to include ``nu=`` as one of these.
170
+
171
+ Returns
172
+ -------
173
+ prob : float
174
+ The estimated probability mass within the bounds.
175
+ est_error : float
176
+ 3 times the standard error of the batch estimates.
177
+ n_samples : int
178
+ The number of integration points actually used.
179
+ """
180
+ n = len(covar)
181
+ n_samples = 0
182
+ if n == 1:
183
+ prob = phi(high) - phi(low)
184
+ # More or less
185
+ est_error = 1e-15
186
+ else:
187
+ mi = min(limit, n * 1000)
188
+ prob = 0.0
189
+ est_error = 1.0
190
+ ei = 0.0
191
+ while est_error > error and n_samples < limit:
192
+ mi = round(np.sqrt(2) * mi)
193
+ pi, ei, ni = func(mi, covar, low, high, rng=rng, **kwds)
194
+ n_samples += ni
195
+ wt = 1.0 / (1 + (ei / est_error)**2)
196
+ prob += wt * (pi - prob)
197
+ est_error = np.sqrt(wt) * ei
198
+ return prob, est_error, n_samples
199
+
200
+
201
+ # Note: this function is not currently used or tested by any SciPy code. It is
202
+ # included in this file to facilitate the resolution of gh-8367, gh-16142, and
203
+ # possibly gh-14286, but must be reviewed and tested before use.
204
+ def _qmvn(m, covar, low, high, rng, lattice='cbc', n_batches=10):
205
+ """Multivariate normal integration over box bounds.
206
+
207
+ Parameters
208
+ ----------
209
+ m : int > n_batches
210
+ The number of points to sample. This number will be divided into
211
+ `n_batches` batches that apply random offsets of the sampling lattice
212
+ for each batch in order to estimate the error.
213
+ covar : (n, n) float array
214
+ Possibly singular, positive semidefinite symmetric covariance matrix.
215
+ low, high : (n,) float array
216
+ The low and high integration bounds.
217
+ rng : Generator, optional
218
+ default_rng(), yada, yada
219
+ lattice : 'cbc' or callable
220
+ The type of lattice rule to use to construct the integration points.
221
+ n_batches : int > 0, optional
222
+ The number of QMC batches to apply.
223
+
224
+ Returns
225
+ -------
226
+ prob : float
227
+ The estimated probability mass within the bounds.
228
+ est_error : float
229
+ 3 times the standard error of the batch estimates.
230
+ """
231
+ cho, lo, hi = _permuted_cholesky(covar, low, high)
232
+ n = cho.shape[0]
233
+ ct = cho[0, 0]
234
+ c = phi(lo[0] / ct)
235
+ d = phi(hi[0] / ct)
236
+ ci = c
237
+ dci = d - ci
238
+ prob = 0.0
239
+ error_var = 0.0
240
+ q, n_qmc_samples = _cbc_lattice(n - 1, max(m // n_batches, 1))
241
+ y = np.zeros((n - 1, n_qmc_samples))
242
+ i_samples = np.arange(n_qmc_samples) + 1
243
+ for j in range(n_batches):
244
+ c = np.full(n_qmc_samples, ci)
245
+ dc = np.full(n_qmc_samples, dci)
246
+ pv = dc.copy()
247
+ for i in range(1, n):
248
+ # Pseudorandomly-shifted lattice coordinate.
249
+ z = q[i - 1] * i_samples + rng.random()
250
+ # Fast remainder(z, 1.0)
251
+ z -= z.astype(int)
252
+ # Tent periodization transform.
253
+ x = abs(2 * z - 1)
254
+ y[i - 1, :] = phinv(c + x * dc)
255
+ s = cho[i, :i] @ y[:i, :]
256
+ ct = cho[i, i]
257
+ c = phi((lo[i] - s) / ct)
258
+ d = phi((hi[i] - s) / ct)
259
+ dc = d - c
260
+ pv = pv * dc
261
+ # Accumulate the mean and error variances with online formulations.
262
+ d = (pv.mean() - prob) / (j + 1)
263
+ prob += d
264
+ error_var = (j - 1) * error_var / (j + 1) + d * d
265
+ # Error bounds are 3 times the standard error of the estimates.
266
+ est_error = 3 * np.sqrt(error_var)
267
+ n_samples = n_qmc_samples * n_batches
268
+ return prob, est_error, n_samples
269
+
270
+
271
+ # Note: this function is not currently used or tested by any SciPy code. It is
272
+ # included in this file to facilitate the resolution of gh-8367, gh-16142, and
273
+ # possibly gh-14286, but must be reviewed and tested before use.
274
+ def _mvn_qmc_integrand(covar, low, high, use_tent=False):
275
+ """Transform the multivariate normal integration into a QMC integrand over
276
+ a unit hypercube.
277
+
278
+ The dimensionality of the resulting hypercube integration domain is one
279
+ less than the dimensionality of the original integrand. Note that this
280
+ transformation subsumes the integration bounds in order to account for
281
+ infinite bounds. The QMC integration one does with the returned integrand
282
+ should be on the unit hypercube.
283
+
284
+ Parameters
285
+ ----------
286
+ covar : (n, n) float array
287
+ Possibly singular, positive semidefinite symmetric covariance matrix.
288
+ low, high : (n,) float array
289
+ The low and high integration bounds.
290
+ use_tent : bool, optional
291
+ If True, then use tent periodization. Only helpful for lattice rules.
292
+
293
+ Returns
294
+ -------
295
+ integrand : Callable[[NDArray], NDArray]
296
+ The QMC-integrable integrand. It takes an
297
+ ``(n_qmc_samples, ndim_integrand)`` array of QMC samples in the unit
298
+ hypercube and returns the ``(n_qmc_samples,)`` evaluations of at these
299
+ QMC points.
300
+ ndim_integrand : int
301
+ The dimensionality of the integrand. Equal to ``n-1``.
302
+ """
303
+ cho, lo, hi = _permuted_cholesky(covar, low, high)
304
+ n = cho.shape[0]
305
+ ndim_integrand = n - 1
306
+ ct = cho[0, 0]
307
+ c = phi(lo[0] / ct)
308
+ d = phi(hi[0] / ct)
309
+ ci = c
310
+ dci = d - ci
311
+
312
+ def integrand(*zs):
313
+ ndim_qmc = len(zs)
314
+ n_qmc_samples = len(np.atleast_1d(zs[0]))
315
+ assert ndim_qmc == ndim_integrand
316
+ y = np.zeros((ndim_qmc, n_qmc_samples))
317
+ c = np.full(n_qmc_samples, ci)
318
+ dc = np.full(n_qmc_samples, dci)
319
+ pv = dc.copy()
320
+ for i in range(1, n):
321
+ if use_tent:
322
+ # Tent periodization transform.
323
+ x = abs(2 * zs[i-1] - 1)
324
+ else:
325
+ x = zs[i-1]
326
+ y[i - 1, :] = phinv(c + x * dc)
327
+ s = cho[i, :i] @ y[:i, :]
328
+ ct = cho[i, i]
329
+ c = phi((lo[i] - s) / ct)
330
+ d = phi((hi[i] - s) / ct)
331
+ dc = d - c
332
+ pv = pv * dc
333
+ return pv
334
+
335
+ return integrand, ndim_integrand
336
+
337
+
338
+ def _qmvt(m, nu, covar, low, high, rng, lattice='cbc', n_batches=10):
339
+ """Multivariate t integration over box bounds.
340
+
341
+ Parameters
342
+ ----------
343
+ m : int > n_batches
344
+ The number of points to sample. This number will be divided into
345
+ `n_batches` batches that apply random offsets of the sampling lattice
346
+ for each batch in order to estimate the error.
347
+ nu : float >= 0
348
+ The shape parameter of the multivariate t distribution.
349
+ covar : (n, n) float array
350
+ Possibly singular, positive semidefinite symmetric covariance matrix.
351
+ low, high : (n,) float array
352
+ The low and high integration bounds.
353
+ rng : Generator, optional
354
+ default_rng(), yada, yada
355
+ lattice : 'cbc' or callable
356
+ The type of lattice rule to use to construct the integration points.
357
+ n_batches : int > 0, optional
358
+ The number of QMC batches to apply.
359
+
360
+ Returns
361
+ -------
362
+ prob : float
363
+ The estimated probability mass within the bounds.
364
+ est_error : float
365
+ 3 times the standard error of the batch estimates.
366
+ n_samples : int
367
+ The number of samples actually used.
368
+ """
369
+ sn = max(1.0, np.sqrt(nu))
370
+ low = np.asarray(low, dtype=np.float64)
371
+ high = np.asarray(high, dtype=np.float64)
372
+ cho, lo, hi = _permuted_cholesky(covar, low / sn, high / sn)
373
+ n = cho.shape[0]
374
+ prob = 0.0
375
+ error_var = 0.0
376
+ q, n_qmc_samples = _cbc_lattice(n, max(m // n_batches, 1))
377
+ i_samples = np.arange(n_qmc_samples) + 1
378
+ for j in range(n_batches):
379
+ pv = np.ones(n_qmc_samples)
380
+ s = np.zeros((n, n_qmc_samples))
381
+ for i in range(n):
382
+ # Pseudorandomly-shifted lattice coordinate.
383
+ z = q[i] * i_samples + rng.random()
384
+ # Fast remainder(z, 1.0)
385
+ z -= z.astype(int)
386
+ # Tent periodization transform.
387
+ x = abs(2 * z - 1)
388
+ # FIXME: Lift the i==0 case out of the loop to make the logic
389
+ # easier to follow.
390
+ if i == 0:
391
+ # We'll use one of the QR variates to pull out the
392
+ # t-distribution scaling.
393
+ if nu > 0:
394
+ r = np.sqrt(2 * gammaincinv(nu / 2, x))
395
+ else:
396
+ r = np.ones_like(x)
397
+ else:
398
+ y = phinv(c + x * dc) # noqa: F821
399
+ with np.errstate(invalid='ignore'):
400
+ s[i:, :] += cho[i:, i - 1][:, np.newaxis] * y
401
+ si = s[i, :]
402
+
403
+ c = np.ones(n_qmc_samples)
404
+ d = np.ones(n_qmc_samples)
405
+ with np.errstate(invalid='ignore'):
406
+ lois = lo[i] * r - si
407
+ hiis = hi[i] * r - si
408
+ c[lois < -9] = 0.0
409
+ d[hiis < -9] = 0.0
410
+ lo_mask = abs(lois) < 9
411
+ hi_mask = abs(hiis) < 9
412
+ c[lo_mask] = phi(lois[lo_mask])
413
+ d[hi_mask] = phi(hiis[hi_mask])
414
+
415
+ dc = d - c
416
+ pv *= dc
417
+
418
+ # Accumulate the mean and error variances with online formulations.
419
+ d = (pv.mean() - prob) / (j + 1)
420
+ prob += d
421
+ error_var = (j - 1) * error_var / (j + 1) + d * d
422
+ # Error bounds are 3 times the standard error of the estimates.
423
+ est_error = 3 * np.sqrt(error_var)
424
+ n_samples = n_qmc_samples * n_batches
425
+ return prob, est_error, n_samples
426
+
427
+
428
+ def _permuted_cholesky(covar, low, high, tol=1e-10):
429
+ """Compute a scaled, permuted Cholesky factor, with integration bounds.
430
+
431
+ The scaling and permuting of the dimensions accomplishes part of the
432
+ transformation of the original integration problem into a more numerically
433
+ tractable form. The lower-triangular Cholesky factor will then be used in
434
+ the subsequent integration. The integration bounds will be scaled and
435
+ permuted as well.
436
+
437
+ Parameters
438
+ ----------
439
+ covar : (n, n) float array
440
+ Possibly singular, positive semidefinite symmetric covariance matrix.
441
+ low, high : (n,) float array
442
+ The low and high integration bounds.
443
+ tol : float, optional
444
+ The singularity tolerance.
445
+
446
+ Returns
447
+ -------
448
+ cho : (n, n) float array
449
+ Lower Cholesky factor, scaled and permuted.
450
+ new_low, new_high : (n,) float array
451
+ The scaled and permuted low and high integration bounds.
452
+ """
453
+ # Make copies for outputting.
454
+ cho = np.array(covar, dtype=np.float64)
455
+ new_lo = np.array(low, dtype=np.float64)
456
+ new_hi = np.array(high, dtype=np.float64)
457
+ n = cho.shape[0]
458
+ if cho.shape != (n, n):
459
+ raise ValueError("expected a square symmetric array")
460
+ if new_lo.shape != (n,) or new_hi.shape != (n,):
461
+ raise ValueError(
462
+ "expected integration boundaries the same dimensions "
463
+ "as the covariance matrix"
464
+ )
465
+ # Scale by the sqrt of the diagonal.
466
+ dc = np.sqrt(np.maximum(np.diag(cho), 0.0))
467
+ # But don't divide by 0.
468
+ dc[dc == 0.0] = 1.0
469
+ new_lo /= dc
470
+ new_hi /= dc
471
+ cho /= dc
472
+ cho /= dc[:, np.newaxis]
473
+
474
+ y = np.zeros(n)
475
+ sqtp = np.sqrt(2 * np.pi)
476
+ for k in range(n):
477
+ epk = (k + 1) * tol
478
+ im = k
479
+ ck = 0.0
480
+ dem = 1.0
481
+ s = 0.0
482
+ lo_m = 0.0
483
+ hi_m = 0.0
484
+ for i in range(k, n):
485
+ if cho[i, i] > tol:
486
+ ci = np.sqrt(cho[i, i])
487
+ if i > 0:
488
+ s = cho[i, :k] @ y[:k]
489
+ lo_i = (new_lo[i] - s) / ci
490
+ hi_i = (new_hi[i] - s) / ci
491
+ de = phi(hi_i) - phi(lo_i)
492
+ if de <= dem:
493
+ ck = ci
494
+ dem = de
495
+ lo_m = lo_i
496
+ hi_m = hi_i
497
+ im = i
498
+ if im > k:
499
+ # Swap im and k
500
+ cho[im, im] = cho[k, k]
501
+ _swap_slices(cho, np.s_[im, :k], np.s_[k, :k])
502
+ _swap_slices(cho, np.s_[im + 1:, im], np.s_[im + 1:, k])
503
+ _swap_slices(cho, np.s_[k + 1:im, k], np.s_[im, k + 1:im])
504
+ _swap_slices(new_lo, k, im)
505
+ _swap_slices(new_hi, k, im)
506
+ if ck > epk:
507
+ cho[k, k] = ck
508
+ cho[k, k + 1:] = 0.0
509
+ for i in range(k + 1, n):
510
+ cho[i, k] /= ck
511
+ cho[i, k + 1:i + 1] -= cho[i, k] * cho[k + 1:i + 1, k]
512
+ if abs(dem) > tol:
513
+ y[k] = ((np.exp(-lo_m * lo_m / 2) - np.exp(-hi_m * hi_m / 2)) /
514
+ (sqtp * dem))
515
+ else:
516
+ y[k] = (lo_m + hi_m) / 2
517
+ if lo_m < -10:
518
+ y[k] = hi_m
519
+ elif hi_m > 10:
520
+ y[k] = lo_m
521
+ cho[k, :k + 1] /= ck
522
+ new_lo[k] /= ck
523
+ new_hi[k] /= ck
524
+ else:
525
+ cho[k:, k] = 0.0
526
+ y[k] = (new_lo[k] + new_hi[k]) / 2
527
+ return cho, new_lo, new_hi
528
+
529
+
530
+ def _swap_slices(x, slc1, slc2):
531
+ t = x[slc1].copy()
532
+ x[slc1] = x[slc2].copy()
533
+ x[slc2] = t
.venv/Lib/site-packages/scipy/stats/_relative_risk.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import operator
2
+ from dataclasses import dataclass
3
+ import numpy as np
4
+ from scipy.special import ndtri
5
+ from ._common import ConfidenceInterval
6
+
7
+
8
+ def _validate_int(n, bound, name):
9
+ msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
10
+ try:
11
+ n = operator.index(n)
12
+ except TypeError:
13
+ raise TypeError(msg) from None
14
+ if n < bound:
15
+ raise ValueError(msg)
16
+ return n
17
+
18
+
19
+ @dataclass
20
+ class RelativeRiskResult:
21
+ """
22
+ Result of `scipy.stats.contingency.relative_risk`.
23
+
24
+ Attributes
25
+ ----------
26
+ relative_risk : float
27
+ This is::
28
+
29
+ (exposed_cases/exposed_total) / (control_cases/control_total)
30
+
31
+ exposed_cases : int
32
+ The number of "cases" (i.e. occurrence of disease or other event
33
+ of interest) among the sample of "exposed" individuals.
34
+ exposed_total : int
35
+ The total number of "exposed" individuals in the sample.
36
+ control_cases : int
37
+ The number of "cases" among the sample of "control" or non-exposed
38
+ individuals.
39
+ control_total : int
40
+ The total number of "control" individuals in the sample.
41
+
42
+ Methods
43
+ -------
44
+ confidence_interval :
45
+ Compute the confidence interval for the relative risk estimate.
46
+ """
47
+
48
+ relative_risk: float
49
+ exposed_cases: int
50
+ exposed_total: int
51
+ control_cases: int
52
+ control_total: int
53
+
54
+ def confidence_interval(self, confidence_level=0.95):
55
+ """
56
+ Compute the confidence interval for the relative risk.
57
+
58
+ The confidence interval is computed using the Katz method
59
+ (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
60
+
61
+ Parameters
62
+ ----------
63
+ confidence_level : float, optional
64
+ The confidence level to use for the confidence interval.
65
+ Default is 0.95.
66
+
67
+ Returns
68
+ -------
69
+ ci : ConfidenceInterval instance
70
+ The return value is an object with attributes ``low`` and
71
+ ``high`` that hold the confidence interval.
72
+
73
+ References
74
+ ----------
75
+ .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
76
+ confidence intervals for the risk ratio in cohort studies",
77
+ Biometrics, 34, 469-474 (1978).
78
+ .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
79
+ CRC Press LLC, Boca Raton, FL, USA (1996).
80
+
81
+
82
+ Examples
83
+ --------
84
+ >>> from scipy.stats.contingency import relative_risk
85
+ >>> result = relative_risk(exposed_cases=10, exposed_total=75,
86
+ ... control_cases=12, control_total=225)
87
+ >>> result.relative_risk
88
+ 2.5
89
+ >>> result.confidence_interval()
90
+ ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
91
+ """
92
+ if not 0 <= confidence_level <= 1:
93
+ raise ValueError('confidence_level must be in the interval '
94
+ '[0, 1].')
95
+
96
+ # Handle edge cases where either exposed_cases or control_cases
97
+ # is zero. We follow the convention of the R function riskratio
98
+ # from the epitools library.
99
+ if self.exposed_cases == 0 and self.control_cases == 0:
100
+ # relative risk is nan.
101
+ return ConfidenceInterval(low=np.nan, high=np.nan)
102
+ elif self.exposed_cases == 0:
103
+ # relative risk is 0.
104
+ return ConfidenceInterval(low=0.0, high=np.nan)
105
+ elif self.control_cases == 0:
106
+ # relative risk is inf
107
+ return ConfidenceInterval(low=np.nan, high=np.inf)
108
+
109
+ alpha = 1 - confidence_level
110
+ z = ndtri(1 - alpha/2)
111
+ rr = self.relative_risk
112
+
113
+ # Estimate of the variance of log(rr) is
114
+ # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
115
+ # 1/control_cases - 1/control_total
116
+ # and the standard error is the square root of that.
117
+ se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
118
+ 1/self.control_cases - 1/self.control_total)
119
+ delta = z*se
120
+ katz_lo = rr*np.exp(-delta)
121
+ katz_hi = rr*np.exp(delta)
122
+ return ConfidenceInterval(low=katz_lo, high=katz_hi)
123
+
124
+
125
+ def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
126
+ """
127
+ Compute the relative risk (also known as the risk ratio).
128
+
129
+ This function computes the relative risk associated with a 2x2
130
+ contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
131
+ of accepting a table as an argument, the individual numbers that are
132
+ used to compute the relative risk are given as separate parameters.
133
+ This is to avoid the ambiguity of which row or column of the contingency
134
+ table corresponds to the "exposed" cases and which corresponds to the
135
+ "control" cases. Unlike, say, the odds ratio, the relative risk is not
136
+ invariant under an interchange of the rows or columns.
137
+
138
+ Parameters
139
+ ----------
140
+ exposed_cases : nonnegative int
141
+ The number of "cases" (i.e. occurrence of disease or other event
142
+ of interest) among the sample of "exposed" individuals.
143
+ exposed_total : positive int
144
+ The total number of "exposed" individuals in the sample.
145
+ control_cases : nonnegative int
146
+ The number of "cases" among the sample of "control" or non-exposed
147
+ individuals.
148
+ control_total : positive int
149
+ The total number of "control" individuals in the sample.
150
+
151
+ Returns
152
+ -------
153
+ result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
154
+ The object has the float attribute ``relative_risk``, which is::
155
+
156
+ rr = (exposed_cases/exposed_total) / (control_cases/control_total)
157
+
158
+ The object also has the method ``confidence_interval`` to compute
159
+ the confidence interval of the relative risk for a given confidence
160
+ level.
161
+
162
+ See Also
163
+ --------
164
+ odds_ratio
165
+
166
+ Notes
167
+ -----
168
+ The R package epitools has the function `riskratio`, which accepts
169
+ a table with the following layout::
170
+
171
+ disease=0 disease=1
172
+ exposed=0 (ref) n00 n01
173
+ exposed=1 n10 n11
174
+
175
+ With a 2x2 table in the above format, the estimate of the CI is
176
+ computed by `riskratio` when the argument method="wald" is given,
177
+ or with the function `riskratio.wald`.
178
+
179
+ For example, in a test of the incidence of lung cancer among a
180
+ sample of smokers and nonsmokers, the "exposed" category would
181
+ correspond to "is a smoker" and the "disease" category would
182
+ correspond to "has or had lung cancer".
183
+
184
+ To pass the same data to ``relative_risk``, use::
185
+
186
+ relative_risk(n11, n10 + n11, n01, n00 + n01)
187
+
188
+ .. versionadded:: 1.7.0
189
+
190
+ References
191
+ ----------
192
+ .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
193
+ (second edition), Wiley, Hoboken, NJ, USA (2007).
194
+ .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
195
+ CRC Press LLC, Boca Raton, FL, USA (1996).
196
+
197
+ Examples
198
+ --------
199
+ >>> from scipy.stats.contingency import relative_risk
200
+
201
+ This example is from Example 3.1 of [2]_. The results of a heart
202
+ disease study are summarized in the following table::
203
+
204
+ High CAT Low CAT Total
205
+ -------- ------- -----
206
+ CHD 27 44 71
207
+ No CHD 95 443 538
208
+
209
+ Total 122 487 609
210
+
211
+ CHD is coronary heart disease, and CAT refers to the level of
212
+ circulating catecholamine. CAT is the "exposure" variable, and
213
+ high CAT is the "exposed" category. So the data from the table
214
+ to be passed to ``relative_risk`` is::
215
+
216
+ exposed_cases = 27
217
+ exposed_total = 122
218
+ control_cases = 44
219
+ control_total = 487
220
+
221
+ >>> result = relative_risk(27, 122, 44, 487)
222
+ >>> result.relative_risk
223
+ 2.4495156482861398
224
+
225
+ Find the confidence interval for the relative risk.
226
+
227
+ >>> result.confidence_interval(confidence_level=0.95)
228
+ ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
229
+
230
+ The interval does not contain 1, so the data supports the statement
231
+ that high CAT is associated with greater risk of CHD.
232
+ """
233
+ # Relative risk is a trivial calculation. The nontrivial part is in the
234
+ # `confidence_interval` method of the RelativeRiskResult class.
235
+
236
+ exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
237
+ exposed_total = _validate_int(exposed_total, 1, "exposed_total")
238
+ control_cases = _validate_int(control_cases, 0, "control_cases")
239
+ control_total = _validate_int(control_total, 1, "control_total")
240
+
241
+ if exposed_cases > exposed_total:
242
+ raise ValueError('exposed_cases must not exceed exposed_total.')
243
+ if control_cases > control_total:
244
+ raise ValueError('control_cases must not exceed control_total.')
245
+
246
+ if exposed_cases == 0 and control_cases == 0:
247
+ # relative risk is 0/0.
248
+ rr = np.nan
249
+ elif exposed_cases == 0:
250
+ # relative risk is 0/nonzero
251
+ rr = 0.0
252
+ elif control_cases == 0:
253
+ # relative risk is nonzero/0.
254
+ rr = np.inf
255
+ else:
256
+ p1 = exposed_cases / exposed_total
257
+ p2 = control_cases / control_total
258
+ rr = p1 / p2
259
+ return RelativeRiskResult(relative_risk=rr,
260
+ exposed_cases=exposed_cases,
261
+ exposed_total=exposed_total,
262
+ control_cases=control_cases,
263
+ control_total=control_total)