ayousanz commited on Dec 10, 2024

Commit

45b0b28

verified ·

1 Parent(s): b35b196

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/Lib/site-packages/scipy/sparse/linalg/tests/propack_test_data.npz +3 -0
.venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd +3 -0
.venv/Lib/site-packages/scipy/special/__pycache__/__init__.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/special/__pycache__/_orthogonal.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/special/__pycache__/_sf_error.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/special/__pycache__/_spfun_stats.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/special/__pycache__/_spherical_bessel.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/special/__pycache__/_support_alternative_backends.cpython-39.pyc +0 -0
.venv/Lib/site-packages/scipy/stats/__init__.py +643 -0
.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.dll.a +0 -0
.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.pyd +0 -0
.venv/Lib/site-packages/scipy/stats/_axis_nan_policy.py +642 -0
.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.dll.a +0 -0
.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.pyd +0 -0
.venv/Lib/site-packages/scipy/stats/_biasedurn.pxd +27 -0
.venv/Lib/site-packages/scipy/stats/_binned_statistic.py +795 -0
.venv/Lib/site-packages/scipy/stats/_binomtest.py +375 -0
.venv/Lib/site-packages/scipy/stats/_bws_test.py +177 -0
.venv/Lib/site-packages/scipy/stats/_censored_data.py +459 -0
.venv/Lib/site-packages/scipy/stats/_common.py +5 -0
.venv/Lib/site-packages/scipy/stats/_constants.py +39 -0
.venv/Lib/site-packages/scipy/stats/_continuous_distns.py +0 -0
.venv/Lib/site-packages/scipy/stats/_covariance.py +633 -0
.venv/Lib/site-packages/scipy/stats/_crosstab.py +204 -0
.venv/Lib/site-packages/scipy/stats/_discrete_distns.py +1954 -0
.venv/Lib/site-packages/scipy/stats/_distn_infrastructure.py +0 -0
.venv/Lib/site-packages/scipy/stats/_distr_params.py +288 -0
.venv/Lib/site-packages/scipy/stats/_entropy.py +423 -0
.venv/Lib/site-packages/scipy/stats/_fit.py +1351 -0
.venv/Lib/site-packages/scipy/stats/_generate_pyx.py +27 -0
.venv/Lib/site-packages/scipy/stats/_hypotests.py +2021 -0
.venv/Lib/site-packages/scipy/stats/_kde.py +728 -0
.venv/Lib/site-packages/scipy/stats/_ksstats.py +600 -0
.venv/Lib/site-packages/scipy/stats/_mannwhitneyu.py +519 -0
.venv/Lib/site-packages/scipy/stats/_morestats.py +0 -0
.venv/Lib/site-packages/scipy/stats/_mstats_basic.py +0 -0
.venv/Lib/site-packages/scipy/stats/_mstats_extras.py +521 -0
.venv/Lib/site-packages/scipy/stats/_multicomp.py +459 -0
.venv/Lib/site-packages/scipy/stats/_multivariate.py +0 -0
.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.dll.a +0 -0
.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.pyd +0 -0
.venv/Lib/site-packages/scipy/stats/_odds_ratio.py +482 -0
.venv/Lib/site-packages/scipy/stats/_page_trend_test.py +479 -0
.venv/Lib/site-packages/scipy/stats/_qmc.py +0 -0
.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.dll.a +0 -0
.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.pyd +0 -0
.venv/Lib/site-packages/scipy/stats/_qmc_cy.pyi +54 -0
.venv/Lib/site-packages/scipy/stats/_qmvnt.py +533 -0
.venv/Lib/site-packages/scipy/stats/_relative_risk.py +263 -0

.gitattributes CHANGED Viewed

@@ -90,3 +90,4 @@ reference_sample_wavs/syuukovoice_200918_3_01.wav filter=lfs diff=lfs merge=lfs
 .venv/Lib/site-packages/torio/lib/_torio_ffmpeg5.pyd filter=lfs diff=lfs merge=lfs -text
 .venv/Lib/site-packages/torio/lib/_torio_ffmpeg6.pyd filter=lfs diff=lfs merge=lfs -text
 .venv/Lib/site-packages/torch/lib/cudnn_adv64_9.dll filter=lfs diff=lfs merge=lfs -text

 .venv/Lib/site-packages/torio/lib/_torio_ffmpeg5.pyd filter=lfs diff=lfs merge=lfs -text
 .venv/Lib/site-packages/torio/lib/_torio_ffmpeg6.pyd filter=lfs diff=lfs merge=lfs -text
 .venv/Lib/site-packages/torch/lib/cudnn_adv64_9.dll filter=lfs diff=lfs merge=lfs -text
+.venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd filter=lfs diff=lfs merge=lfs -text

.venv/Lib/site-packages/scipy/sparse/linalg/tests/propack_test_data.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfe34d9a92353e08f400f3837136e553a8e91d441186913d39b59bf8a627bba3
+size 600350

.venv/Lib/site-packages/scipy/spatial/_distance_pybind.cp39-win_amd64.pyd ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bdc657c7357110d74977999bce28b06bdcc7dedf675339d5462ec030c6da0ac
+size 1372160

.venv/Lib/site-packages/scipy/special/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (31.8 kB). View file

.venv/Lib/site-packages/scipy/special/__pycache__/_orthogonal.cpython-39.pyc ADDED Viewed

Binary file (74.5 kB). View file

.venv/Lib/site-packages/scipy/special/__pycache__/_sf_error.cpython-39.pyc ADDED Viewed

Binary file (784 Bytes). View file

.venv/Lib/site-packages/scipy/special/__pycache__/_spfun_stats.cpython-39.pyc ADDED Viewed

Binary file (2.6 kB). View file

.venv/Lib/site-packages/scipy/special/__pycache__/_spherical_bessel.cpython-39.pyc ADDED Viewed

Binary file (10.7 kB). View file

.venv/Lib/site-packages/scipy/special/__pycache__/_support_alternative_backends.cpython-39.pyc ADDED Viewed

Binary file (2.24 kB). View file

.venv/Lib/site-packages/scipy/stats/__init__.py ADDED Viewed

	@@ -0,0 +1,643 @@

+"""
+.. _statsrefmanual:
+==========================================
+Statistical functions (:mod:`scipy.stats`)
+==========================================
+.. currentmodule:: scipy.stats
+This module contains a large number of probability distributions,
+summary and frequency statistics, correlation functions and statistical
+tests, masked statistics, kernel density estimation, quasi-Monte Carlo
+functionality, and more.
+Statistics is a very large area, and there are topics that are out of scope
+for SciPy and are covered by other packages. Some of the most important ones
+are:
+- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
+  regression, linear models, time series analysis, extensions to topics
+  also covered by ``scipy.stats``.
+- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
+  functionality, interfaces to other statistical languages.
+- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
+  modeling, probabilistic machine learning.
+- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
+  model selection.
+- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
+- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
+Probability distributions
+=========================
+Each univariate distribution is an instance of a subclass of `rv_continuous`
+(`rv_discrete` for discrete distributions):
+.. autosummary::
+   :toctree: generated/
+   rv_continuous
+   rv_discrete
+   rv_histogram
+Continuous distributions
+------------------------
+.. autosummary::
+   :toctree: generated/
+   alpha             -- Alpha
+   anglit            -- Anglit
+   arcsine           -- Arcsine
+   argus             -- Argus
+   beta              -- Beta
+   betaprime         -- Beta Prime
+   bradford          -- Bradford
+   burr              -- Burr (Type III)
+   burr12            -- Burr (Type XII)
+   cauchy            -- Cauchy
+   chi               -- Chi
+   chi2              -- Chi-squared
+   cosine            -- Cosine
+   crystalball       -- Crystalball
+   dgamma            -- Double Gamma
+   dweibull          -- Double Weibull
+   erlang            -- Erlang
+   expon             -- Exponential
+   exponnorm         -- Exponentially Modified Normal
+   exponweib         -- Exponentiated Weibull
+   exponpow          -- Exponential Power
+   f                 -- F (Snecdor F)
+   fatiguelife       -- Fatigue Life (Birnbaum-Saunders)
+   fisk              -- Fisk
+   foldcauchy        -- Folded Cauchy
+   foldnorm          -- Folded Normal
+   genlogistic       -- Generalized Logistic
+   gennorm           -- Generalized normal
+   genpareto         -- Generalized Pareto
+   genexpon          -- Generalized Exponential
+   genextreme        -- Generalized Extreme Value
+   gausshyper        -- Gauss Hypergeometric
+   gamma             -- Gamma
+   gengamma          -- Generalized gamma
+   genhalflogistic   -- Generalized Half Logistic
+   genhyperbolic     -- Generalized Hyperbolic
+   geninvgauss       -- Generalized Inverse Gaussian
+   gibrat            -- Gibrat
+   gompertz          -- Gompertz (Truncated Gumbel)
+   gumbel_r          -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
+   gumbel_l          -- Left Sided Gumbel, etc.
+   halfcauchy        -- Half Cauchy
+   halflogistic      -- Half Logistic
+   halfnorm          -- Half Normal
+   halfgennorm       -- Generalized Half Normal
+   hypsecant         -- Hyperbolic Secant
+   invgamma          -- Inverse Gamma
+   invgauss          -- Inverse Gaussian
+   invweibull        -- Inverse Weibull
+   jf_skew_t         -- Jones and Faddy Skew-T
+   johnsonsb         -- Johnson SB
+   johnsonsu         -- Johnson SU
+   kappa4            -- Kappa 4 parameter
+   kappa3            -- Kappa 3 parameter
+   ksone             -- Distribution of Kolmogorov-Smirnov one-sided test statistic
+   kstwo             -- Distribution of Kolmogorov-Smirnov two-sided test statistic
+   kstwobign         -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
+   laplace           -- Laplace
+   laplace_asymmetric    -- Asymmetric Laplace
+   levy              -- Levy
+   levy_l
+   levy_stable
+   logistic          -- Logistic
+   loggamma          -- Log-Gamma
+   loglaplace        -- Log-Laplace (Log Double Exponential)
+   lognorm           -- Log-Normal
+   loguniform        -- Log-Uniform
+   lomax             -- Lomax (Pareto of the second kind)
+   maxwell           -- Maxwell
+   mielke            -- Mielke's Beta-Kappa
+   moyal             -- Moyal
+   nakagami          -- Nakagami
+   ncx2              -- Non-central chi-squared
+   ncf               -- Non-central F
+   nct               -- Non-central Student's T
+   norm              -- Normal (Gaussian)
+   norminvgauss      -- Normal Inverse Gaussian
+   pareto            -- Pareto
+   pearson3          -- Pearson type III
+   powerlaw          -- Power-function
+   powerlognorm      -- Power log normal
+   powernorm         -- Power normal
+   rdist             -- R-distribution
+   rayleigh          -- Rayleigh
+   rel_breitwigner   -- Relativistic Breit-Wigner
+   rice              -- Rice
+   recipinvgauss     -- Reciprocal Inverse Gaussian
+   semicircular      -- Semicircular
+   skewcauchy        -- Skew Cauchy
+   skewnorm          -- Skew normal
+   studentized_range    -- Studentized Range
+   t                 -- Student's T
+   trapezoid         -- Trapezoidal
+   triang            -- Triangular
+   truncexpon        -- Truncated Exponential
+   truncnorm         -- Truncated Normal
+   truncpareto       -- Truncated Pareto
+   truncweibull_min  -- Truncated minimum Weibull distribution
+   tukeylambda       -- Tukey-Lambda
+   uniform           -- Uniform
+   vonmises          -- Von-Mises (Circular)
+   vonmises_line     -- Von-Mises (Line)
+   wald              -- Wald
+   weibull_min       -- Minimum Weibull (see Frechet)
+   weibull_max       -- Maximum Weibull (see Frechet)
+   wrapcauchy        -- Wrapped Cauchy
+The ``fit`` method of the univariate continuous distributions uses
+maximum likelihood estimation to fit the distribution to a data set.
+The ``fit`` method can accept regular data or *censored data*.
+Censored data is represented with instances of the `CensoredData`
+class.
+.. autosummary::
+   :toctree: generated/
+   CensoredData
+Multivariate distributions
+--------------------------
+.. autosummary::
+   :toctree: generated/
+   multivariate_normal    -- Multivariate normal distribution
+   matrix_normal          -- Matrix normal distribution
+   dirichlet              -- Dirichlet
+   dirichlet_multinomial  -- Dirichlet multinomial distribution
+   wishart                -- Wishart
+   invwishart             -- Inverse Wishart
+   multinomial            -- Multinomial distribution
+   special_ortho_group    -- SO(N) group
+   ortho_group            -- O(N) group
+   unitary_group          -- U(N) group
+   random_correlation     -- random correlation matrices
+   multivariate_t         -- Multivariate t-distribution
+   multivariate_hypergeom -- Multivariate hypergeometric distribution
+   random_table           -- Distribution of random tables with given marginals
+   uniform_direction      -- Uniform distribution on S(N-1)
+   vonmises_fisher        -- Von Mises-Fisher distribution
+`scipy.stats.multivariate_normal` methods accept instances
+of the following class to represent the covariance.
+.. autosummary::
+   :toctree: generated/
+   Covariance             -- Representation of a covariance matrix
+Discrete distributions
+----------------------
+.. autosummary::
+   :toctree: generated/
+   bernoulli                -- Bernoulli
+   betabinom                -- Beta-Binomial
+   betanbinom               -- Beta-Negative Binomial
+   binom                    -- Binomial
+   boltzmann                -- Boltzmann (Truncated Discrete Exponential)
+   dlaplace                 -- Discrete Laplacian
+   geom                     -- Geometric
+   hypergeom                -- Hypergeometric
+   logser                   -- Logarithmic (Log-Series, Series)
+   nbinom                   -- Negative Binomial
+   nchypergeom_fisher       -- Fisher's Noncentral Hypergeometric
+   nchypergeom_wallenius    -- Wallenius's Noncentral Hypergeometric
+   nhypergeom               -- Negative Hypergeometric
+   planck                   -- Planck (Discrete Exponential)
+   poisson                  -- Poisson
+   randint                  -- Discrete Uniform
+   skellam                  -- Skellam
+   yulesimon                -- Yule-Simon
+   zipf                     -- Zipf (Zeta)
+   zipfian                  -- Zipfian
+An overview of statistical functions is given below.  Many of these functions
+have a similar version in `scipy.stats.mstats` which work for masked arrays.
+Summary statistics
+==================
+.. autosummary::
+   :toctree: generated/
+   describe          -- Descriptive statistics
+   gmean             -- Geometric mean
+   hmean             -- Harmonic mean
+   pmean             -- Power mean
+   kurtosis          -- Fisher or Pearson kurtosis
+   mode              -- Modal value
+   moment            -- Central moment
+   expectile         -- Expectile
+   skew              -- Skewness
+   kstat             --
+   kstatvar          --
+   tmean             -- Truncated arithmetic mean
+   tvar              -- Truncated variance
+   tmin              --
+   tmax              --
+   tstd              --
+   tsem              --
+   variation         -- Coefficient of variation
+   find_repeats
+   rankdata
+   tiecorrect
+   trim_mean
+   gstd              -- Geometric Standard Deviation
+   iqr
+   sem
+   bayes_mvs
+   mvsdist
+   entropy
+   differential_entropy
+   median_abs_deviation
+Frequency statistics
+====================
+.. autosummary::
+   :toctree: generated/
+   cumfreq
+   percentileofscore
+   scoreatpercentile
+   relfreq
+.. autosummary::
+   :toctree: generated/
+   binned_statistic     -- Compute a binned statistic for a set of data.
+   binned_statistic_2d  -- Compute a 2-D binned statistic for a set of data.
+   binned_statistic_dd  -- Compute a d-D binned statistic for a set of data.
+Hypothesis Tests and related functions
+======================================
+SciPy has many functions for performing hypothesis tests that return a
+test statistic and a p-value, and several of them return confidence intervals
+and/or other related information.
+The headings below are based on common uses of the functions within, but due to
+the wide variety of statistical procedures, any attempt at coarse-grained
+categorization will be imperfect. Also, note that tests within the same heading
+are not interchangeable in general (e.g. many have different distributional
+assumptions).
+One Sample Tests / Paired Sample Tests
+--------------------------------------
+One sample tests are typically used to assess whether a single sample was
+drawn from a specified distribution or a distribution with specified properties
+(e.g. zero mean).
+.. autosummary::
+   :toctree: generated/
+   ttest_1samp
+   binomtest
+   quantile_test
+   skewtest
+   kurtosistest
+   normaltest
+   jarque_bera
+   shapiro
+   anderson
+   cramervonmises
+   ks_1samp
+   goodness_of_fit
+   chisquare
+   power_divergence
+Paired sample tests are often used to assess whether two samples were drawn
+from the same distribution; they differ from the independent sample tests below
+in that each observation in one sample is treated as paired with a
+closely-related observation in the other sample (e.g. when environmental
+factors are controlled between observations within a pair but not among pairs).
+They can also be interpreted or used as one-sample tests (e.g. tests on the
+mean or median of *differences* between paired observations).
+.. autosummary::
+   :toctree: generated/
+   ttest_rel
+   wilcoxon
+Association/Correlation Tests
+-----------------------------
+These tests are often used to assess whether there is a relationship (e.g.
+linear) between paired observations in multiple samples or among the
+coordinates of multivariate observations.
+.. autosummary::
+   :toctree: generated/
+   linregress
+   pearsonr
+   spearmanr
+   pointbiserialr
+   kendalltau
+   weightedtau
+   somersd
+   siegelslopes
+   theilslopes
+   page_trend_test
+   multiscale_graphcorr
+These association tests and are to work with samples in the form of contingency
+tables. Supporting functions are available in `scipy.stats.contingency`.
+.. autosummary::
+   :toctree: generated/
+   chi2_contingency
+   fisher_exact
+   barnard_exact
+   boschloo_exact
+Independent Sample Tests
+------------------------
+Independent sample tests are typically used to assess whether multiple samples
+were independently drawn from the same distribution or different distributions
+with a shared property (e.g. equal means).
+Some tests are specifically for comparing two samples.
+.. autosummary::
+   :toctree: generated/
+   ttest_ind_from_stats
+   poisson_means_test
+   ttest_ind
+   mannwhitneyu
+   bws_test
+   ranksums
+   brunnermunzel
+   mood
+   ansari
+   cramervonmises_2samp
+   epps_singleton_2samp
+   ks_2samp
+   kstest
+Others are generalized to multiple samples.
+.. autosummary::
+   :toctree: generated/
+   f_oneway
+   tukey_hsd
+   dunnett
+   kruskal
+   alexandergovern
+   fligner
+   levene
+   bartlett
+   median_test
+   friedmanchisquare
+   anderson_ksamp
+Resampling and Monte Carlo Methods
+----------------------------------
+The following functions can reproduce the p-value and confidence interval
+results of most of the functions above, and often produce accurate results in a
+wider variety of conditions. They can also be used to perform hypothesis tests
+and generate confidence intervals for custom statistics. This flexibility comes
+at the cost of greater computational requirements and stochastic results.
+.. autosummary::
+   :toctree: generated/
+   monte_carlo_test
+   permutation_test
+   bootstrap
+Instances of the following object can be passed into some hypothesis test
+functions to perform a resampling or Monte Carlo version of the hypothesis
+test.
+.. autosummary::
+   :toctree: generated/
+   MonteCarloMethod
+   PermutationMethod
+   BootstrapMethod
+Multiple Hypothesis Testing and Meta-Analysis
+---------------------------------------------
+These functions are for assessing the results of individual tests as a whole.
+Functions for performing specific multiple hypothesis tests (e.g. post hoc
+tests) are listed above.
+.. autosummary::
+   :toctree: generated/
+   combine_pvalues
+   false_discovery_control
+The following functions are related to the tests above but do not belong in the
+above categories.
+Quasi-Monte Carlo
+=================
+.. toctree::
+   :maxdepth: 4
+   stats.qmc
+Contingency Tables
+==================
+.. toctree::
+   :maxdepth: 4
+   stats.contingency
+Masked statistics functions
+===========================
+.. toctree::
+   stats.mstats
+Other statistical functionality
+===============================
+Transformations
+---------------
+.. autosummary::
+   :toctree: generated/
+   boxcox
+   boxcox_normmax
+   boxcox_llf
+   yeojohnson
+   yeojohnson_normmax
+   yeojohnson_llf
+   obrientransform
+   sigmaclip
+   trimboth
+   trim1
+   zmap
+   zscore
+   gzscore
+Statistical distances
+---------------------
+.. autosummary::
+   :toctree: generated/
+   wasserstein_distance
+   wasserstein_distance_nd
+   energy_distance
+Sampling
+--------
+.. toctree::
+   :maxdepth: 4
+   stats.sampling
+Random variate generation / CDF Inversion
+-----------------------------------------
+.. autosummary::
+   :toctree: generated/
+   rvs_ratio_uniforms
+Fitting / Survival Analysis
+---------------------------
+.. autosummary::
+   :toctree: generated/
+   fit
+   ecdf
+   logrank
+Directional statistical functions
+---------------------------------
+.. autosummary::
+   :toctree: generated/
+   directional_stats
+   circmean
+   circvar
+   circstd
+Sensitivity Analysis
+--------------------
+.. autosummary::
+   :toctree: generated/
+   sobol_indices
+Plot-tests
+----------
+.. autosummary::
+   :toctree: generated/
+   ppcc_max
+   ppcc_plot
+   probplot
+   boxcox_normplot
+   yeojohnson_normplot
+Univariate and multivariate kernel density estimation
+-----------------------------------------------------
+.. autosummary::
+   :toctree: generated/
+   gaussian_kde
+Warnings / Errors used in :mod:`scipy.stats`
+--------------------------------------------
+.. autosummary::
+   :toctree: generated/
+   DegenerateDataWarning
+   ConstantInputWarning
+   NearConstantInputWarning
+   FitError
+Result classes used in :mod:`scipy.stats`
+-----------------------------------------
+.. warning::
+    These classes are private, but they are included here because instances
+    of them are returned by other statistical functions. User import and
+    instantiation is not supported.
+.. toctree::
+   :maxdepth: 2
+   stats._result_classes
+"""  # noqa: E501
+from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
+                               DegenerateDataWarning, FitError)
+from ._stats_py import *
+from ._variation import variation
+from .distributions import *
+from ._morestats import *
+from ._multicomp import *
+from ._binomtest import binomtest
+from ._binned_statistic import *
+from ._kde import gaussian_kde
+from . import mstats
+from . import qmc
+from ._multivariate import *
+from . import contingency
+from .contingency import chi2_contingency
+from ._censored_data import CensoredData
+from ._resampling import (bootstrap, monte_carlo_test, permutation_test,
+                          MonteCarloMethod, PermutationMethod, BootstrapMethod)
+from ._entropy import *
+from ._hypotests import *
+from ._rvs_sampling import rvs_ratio_uniforms
+from ._page_trend_test import page_trend_test
+from ._mannwhitneyu import mannwhitneyu
+from ._bws_test import bws_test
+from ._fit import fit, goodness_of_fit
+from ._covariance import Covariance
+from ._sensitivity_analysis import *
+from ._survival import *
+# Deprecated namespaces, to be removed in v2.0.0
+from . import (
+    biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
+)
+__all__ = [s for s in dir() if not s.startswith("_")]  # Remove dunders.
+from scipy._lib._testutils import PytestTester
+test = PytestTester(__name__)
+del PytestTester

.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.dll.a ADDED Viewed

Binary file (1.74 kB). View file

.venv/Lib/site-packages/scipy/stats/_ansari_swilk_statistics.cp39-win_amd64.pyd ADDED Viewed

Binary file (259 kB). View file

.venv/Lib/site-packages/scipy/stats/_axis_nan_policy.py ADDED Viewed

	@@ -0,0 +1,642 @@

+# Many scipy.stats functions support `axis` and `nan_policy` parameters.
+# When the two are combined, it can be tricky to get all the behavior just
+# right. This file contains utility functions useful for scipy.stats functions
+# that support `axis` and `nan_policy`, including a decorator that
+# automatically adds `axis` and `nan_policy` arguments to a function.
+import numpy as np
+from functools import wraps
+from scipy._lib._docscrape import FunctionDoc, Parameter
+from scipy._lib._util import _contains_nan, AxisError, _get_nan
+import inspect
+def _broadcast_arrays(arrays, axis=None):
+    """
+    Broadcast shapes of arrays, ignoring incompatibility of specified axes
+    """
+    new_shapes = _broadcast_array_shapes(arrays, axis=axis)
+    if axis is None:
+        new_shapes = [new_shapes]*len(arrays)
+    return [np.broadcast_to(array, new_shape)
+            for array, new_shape in zip(arrays, new_shapes)]
+def _broadcast_array_shapes(arrays, axis=None):
+    """
+    Broadcast shapes of arrays, ignoring incompatibility of specified axes
+    """
+    shapes = [np.asarray(arr).shape for arr in arrays]
+    return _broadcast_shapes(shapes, axis)
+def _broadcast_shapes(shapes, axis=None):
+    """
+    Broadcast shapes, ignoring incompatibility of specified axes
+    """
+    if not shapes:
+        return shapes
+    # input validation
+    if axis is not None:
+        axis = np.atleast_1d(axis)
+        axis_int = axis.astype(int)
+        if not np.array_equal(axis_int, axis):
+            raise AxisError('`axis` must be an integer, a '
+                            'tuple of integers, or `None`.')
+        axis = axis_int
+    # First, ensure all shapes have same number of dimensions by prepending 1s.
+    n_dims = max([len(shape) for shape in shapes])
+    new_shapes = np.ones((len(shapes), n_dims), dtype=int)
+    for row, shape in zip(new_shapes, shapes):
+        row[len(row)-len(shape):] = shape  # can't use negative indices (-0:)
+    # Remove the shape elements of the axes to be ignored, but remember them.
+    if axis is not None:
+        axis[axis < 0] = n_dims + axis[axis < 0]
+        axis = np.sort(axis)
+        if axis[-1] >= n_dims or axis[0] < 0:
+            message = (f"`axis` is out of bounds "
+                       f"for array of dimension {n_dims}")
+            raise AxisError(message)
+        if len(np.unique(axis)) != len(axis):
+            raise AxisError("`axis` must contain only distinct elements")
+        removed_shapes = new_shapes[:, axis]
+        new_shapes = np.delete(new_shapes, axis, axis=1)
+    # If arrays are broadcastable, shape elements that are 1 may be replaced
+    # with a corresponding non-1 shape element. Assuming arrays are
+    # broadcastable, that final shape element can be found with:
+    new_shape = np.max(new_shapes, axis=0)
+    # except in case of an empty array:
+    new_shape *= new_shapes.all(axis=0)
+    # Among all arrays, there can only be one unique non-1 shape element.
+    # Therefore, if any non-1 shape element does not match what we found
+    # above, the arrays must not be broadcastable after all.
+    if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
+        raise ValueError("Array shapes are incompatible for broadcasting.")
+    if axis is not None:
+        # Add back the shape elements that were ignored
+        new_axis = axis - np.arange(len(axis))
+        new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
+                      for removed_shape in removed_shapes]
+        return new_shapes
+    else:
+        return tuple(new_shape)
+def _broadcast_array_shapes_remove_axis(arrays, axis=None):
+    """
+    Broadcast shapes of arrays, dropping specified axes
+    Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
+    the shape of the broadcast result after consuming/dropping `axis`.
+    In other words, return output shape of a typical hypothesis test on
+    `arrays` vectorized along `axis`.
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes
+    >>> a = np.zeros((5, 2, 1))
+    >>> b = np.zeros((9, 3))
+    >>> _broadcast_array_shapes((a, b), 1)
+    (5, 3)
+    """
+    # Note that here, `axis=None` means do not consume/drop any axes - _not_
+    # ravel arrays before broadcasting.
+    shapes = [arr.shape for arr in arrays]
+    return _broadcast_shapes_remove_axis(shapes, axis)
+def _broadcast_shapes_remove_axis(shapes, axis=None):
+    """
+    Broadcast shapes, dropping specified axes
+    Same as _broadcast_array_shapes, but given a sequence
+    of array shapes `shapes` instead of the arrays themselves.
+    """
+    shapes = _broadcast_shapes(shapes, axis)
+    shape = shapes[0]
+    if axis is not None:
+        shape = np.delete(shape, axis)
+    return tuple(shape)
+def _broadcast_concatenate(arrays, axis, paired=False):
+    """Concatenate arrays along an axis with broadcasting."""
+    arrays = _broadcast_arrays(arrays, axis if not paired else None)
+    res = np.concatenate(arrays, axis=axis)
+    return res
+# TODO: add support for `axis` tuples
+def _remove_nans(samples, paired):
+    "Remove nans from paired or unpaired 1D samples"
+    # potential optimization: don't copy arrays that don't contain nans
+    if not paired:
+        return [sample[~np.isnan(sample)] for sample in samples]
+    # for paired samples, we need to remove the whole pair when any part
+    # has a nan
+    nans = np.isnan(samples[0])
+    for sample in samples[1:]:
+        nans = nans | np.isnan(sample)
+    not_nans = ~nans
+    return [sample[not_nans] for sample in samples]
+def _remove_sentinel(samples, paired, sentinel):
+    "Remove sentinel values from paired or unpaired 1D samples"
+    # could consolidate with `_remove_nans`, but it's not quite as simple as
+    # passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
+    # potential optimization: don't copy arrays that don't contain sentinel
+    if not paired:
+        return [sample[sample != sentinel] for sample in samples]
+    # for paired samples, we need to remove the whole pair when any part
+    # has a nan
+    sentinels = (samples[0] == sentinel)
+    for sample in samples[1:]:
+        sentinels = sentinels | (sample == sentinel)
+    not_sentinels = ~sentinels
+    return [sample[not_sentinels] for sample in samples]
+def _masked_arrays_2_sentinel_arrays(samples):
+    # masked arrays in `samples` are converted to regular arrays, and values
+    # corresponding with masked elements are replaced with a sentinel value
+    # return without modifying arrays if none have a mask
+    has_mask = False
+    for sample in samples:
+        mask = getattr(sample, 'mask', False)
+        has_mask = has_mask or np.any(mask)
+    if not has_mask:
+        return samples, None  # None means there is no sentinel value
+    # Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
+    # values are always omitted, but there are different nan policies.
+    dtype = np.result_type(*samples)
+    dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
+    for i in range(len(samples)):
+        # Things get more complicated if the arrays are of different types.
+        # We could have different sentinel values for each array, but
+        # the purpose of this code is convenience, not efficiency.
+        samples[i] = samples[i].astype(dtype, copy=False)
+    inexact = np.issubdtype(dtype, np.inexact)
+    info = np.finfo if inexact else np.iinfo
+    max_possible, min_possible = info(dtype).max, info(dtype).min
+    nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
+    sentinel = max_possible
+    # For simplicity, min_possible/np.infs are not candidate sentinel values
+    while sentinel > min_possible:
+        for sample in samples:
+            if np.any(sample == sentinel):  # choose a new sentinel value
+                sentinel = nextafter(sentinel, -np.inf)
+                break
+        else:  # when sentinel value is OK, break the while loop
+            break
+    else:
+        message = ("This function replaces masked elements with sentinel "
+                   "values, but the data contains all distinct values of this "
+                   "data type. Consider promoting the dtype to `np.float64`.")
+        raise ValueError(message)
+    # replace masked elements with sentinel value
+    out_samples = []
+    for sample in samples:
+        mask = getattr(sample, 'mask', None)
+        if mask is not None:  # turn all masked arrays into sentinel arrays
+            mask = np.broadcast_to(mask, sample.shape)
+            sample = sample.data.copy() if np.any(mask) else sample.data
+            sample = np.asarray(sample)  # `sample.data` could be a memoryview?
+            sample[mask] = sentinel
+        out_samples.append(sample)
+    return out_samples, sentinel
+def _check_empty_inputs(samples, axis):
+    """
+    Check for empty sample; return appropriate output for a vectorized hypotest
+    """
+    # if none of the samples are empty, we need to perform the test
+    if not any(sample.size == 0 for sample in samples):
+        return None
+    # otherwise, the statistic and p-value will be either empty arrays or
+    # arrays with NaNs. Produce the appropriate array and return it.
+    output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
+    output = np.ones(output_shape) * _get_nan(*samples)
+    return output
+def _add_reduced_axes(res, reduced_axes, keepdims):
+    """
+    Add reduced axes back to all the arrays in the result object
+    if keepdims = True.
+    """
+    return ([np.expand_dims(output, reduced_axes) for output in res]
+            if keepdims else res)
+# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
+_name = 'axis'
+_desc = (
+    """If an int, the axis of the input along which to compute the statistic.
+The statistic of each axis-slice (e.g. row) of the input will appear in a
+corresponding element of the output.
+If ``None``, the input will be raveled before computing the statistic."""
+    .split('\n'))
+def _get_axis_params(default_axis=0, _name=_name, _desc=_desc):  # bind NOW
+    _type = f"int or None, default: {default_axis}"
+    _axis_parameter_doc = Parameter(_name, _type, _desc)
+    _axis_parameter = inspect.Parameter(_name,
+                                        inspect.Parameter.KEYWORD_ONLY,
+                                        default=default_axis)
+    return _axis_parameter_doc, _axis_parameter
+_name = 'nan_policy'
+_type = "{'propagate', 'omit', 'raise'}"
+_desc = (
+    """Defines how to handle input NaNs.
+- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
+  which the  statistic is computed, the corresponding entry of the output
+  will be NaN.
+- ``omit``: NaNs will be omitted when performing the calculation.
+  If insufficient data remains in the axis slice along which the
+  statistic is computed, the corresponding entry of the output will be
+  NaN.
+- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
+    .split('\n'))
+_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
+_nan_policy_parameter = inspect.Parameter(_name,
+                                          inspect.Parameter.KEYWORD_ONLY,
+                                          default='propagate')
+_name = 'keepdims'
+_type = "bool, default: False"
+_desc = (
+    """If this is set to True, the axes which are reduced are left
+in the result as dimensions with size one. With this option,
+the result will broadcast correctly against the input array."""
+    .split('\n'))
+_keepdims_parameter_doc = Parameter(_name, _type, _desc)
+_keepdims_parameter = inspect.Parameter(_name,
+                                        inspect.Parameter.KEYWORD_ONLY,
+                                        default=False)
+_standard_note_addition = (
+    """\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
+code) are converted to ``np.ndarray`` before the calculation is performed. In
+this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
+rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
+arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
+masked array with ``mask=False``.""").split('\n')
+def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
+                             n_samples=1, paired=False,
+                             result_to_tuple=None, too_small=0,
+                             n_outputs=2, kwd_samples=[], override=None):
+    """Factory for a wrapper that adds axis/nan_policy params to a function.
+    Parameters
+    ----------
+    tuple_to_result : callable
+        Callable that returns an object of the type returned by the function
+        being wrapped (e.g. the namedtuple or dataclass returned by a
+        statistical test) provided the separate components (e.g. statistic,
+        pvalue).
+    default_axis : int, default: 0
+        The default value of the axis argument. Standard is 0 except when
+        backwards compatibility demands otherwise (e.g. `None`).
+    n_samples : int or callable, default: 1
+        The number of data samples accepted by the function
+        (e.g. `mannwhitneyu`), a callable that accepts a dictionary of
+        parameters passed into the function and returns the number of data
+        samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
+        of samples (e.g. `kruskal`).
+    paired : {False, True}
+        Whether the function being wrapped treats the samples as paired (i.e.
+        corresponding elements of each sample should be considered as different
+        components of the same sample.)
+    result_to_tuple : callable, optional
+        Function that unpacks the results of the function being wrapped into
+        a tuple. This is essentially the inverse of `tuple_to_result`. Default
+        is `None`, which is appropriate for statistical tests that return a
+        statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
+    too_small : int or callable, default: 0
+        The largest unnacceptably small sample for the function being wrapped.
+        For example, some functions require samples of size two or more or they
+        raise an error. This argument prevents the error from being raised when
+        input is not 1D and instead places a NaN in the corresponding element
+        of the result. If callable, it must accept a list of samples, axis,
+        and a dictionary of keyword arguments passed to the wrapper function as
+        arguments and return a bool indicating weather the samples passed are
+        too small.
+    n_outputs : int or callable, default: 2
+        The number of outputs produced by the function given 1d sample(s). For
+        example, hypothesis tests that return a namedtuple or result object
+        with attributes ``statistic`` and ``pvalue`` use the default
+        ``n_outputs=2``; summary statistics with scalar output use
+        ``n_outputs=1``. Alternatively, may be a callable that accepts a
+        dictionary of arguments passed into the wrapped function and returns
+        the number of outputs corresponding with those arguments.
+    kwd_samples : sequence, default: []
+        The names of keyword parameters that should be treated as samples. For
+        example, `gmean` accepts as its first argument a sample `a` but
+        also `weights` as a fourth, optional keyword argument. In this case, we
+        use `n_samples=1` and kwd_samples=['weights'].
+    override : dict, default: {'vectorization': False, 'nan_propagation': True}
+        Pass a dictionary with ``'vectorization': True`` to ensure that the
+        decorator overrides the function's behavior for multimensional input.
+        Use ``'nan_propagation': False`` to ensure that the decorator does not
+        override the function's behavior for ``nan_policy='propagate'``.
+        (See `scipy.stats.mode`, for example.)
+    """
+    # Specify which existing behaviors the decorator must override
+    temp = override or {}
+    override = {'vectorization': False,
+                'nan_propagation': True}
+    override.update(temp)
+    if result_to_tuple is None:
+        def result_to_tuple(res):
+            return res
+    if not callable(too_small):
+        def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
+            for sample in samples:
+                if sample.shape[axis] <= too_small:
+                    return True
+            return False
+    else:
+        is_too_small = too_small
+    def axis_nan_policy_decorator(hypotest_fun_in):
+        @wraps(hypotest_fun_in)
+        def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
+            if _no_deco:  # for testing, decorator does nothing
+                return hypotest_fun_in(*args, **kwds)
+            # We need to be flexible about whether position or keyword
+            # arguments are used, but we need to make sure users don't pass
+            # both for the same parameter. To complicate matters, some
+            # functions accept samples with *args, and some functions already
+            # accept `axis` and `nan_policy` as positional arguments.
+            # The strategy is to make sure that there is no duplication
+            # between `args` and `kwds`, combine the two into `kwds`, then
+            # the samples, `nan_policy`, and `axis` from `kwds`, as they are
+            # dealt with separately.
+            # Check for intersection between positional and keyword args
+            params = list(inspect.signature(hypotest_fun_in).parameters)
+            if n_samples is None:
+                # Give unique names to each positional sample argument
+                # Note that *args can't be provided as a keyword argument
+                params = [f"arg{i}" for i in range(len(args))] + params[1:]
+            # raise if there are too many positional args
+            maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
+                      else len(inspect.getfullargspec(hypotest_fun_in).args))
+            if len(args) > maxarg:  # let the function raise the right error
+                hypotest_fun_in(*args, **kwds)
+            # raise if multiple values passed for same parameter
+            d_args = dict(zip(params, args))
+            intersection = set(d_args) & set(kwds)
+            if intersection:  # let the function raise the right error
+                hypotest_fun_in(*args, **kwds)
+            # Consolidate other positional and keyword args into `kwds`
+            kwds.update(d_args)
+            # rename avoids UnboundLocalError
+            if callable(n_samples):
+                # Future refactoring idea: no need for callable n_samples.
+                # Just replace `n_samples` and `kwd_samples` with a single
+                # list of the names of all samples, and treat all of them
+                # as `kwd_samples` are treated below.
+                n_samp = n_samples(kwds)
+            else:
+                n_samp = n_samples or len(args)
+            # get the number of outputs
+            n_out = n_outputs  # rename to avoid UnboundLocalError
+            if callable(n_out):
+                n_out = n_out(kwds)
+            # If necessary, rearrange function signature: accept other samples
+            # as positional args right after the first n_samp args
+            kwd_samp = [name for name in kwd_samples
+                        if kwds.get(name, None) is not None]
+            n_kwd_samp = len(kwd_samp)
+            if not kwd_samp:
+                hypotest_fun_out = hypotest_fun_in
+            else:
+                def hypotest_fun_out(*samples, **kwds):
+                    new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
+                    kwds.update(new_kwds)
+                    return hypotest_fun_in(*samples[:n_samp], **kwds)
+            # Extract the things we need here
+            try:  # if something is missing
+                samples = [np.atleast_1d(kwds.pop(param))
+                           for param in (params[:n_samp] + kwd_samp)]
+            except KeyError:  # let the function raise the right error
+                # might need to revisit this if required arg is not a "sample"
+                hypotest_fun_in(*args, **kwds)
+            vectorized = True if 'axis' in params else False
+            vectorized = vectorized and not override['vectorization']
+            axis = kwds.pop('axis', default_axis)
+            nan_policy = kwds.pop('nan_policy', 'propagate')
+            keepdims = kwds.pop("keepdims", False)
+            del args  # avoid the possibility of passing both `args` and `kwds`
+            # convert masked arrays to regular arrays with sentinel values
+            samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
+            # standardize to always work along last axis
+            reduced_axes = axis
+            if axis is None:
+                if samples:
+                    # when axis=None, take the maximum of all dimensions since
+                    # all the dimensions are reduced.
+                    n_dims = np.max([sample.ndim for sample in samples])
+                    reduced_axes = tuple(range(n_dims))
+                samples = [np.asarray(sample.ravel()) for sample in samples]
+            else:
+                samples = _broadcast_arrays(samples, axis=axis)
+                axis = np.atleast_1d(axis)
+                n_axes = len(axis)
+                # move all axes in `axis` to the end to be raveled
+                samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
+                           for sample in samples]
+                shapes = [sample.shape for sample in samples]
+                # New shape is unchanged for all axes _not_ in `axis`
+                # At the end, we append the product of the shapes of the axes
+                # in `axis`. Appending -1 doesn't work for zero-size arrays!
+                new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
+                              for shape in shapes]
+                samples = [sample.reshape(new_shape)
+                           for sample, new_shape in zip(samples, new_shapes)]
+            axis = -1  # work over the last axis
+            NaN = _get_nan(*samples)
+            # if axis is not needed, just handle nan_policy and return
+            ndims = np.array([sample.ndim for sample in samples])
+            if np.all(ndims <= 1):
+                # Addresses nan_policy == "raise"
+                if nan_policy != 'propagate' or override['nan_propagation']:
+                    contains_nan = [_contains_nan(sample, nan_policy)[0]
+                                    for sample in samples]
+                else:
+                    # Behave as though there are no NaNs (even if there are)
+                    contains_nan = [False]*len(samples)
+                # Addresses nan_policy == "propagate"
+                if any(contains_nan) and (nan_policy == 'propagate'
+                                          and override['nan_propagation']):
+                    res = np.full(n_out, NaN)
+                    res = _add_reduced_axes(res, reduced_axes, keepdims)
+                    return tuple_to_result(*res)
+                # Addresses nan_policy == "omit"
+                if any(contains_nan) and nan_policy == 'omit':
+                    # consider passing in contains_nan
+                    samples = _remove_nans(samples, paired)
+                # ideally, this is what the behavior would be:
+                # if is_too_small(samples):
+                #     return tuple_to_result(NaN, NaN)
+                # but some existing functions raise exceptions, and changing
+                # behavior of those would break backward compatibility.
+                if sentinel:
+                    samples = _remove_sentinel(samples, paired, sentinel)
+                res = hypotest_fun_out(*samples, **kwds)
+                res = result_to_tuple(res)
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+            # check for empty input
+            # ideally, move this to the top, but some existing functions raise
+            # exceptions for empty input, so overriding it would break
+            # backward compatibility.
+            empty_output = _check_empty_inputs(samples, axis)
+            # only return empty output if zero sized input is too small.
+            if (
+                empty_output is not None
+                and (is_too_small(samples, kwds) or empty_output.size == 0)
+            ):
+                res = [empty_output.copy() for i in range(n_out)]
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+            # otherwise, concatenate all samples along axis, remembering where
+            # each separate sample begins
+            lengths = np.array([sample.shape[axis] for sample in samples])
+            split_indices = np.cumsum(lengths)
+            x = _broadcast_concatenate(samples, axis)
+            # Addresses nan_policy == "raise"
+            if nan_policy != 'propagate' or override['nan_propagation']:
+                contains_nan, _ = _contains_nan(x, nan_policy)
+            else:
+                contains_nan = False  # behave like there are no NaNs
+            if vectorized and not contains_nan and not sentinel:
+                res = hypotest_fun_out(*samples, axis=axis, **kwds)
+                res = result_to_tuple(res)
+                res = _add_reduced_axes(res, reduced_axes, keepdims)
+                return tuple_to_result(*res)
+            # Addresses nan_policy == "omit"
+            if contains_nan and nan_policy == 'omit':
+                def hypotest_fun(x):
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    samples = _remove_nans(samples, paired)
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds))
+            # Addresses nan_policy == "propagate"
+            elif (contains_nan and nan_policy == 'propagate'
+                  and override['nan_propagation']):
+                def hypotest_fun(x):
+                    if np.isnan(x).any():
+                        return np.full(n_out, NaN)
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds))
+            else:
+                def hypotest_fun(x):
+                    samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
+                    if sentinel:
+                        samples = _remove_sentinel(samples, paired, sentinel)
+                    if is_too_small(samples, kwds):
+                        return np.full(n_out, NaN)
+                    return result_to_tuple(hypotest_fun_out(*samples, **kwds))
+            x = np.moveaxis(x, axis, 0)
+            res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
+            res = _add_reduced_axes(res, reduced_axes, keepdims)
+            return tuple_to_result(*res)
+        _axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
+        doc = FunctionDoc(axis_nan_policy_wrapper)
+        parameter_names = [param.name for param in doc['Parameters']]
+        if 'axis' in parameter_names:
+            doc['Parameters'][parameter_names.index('axis')] = (
+                _axis_parameter_doc)
+        else:
+            doc['Parameters'].append(_axis_parameter_doc)
+        if 'nan_policy' in parameter_names:
+            doc['Parameters'][parameter_names.index('nan_policy')] = (
+                _nan_policy_parameter_doc)
+        else:
+            doc['Parameters'].append(_nan_policy_parameter_doc)
+        if 'keepdims' in parameter_names:
+            doc['Parameters'][parameter_names.index('keepdims')] = (
+                _keepdims_parameter_doc)
+        else:
+            doc['Parameters'].append(_keepdims_parameter_doc)
+        doc['Notes'] += _standard_note_addition
+        doc = str(doc).split("\n", 1)[1]  # remove signature
+        axis_nan_policy_wrapper.__doc__ = str(doc)
+        sig = inspect.signature(axis_nan_policy_wrapper)
+        parameters = sig.parameters
+        parameter_list = list(parameters.values())
+        if 'axis' not in parameters:
+            parameter_list.append(_axis_parameter)
+        if 'nan_policy' not in parameters:
+            parameter_list.append(_nan_policy_parameter)
+        if 'keepdims' not in parameters:
+            parameter_list.append(_keepdims_parameter)
+        sig = sig.replace(parameters=parameter_list)
+        axis_nan_policy_wrapper.__signature__ = sig
+        return axis_nan_policy_wrapper
+    return axis_nan_policy_decorator

.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.dll.a ADDED Viewed

Binary file (1.57 kB). View file

.venv/Lib/site-packages/scipy/stats/_biasedurn.cp39-win_amd64.pyd ADDED Viewed

Binary file (399 kB). View file

.venv/Lib/site-packages/scipy/stats/_biasedurn.pxd ADDED Viewed

	@@ -0,0 +1,27 @@

+# Declare the class with cdef
+cdef extern from "biasedurn/stocc.h" nogil:
+    cdef cppclass CFishersNCHypergeometric:
+        CFishersNCHypergeometric(int, int, int, double, double) except +
+        int mode()
+        double mean()
+        double variance()
+        double probability(int x)
+        double moments(double * mean, double * var)
+    cdef cppclass CWalleniusNCHypergeometric:
+        CWalleniusNCHypergeometric() except +
+        CWalleniusNCHypergeometric(int, int, int, double, double) except +
+        int mode()
+        double mean()
+        double variance()
+        double probability(int x)
+        double moments(double * mean, double * var)
+    cdef cppclass StochasticLib3:
+        StochasticLib3(int seed) except +
+        double Random() except +
+        void SetAccuracy(double accur)
+        int FishersNCHyp (int n, int m, int N, double odds) except +
+        int WalleniusNCHyp (int n, int m, int N, double odds) except +
+        double(*next_double)()
+        double(*next_normal)(const double m, const double s)

.venv/Lib/site-packages/scipy/stats/_binned_statistic.py ADDED Viewed

	@@ -0,0 +1,795 @@

+import builtins
+from warnings import catch_warnings, simplefilter
+import numpy as np
+from operator import index
+from collections import namedtuple
+__all__ = ['binned_statistic',
+           'binned_statistic_2d',
+           'binned_statistic_dd']
+BinnedStatisticResult = namedtuple('BinnedStatisticResult',
+                                   ('statistic', 'bin_edges', 'binnumber'))
+def binned_statistic(x, values, statistic='mean',
+                     bins=10, range=None):
+    """
+    Compute a binned statistic for one or more sets of data.
+    This is a generalization of a histogram function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values (or set of values) within each bin.
+    Parameters
+    ----------
+    x : (N,) array_like
+        A sequence of values to be binned.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `x`, or a set of sequences - each the same shape as
+        `x`.  If `values` is a set of sequences, the statistic will be computed
+        on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+    bins : int or sequence of scalars, optional
+        If `bins` is an int, it defines the number of equal-width bins in the
+        given range (10 by default).  If `bins` is a sequence, it defines the
+        bin edges, including the rightmost edge, allowing for non-uniform bin
+        widths.  Values in `x` that are smaller than lowest bin edge are
+        assigned to bin number 0, values beyond the highest bin are assigned to
+        ``bins[-1]``.  If the bin edges are specified, the number of bins will
+        be, (nx = len(bins)-1).
+    range : (float, float) or [(float, float)], optional
+        The lower and upper range of the bins.  If not provided, range
+        is simply ``(x.min(), x.max())``.  Values outside the range are
+        ignored.
+    Returns
+    -------
+    statistic : array
+        The values of the selected statistic in each bin.
+    bin_edges : array of dtype float
+        Return the bin edges ``(length(statistic)+1)``.
+    binnumber: 1-D ndarray of ints
+        Indices of the bins (corresponding to `bin_edges`) in which each value
+        of `x` belongs.  Same length as `values`.  A binnumber of `i` means the
+        corresponding value is between (bin_edges[i-1], bin_edges[i]).
+    See Also
+    --------
+    numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
+    Notes
+    -----
+    All but the last (righthand-most) bin is half-open.  In other words, if
+    `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
+    but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
+    ``[3, 4]``, which *includes* 4.
+    .. versionadded:: 0.11.0
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    First some basic examples:
+    Create two evenly spaced bins in the range of the given sample, and sum the
+    corresponding values in each of those bins:
+    >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
+    >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
+    BinnedStatisticResult(statistic=array([4. , 4.5]),
+            bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
+    Multiple arrays of values can also be passed.  The statistic is calculated
+    on each set independently:
+    >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
+    >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
+    BinnedStatisticResult(statistic=array([[4. , 4.5],
+           [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
+           binnumber=array([1, 1, 1, 2, 2]))
+    >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
+    ...                        bins=3)
+    BinnedStatisticResult(statistic=array([1., 2., 4.]),
+            bin_edges=array([1., 2., 3., 4.]),
+            binnumber=array([1, 2, 1, 2, 3]))
+    As a second example, we now generate some random data of sailing boat speed
+    as a function of wind speed, and then determine how fast our boat is for
+    certain wind speeds:
+    >>> rng = np.random.default_rng()
+    >>> windspeed = 8 * rng.random(500)
+    >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
+    >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
+    ...                 boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
+    >>> plt.figure()
+    >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
+    >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
+    ...            label='binned statistic of data')
+    >>> plt.legend()
+    Now we can use ``binnumber`` to select all datapoints with a windspeed
+    below 1:
+    >>> low_boatspeed = boatspeed[binnumber == 0]
+    As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
+    plot of a distribution that shows the mean and distribution around that
+    mean per bin, on top of a regular histogram and the probability
+    distribution function:
+    >>> x = np.linspace(0, 5, num=500)
+    >>> x_pdf = stats.maxwell.pdf(x)
+    >>> samples = stats.maxwell.rvs(size=10000)
+    >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
+    ...         statistic='mean', bins=25)
+    >>> bin_width = (bin_edges[1] - bin_edges[0])
+    >>> bin_centers = bin_edges[1:] - bin_width/2
+    >>> plt.figure()
+    >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
+    ...          alpha=0.2, label='histogram of data')
+    >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
+    >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
+    ...            label='binned statistic of data')
+    >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
+    >>> plt.legend(fontsize=10)
+    >>> plt.show()
+    """
+    try:
+        N = len(bins)
+    except TypeError:
+        N = 1
+    if N != 1:
+        bins = [np.asarray(bins, float)]
+    if range is not None:
+        if len(range) == 2:
+            range = [range]
+    medians, edges, binnumbers = binned_statistic_dd(
+        [x], values, statistic, bins, range)
+    return BinnedStatisticResult(medians, edges[0], binnumbers)
+BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
+                                     ('statistic', 'x_edge', 'y_edge',
+                                      'binnumber'))
+def binned_statistic_2d(x, y, values, statistic='mean',
+                        bins=10, range=None, expand_binnumbers=False):
+    """
+    Compute a bidimensional binned statistic for one or more sets of data.
+    This is a generalization of a histogram2d function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values (or set of values) within each bin.
+    Parameters
+    ----------
+    x : (N,) array_like
+        A sequence of values to be binned along the first dimension.
+    y : (N,) array_like
+        A sequence of values to be binned along the second dimension.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `x`, or a list of sequences - each with the same
+        shape as `x`.  If `values` is such a list, the statistic will be
+        computed on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+    bins : int or [int, int] or array_like or [array, array], optional
+        The bin specification:
+          * the number of bins for the two dimensions (nx = ny = bins),
+          * the number of bins in each dimension (nx, ny = bins),
+          * the bin edges for the two dimensions (x_edge = y_edge = bins),
+          * the bin edges in each dimension (x_edge, y_edge = bins).
+        If the bin edges are specified, the number of bins will be,
+        (nx = len(x_edge)-1, ny = len(y_edge)-1).
+    range : (2,2) array_like, optional
+        The leftmost and rightmost edges of the bins along each dimension
+        (if not specified explicitly in the `bins` parameters):
+        [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
+        considered outliers and not tallied in the histogram.
+    expand_binnumbers : bool, optional
+        'False' (default): the returned `binnumber` is a shape (N,) array of
+        linearized bin indices.
+        'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
+        ndarray, where each row gives the bin numbers in the corresponding
+        dimension.
+        See the `binnumber` returned value, and the `Examples` section.
+        .. versionadded:: 0.17.0
+    Returns
+    -------
+    statistic : (nx, ny) ndarray
+        The values of the selected statistic in each two-dimensional bin.
+    x_edge : (nx + 1) ndarray
+        The bin edges along the first dimension.
+    y_edge : (ny + 1) ndarray
+        The bin edges along the second dimension.
+    binnumber : (N,) array of ints or (2,N) ndarray of ints
+        This assigns to each element of `sample` an integer that represents the
+        bin in which this observation falls.  The representation depends on the
+        `expand_binnumbers` argument.  See `Notes` for details.
+    See Also
+    --------
+    numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
+    Notes
+    -----
+    Binedges:
+    All but the last (righthand-most) bin is half-open.  In other words, if
+    `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
+    but excluding 2) and the second ``[2, 3)``.  The last bin, however, is
+    ``[3, 4]``, which *includes* 4.
+    `binnumber`:
+    This returned argument assigns to each element of `sample` an integer that
+    represents the bin in which it belongs.  The representation depends on the
+    `expand_binnumbers` argument. If 'False' (default): The returned
+    `binnumber` is a shape (N,) array of linearized indices mapping each
+    element of `sample` to its corresponding bin (using row-major ordering).
+    Note that the returned linearized bin indices are used for an array with
+    extra bins on the outer binedges to capture values outside of the defined
+    bin bounds.
+    If 'True': The returned `binnumber` is a shape (2,N) ndarray where
+    each row indicates bin placements for each dimension respectively.  In each
+    dimension, a binnumber of `i` means the corresponding value is between
+    (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
+    .. versionadded:: 0.11.0
+    Examples
+    --------
+    >>> from scipy import stats
+    Calculate the counts with explicit bin-edges:
+    >>> x = [0.1, 0.1, 0.1, 0.6]
+    >>> y = [2.1, 2.6, 2.1, 2.1]
+    >>> binx = [0.0, 0.5, 1.0]
+    >>> biny = [2.0, 2.5, 3.0]
+    >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
+    >>> ret.statistic
+    array([[2., 1.],
+           [1., 0.]])
+    The bin in which each sample is placed is given by the `binnumber`
+    returned parameter.  By default, these are the linearized bin indices:
+    >>> ret.binnumber
+    array([5, 6, 5, 9])
+    The bin indices can also be expanded into separate entries for each
+    dimension using the `expand_binnumbers` parameter:
+    >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
+    ...                                 expand_binnumbers=True)
+    >>> ret.binnumber
+    array([[1, 1, 1, 2],
+           [1, 2, 1, 1]])
+    Which shows that the first three elements belong in the xbin 1, and the
+    fourth into xbin 2; and so on for y.
+    """
+    # This code is based on np.histogram2d
+    try:
+        N = len(bins)
+    except TypeError:
+        N = 1
+    if N != 1 and N != 2:
+        xedges = yedges = np.asarray(bins, float)
+        bins = [xedges, yedges]
+    medians, edges, binnumbers = binned_statistic_dd(
+        [x, y], values, statistic, bins, range,
+        expand_binnumbers=expand_binnumbers)
+    return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
+BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
+                                     ('statistic', 'bin_edges',
+                                      'binnumber'))
+def _bincount(x, weights):
+    if np.iscomplexobj(weights):
+        a = np.bincount(x, np.real(weights))
+        b = np.bincount(x, np.imag(weights))
+        z = a + b*1j
+    else:
+        z = np.bincount(x, weights)
+    return z
+def binned_statistic_dd(sample, values, statistic='mean',
+                        bins=10, range=None, expand_binnumbers=False,
+                        binned_statistic_result=None):
+    """
+    Compute a multidimensional binned statistic for a set of data.
+    This is a generalization of a histogramdd function.  A histogram divides
+    the space into bins, and returns the count of the number of points in
+    each bin.  This function allows the computation of the sum, mean, median,
+    or other statistic of the values within each bin.
+    Parameters
+    ----------
+    sample : array_like
+        Data to histogram passed as a sequence of N arrays of length D, or
+        as an (N,D) array.
+    values : (N,) array_like or list of (N,) array_like
+        The data on which the statistic will be computed.  This must be
+        the same shape as `sample`, or a list of sequences - each with the
+        same shape as `sample`.  If `values` is such a list, the statistic
+        will be computed on each independently.
+    statistic : string or callable, optional
+        The statistic to compute (default is 'mean').
+        The following statistics are available:
+          * 'mean' : compute the mean of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'median' : compute the median of values for points within each
+            bin. Empty bins will be represented by NaN.
+          * 'count' : compute the count of points within each bin.  This is
+            identical to an unweighted histogram.  `values` array is not
+            referenced.
+          * 'sum' : compute the sum of values for points within each bin.
+            This is identical to a weighted histogram.
+          * 'std' : compute the standard deviation within each bin. This
+            is implicitly calculated with ddof=0. If the number of values
+            within a given bin is 0 or 1, the computed standard deviation value
+            will be 0 for the bin.
+          * 'min' : compute the minimum of values for points within each bin.
+            Empty bins will be represented by NaN.
+          * 'max' : compute the maximum of values for point within each bin.
+            Empty bins will be represented by NaN.
+          * function : a user-defined function which takes a 1D array of
+            values, and outputs a single numerical statistic. This function
+            will be called on the values in each bin.  Empty bins will be
+            represented by function([]), or NaN if this returns an error.
+    bins : sequence or positive int, optional
+        The bin specification must be in one of the following forms:
+          * A sequence of arrays describing the bin edges along each dimension.
+          * The number of bins for each dimension (nx, ny, ... = bins).
+          * The number of bins for all dimensions (nx = ny = ... = bins).
+    range : sequence, optional
+        A sequence of lower and upper bin edges to be used if the edges are
+        not given explicitly in `bins`. Defaults to the minimum and maximum
+        values along each dimension.
+    expand_binnumbers : bool, optional
+        'False' (default): the returned `binnumber` is a shape (N,) array of
+        linearized bin indices.
+        'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
+        ndarray, where each row gives the bin numbers in the corresponding
+        dimension.
+        See the `binnumber` returned value, and the `Examples` section of
+        `binned_statistic_2d`.
+    binned_statistic_result : binnedStatisticddResult
+        Result of a previous call to the function in order to reuse bin edges
+        and bin numbers with new values and/or a different statistic.
+        To reuse bin numbers, `expand_binnumbers` must have been set to False
+        (the default)
+        .. versionadded:: 0.17.0
+    Returns
+    -------
+    statistic : ndarray, shape(nx1, nx2, nx3,...)
+        The values of the selected statistic in each two-dimensional bin.
+    bin_edges : list of ndarrays
+        A list of D arrays describing the (nxi + 1) bin edges for each
+        dimension.
+    binnumber : (N,) array of ints or (D,N) ndarray of ints
+        This assigns to each element of `sample` an integer that represents the
+        bin in which this observation falls.  The representation depends on the
+        `expand_binnumbers` argument.  See `Notes` for details.
+    See Also
+    --------
+    numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
+    Notes
+    -----
+    Binedges:
+    All but the last (righthand-most) bin is half-open in each dimension.  In
+    other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
+    ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``.  The
+    last bin, however, is ``[3, 4]``, which *includes* 4.
+    `binnumber`:
+    This returned argument assigns to each element of `sample` an integer that
+    represents the bin in which it belongs.  The representation depends on the
+    `expand_binnumbers` argument. If 'False' (default): The returned
+    `binnumber` is a shape (N,) array of linearized indices mapping each
+    element of `sample` to its corresponding bin (using row-major ordering).
+    If 'True': The returned `binnumber` is a shape (D,N) ndarray where
+    each row indicates bin placements for each dimension respectively.  In each
+    dimension, a binnumber of `i` means the corresponding value is between
+    (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
+    .. versionadded:: 0.11.0
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>> from mpl_toolkits.mplot3d import Axes3D
+    Take an array of 600 (x, y) coordinates as an example.
+    `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
+    of dimension `D+1` is required.
+    >>> mu = np.array([0., 1.])
+    >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
+    >>> multinormal = stats.multivariate_normal(mu, sigma)
+    >>> data = multinormal.rvs(size=600, random_state=235412)
+    >>> data.shape
+    (600, 2)
+    Create bins and count how many arrays fall in each bin:
+    >>> N = 60
+    >>> x = np.linspace(-3, 3, N)
+    >>> y = np.linspace(-3, 4, N)
+    >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
+    ...                                 statistic='count')
+    >>> bincounts = ret.statistic
+    Set the volume and the location of bars:
+    >>> dx = x[1] - x[0]
+    >>> dy = y[1] - y[0]
+    >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
+    >>> z = 0
+    >>> bincounts = bincounts.ravel()
+    >>> x = x.ravel()
+    >>> y = y.ravel()
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111, projection='3d')
+    >>> with np.errstate(divide='ignore'):   # silence random axes3d warning
+    ...     ax.bar3d(x, y, z, dx, dy, bincounts)
+    Reuse bin numbers and bin edges with new values:
+    >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
+    ...                                  binned_statistic_result=ret,
+    ...                                  statistic='mean')
+    """
+    known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
+    if not callable(statistic) and statistic not in known_stats:
+        raise ValueError(f'invalid statistic {statistic!r}')
+    try:
+        bins = index(bins)
+    except TypeError:
+        # bins is not an integer
+        pass
+    # If bins was an integer-like object, now it is an actual Python int.
+    # NOTE: for _bin_edges(), see e.g. gh-11365
+    if isinstance(bins, int) and not np.isfinite(sample).all():
+        raise ValueError(f'{sample!r} contains non-finite values.')
+    # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
+    # `Dlen` is the length of elements along each dimension.
+    # This code is based on np.histogramdd
+    try:
+        # `sample` is an ND-array.
+        Dlen, Ndim = sample.shape
+    except (AttributeError, ValueError):
+        # `sample` is a sequence of 1D arrays.
+        sample = np.atleast_2d(sample).T
+        Dlen, Ndim = sample.shape
+    # Store initial shape of `values` to preserve it in the output
+    values = np.asarray(values)
+    input_shape = list(values.shape)
+    # Make sure that `values` is 2D to iterate over rows
+    values = np.atleast_2d(values)
+    Vdim, Vlen = values.shape
+    # Make sure `values` match `sample`
+    if statistic != 'count' and Vlen != Dlen:
+        raise AttributeError('The number of `values` elements must match the '
+                             'length of each `sample` dimension.')
+    try:
+        M = len(bins)
+        if M != Ndim:
+            raise AttributeError('The dimension of bins must be equal '
+                                 'to the dimension of the sample x.')
+    except TypeError:
+        bins = Ndim * [bins]
+    if binned_statistic_result is None:
+        nbin, edges, dedges = _bin_edges(sample, bins, range)
+        binnumbers = _bin_numbers(sample, nbin, edges, dedges)
+    else:
+        edges = binned_statistic_result.bin_edges
+        nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
+        # +1 for outlier bins
+        dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
+        binnumbers = binned_statistic_result.binnumber
+    # Avoid overflow with double precision. Complex `values` -> `complex128`.
+    result_type = np.result_type(values, np.float64)
+    result = np.empty([Vdim, nbin.prod()], dtype=result_type)
+    if statistic in {'mean', np.mean}:
+        result.fill(np.nan)
+        flatcount = _bincount(binnumbers, None)
+        a = flatcount.nonzero()
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            result[vv, a] = flatsum[a] / flatcount[a]
+    elif statistic in {'std', np.std}:
+        result.fill(np.nan)
+        flatcount = _bincount(binnumbers, None)
+        a = flatcount.nonzero()
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
+            std = np.sqrt(
+                _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
+            )
+            result[vv, a] = std
+        result = np.real(result)
+    elif statistic == 'count':
+        result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
+        result.fill(0)
+        flatcount = _bincount(binnumbers, None)
+        a = np.arange(len(flatcount))
+        result[:, a] = flatcount[np.newaxis, :]
+    elif statistic in {'sum', np.sum}:
+        result.fill(0)
+        for vv in builtins.range(Vdim):
+            flatsum = _bincount(binnumbers, values[vv])
+            a = np.arange(len(flatsum))
+            result[vv, a] = flatsum
+    elif statistic in {'median', np.median}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.lexsort((values[vv], binnumbers))
+            _, j, counts = np.unique(binnumbers[i],
+                                     return_index=True, return_counts=True)
+            mid = j + (counts - 1) / 2
+            mid_a = values[vv, i][np.floor(mid).astype(int)]
+            mid_b = values[vv, i][np.ceil(mid).astype(int)]
+            medians = (mid_a + mid_b) / 2
+            result[vv, binnumbers[i][j]] = medians
+    elif statistic in {'min', np.min}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.argsort(values[vv])[::-1]  # Reversed so the min is last
+            result[vv, binnumbers[i]] = values[vv, i]
+    elif statistic in {'max', np.max}:
+        result.fill(np.nan)
+        for vv in builtins.range(Vdim):
+            i = np.argsort(values[vv])
+            result[vv, binnumbers[i]] = values[vv, i]
+    elif callable(statistic):
+        with np.errstate(invalid='ignore'), catch_warnings():
+            simplefilter("ignore", RuntimeWarning)
+            try:
+                null = statistic([])
+            except Exception:
+                null = np.nan
+        if np.iscomplexobj(null):
+            result = result.astype(np.complex128)
+        result.fill(null)
+        try:
+            _calc_binned_statistic(
+                Vdim, binnumbers, result, values, statistic
+            )
+        except ValueError:
+            result = result.astype(np.complex128)
+            _calc_binned_statistic(
+                Vdim, binnumbers, result, values, statistic
+            )
+    # Shape into a proper matrix
+    result = result.reshape(np.append(Vdim, nbin))
+    # Remove outliers (indices 0 and -1 for each bin-dimension).
+    core = tuple([slice(None)] + Ndim * [slice(1, -1)])
+    result = result[core]
+    # Unravel binnumbers into an ndarray, each row the bins for each dimension
+    if expand_binnumbers and Ndim > 1:
+        binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
+    if np.any(result.shape[1:] != nbin - 2):
+        raise RuntimeError('Internal Shape Error')
+    # Reshape to have output (`result`) match input (`values`) shape
+    result = result.reshape(input_shape[:-1] + list(nbin-2))
+    return BinnedStatisticddResult(result, edges, binnumbers)
+def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
+    unique_bin_numbers = np.unique(bin_numbers)
+    for vv in builtins.range(Vdim):
+        bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
+                                      values, vv)
+        for i in unique_bin_numbers:
+            stat = stat_func(np.array(bin_map[i]))
+            if np.iscomplexobj(stat) and not np.iscomplexobj(result):
+                raise ValueError("The statistic function returns complex ")
+            result[vv, i] = stat
+def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
+    """ Create hashmap of bin ids to values in bins
+    key: bin number
+    value: list of binned data
+    """
+    bin_map = dict()
+    for i in unique_bin_numbers:
+        bin_map[i] = []
+    for i in builtins.range(len(bin_numbers)):
+        bin_map[bin_numbers[i]].append(values[vv, i])
+    return bin_map
+def _bin_edges(sample, bins=None, range=None):
+    """ Create edge arrays
+    """
+    Dlen, Ndim = sample.shape
+    nbin = np.empty(Ndim, int)    # Number of bins in each dimension
+    edges = Ndim * [None]         # Bin edges for each dim (will be 2D array)
+    dedges = Ndim * [None]        # Spacing between edges (will be 2D array)
+    # Select range for each dimension
+    # Used only if number of bins is given.
+    if range is None:
+        smin = np.atleast_1d(np.array(sample.min(axis=0), float))
+        smax = np.atleast_1d(np.array(sample.max(axis=0), float))
+    else:
+        if len(range) != Ndim:
+            raise ValueError(
+                f"range given for {len(range)} dimensions; {Ndim} required")
+        smin = np.empty(Ndim)
+        smax = np.empty(Ndim)
+        for i in builtins.range(Ndim):
+            if range[i][1] < range[i][0]:
+                raise ValueError(
+                    "In {}range, start must be <= stop".format(
+                        f"dimension {i + 1} of " if Ndim > 1 else ""))
+            smin[i], smax[i] = range[i]
+    # Make sure the bins have a finite width.
+    for i in builtins.range(len(smin)):
+        if smin[i] == smax[i]:
+            smin[i] = smin[i] - .5
+            smax[i] = smax[i] + .5
+    # Preserve sample floating point precision in bin edges
+    edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
+                   else float)
+    # Create edge arrays
+    for i in builtins.range(Ndim):
+        if np.isscalar(bins[i]):
+            nbin[i] = bins[i] + 2  # +2 for outlier bins
+            edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
+                                   dtype=edges_dtype)
+        else:
+            edges[i] = np.asarray(bins[i], edges_dtype)
+            nbin[i] = len(edges[i]) + 1  # +1 for outlier bins
+        dedges[i] = np.diff(edges[i])
+    nbin = np.asarray(nbin)
+    return nbin, edges, dedges
+def _bin_numbers(sample, nbin, edges, dedges):
+    """Compute the bin number each sample falls into, in each dimension
+    """
+    Dlen, Ndim = sample.shape
+    sampBin = [
+        np.digitize(sample[:, i], edges[i])
+        for i in range(Ndim)
+    ]
+    # Using `digitize`, values that fall on an edge are put in the right bin.
+    # For the rightmost bin, we want values equal to the right
+    # edge to be counted in the last bin, and not as an outlier.
+    for i in range(Ndim):
+        # Find the rounding precision
+        dedges_min = dedges[i].min()
+        if dedges_min == 0:
+            raise ValueError('The smallest edge difference is numerically 0.')
+        decimal = int(-np.log10(dedges_min)) + 6
+        # Find which points are on the rightmost edge.
+        on_edge = np.where((sample[:, i] >= edges[i][-1]) &
+                           (np.around(sample[:, i], decimal) ==
+                            np.around(edges[i][-1], decimal)))[0]
+        # Shift these points one bin to the left.
+        sampBin[i][on_edge] -= 1
+    # Compute the sample indices in the flattened statistic matrix.
+    binnumbers = np.ravel_multi_index(sampBin, nbin)
+    return binnumbers

.venv/Lib/site-packages/scipy/stats/_binomtest.py ADDED Viewed

	@@ -0,0 +1,375 @@

+from math import sqrt
+import numpy as np
+from scipy._lib._util import _validate_int
+from scipy.optimize import brentq
+from scipy.special import ndtri
+from ._discrete_distns import binom
+from ._common import ConfidenceInterval
+class BinomTestResult:
+    """
+    Result of `scipy.stats.binomtest`.
+    Attributes
+    ----------
+    k : int
+        The number of successes (copied from `binomtest` input).
+    n : int
+        The number of trials (copied from `binomtest` input).
+    alternative : str
+        Indicates the alternative hypothesis specified in the input
+        to `binomtest`.  It will be one of ``'two-sided'``, ``'greater'``,
+        or ``'less'``.
+    statistic: float
+        The estimate of the proportion of successes.
+    pvalue : float
+        The p-value of the hypothesis test.
+    """
+    def __init__(self, k, n, alternative, statistic, pvalue):
+        self.k = k
+        self.n = n
+        self.alternative = alternative
+        self.statistic = statistic
+        self.pvalue = pvalue
+        # add alias for backward compatibility
+        self.proportion_estimate = statistic
+    def __repr__(self):
+        s = ("BinomTestResult("
+             f"k={self.k}, "
+             f"n={self.n}, "
+             f"alternative={self.alternative!r}, "
+             f"statistic={self.statistic}, "
+             f"pvalue={self.pvalue})")
+        return s
+    def proportion_ci(self, confidence_level=0.95, method='exact'):
+        """
+        Compute the confidence interval for ``statistic``.
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval
+            of the estimated proportion. Default is 0.95.
+        method : {'exact', 'wilson', 'wilsoncc'}, optional
+            Selects the method used to compute the confidence interval
+            for the estimate of the proportion:
+            'exact' :
+                Use the Clopper-Pearson exact method [1]_.
+            'wilson' :
+                Wilson's method, without continuity correction ([2]_, [3]_).
+            'wilsoncc' :
+                Wilson's method, with continuity correction ([2]_, [3]_).
+            Default is ``'exact'``.
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence interval.
+        References
+        ----------
+        .. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
+               fiducial limits illustrated in the case of the binomial,
+               Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
+        .. [2] E. B. Wilson, Probable inference, the law of succession, and
+               statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
+               (1927).
+        .. [3] Robert G. Newcombe, Two-sided confidence intervals for the
+               single proportion: comparison of seven methods, Statistics
+               in Medicine, 17, pp 857-872 (1998).
+        Examples
+        --------
+        >>> from scipy.stats import binomtest
+        >>> result = binomtest(k=7, n=50, p=0.1)
+        >>> result.statistic
+        0.14
+        >>> result.proportion_ci()
+        ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
+        """
+        if method not in ('exact', 'wilson', 'wilsoncc'):
+            raise ValueError(f"method ('{method}') must be one of 'exact', "
+                             "'wilson' or 'wilsoncc'.")
+        if not (0 <= confidence_level <= 1):
+            raise ValueError(f'confidence_level ({confidence_level}) must be in '
+                             'the interval [0, 1].')
+        if method == 'exact':
+            low, high = _binom_exact_conf_int(self.k, self.n,
+                                              confidence_level,
+                                              self.alternative)
+        else:
+            # method is 'wilson' or 'wilsoncc'
+            low, high = _binom_wilson_conf_int(self.k, self.n,
+                                               confidence_level,
+                                               self.alternative,
+                                               correction=method == 'wilsoncc')
+        return ConfidenceInterval(low=low, high=high)
+def _findp(func):
+    try:
+        p = brentq(func, 0, 1)
+    except RuntimeError:
+        raise RuntimeError('numerical solver failed to converge when '
+                           'computing the confidence limits') from None
+    except ValueError as exc:
+        raise ValueError('brentq raised a ValueError; report this to the '
+                         'SciPy developers') from exc
+    return p
+def _binom_exact_conf_int(k, n, confidence_level, alternative):
+    """
+    Compute the estimate and confidence interval for the binomial test.
+    Returns proportion, prop_low, prop_high
+    """
+    if alternative == 'two-sided':
+        alpha = (1 - confidence_level) / 2
+        if k == 0:
+            plow = 0.0
+        else:
+            plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
+        if k == n:
+            phigh = 1.0
+        else:
+            phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
+    elif alternative == 'less':
+        alpha = 1 - confidence_level
+        plow = 0.0
+        if k == n:
+            phigh = 1.0
+        else:
+            phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
+    elif alternative == 'greater':
+        alpha = 1 - confidence_level
+        if k == 0:
+            plow = 0.0
+        else:
+            plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
+        phigh = 1.0
+    return plow, phigh
+def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
+    # This function assumes that the arguments have already been validated.
+    # In particular, `alternative` must be one of 'two-sided', 'less' or
+    # 'greater'.
+    p = k / n
+    if alternative == 'two-sided':
+        z = ndtri(0.5 + 0.5*confidence_level)
+    else:
+        z = ndtri(confidence_level)
+    # For reference, the formulas implemented here are from
+    # Newcombe (1998) (ref. [3] in the proportion_ci docstring).
+    denom = 2*(n + z**2)
+    center = (2*n*p + z**2)/denom
+    q = 1 - p
+    if correction:
+        if alternative == 'less' or k == 0:
+            lo = 0.0
+        else:
+            dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
+            lo = center - dlo
+        if alternative == 'greater' or k == n:
+            hi = 1.0
+        else:
+            dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
+            hi = center + dhi
+    else:
+        delta = z/denom * sqrt(4*n*p*q + z**2)
+        if alternative == 'less' or k == 0:
+            lo = 0.0
+        else:
+            lo = center - delta
+        if alternative == 'greater' or k == n:
+            hi = 1.0
+        else:
+            hi = center + delta
+    return lo, hi
+def binomtest(k, n, p=0.5, alternative='two-sided'):
+    """
+    Perform a test that the probability of success is p.
+    The binomial test [1]_ is a test of the null hypothesis that the
+    probability of success in a Bernoulli experiment is `p`.
+    Details of the test can be found in many texts on statistics, such
+    as section 24.5 of [2]_.
+    Parameters
+    ----------
+    k : int
+        The number of successes.
+    n : int
+        The number of trials.
+    p : float, optional
+        The hypothesized probability of success, i.e. the expected
+        proportion of successes.  The value must be in the interval
+        ``0 <= p <= 1``. The default value is ``p = 0.5``.
+    alternative : {'two-sided', 'greater', 'less'}, optional
+        Indicates the alternative hypothesis. The default value is
+        'two-sided'.
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.BinomTestResult` instance
+        The return value is an object with the following attributes:
+        k : int
+            The number of successes (copied from `binomtest` input).
+        n : int
+            The number of trials (copied from `binomtest` input).
+        alternative : str
+            Indicates the alternative hypothesis specified in the input
+            to `binomtest`.  It will be one of ``'two-sided'``, ``'greater'``,
+            or ``'less'``.
+        statistic : float
+            The estimate of the proportion of successes.
+        pvalue : float
+            The p-value of the hypothesis test.
+        The object has the following methods:
+        proportion_ci(confidence_level=0.95, method='exact') :
+            Compute the confidence interval for ``statistic``.
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+    References
+    ----------
+    .. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
+    .. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
+           Prentice Hall, Upper Saddle River, New Jersey USA (2010)
+    Examples
+    --------
+    >>> from scipy.stats import binomtest
+    A car manufacturer claims that no more than 10% of their cars are unsafe.
+    15 cars are inspected for safety, 3 were found to be unsafe. Test the
+    manufacturer's claim:
+    >>> result = binomtest(3, n=15, p=0.1, alternative='greater')
+    >>> result.pvalue
+    0.18406106910639114
+    The null hypothesis cannot be rejected at the 5% level of significance
+    because the returned p-value is greater than the critical value of 5%.
+    The test statistic is equal to the estimated proportion, which is simply
+    ``3/15``:
+    >>> result.statistic
+    0.2
+    We can use the `proportion_ci()` method of the result to compute the
+    confidence interval of the estimate:
+    >>> result.proportion_ci(confidence_level=0.95)
+    ConfidenceInterval(low=0.05684686759024681, high=1.0)
+    """
+    k = _validate_int(k, 'k', minimum=0)
+    n = _validate_int(n, 'n', minimum=1)
+    if k > n:
+        raise ValueError(f'k ({k}) must not be greater than n ({n}).')
+    if not (0 <= p <= 1):
+        raise ValueError(f"p ({p}) must be in range [0,1]")
+    if alternative not in ('two-sided', 'less', 'greater'):
+        raise ValueError(f"alternative ('{alternative}') not recognized; \n"
+                         "must be 'two-sided', 'less' or 'greater'")
+    if alternative == 'less':
+        pval = binom.cdf(k, n, p)
+    elif alternative == 'greater':
+        pval = binom.sf(k-1, n, p)
+    else:
+        # alternative is 'two-sided'
+        d = binom.pmf(k, n, p)
+        rerr = 1 + 1e-7
+        if k == p * n:
+            # special case as shortcut, would also be handled by `else` below
+            pval = 1.
+        elif k < p * n:
+            ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
+                                              -d*rerr, np.ceil(p * n), n)
+            # y is the number of terms between mode and n that are <= d*rerr.
+            # ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
+            # if the first equality doesn't hold, y=n-ix. Otherwise, we
+            # need to include ix as well as the equality holds. Note that
+            # the equality will hold in very very rare situations due to rerr.
+            y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
+            pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
+        else:
+            ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
+                                              d*rerr, 0, np.floor(p * n))
+            # y is the number of terms between 0 and mode that are <= d*rerr.
+            # we need to add a 1 to account for the 0 index.
+            # For comparing this with old behavior, see
+            # tst_binary_srch_for_binom_tst method in test_morestats.
+            y = ix + 1
+            pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
+        pval = min(1.0, pval)
+    result = BinomTestResult(k=k, n=n, alternative=alternative,
+                             statistic=k/n, pvalue=pval)
+    return result
+def _binary_search_for_binom_tst(a, d, lo, hi):
+    """
+    Conducts an implicit binary search on a function specified by `a`.
+    Meant to be used on the binomial PMF for the case of two-sided tests
+    to obtain the value on the other side of the mode where the tail
+    probability should be computed. The values on either side of
+    the mode are always in order, meaning binary search is applicable.
+    Parameters
+    ----------
+    a : callable
+      The function over which to perform binary search. Its values
+      for inputs lo and hi should be in ascending order.
+    d : float
+      The value to search.
+    lo : int
+      The lower end of range to search.
+    hi : int
+      The higher end of the range to search.
+    Returns
+    -------
+    int
+      The index, i between lo and hi
+      such that a(i)<=d<a(i+1)
+    """
+    while lo < hi:
+        mid = lo + (hi-lo)//2
+        midval = a(mid)
+        if midval < d:
+            lo = mid+1
+        elif midval > d:
+            hi = mid-1
+        else:
+            return mid
+    if a(lo) <= d:
+        return lo
+    else:
+        return lo-1

.venv/Lib/site-packages/scipy/stats/_bws_test.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import numpy as np
+from functools import partial
+from scipy import stats
+def _bws_input_validation(x, y, alternative, method):
+    ''' Input validation and standardization for bws test'''
+    x, y = np.atleast_1d(x, y)
+    if x.ndim > 1 or y.ndim > 1:
+        raise ValueError('`x` and `y` must be exactly one-dimensional.')
+    if np.isnan(x).any() or np.isnan(y).any():
+        raise ValueError('`x` and `y` must not contain NaNs.')
+    if np.size(x) == 0 or np.size(y) == 0:
+        raise ValueError('`x` and `y` must be of nonzero size.')
+    z = stats.rankdata(np.concatenate((x, y)))
+    x, y = z[:len(x)], z[len(x):]
+    alternatives = {'two-sided', 'less', 'greater'}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f'`alternative` must be one of {alternatives}.')
+    method = stats.PermutationMethod() if method is None else method
+    if not isinstance(method, stats.PermutationMethod):
+        raise ValueError('`method` must be an instance of '
+                         '`scipy.stats.PermutationMethod`')
+    return x, y, alternative, method
+def _bws_statistic(x, y, alternative, axis):
+    '''Compute the BWS test statistic for two independent samples'''
+    # Public function currently does not accept `axis`, but `permutation_test`
+    # uses `axis` to make vectorized call.
+    Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
+    n, m = Ri.shape[axis], Hj.shape[axis]
+    i, j = np.arange(1, n+1), np.arange(1, m+1)
+    Bx_num = Ri - (m + n)/n * i
+    By_num = Hj - (m + n)/m * j
+    if alternative == 'two-sided':
+        Bx_num *= Bx_num
+        By_num *= By_num
+    else:
+        Bx_num *= np.abs(Bx_num)
+        By_num *= np.abs(By_num)
+    Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
+    By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
+    Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
+    By = 1/m * np.sum(By_num/By_den, axis=axis)
+    B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
+    return B
+def bws_test(x, y, *, alternative="two-sided", method=None):
+    r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
+    The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of
+    the null hypothesis that the distribution underlying sample `x`
+    is the same as the distribution underlying sample `y`. Unlike
+    the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests,
+    the BWS test weights the integral by the variance of the difference
+    in cumulative distribution functions (CDFs), emphasizing the tails of the
+    distributions, which increases the power of the test in many applications.
+    Parameters
+    ----------
+    x, y : array-like
+        1-d arrays of samples.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        Let *F(u)* and *G(u)* be the cumulative distribution functions of the
+        distributions underlying `x` and `y`, respectively. Then the following
+        alternative hypotheses are available:
+        * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
+          at least one *u*.
+        * 'less': the distribution underlying `x` is stochastically less than
+          the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
+        * 'greater': the distribution underlying `x` is stochastically greater
+          than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
+          *u*.
+        Under a more restrictive set of assumptions, the alternative hypotheses
+        can be expressed in terms of the locations of the distributions;
+        see [2] section 5.1.
+    method : PermutationMethod, optional
+        Configures the method used to compute the p-value. The default is
+        the default `PermutationMethod` object.
+    Returns
+    -------
+    res : PermutationTestResult
+    An object with attributes:
+    statistic : float
+        The observed test statistic of the data.
+    pvalue : float
+        The p-value for the given alternative.
+    null_distribution : ndarray
+        The values of the test statistic generated under the null hypothesis.
+    See also
+    --------
+    scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
+    Notes
+    -----
+    When ``alternative=='two-sided'``, the statistic is defined by the
+    equations given in [1]_ Section 2. This statistic is not appropriate for
+    one-sided alternatives; in that case, the statistic is the *negative* of
+    that given by the equations in [1]_ Section 2. Consequently, when the
+    distribution of the first sample is stochastically greater than that of the
+    second sample, the statistic will tend to be positive.
+    References
+    ----------
+    .. [1] Neuhäuser, M. (2005). Exact Tests Based on the
+           Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
+           46(1), 1-29.
+    .. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
+           On assumptions for hypothesis tests and multiple interpretations of
+           decision rules. Statistics surveys, 4, 1.
+    Examples
+    --------
+    We follow the example of table 3 in [1]_: Fourteen children were divided
+    randomly into two groups. Their ranks at performing a specific tests are
+    as follows.
+    >>> import numpy as np
+    >>> x = [1, 2, 3, 4, 6, 7, 8]
+    >>> y = [5, 9, 10, 11, 12, 13, 14]
+    We use the BWS test to assess whether there is a statistically significant
+    difference between the two groups.
+    The null hypothesis is that there is no difference in the distributions of
+    performance between the two groups. We decide that a significance level of
+    1% is required to reject the null hypothesis in favor of the alternative
+    that the distributions are different.
+    Since the number of samples is very small, we can compare the observed test
+    statistic against the *exact* distribution of the test statistic under the
+    null hypothesis.
+    >>> from scipy.stats import bws_test
+    >>> res = bws_test(x, y)
+    >>> print(res.statistic)
+    5.132167152575315
+    This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
+    by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
+    >>> print(res.pvalue)
+    0.002913752913752914
+    Because the p-value is below our threshold of 1%, we take this as evidence
+    against the null hypothesis in favor of the alternative that there is a
+    difference in performance between the two groups.
+    '''
+    x, y, alternative, method = _bws_input_validation(x, y, alternative,
+                                                      method)
+    bws_statistic = partial(_bws_statistic, alternative=alternative)
+    permutation_alternative = 'less' if alternative == 'less' else 'greater'
+    res = stats.permutation_test((x, y), bws_statistic,
+                                 alternative=permutation_alternative,
+                                 **method._asdict())
+    return res

.venv/Lib/site-packages/scipy/stats/_censored_data.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import numpy as np
+def _validate_1d(a, name, allow_inf=False):
+    if np.ndim(a) != 1:
+        raise ValueError(f'`{name}` must be a one-dimensional sequence.')
+    if np.isnan(a).any():
+        raise ValueError(f'`{name}` must not contain nan.')
+    if not allow_inf and np.isinf(a).any():
+        raise ValueError(f'`{name}` must contain only finite values.')
+def _validate_interval(interval):
+    interval = np.asarray(interval)
+    if interval.shape == (0,):
+        # The input was a sequence with length 0.
+        interval = interval.reshape((0, 2))
+    if interval.ndim != 2 or interval.shape[-1] != 2:
+        raise ValueError('`interval` must be a two-dimensional array with '
+                         'shape (m, 2), where m is the number of '
+                         'interval-censored values, but got shape '
+                         f'{interval.shape}')
+    if np.isnan(interval).any():
+        raise ValueError('`interval` must not contain nan.')
+    if np.isinf(interval).all(axis=1).any():
+        raise ValueError('In each row in `interval`, both values must not'
+                         ' be infinite.')
+    if (interval[:, 0] > interval[:, 1]).any():
+        raise ValueError('In each row of `interval`, the left value must not'
+                         ' exceed the right value.')
+    uncensored_mask = interval[:, 0] == interval[:, 1]
+    left_mask = np.isinf(interval[:, 0])
+    right_mask = np.isinf(interval[:, 1])
+    interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
+    uncensored2 = interval[uncensored_mask, 0]
+    left2 = interval[left_mask, 1]
+    right2 = interval[right_mask, 0]
+    interval2 = interval[interval_mask]
+    return uncensored2, left2, right2, interval2
+def _validate_x_censored(x, censored):
+    x = np.asarray(x)
+    if x.ndim != 1:
+        raise ValueError('`x` must be one-dimensional.')
+    censored = np.asarray(censored)
+    if censored.ndim != 1:
+        raise ValueError('`censored` must be one-dimensional.')
+    if (~np.isfinite(x)).any():
+        raise ValueError('`x` must not contain nan or inf.')
+    if censored.size != x.size:
+        raise ValueError('`x` and `censored` must have the same length.')
+    return x, censored.astype(bool)
+class CensoredData:
+    """
+    Instances of this class represent censored data.
+    Instances may be passed to the ``fit`` method of continuous
+    univariate SciPy distributions for maximum likelihood estimation.
+    The *only* method of the univariate continuous distributions that
+    understands `CensoredData` is the ``fit`` method.  An instance of
+    `CensoredData` can not be passed to methods such as ``pdf`` and
+    ``cdf``.
+    An observation is said to be *censored* when the precise value is unknown,
+    but it has a known upper and/or lower bound.  The conventional terminology
+    is:
+    * left-censored: an observation is below a certain value but it is
+      unknown by how much.
+    * right-censored: an observation is above a certain value but it is
+      unknown by how much.
+    * interval-censored: an observation lies somewhere on an interval between
+      two values.
+    Left-, right-, and interval-censored data can be represented by
+    `CensoredData`.
+    For convenience, the class methods ``left_censored`` and
+    ``right_censored`` are provided to create a `CensoredData`
+    instance from a single one-dimensional array of measurements
+    and a corresponding boolean array to indicate which measurements
+    are censored.  The class method ``interval_censored`` accepts two
+    one-dimensional arrays that hold the lower and upper bounds of the
+    intervals.
+    Parameters
+    ----------
+    uncensored : array_like, 1D
+        Uncensored observations.
+    left : array_like, 1D
+        Left-censored observations.
+    right : array_like, 1D
+        Right-censored observations.
+    interval : array_like, 2D, with shape (m, 2)
+        Interval-censored observations.  Each row ``interval[k, :]``
+        represents the interval for the kth interval-censored observation.
+    Notes
+    -----
+    In the input array `interval`, the lower bound of the interval may
+    be ``-inf``, and the upper bound may be ``inf``, but at least one must be
+    finite. When the lower bound is ``-inf``, the row represents a left-
+    censored observation, and when the upper bound is ``inf``, the row
+    represents a right-censored observation.  If the length of an interval
+    is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
+    treated as uncensored.  So one can represent all the types of censored
+    and uncensored data in ``interval``, but it is generally more convenient
+    to use `uncensored`, `left` and `right` for uncensored, left-censored and
+    right-censored observations, respectively.
+    Examples
+    --------
+    In the most general case, a censored data set may contain values that
+    are left-censored, right-censored, interval-censored, and uncensored.
+    For example, here we create a data set with five observations.  Two
+    are uncensored (values 1 and 1.5), one is a left-censored observation
+    of 0, one is a right-censored observation of 10 and one is
+    interval-censored in the interval [2, 3].
+    >>> import numpy as np
+    >>> from scipy.stats import CensoredData
+    >>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
+    ...                     interval=[[2, 3]])
+    >>> print(data)
+    CensoredData(5 values: 2 not censored, 1 left-censored,
+    1 right-censored, 1 interval-censored)
+    Equivalently,
+    >>> data = CensoredData(interval=[[1, 1],
+    ...                               [1.5, 1.5],
+    ...                               [-np.inf, 0],
+    ...                               [10, np.inf],
+    ...                               [2, 3]])
+    >>> print(data)
+    CensoredData(5 values: 2 not censored, 1 left-censored,
+    1 right-censored, 1 interval-censored)
+    A common case is to have a mix of uncensored observations and censored
+    observations that are all right-censored (or all left-censored). For
+    example, consider an experiment in which six devices are started at
+    various times and left running until they fail.  Assume that time is
+    measured in hours, and the experiment is stopped after 30 hours, even
+    if all the devices have not failed by that time.  We might end up with
+    data such as this::
+        Device  Start-time  Fail-time  Time-to-failure
+           1         0         13           13
+           2         2         24           22
+           3         5         22           17
+           4         8         23           15
+           5        10        ***          >20
+           6        12        ***          >18
+    Two of the devices had not failed when the experiment was stopped;
+    the observations of the time-to-failure for these two devices are
+    right-censored.  We can represent this data with
+    >>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
+    >>> print(data)
+    CensoredData(6 values: 4 not censored, 2 right-censored)
+    Alternatively, we can use the method `CensoredData.right_censored` to
+    create a representation of this data.  The time-to-failure observations
+    are put the list ``ttf``.  The ``censored`` list indicates which values
+    in ``ttf`` are censored.
+    >>> ttf = [13, 22, 17, 15, 20, 18]
+    >>> censored = [False, False, False, False, True, True]
+    Pass these lists to `CensoredData.right_censored` to create an
+    instance of `CensoredData`.
+    >>> data = CensoredData.right_censored(ttf, censored)
+    >>> print(data)
+    CensoredData(6 values: 4 not censored, 2 right-censored)
+    If the input data is interval censored and already stored in two
+    arrays, one holding the low end of the intervals and another
+    holding the high ends, the class method ``interval_censored`` can
+    be used to create the `CensoredData` instance.
+    This example creates an instance with four interval-censored values.
+    The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
+    >>> a = [10, 0.5, 2, 12.5]  # Low ends of the intervals
+    >>> b = [11, 1.0, 3, 13.5]  # High ends of the intervals
+    >>> data = CensoredData.interval_censored(low=a, high=b)
+    >>> print(data)
+    CensoredData(4 values: 0 not censored, 4 interval-censored)
+    Finally, we create and censor some data from the `weibull_min`
+    distribution, and then fit `weibull_min` to that data. We'll assume
+    that the location parameter is known to be 0.
+    >>> from scipy.stats import weibull_min
+    >>> rng = np.random.default_rng()
+    Create the random data set.
+    >>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
+    >>> x[x > 40] = 40  # Right-censor values greater or equal to 40.
+    Create the `CensoredData` instance with the `right_censored` method.
+    The censored values are those where the value is 40.
+    >>> data = CensoredData.right_censored(x, x == 40)
+    >>> print(data)
+    CensoredData(250 values: 215 not censored, 35 right-censored)
+    35 values have been right-censored.
+    Fit `weibull_min` to the censored data.  We expect to shape and scale
+    to be approximately 2.5 and 30, respectively.
+    >>> weibull_min.fit(data, floc=0)
+    (2.3575922823897315, 0, 30.40650074451254)
+    """
+    def __init__(self, uncensored=None, *, left=None, right=None,
+                 interval=None):
+        if uncensored is None:
+            uncensored = []
+        if left is None:
+            left = []
+        if right is None:
+            right = []
+        if interval is None:
+            interval = np.empty((0, 2))
+        _validate_1d(uncensored, 'uncensored')
+        _validate_1d(left, 'left')
+        _validate_1d(right, 'right')
+        uncensored2, left2, right2, interval2 = _validate_interval(interval)
+        self._uncensored = np.concatenate((uncensored, uncensored2))
+        self._left = np.concatenate((left, left2))
+        self._right = np.concatenate((right, right2))
+        # Note that by construction, the private attribute _interval
+        # will be a 2D array that contains only finite values representing
+        # intervals with nonzero but finite length.
+        self._interval = interval2
+    def __repr__(self):
+        uncensored_str = " ".join(np.array_repr(self._uncensored).split())
+        left_str = " ".join(np.array_repr(self._left).split())
+        right_str = " ".join(np.array_repr(self._right).split())
+        interval_str = " ".join(np.array_repr(self._interval).split())
+        return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
+                f"right={right_str}, interval={interval_str})")
+    def __str__(self):
+        num_nc = len(self._uncensored)
+        num_lc = len(self._left)
+        num_rc = len(self._right)
+        num_ic = len(self._interval)
+        n = num_nc + num_lc + num_rc + num_ic
+        parts = [f'{num_nc} not censored']
+        if num_lc > 0:
+            parts.append(f'{num_lc} left-censored')
+        if num_rc > 0:
+            parts.append(f'{num_rc} right-censored')
+        if num_ic > 0:
+            parts.append(f'{num_ic} interval-censored')
+        return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
+    # This is not a complete implementation of the arithmetic operators.
+    # All we need is subtracting a scalar and dividing by a scalar.
+    def __sub__(self, other):
+        return CensoredData(uncensored=self._uncensored - other,
+                            left=self._left - other,
+                            right=self._right - other,
+                            interval=self._interval - other)
+    def __truediv__(self, other):
+        return CensoredData(uncensored=self._uncensored / other,
+                            left=self._left / other,
+                            right=self._right / other,
+                            interval=self._interval / other)
+    def __len__(self):
+        """
+        The number of values (censored and not censored).
+        """
+        return (len(self._uncensored) + len(self._left) + len(self._right)
+                + len(self._interval))
+    def num_censored(self):
+        """
+        Number of censored values.
+        """
+        return len(self._left) + len(self._right) + len(self._interval)
+    @classmethod
+    def right_censored(cls, x, censored):
+        """
+        Create a `CensoredData` instance of right-censored data.
+        Parameters
+        ----------
+        x : array_like
+            `x` is the array of observed data or measurements.
+            `x` must be a one-dimensional sequence of finite numbers.
+        censored : array_like of bool
+            `censored` must be a one-dimensional sequence of boolean
+            values.  If ``censored[k]`` is True, the corresponding value
+            in `x` is right-censored.  That is, the value ``x[k]``
+            is the lower bound of the true (but unknown) value.
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of uncensored and right-censored values.
+        Examples
+        --------
+        >>> from scipy.stats import CensoredData
+        Two uncensored values (4 and 10) and two right-censored values
+        (24 and 25).
+        >>> data = CensoredData.right_censored([4, 10, 24, 25],
+        ...                                    [False, False, True, True])
+        >>> data
+        CensoredData(uncensored=array([ 4., 10.]),
+        left=array([], dtype=float64), right=array([24., 25.]),
+        interval=array([], shape=(0, 2), dtype=float64))
+        >>> print(data)
+        CensoredData(4 values: 2 not censored, 2 right-censored)
+        """
+        x, censored = _validate_x_censored(x, censored)
+        return cls(uncensored=x[~censored], right=x[censored])
+    @classmethod
+    def left_censored(cls, x, censored):
+        """
+        Create a `CensoredData` instance of left-censored data.
+        Parameters
+        ----------
+        x : array_like
+            `x` is the array of observed data or measurements.
+            `x` must be a one-dimensional sequence of finite numbers.
+        censored : array_like of bool
+            `censored` must be a one-dimensional sequence of boolean
+            values.  If ``censored[k]`` is True, the corresponding value
+            in `x` is left-censored.  That is, the value ``x[k]``
+            is the upper bound of the true (but unknown) value.
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of uncensored and left-censored values.
+        Examples
+        --------
+        >>> from scipy.stats import CensoredData
+        Two uncensored values (0.12 and 0.033) and two left-censored values
+        (both 1e-3).
+        >>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
+        ...                                   [False, False, True, True])
+        >>> data
+        CensoredData(uncensored=array([0.12 , 0.033]),
+        left=array([0.001, 0.001]), right=array([], dtype=float64),
+        interval=array([], shape=(0, 2), dtype=float64))
+        >>> print(data)
+        CensoredData(4 values: 2 not censored, 2 left-censored)
+        """
+        x, censored = _validate_x_censored(x, censored)
+        return cls(uncensored=x[~censored], left=x[censored])
+    @classmethod
+    def interval_censored(cls, low, high):
+        """
+        Create a `CensoredData` instance of interval-censored data.
+        This method is useful when all the data is interval-censored, and
+        the low and high ends of the intervals are already stored in
+        separate one-dimensional arrays.
+        Parameters
+        ----------
+        low : array_like
+            The one-dimensional array containing the low ends of the
+            intervals.
+        high : array_like
+            The one-dimensional array containing the high ends of the
+            intervals.
+        Returns
+        -------
+        data : `CensoredData`
+            An instance of `CensoredData` that represents the
+            collection of censored values.
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy.stats import CensoredData
+        ``a`` and ``b`` are the low and high ends of a collection of
+        interval-censored values.
+        >>> a = [0.5, 2.0, 3.0, 5.5]
+        >>> b = [1.0, 2.5, 3.5, 7.0]
+        >>> data = CensoredData.interval_censored(low=a, high=b)
+        >>> print(data)
+        CensoredData(4 values: 0 not censored, 4 interval-censored)
+        """
+        _validate_1d(low, 'low', allow_inf=True)
+        _validate_1d(high, 'high', allow_inf=True)
+        if len(low) != len(high):
+            raise ValueError('`low` and `high` must have the same length.')
+        interval = np.column_stack((low, high))
+        uncensored, left, right, interval = _validate_interval(interval)
+        return cls(uncensored=uncensored, left=left, right=right,
+                   interval=interval)
+    def _uncensor(self):
+        """
+        This function is used when a non-censored version of the data
+        is needed to create a rough estimate of the parameters of a
+        distribution via the method of moments or some similar method.
+        The data is "uncensored" by taking the given endpoints as the
+        data for the left- or right-censored data, and the mean for the
+        interval-censored data.
+        """
+        data = np.concatenate((self._uncensored, self._left, self._right,
+                               self._interval.mean(axis=1)))
+        return data
+    def _supported(self, a, b):
+        """
+        Return a subset of self containing the values that are in
+        (or overlap with) the interval (a, b).
+        """
+        uncensored = self._uncensored
+        uncensored = uncensored[(a < uncensored) & (uncensored < b)]
+        left = self._left
+        left = left[a < left]
+        right = self._right
+        right = right[right < b]
+        interval = self._interval
+        interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
+        return CensoredData(uncensored, left=left, right=right,
+                            interval=interval)

.venv/Lib/site-packages/scipy/stats/_common.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from collections import namedtuple
+ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
+ConfidenceInterval. __doc__ = "Class for confidence intervals."

.venv/Lib/site-packages/scipy/stats/_constants.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+Statistics-related constants.
+"""
+import numpy as np
+# The smallest representable positive number such that 1.0 + _EPS != 1.0.
+_EPS = np.finfo(float).eps
+# The largest [in magnitude] usable floating value.
+_XMAX = np.finfo(float).max
+# The log of the largest usable floating value; useful for knowing
+# when exp(something) will overflow
+_LOGXMAX = np.log(_XMAX)
+# The smallest [in magnitude] usable (i.e. not subnormal) double precision
+# floating value.
+_XMIN = np.finfo(float).tiny
+# The log of the smallest [in magnitude] usable (i.e not subnormal)
+# double precision floating value.
+_LOGXMIN = np.log(_XMIN)
+# -special.psi(1)
+_EULER = 0.577215664901532860606512090082402431042
+# special.zeta(3, 1)  Apery's constant
+_ZETA3 = 1.202056903159594285399738161511449990765
+# sqrt(pi)
+_SQRT_PI = 1.772453850905516027298167483341145182798
+# sqrt(2/pi)
+_SQRT_2_OVER_PI = 0.7978845608028654
+# log(sqrt(2/pi))
+_LOG_SQRT_2_OVER_PI = -0.22579135264472744

.venv/Lib/site-packages/scipy/stats/_continuous_distns.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_covariance.py ADDED Viewed

	@@ -0,0 +1,633 @@

+from functools import cached_property
+import numpy as np
+from scipy import linalg
+from scipy.stats import _multivariate
+__all__ = ["Covariance"]
+class Covariance:
+    """
+    Representation of a covariance matrix
+    Calculations involving covariance matrices (e.g. data whitening,
+    multivariate normal function evaluation) are often performed more
+    efficiently using a decomposition of the covariance matrix instead of the
+    covariance matrix itself. This class allows the user to construct an
+    object representing a covariance matrix using any of several
+    decompositions and perform calculations using a common interface.
+    .. note::
+        The `Covariance` class cannot be instantiated directly. Instead, use
+        one of the factory methods (e.g. `Covariance.from_diagonal`).
+    Examples
+    --------
+    The `Covariance` class is is used by calling one of its
+    factory methods to create a `Covariance` object, then pass that
+    representation of the `Covariance` matrix as a shape parameter of a
+    multivariate distribution.
+    For instance, the multivariate normal distribution can accept an array
+    representing a covariance matrix:
+    >>> from scipy import stats
+    >>> import numpy as np
+    >>> d = [1, 2, 3]
+    >>> A = np.diag(d)  # a diagonal covariance matrix
+    >>> x = [4, -2, 5]  # a point of interest
+    >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
+    >>> dist.pdf(x)
+    4.9595685102808205e-08
+    but the calculations are performed in a very generic way that does not
+    take advantage of any special properties of the covariance matrix. Because
+    our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
+    to create an object representing the covariance matrix, and
+    `multivariate_normal` can use this to compute the probability density
+    function more efficiently.
+    >>> cov = stats.Covariance.from_diagonal(d)
+    >>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
+    >>> dist.pdf(x)
+    4.9595685102808205e-08
+    """
+    def __init__(self):
+        message = ("The `Covariance` class cannot be instantiated directly. "
+                   "Please use one of the factory methods "
+                   "(e.g. `Covariance.from_diagonal`).")
+        raise NotImplementedError(message)
+    @staticmethod
+    def from_diagonal(diagonal):
+        r"""
+        Return a representation of a covariance matrix from its diagonal.
+        Parameters
+        ----------
+        diagonal : array_like
+            The diagonal elements of a diagonal matrix.
+        Notes
+        -----
+        Let the diagonal elements of a diagonal covariance matrix :math:`D` be
+        stored in the vector :math:`d`.
+        When all elements of :math:`d` are strictly positive, whitening of a
+        data point :math:`x` is performed by computing
+        :math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
+        element-wise.
+        :math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
+        where the :math:`\log` operation is performed element-wise.
+        This `Covariance` class supports singular covariance matrices. When
+        computing ``_log_pdet``, non-positive elements of :math:`d` are
+        ignored. Whitening is not well defined when the point to be whitened
+        does not lie in the span of the columns of the covariance matrix. The
+        convention taken here is to treat the inverse square root of
+        non-positive elements of :math:`d` as zeros.
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = np.diag(rng.random(n))
+        >>> x = rng.random(size=n)
+        Extract the diagonal from ``A`` and create the `Covariance` object.
+        >>> d = np.diag(A)
+        >>> cov = stats.Covariance.from_diagonal(d)
+        Compare the functionality of the `Covariance` object against a
+        reference implementations.
+        >>> res = cov.whiten(x)
+        >>> ref = np.diag(d**-0.5) @ x
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+        """
+        return CovViaDiagonal(diagonal)
+    @staticmethod
+    def from_precision(precision, covariance=None):
+        r"""
+        Return a representation of a covariance from its precision matrix.
+        Parameters
+        ----------
+        precision : array_like
+            The precision matrix; that is, the inverse of a square, symmetric,
+            positive definite covariance matrix.
+        covariance : array_like, optional
+            The square, symmetric, positive definite covariance matrix. If not
+            provided, this may need to be calculated (e.g. to evaluate the
+            cumulative distribution function of
+            `scipy.stats.multivariate_normal`) by inverting `precision`.
+        Notes
+        -----
+        Let the covariance matrix be :math:`A`, its precision matrix be
+        :math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
+        that :math:`L L^T = P`.
+        Whitening of a data point :math:`x` is performed by computing
+        :math:`x^T L`. :math:`\log\det{A}` is calculated as
+        :math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
+        element-wise.
+        This `Covariance` class does not support singular covariance matrices
+        because the precision matrix does not exist for a singular covariance
+        matrix.
+        Examples
+        --------
+        Prepare a symmetric positive definite precision matrix ``P`` and a
+        data point ``x``. (If the precision matrix is not already available,
+        consider the other factory methods of the `Covariance` class.)
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> P = rng.random(size=(n, n))
+        >>> P = P @ P.T  # a precision matrix must be positive definite
+        >>> x = rng.random(size=n)
+        Create the `Covariance` object.
+        >>> cov = stats.Covariance.from_precision(P)
+        Compare the functionality of the `Covariance` object against
+        reference implementations.
+        >>> res = cov.whiten(x)
+        >>> ref = x @ np.linalg.cholesky(P)
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = -np.linalg.slogdet(P)[-1]
+        >>> np.allclose(res, ref)
+        True
+        """
+        return CovViaPrecision(precision, covariance)
+    @staticmethod
+    def from_cholesky(cholesky):
+        r"""
+        Representation of a covariance provided via the (lower) Cholesky factor
+        Parameters
+        ----------
+        cholesky : array_like
+            The lower triangular Cholesky factor of the covariance matrix.
+        Notes
+        -----
+        Let the covariance matrix be :math:`A` and :math:`L` be the lower
+        Cholesky factor such that :math:`L L^T = A`.
+        Whitening of a data point :math:`x` is performed by computing
+        :math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
+        :math:`2tr(\log{L})`, where the :math:`\log` operation is performed
+        element-wise.
+        This `Covariance` class does not support singular covariance matrices
+        because the Cholesky decomposition does not exist for a singular
+        covariance matrix.
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = rng.random(size=(n, n))
+        >>> A = A @ A.T  # make the covariance symmetric positive definite
+        >>> x = rng.random(size=n)
+        Perform the Cholesky decomposition of ``A`` and create the
+        `Covariance` object.
+        >>> L = np.linalg.cholesky(A)
+        >>> cov = stats.Covariance.from_cholesky(L)
+        Compare the functionality of the `Covariance` object against
+        reference implementation.
+        >>> from scipy.linalg import solve_triangular
+        >>> res = cov.whiten(x)
+        >>> ref = solve_triangular(L, x, lower=True)
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+        """
+        return CovViaCholesky(cholesky)
+    @staticmethod
+    def from_eigendecomposition(eigendecomposition):
+        r"""
+        Representation of a covariance provided via eigendecomposition
+        Parameters
+        ----------
+        eigendecomposition : sequence
+            A sequence (nominally a tuple) containing the eigenvalue and
+            eigenvector arrays as computed by `scipy.linalg.eigh` or
+            `numpy.linalg.eigh`.
+        Notes
+        -----
+        Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
+        eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
+        such that `V W V^T = A`.
+        When all of the eigenvalues are strictly positive, whitening of a
+        data point :math:`x` is performed by computing
+        :math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
+        element-wise.
+        :math:`\log\det{A}` is calculated as  :math:`tr(\log{W})`,
+        where the :math:`\log` operation is performed element-wise.
+        This `Covariance` class supports singular covariance matrices. When
+        computing ``_log_pdet``, non-positive eigenvalues are ignored.
+        Whitening is not well defined when the point to be whitened
+        does not lie in the span of the columns of the covariance matrix. The
+        convention taken here is to treat the inverse square root of
+        non-positive eigenvalues as zeros.
+        Examples
+        --------
+        Prepare a symmetric positive definite covariance matrix ``A`` and a
+        data point ``x``.
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 5
+        >>> A = rng.random(size=(n, n))
+        >>> A = A @ A.T  # make the covariance symmetric positive definite
+        >>> x = rng.random(size=n)
+        Perform the eigendecomposition of ``A`` and create the `Covariance`
+        object.
+        >>> w, v = np.linalg.eigh(A)
+        >>> cov = stats.Covariance.from_eigendecomposition((w, v))
+        Compare the functionality of the `Covariance` object against
+        reference implementations.
+        >>> res = cov.whiten(x)
+        >>> ref = x @ (v @ np.diag(w**-0.5))
+        >>> np.allclose(res, ref)
+        True
+        >>> res = cov.log_pdet
+        >>> ref = np.linalg.slogdet(A)[-1]
+        >>> np.allclose(res, ref)
+        True
+        """
+        return CovViaEigendecomposition(eigendecomposition)
+    def whiten(self, x):
+        """
+        Perform a whitening transformation on data.
+        "Whitening" ("white" as in "white noise", in which each frequency has
+        equal magnitude) transforms a set of random variables into a new set of
+        random variables with unit-diagonal covariance. When a whitening
+        transform is applied to a sample of points distributed according to
+        a multivariate normal distribution with zero mean, the covariance of
+        the transformed sample is approximately the identity matrix.
+        Parameters
+        ----------
+        x : array_like
+            An array of points. The last dimension must correspond with the
+            dimensionality of the space, i.e., the number of columns in the
+            covariance matrix.
+        Returns
+        -------
+        x_ : array_like
+            The transformed array of points.
+        References
+        ----------
+        .. [1] "Whitening Transformation". Wikipedia.
+               https://en.wikipedia.org/wiki/Whitening_transformation
+        .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
+               coloring linear transformation". Transactions of VSB 18.2
+               (2018): 31-35. :doi:`10.31490/tces-2018-0013`
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng()
+        >>> n = 3
+        >>> A = rng.random(size=(n, n))
+        >>> cov_array = A @ A.T  # make matrix symmetric positive definite
+        >>> precision = np.linalg.inv(cov_array)
+        >>> cov_object = stats.Covariance.from_precision(precision)
+        >>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
+        >>> x_ = cov_object.whiten(x)
+        >>> np.cov(x_, rowvar=False)  # near-identity covariance
+        array([[0.97862122, 0.00893147, 0.02430451],
+               [0.00893147, 0.96719062, 0.02201312],
+               [0.02430451, 0.02201312, 0.99206881]])
+        """
+        return self._whiten(np.asarray(x))
+    def colorize(self, x):
+        """
+        Perform a colorizing transformation on data.
+        "Colorizing" ("color" as in "colored noise", in which different
+        frequencies may have different magnitudes) transforms a set of
+        uncorrelated random variables into a new set of random variables with
+        the desired covariance. When a coloring transform is applied to a
+        sample of points distributed according to a multivariate normal
+        distribution with identity covariance and zero mean, the covariance of
+        the transformed sample is approximately the covariance matrix used
+        in the coloring transform.
+        Parameters
+        ----------
+        x : array_like
+            An array of points. The last dimension must correspond with the
+            dimensionality of the space, i.e., the number of columns in the
+            covariance matrix.
+        Returns
+        -------
+        x_ : array_like
+            The transformed array of points.
+        References
+        ----------
+        .. [1] "Whitening Transformation". Wikipedia.
+               https://en.wikipedia.org/wiki/Whitening_transformation
+        .. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
+               coloring linear transformation". Transactions of VSB 18.2
+               (2018): 31-35. :doi:`10.31490/tces-2018-0013`
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> rng = np.random.default_rng(1638083107694713882823079058616272161)
+        >>> n = 3
+        >>> A = rng.random(size=(n, n))
+        >>> cov_array = A @ A.T  # make matrix symmetric positive definite
+        >>> cholesky = np.linalg.cholesky(cov_array)
+        >>> cov_object = stats.Covariance.from_cholesky(cholesky)
+        >>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
+        >>> x_ = cov_object.colorize(x)
+        >>> cov_data = np.cov(x_, rowvar=False)
+        >>> np.allclose(cov_data, cov_array, rtol=3e-2)
+        True
+        """
+        return self._colorize(np.asarray(x))
+    @property
+    def log_pdet(self):
+        """
+        Log of the pseudo-determinant of the covariance matrix
+        """
+        return np.array(self._log_pdet, dtype=float)[()]
+    @property
+    def rank(self):
+        """
+        Rank of the covariance matrix
+        """
+        return np.array(self._rank, dtype=int)[()]
+    @property
+    def covariance(self):
+        """
+        Explicit representation of the covariance matrix
+        """
+        return self._covariance
+    @property
+    def shape(self):
+        """
+        Shape of the covariance array
+        """
+        return self._shape
+    def _validate_matrix(self, A, name):
+        A = np.atleast_2d(A)
+        m, n = A.shape[-2:]
+        if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
+                                         np.issubdtype(A.dtype, np.floating)):
+            message = (f"The input `{name}` must be a square, "
+                       "two-dimensional array of real numbers.")
+            raise ValueError(message)
+        return A
+    def _validate_vector(self, A, name):
+        A = np.atleast_1d(A)
+        if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
+                               np.issubdtype(A.dtype, np.floating)):
+            message = (f"The input `{name}` must be a one-dimensional array "
+                       "of real numbers.")
+            raise ValueError(message)
+        return A
+class CovViaPrecision(Covariance):
+    def __init__(self, precision, covariance=None):
+        precision = self._validate_matrix(precision, 'precision')
+        if covariance is not None:
+            covariance = self._validate_matrix(covariance, 'covariance')
+            message = "`precision.shape` must equal `covariance.shape`."
+            if precision.shape != covariance.shape:
+                raise ValueError(message)
+        self._chol_P = np.linalg.cholesky(precision)
+        self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
+        self._rank = precision.shape[-1]  # must be full rank if invertible
+        self._precision = precision
+        self._cov_matrix = covariance
+        self._shape = precision.shape
+        self._allow_singular = False
+    def _whiten(self, x):
+        return x @ self._chol_P
+    @cached_property
+    def _covariance(self):
+        n = self._shape[-1]
+        return (linalg.cho_solve((self._chol_P, True), np.eye(n))
+                if self._cov_matrix is None else self._cov_matrix)
+    def _colorize(self, x):
+        return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
+def _dot_diag(x, d):
+    # If d were a full diagonal matrix, x @ d would always do what we want.
+    # Special treatment is needed for n-dimensional `d` in which each row
+    # includes only the diagonal elements of a covariance matrix.
+    return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
+class CovViaDiagonal(Covariance):
+    def __init__(self, diagonal):
+        diagonal = self._validate_vector(diagonal, 'diagonal')
+        i_zero = diagonal <= 0
+        positive_diagonal = np.array(diagonal, dtype=np.float64)
+        positive_diagonal[i_zero] = 1  # ones don't affect determinant
+        self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
+        psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
+        psuedo_reciprocals[i_zero] = 0
+        self._sqrt_diagonal = np.sqrt(diagonal)
+        self._LP = psuedo_reciprocals
+        self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
+        self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
+        self._i_zero = i_zero
+        self._shape = self._covariance.shape
+        self._allow_singular = True
+    def _whiten(self, x):
+        return _dot_diag(x, self._LP)
+    def _colorize(self, x):
+        return _dot_diag(x, self._sqrt_diagonal)
+    def _support_mask(self, x):
+        """
+        Check whether x lies in the support of the distribution.
+        """
+        return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
+class CovViaCholesky(Covariance):
+    def __init__(self, cholesky):
+        L = self._validate_matrix(cholesky, 'cholesky')
+        self._factor = L
+        self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
+        self._rank = L.shape[-1]  # must be full rank for cholesky
+        self._shape = L.shape
+        self._allow_singular = False
+    @cached_property
+    def _covariance(self):
+        return self._factor @ self._factor.T
+    def _whiten(self, x):
+        res = linalg.solve_triangular(self._factor, x.T, lower=True).T
+        return res
+    def _colorize(self, x):
+        return x @ self._factor.T
+class CovViaEigendecomposition(Covariance):
+    def __init__(self, eigendecomposition):
+        eigenvalues, eigenvectors = eigendecomposition
+        eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
+        eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
+        message = ("The shapes of `eigenvalues` and `eigenvectors` "
+                   "must be compatible.")
+        try:
+            eigenvalues = np.expand_dims(eigenvalues, -2)
+            eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
+                                                            eigenvalues)
+            eigenvalues = eigenvalues[..., 0, :]
+        except ValueError:
+            raise ValueError(message)
+        i_zero = eigenvalues <= 0
+        positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
+        positive_eigenvalues[i_zero] = 1  # ones don't affect determinant
+        self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
+        psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
+        psuedo_reciprocals[i_zero] = 0
+        self._LP = eigenvectors * psuedo_reciprocals
+        self._LA = eigenvectors * np.sqrt(eigenvalues)
+        self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
+        self._w = eigenvalues
+        self._v = eigenvectors
+        self._shape = eigenvectors.shape
+        self._null_basis = eigenvectors * i_zero
+        # This is only used for `_support_mask`, not to decide whether
+        # the covariance is singular or not.
+        self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
+        self._allow_singular = True
+    def _whiten(self, x):
+        return x @ self._LP
+    def _colorize(self, x):
+        return x @ self._LA.T
+    @cached_property
+    def _covariance(self):
+        return (self._v * self._w) @ self._v.T
+    def _support_mask(self, x):
+        """
+        Check whether x lies in the support of the distribution.
+        """
+        residual = np.linalg.norm(x @ self._null_basis, axis=-1)
+        in_support = residual < self._eps
+        return in_support
+class CovViaPSD(Covariance):
+    """
+    Representation of a covariance provided via an instance of _PSD
+    """
+    def __init__(self, psd):
+        self._LP = psd.U
+        self._log_pdet = psd.log_pdet
+        self._rank = psd.rank
+        self._covariance = psd._M
+        self._shape = psd._M.shape
+        self._psd = psd
+        self._allow_singular = False  # by default
+    def _whiten(self, x):
+        return x @ self._LP
+    def _support_mask(self, x):
+        return self._psd._support_mask(x)

.venv/Lib/site-packages/scipy/stats/_crosstab.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import numpy as np
+from scipy.sparse import coo_matrix
+from scipy._lib._bunch import _make_tuple_bunch
+CrosstabResult = _make_tuple_bunch(
+    "CrosstabResult", ["elements", "count"]
+)
+def crosstab(*args, levels=None, sparse=False):
+    """
+    Return table of counts for each possible unique combination in ``*args``.
+    When ``len(args) > 1``, the array computed by this function is
+    often referred to as a *contingency table* [1]_.
+    The arguments must be sequences with the same length.  The second return
+    value, `count`, is an integer array with ``len(args)`` dimensions.  If
+    `levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
+    is the number of unique elements in ``args[k]``.
+    Parameters
+    ----------
+    *args : sequences
+        A sequence of sequences whose unique aligned elements are to be
+        counted.  The sequences in args must all be the same length.
+    levels : sequence, optional
+        If `levels` is given, it must be a sequence that is the same length as
+        `args`.  Each element in `levels` is either a sequence or None.  If it
+        is a sequence, it gives the values in the corresponding sequence in
+        `args` that are to be counted.  If any value in the sequences in `args`
+        does not occur in the corresponding sequence in `levels`, that value
+        is ignored and not counted in the returned array `count`.  The default
+        value of `levels` for ``args[i]`` is ``np.unique(args[i])``
+    sparse : bool, optional
+        If True, return a sparse matrix.  The matrix will be an instance of
+        the `scipy.sparse.coo_matrix` class.  Because SciPy's sparse matrices
+        must be 2-d, only two input sequences are allowed when `sparse` is
+        True.  Default is False.
+    Returns
+    -------
+    res : CrosstabResult
+        An object containing the following attributes:
+        elements : tuple of numpy.ndarrays.
+            Tuple of length ``len(args)`` containing the arrays of elements
+            that are counted in `count`.  These can be interpreted as the
+            labels of the corresponding dimensions of `count`. If `levels` was
+            given, then if ``levels[i]`` is not None, ``elements[i]`` will
+            hold the values given in ``levels[i]``.
+        count : numpy.ndarray or scipy.sparse.coo_matrix
+            Counts of the unique elements in ``zip(*args)``, stored in an
+            array. Also known as a *contingency table* when ``len(args) > 1``.
+    See Also
+    --------
+    numpy.unique
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+    References
+    ----------
+    .. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
+    Examples
+    --------
+    >>> from scipy.stats.contingency import crosstab
+    Given the lists `a` and `x`, create a contingency table that counts the
+    frequencies of the corresponding pairs.
+    >>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
+    >>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
+    >>> res = crosstab(a, x)
+    >>> avals, xvals = res.elements
+    >>> avals
+    array(['A', 'B'], dtype='<U1')
+    >>> xvals
+    array(['X', 'Y', 'Z'], dtype='<U1')
+    >>> res.count
+    array([[2, 3, 0],
+           [1, 0, 4]])
+    So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
+    Higher dimensional contingency tables can be created.
+    >>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
+    >>> res = crosstab(a, x, p)
+    >>> res.count
+    array([[[2, 0],
+            [2, 1],
+            [0, 0]],
+           [[1, 0],
+            [0, 0],
+            [1, 3]]])
+    >>> res.count.shape
+    (2, 3, 2)
+    The values to be counted can be set by using the `levels` argument.
+    It allows the elements of interest in each input sequence to be
+    given explicitly instead finding the unique elements of the sequence.
+    For example, suppose one of the arguments is an array containing the
+    answers to a survey question, with integer values 1 to 4.  Even if the
+    value 1 does not occur in the data, we want an entry for it in the table.
+    >>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4]  # 1 does not occur.
+    >>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4]  # 3 does not occur.
+    >>> options = [1, 2, 3, 4]
+    >>> res = crosstab(q1, q2, levels=(options, options))
+    >>> res.count
+    array([[0, 0, 0, 0],
+           [1, 1, 0, 1],
+           [1, 4, 0, 1],
+           [0, 3, 0, 3]])
+    If `levels` is given, but an element of `levels` is None, the unique values
+    of the corresponding argument are used. For example,
+    >>> res = crosstab(q1, q2, levels=(None, options))
+    >>> res.elements
+    [array([2, 3, 4]), [1, 2, 3, 4]]
+    >>> res.count
+    array([[1, 1, 0, 1],
+           [1, 4, 0, 1],
+           [0, 3, 0, 3]])
+    If we want to ignore the pairs where 4 occurs in ``q2``, we can
+    give just the values [1, 2] to `levels`, and the 4 will be ignored:
+    >>> res = crosstab(q1, q2, levels=(None, [1, 2]))
+    >>> res.elements
+    [array([2, 3, 4]), [1, 2]]
+    >>> res.count
+    array([[1, 1],
+           [1, 4],
+           [0, 3]])
+    Finally, let's repeat the first example, but return a sparse matrix:
+    >>> res = crosstab(a, x, sparse=True)
+    >>> res.count
+    <2x3 sparse matrix of type '<class 'numpy.int64'>'
+            with 4 stored elements in COOrdinate format>
+    >>> res.count.A
+    array([[2, 3, 0],
+           [1, 0, 4]])
+    """
+    nargs = len(args)
+    if nargs == 0:
+        raise TypeError("At least one input sequence is required.")
+    len0 = len(args[0])
+    if not all(len(a) == len0 for a in args[1:]):
+        raise ValueError("All input sequences must have the same length.")
+    if sparse and nargs != 2:
+        raise ValueError("When `sparse` is True, only two input sequences "
+                         "are allowed.")
+    if levels is None:
+        # Call np.unique with return_inverse=True on each argument.
+        actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
+                                       for a in args])
+    else:
+        # `levels` is not None...
+        if len(levels) != nargs:
+            raise ValueError('len(levels) must equal the number of input '
+                             'sequences')
+        args = [np.asarray(arg) for arg in args]
+        mask = np.zeros((nargs, len0), dtype=np.bool_)
+        inv = np.zeros((nargs, len0), dtype=np.intp)
+        actual_levels = []
+        for k, (levels_list, arg) in enumerate(zip(levels, args)):
+            if levels_list is None:
+                levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
+                mask[k, :] = True
+            else:
+                q = arg == np.asarray(levels_list).reshape(-1, 1)
+                mask[k, :] = np.any(q, axis=0)
+                qnz = q.T.nonzero()
+                inv[k, qnz[0]] = qnz[1]
+            actual_levels.append(levels_list)
+        mask_all = mask.all(axis=0)
+        indices = tuple(inv[:, mask_all])
+    if sparse:
+        count = coo_matrix((np.ones(len(indices[0]), dtype=int),
+                            (indices[0], indices[1])))
+        count.sum_duplicates()
+    else:
+        shape = [len(u) for u in actual_levels]
+        count = np.zeros(shape, dtype=int)
+        np.add.at(count, indices, 1)
+    return CrosstabResult(actual_levels, count)

.venv/Lib/site-packages/scipy/stats/_discrete_distns.py ADDED Viewed

	@@ -0,0 +1,1954 @@

+#
+# Author:  Travis Oliphant  2002-2011 with contributions from
+#          SciPy Developers 2004-2011
+#
+from functools import partial
+from scipy import special
+from scipy.special import entr, logsumexp, betaln, gammaln as gamln, zeta
+from scipy._lib._util import _lazywhere, rng_integers
+from scipy.interpolate import interp1d
+from numpy import floor, ceil, log, exp, sqrt, log1p, expm1, tanh, cosh, sinh
+import numpy as np
+from ._distn_infrastructure import (rv_discrete, get_distribution_names,
+                                    _check_shape, _ShapeInfo)
+import scipy.stats._boost as _boost
+from ._biasedurn import (_PyFishersNCHypergeometric,
+                         _PyWalleniusNCHypergeometric,
+                         _PyStochasticLib3)
+def _isintegral(x):
+    return x == np.round(x)
+class binom_gen(rv_discrete):
+    r"""A binomial discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `binom` is:
+    .. math::
+       f(k) = \binom{n}{k} p^k (1-p)^{n-k}
+    for :math:`k \in \{0, 1, \dots, n\}`, :math:`0 \leq p \leq 1`
+    `binom` takes :math:`n` and :math:`p` as shape parameters,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+    %(after_notes)s
+    %(example)s
+    See Also
+    --------
+    hypergeom, nbinom, nhypergeom
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("p", False, (0, 1), (True, True))]
+    def _rvs(self, n, p, size=None, random_state=None):
+        return random_state.binomial(n, p, size)
+    def _argcheck(self, n, p):
+        return (n >= 0) & _isintegral(n) & (p >= 0) & (p <= 1)
+    def _get_support(self, n, p):
+        return self.a, n
+    def _logpmf(self, x, n, p):
+        k = floor(x)
+        combiln = (gamln(n+1) - (gamln(k+1) + gamln(n-k+1)))
+        return combiln + special.xlogy(k, p) + special.xlog1py(n-k, -p)
+    def _pmf(self, x, n, p):
+        # binom.pmf(k) = choose(n, k) * p**k * (1-p)**(n-k)
+        return _boost._binom_pdf(x, n, p)
+    def _cdf(self, x, n, p):
+        k = floor(x)
+        return _boost._binom_cdf(k, n, p)
+    def _sf(self, x, n, p):
+        k = floor(x)
+        return _boost._binom_sf(k, n, p)
+    def _isf(self, x, n, p):
+        return _boost._binom_isf(x, n, p)
+    def _ppf(self, q, n, p):
+        return _boost._binom_ppf(q, n, p)
+    def _stats(self, n, p, moments='mv'):
+        mu = _boost._binom_mean(n, p)
+        var = _boost._binom_variance(n, p)
+        g1, g2 = None, None
+        if 's' in moments:
+            g1 = _boost._binom_skewness(n, p)
+        if 'k' in moments:
+            g2 = _boost._binom_kurtosis_excess(n, p)
+        return mu, var, g1, g2
+    def _entropy(self, n, p):
+        k = np.r_[0:n + 1]
+        vals = self._pmf(k, n, p)
+        return np.sum(entr(vals), axis=0)
+binom = binom_gen(name='binom')
+class bernoulli_gen(binom_gen):
+    r"""A Bernoulli discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `bernoulli` is:
+    .. math::
+       f(k) = \begin{cases}1-p  &\text{if } k = 0\\
+                           p    &\text{if } k = 1\end{cases}
+    for :math:`k` in :math:`\{0, 1\}`, :math:`0 \leq p \leq 1`
+    `bernoulli` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+    def _rvs(self, p, size=None, random_state=None):
+        return binom_gen._rvs(self, 1, p, size=size, random_state=random_state)
+    def _argcheck(self, p):
+        return (p >= 0) & (p <= 1)
+    def _get_support(self, p):
+        # Overrides binom_gen._get_support!x
+        return self.a, self.b
+    def _logpmf(self, x, p):
+        return binom._logpmf(x, 1, p)
+    def _pmf(self, x, p):
+        # bernoulli.pmf(k) = 1-p  if k = 0
+        #                  = p    if k = 1
+        return binom._pmf(x, 1, p)
+    def _cdf(self, x, p):
+        return binom._cdf(x, 1, p)
+    def _sf(self, x, p):
+        return binom._sf(x, 1, p)
+    def _isf(self, x, p):
+        return binom._isf(x, 1, p)
+    def _ppf(self, q, p):
+        return binom._ppf(q, 1, p)
+    def _stats(self, p):
+        return binom._stats(1, p)
+    def _entropy(self, p):
+        return entr(p) + entr(1-p)
+bernoulli = bernoulli_gen(b=1, name='bernoulli')
+class betabinom_gen(rv_discrete):
+    r"""A beta-binomial discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The beta-binomial distribution is a binomial distribution with a
+    probability of success `p` that follows a beta distribution.
+    The probability mass function for `betabinom` is:
+    .. math::
+       f(k) = \binom{n}{k} \frac{B(k + a, n - k + b)}{B(a, b)}
+    for :math:`k \in \{0, 1, \dots, n\}`, :math:`n \geq 0`, :math:`a > 0`,
+    :math:`b > 0`, where :math:`B(a, b)` is the beta function.
+    `betabinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Beta-binomial_distribution
+    %(after_notes)s
+    .. versionadded:: 1.4.0
+    See Also
+    --------
+    beta, binom
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+    def _rvs(self, n, a, b, size=None, random_state=None):
+        p = random_state.beta(a, b, size)
+        return random_state.binomial(n, p, size)
+    def _get_support(self, n, a, b):
+        return 0, n
+    def _argcheck(self, n, a, b):
+        return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
+    def _logpmf(self, x, n, a, b):
+        k = floor(x)
+        combiln = -log(n + 1) - betaln(n - k + 1, k + 1)
+        return combiln + betaln(k + a, n - k + b) - betaln(a, b)
+    def _pmf(self, x, n, a, b):
+        return exp(self._logpmf(x, n, a, b))
+    def _stats(self, n, a, b, moments='mv'):
+        e_p = a / (a + b)
+        e_q = 1 - e_p
+        mu = n * e_p
+        var = n * (a + b + n) * e_p * e_q / (a + b + 1)
+        g1, g2 = None, None
+        if 's' in moments:
+            g1 = 1.0 / sqrt(var)
+            g1 *= (a + b + 2 * n) * (b - a)
+            g1 /= (a + b + 2) * (a + b)
+        if 'k' in moments:
+            g2 = (a + b).astype(e_p.dtype)
+            g2 *= (a + b - 1 + 6 * n)
+            g2 += 3 * a * b * (n - 2)
+            g2 += 6 * n ** 2
+            g2 -= 3 * e_p * b * n * (6 - n)
+            g2 -= 18 * e_p * e_q * n ** 2
+            g2 *= (a + b) ** 2 * (1 + a + b)
+            g2 /= (n * a * b * (a + b + 2) * (a + b + 3) * (a + b + n))
+            g2 -= 3
+        return mu, var, g1, g2
+betabinom = betabinom_gen(name='betabinom')
+class nbinom_gen(rv_discrete):
+    r"""A negative binomial discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    Negative binomial distribution describes a sequence of i.i.d. Bernoulli
+    trials, repeated until a predefined, non-random number of successes occurs.
+    The probability mass function of the number of failures for `nbinom` is:
+    .. math::
+       f(k) = \binom{k+n-1}{n-1} p^n (1-p)^k
+    for :math:`k \ge 0`, :math:`0 < p \leq 1`
+    `nbinom` takes :math:`n` and :math:`p` as shape parameters where :math:`n`
+    is the number of successes, :math:`p` is the probability of a single
+    success, and :math:`1-p` is the probability of a single failure.
+    Another common parameterization of the negative binomial distribution is
+    in terms of the mean number of failures :math:`\mu` to achieve :math:`n`
+    successes. The mean :math:`\mu` is related to the probability of success
+    as
+    .. math::
+       p = \frac{n}{n + \mu}
+    The number of successes :math:`n` may also be specified in terms of a
+    "dispersion", "heterogeneity", or "aggregation" parameter :math:`\alpha`,
+    which relates the mean :math:`\mu` to the variance :math:`\sigma^2`,
+    e.g. :math:`\sigma^2 = \mu + \alpha \mu^2`. Regardless of the convention
+    used for :math:`\alpha`,
+    .. math::
+       p &= \frac{\mu}{\sigma^2} \\
+       n &= \frac{\mu^2}{\sigma^2 - \mu}
+    %(after_notes)s
+    %(example)s
+    See Also
+    --------
+    hypergeom, binom, nhypergeom
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("p", False, (0, 1), (True, True))]
+    def _rvs(self, n, p, size=None, random_state=None):
+        return random_state.negative_binomial(n, p, size)
+    def _argcheck(self, n, p):
+        return (n > 0) & (p > 0) & (p <= 1)
+    def _pmf(self, x, n, p):
+        # nbinom.pmf(k) = choose(k+n-1, n-1) * p**n * (1-p)**k
+        return _boost._nbinom_pdf(x, n, p)
+    def _logpmf(self, x, n, p):
+        coeff = gamln(n+x) - gamln(x+1) - gamln(n)
+        return coeff + n*log(p) + special.xlog1py(x, -p)
+    def _cdf(self, x, n, p):
+        k = floor(x)
+        return _boost._nbinom_cdf(k, n, p)
+    def _logcdf(self, x, n, p):
+        k = floor(x)
+        k, n, p = np.broadcast_arrays(k, n, p)
+        cdf = self._cdf(k, n, p)
+        cond = cdf > 0.5
+        def f1(k, n, p):
+            return np.log1p(-special.betainc(k + 1, n, 1 - p))
+        # do calc in place
+        logcdf = cdf
+        with np.errstate(divide='ignore'):
+            logcdf[cond] = f1(k[cond], n[cond], p[cond])
+            logcdf[~cond] = np.log(cdf[~cond])
+        return logcdf
+    def _sf(self, x, n, p):
+        k = floor(x)
+        return _boost._nbinom_sf(k, n, p)
+    def _isf(self, x, n, p):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return _boost._nbinom_isf(x, n, p)
+    def _ppf(self, q, n, p):
+        with np.errstate(over='ignore'):  # see gh-17432
+            return _boost._nbinom_ppf(q, n, p)
+    def _stats(self, n, p):
+        return (
+            _boost._nbinom_mean(n, p),
+            _boost._nbinom_variance(n, p),
+            _boost._nbinom_skewness(n, p),
+            _boost._nbinom_kurtosis_excess(n, p),
+        )
+nbinom = nbinom_gen(name='nbinom')
+class betanbinom_gen(rv_discrete):
+    r"""A beta-negative-binomial discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The beta-negative-binomial distribution is a negative binomial
+    distribution with a probability of success `p` that follows a
+    beta distribution.
+    The probability mass function for `betanbinom` is:
+    .. math::
+       f(k) = \binom{n + k - 1}{k} \frac{B(a + n, b + k)}{B(a, b)}
+    for :math:`k \ge 0`, :math:`n \geq 0`, :math:`a > 0`,
+    :math:`b > 0`, where :math:`B(a, b)` is the beta function.
+    `betanbinom` takes :math:`n`, :math:`a`, and :math:`b` as shape parameters.
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Beta_negative_binomial_distribution
+    %(after_notes)s
+    .. versionadded:: 1.12.0
+    See Also
+    --------
+    betabinom : Beta binomial distribution
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("a", False, (0, np.inf), (False, False)),
+                _ShapeInfo("b", False, (0, np.inf), (False, False))]
+    def _rvs(self, n, a, b, size=None, random_state=None):
+        p = random_state.beta(a, b, size)
+        return random_state.negative_binomial(n, p, size)
+    def _argcheck(self, n, a, b):
+        return (n >= 0) & _isintegral(n) & (a > 0) & (b > 0)
+    def _logpmf(self, x, n, a, b):
+        k = floor(x)
+        combiln = -np.log(n + k) - betaln(n, k + 1)
+        return combiln + betaln(a + n, b + k) - betaln(a, b)
+    def _pmf(self, x, n, a, b):
+        return exp(self._logpmf(x, n, a, b))
+    def _stats(self, n, a, b, moments='mv'):
+        # reference: Wolfram Alpha input
+        # BetaNegativeBinomialDistribution[a, b, n]
+        def mean(n, a, b):
+            return n * b / (a - 1.)
+        mu = _lazywhere(a > 1, (n, a, b), f=mean, fillvalue=np.inf)
+        def var(n, a, b):
+            return (n * b * (n + a - 1.) * (a + b - 1.)
+                    / ((a - 2.) * (a - 1.)**2.))
+        var = _lazywhere(a > 2, (n, a, b), f=var, fillvalue=np.inf)
+        g1, g2 = None, None
+        def skew(n, a, b):
+            return ((2 * n + a - 1.) * (2 * b + a - 1.)
+                    / (a - 3.) / sqrt(n * b * (n + a - 1.) * (b + a - 1.)
+                    / (a - 2.)))
+        if 's' in moments:
+            g1 = _lazywhere(a > 3, (n, a, b), f=skew, fillvalue=np.inf)
+        def kurtosis(n, a, b):
+            term = (a - 2.)
+            term_2 = ((a - 1.)**2. * (a**2. + a * (6 * b - 1.)
+                      + 6. * (b - 1.) * b)
+                      + 3. * n**2. * ((a + 5.) * b**2. + (a + 5.)
+                      * (a - 1.) * b + 2. * (a - 1.)**2)
+                      + 3 * (a - 1.) * n
+                      * ((a + 5.) * b**2. + (a + 5.) * (a - 1.) * b
+                      + 2. * (a - 1.)**2.))
+            denominator = ((a - 4.) * (a - 3.) * b * n
+                           * (a + b - 1.) * (a + n - 1.))
+            # Wolfram Alpha uses Pearson kurtosis, so we substract 3 to get
+            # scipy's Fisher kurtosis
+            return term * term_2 / denominator - 3.
+        if 'k' in moments:
+            g2 = _lazywhere(a > 4, (n, a, b), f=kurtosis, fillvalue=np.inf)
+        return mu, var, g1, g2
+betanbinom = betanbinom_gen(name='betanbinom')
+class geom_gen(rv_discrete):
+    r"""A geometric discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `geom` is:
+    .. math::
+        f(k) = (1-p)^{k-1} p
+    for :math:`k \ge 1`, :math:`0 < p \leq 1`
+    `geom` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+    %(after_notes)s
+    See Also
+    --------
+    planck
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+    def _rvs(self, p, size=None, random_state=None):
+        return random_state.geometric(p, size=size)
+    def _argcheck(self, p):
+        return (p <= 1) & (p > 0)
+    def _pmf(self, k, p):
+        return np.power(1-p, k-1) * p
+    def _logpmf(self, k, p):
+        return special.xlog1py(k - 1, -p) + log(p)
+    def _cdf(self, x, p):
+        k = floor(x)
+        return -expm1(log1p(-p)*k)
+    def _sf(self, x, p):
+        return np.exp(self._logsf(x, p))
+    def _logsf(self, x, p):
+        k = floor(x)
+        return k*log1p(-p)
+    def _ppf(self, q, p):
+        vals = ceil(log1p(-q) / log1p(-p))
+        temp = self._cdf(vals-1, p)
+        return np.where((temp >= q) & (vals > 0), vals-1, vals)
+    def _stats(self, p):
+        mu = 1.0/p
+        qr = 1.0-p
+        var = qr / p / p
+        g1 = (2.0-p) / sqrt(qr)
+        g2 = np.polyval([1, -6, 6], p)/(1.0-p)
+        return mu, var, g1, g2
+    def _entropy(self, p):
+        return -np.log(p) - np.log1p(-p) * (1.0-p) / p
+geom = geom_gen(a=1, name='geom', longname="A geometric")
+class hypergeom_gen(rv_discrete):
+    r"""A hypergeometric discrete random variable.
+    The hypergeometric distribution models drawing objects from a bin.
+    `M` is the total number of objects, `n` is total number of Type I objects.
+    The random variate represents the number of Type I objects in `N` drawn
+    without replacement from the total population.
+    %(before_notes)s
+    Notes
+    -----
+    The symbols used to denote the shape parameters (`M`, `n`, and `N`) are not
+    universally accepted.  See the Examples for a clarification of the
+    definitions used here.
+    The probability mass function is defined as,
+    .. math:: p(k, M, n, N) = \frac{\binom{n}{k} \binom{M - n}{N - k}}
+                                   {\binom{M}{N}}
+    for :math:`k \in [\max(0, N - M + n), \min(n, N)]`, where the binomial
+    coefficients are defined as,
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+    %(after_notes)s
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import hypergeom
+    >>> import matplotlib.pyplot as plt
+    Suppose we have a collection of 20 animals, of which 7 are dogs.  Then if
+    we want to know the probability of finding a given number of dogs if we
+    choose at random 12 of the 20 animals, we can initialize a frozen
+    distribution and plot the probability mass function:
+    >>> [M, n, N] = [20, 7, 12]
+    >>> rv = hypergeom(M, n, N)
+    >>> x = np.arange(0, n+1)
+    >>> pmf_dogs = rv.pmf(x)
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, pmf_dogs, 'bo')
+    >>> ax.vlines(x, 0, pmf_dogs, lw=2)
+    >>> ax.set_xlabel('# of dogs in our group of chosen animals')
+    >>> ax.set_ylabel('hypergeom PMF')
+    >>> plt.show()
+    Instead of using a frozen distribution we can also use `hypergeom`
+    methods directly.  To for example obtain the cumulative distribution
+    function, use:
+    >>> prb = hypergeom.cdf(x, M, n, N)
+    And to generate random numbers:
+    >>> R = hypergeom.rvs(M, n, N, size=10)
+    See Also
+    --------
+    nhypergeom, binom, nbinom
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("N", True, (0, np.inf), (True, False))]
+    def _rvs(self, M, n, N, size=None, random_state=None):
+        return random_state.hypergeometric(n, M-n, N, size=size)
+    def _get_support(self, M, n, N):
+        return np.maximum(N-(M-n), 0), np.minimum(n, N)
+    def _argcheck(self, M, n, N):
+        cond = (M > 0) & (n >= 0) & (N >= 0)
+        cond &= (n <= M) & (N <= M)
+        cond &= _isintegral(M) & _isintegral(n) & _isintegral(N)
+        return cond
+    def _logpmf(self, k, M, n, N):
+        tot, good = M, n
+        bad = tot - good
+        result = (betaln(good+1, 1) + betaln(bad+1, 1) + betaln(tot-N+1, N+1) -
+                  betaln(k+1, good-k+1) - betaln(N-k+1, bad-N+k+1) -
+                  betaln(tot+1, 1))
+        return result
+    def _pmf(self, k, M, n, N):
+        return _boost._hypergeom_pdf(k, n, N, M)
+    def _cdf(self, k, M, n, N):
+        return _boost._hypergeom_cdf(k, n, N, M)
+    def _stats(self, M, n, N):
+        M, n, N = 1. * M, 1. * n, 1. * N
+        m = M - n
+        # Boost kurtosis_excess doesn't return the same as the value
+        # computed here.
+        g2 = M * (M + 1) - 6. * N * (M - N) - 6. * n * m
+        g2 *= (M - 1) * M * M
+        g2 += 6. * n * N * (M - N) * m * (5. * M - 6)
+        g2 /= n * N * (M - N) * m * (M - 2.) * (M - 3.)
+        return (
+            _boost._hypergeom_mean(n, N, M),
+            _boost._hypergeom_variance(n, N, M),
+            _boost._hypergeom_skewness(n, N, M),
+            g2,
+        )
+    def _entropy(self, M, n, N):
+        k = np.r_[N - (M - n):min(n, N) + 1]
+        vals = self.pmf(k, M, n, N)
+        return np.sum(entr(vals), axis=0)
+    def _sf(self, k, M, n, N):
+        return _boost._hypergeom_sf(k, n, N, M)
+    def _logsf(self, k, M, n, N):
+        res = []
+        for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
+            if (quant + 0.5) * (tot + 0.5) < (good - 0.5) * (draw - 0.5):
+                # Less terms to sum if we calculate log(1-cdf)
+                res.append(log1p(-exp(self.logcdf(quant, tot, good, draw))))
+            else:
+                # Integration over probability mass function using logsumexp
+                k2 = np.arange(quant + 1, draw + 1)
+                res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
+        return np.asarray(res)
+    def _logcdf(self, k, M, n, N):
+        res = []
+        for quant, tot, good, draw in zip(*np.broadcast_arrays(k, M, n, N)):
+            if (quant + 0.5) * (tot + 0.5) > (good - 0.5) * (draw - 0.5):
+                # Less terms to sum if we calculate log(1-sf)
+                res.append(log1p(-exp(self.logsf(quant, tot, good, draw))))
+            else:
+                # Integration over probability mass function using logsumexp
+                k2 = np.arange(0, quant + 1)
+                res.append(logsumexp(self._logpmf(k2, tot, good, draw)))
+        return np.asarray(res)
+hypergeom = hypergeom_gen(name='hypergeom')
+class nhypergeom_gen(rv_discrete):
+    r"""A negative hypergeometric discrete random variable.
+    Consider a box containing :math:`M` balls:, :math:`n` red and
+    :math:`M-n` blue. We randomly sample balls from the box, one
+    at a time and *without* replacement, until we have picked :math:`r`
+    blue balls. `nhypergeom` is the distribution of the number of
+    red balls :math:`k` we have picked.
+    %(before_notes)s
+    Notes
+    -----
+    The symbols used to denote the shape parameters (`M`, `n`, and `r`) are not
+    universally accepted. See the Examples for a clarification of the
+    definitions used here.
+    The probability mass function is defined as,
+    .. math:: f(k; M, n, r) = \frac{{{k+r-1}\choose{k}}{{M-r-k}\choose{n-k}}}
+                                   {{M \choose n}}
+    for :math:`k \in [0, n]`, :math:`n \in [0, M]`, :math:`r \in [0, M-n]`,
+    and the binomial coefficient is:
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+    It is equivalent to observing :math:`k` successes in :math:`k+r-1`
+    samples with :math:`k+r`'th sample being a failure. The former
+    can be modelled as a hypergeometric distribution. The probability
+    of the latter is simply the number of failures remaining
+    :math:`M-n-(r-1)` divided by the size of the remaining population
+    :math:`M-(k+r-1)`. This relationship can be shown as:
+    .. math:: NHG(k;M,n,r) = HG(k;M,n,k+r-1)\frac{(M-n-(r-1))}{(M-(k+r-1))}
+    where :math:`NHG` is probability mass function (PMF) of the
+    negative hypergeometric distribution and :math:`HG` is the
+    PMF of the hypergeometric distribution.
+    %(after_notes)s
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import nhypergeom
+    >>> import matplotlib.pyplot as plt
+    Suppose we have a collection of 20 animals, of which 7 are dogs.
+    Then if we want to know the probability of finding a given number
+    of dogs (successes) in a sample with exactly 12 animals that
+    aren't dogs (failures), we can initialize a frozen distribution
+    and plot the probability mass function:
+    >>> M, n, r = [20, 7, 12]
+    >>> rv = nhypergeom(M, n, r)
+    >>> x = np.arange(0, n+2)
+    >>> pmf_dogs = rv.pmf(x)
+    >>> fig = plt.figure()
+    >>> ax = fig.add_subplot(111)
+    >>> ax.plot(x, pmf_dogs, 'bo')
+    >>> ax.vlines(x, 0, pmf_dogs, lw=2)
+    >>> ax.set_xlabel('# of dogs in our group with given 12 failures')
+    >>> ax.set_ylabel('nhypergeom PMF')
+    >>> plt.show()
+    Instead of using a frozen distribution we can also use `nhypergeom`
+    methods directly.  To for example obtain the probability mass
+    function, use:
+    >>> prb = nhypergeom.pmf(x, M, n, r)
+    And to generate random numbers:
+    >>> R = nhypergeom.rvs(M, n, r, size=10)
+    To verify the relationship between `hypergeom` and `nhypergeom`, use:
+    >>> from scipy.stats import hypergeom, nhypergeom
+    >>> M, n, r = 45, 13, 8
+    >>> k = 6
+    >>> nhypergeom.pmf(k, M, n, r)
+    0.06180776620271643
+    >>> hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
+    0.06180776620271644
+    See Also
+    --------
+    hypergeom, binom, nbinom
+    References
+    ----------
+    .. [1] Negative Hypergeometric Distribution on Wikipedia
+           https://en.wikipedia.org/wiki/Negative_hypergeometric_distribution
+    .. [2] Negative Hypergeometric Distribution from
+           http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Negativehypergeometric.pdf
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("r", True, (0, np.inf), (True, False))]
+    def _get_support(self, M, n, r):
+        return 0, n
+    def _argcheck(self, M, n, r):
+        cond = (n >= 0) & (n <= M) & (r >= 0) & (r <= M-n)
+        cond &= _isintegral(M) & _isintegral(n) & _isintegral(r)
+        return cond
+    def _rvs(self, M, n, r, size=None, random_state=None):
+        @_vectorize_rvs_over_shapes
+        def _rvs1(M, n, r, size, random_state):
+            # invert cdf by calculating all values in support, scalar M, n, r
+            a, b = self.support(M, n, r)
+            ks = np.arange(a, b+1)
+            cdf = self.cdf(ks, M, n, r)
+            ppf = interp1d(cdf, ks, kind='next', fill_value='extrapolate')
+            rvs = ppf(random_state.uniform(size=size)).astype(int)
+            if size is None:
+                return rvs.item()
+            return rvs
+        return _rvs1(M, n, r, size=size, random_state=random_state)
+    def _logpmf(self, k, M, n, r):
+        cond = ((r == 0) & (k == 0))
+        result = _lazywhere(~cond, (k, M, n, r),
+                            lambda k, M, n, r:
+                                (-betaln(k+1, r) + betaln(k+r, 1) -
+                                 betaln(n-k+1, M-r-n+1) + betaln(M-r-k+1, 1) +
+                                 betaln(n+1, M-n+1) - betaln(M+1, 1)),
+                            fillvalue=0.0)
+        return result
+    def _pmf(self, k, M, n, r):
+        # same as the following but numerically more precise
+        # return comb(k+r-1, k) * comb(M-r-k, n-k) / comb(M, n)
+        return exp(self._logpmf(k, M, n, r))
+    def _stats(self, M, n, r):
+        # Promote the datatype to at least float
+        # mu = rn / (M-n+1)
+        M, n, r = 1.*M, 1.*n, 1.*r
+        mu = r*n / (M-n+1)
+        var = r*(M+1)*n / ((M-n+1)*(M-n+2)) * (1 - r / (M-n+1))
+        # The skew and kurtosis are mathematically
+        # intractable so return `None`. See [2]_.
+        g1, g2 = None, None
+        return mu, var, g1, g2
+nhypergeom = nhypergeom_gen(name='nhypergeom')
+# FIXME: Fails _cdfvec
+class logser_gen(rv_discrete):
+    r"""A Logarithmic (Log-Series, Series) discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `logser` is:
+    .. math::
+        f(k) = - \frac{p^k}{k \log(1-p)}
+    for :math:`k \ge 1`, :math:`0 < p < 1`
+    `logser` takes :math:`p` as shape parameter,
+    where :math:`p` is the probability of a single success
+    and :math:`1-p` is the probability of a single failure.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("p", False, (0, 1), (True, True))]
+    def _rvs(self, p, size=None, random_state=None):
+        # looks wrong for p>0.5, too few k=1
+        # trying to use generic is worse, no k=1 at all
+        return random_state.logseries(p, size=size)
+    def _argcheck(self, p):
+        return (p > 0) & (p < 1)
+    def _pmf(self, k, p):
+        # logser.pmf(k) = - p**k / (k*log(1-p))
+        return -np.power(p, k) * 1.0 / k / special.log1p(-p)
+    def _stats(self, p):
+        r = special.log1p(-p)
+        mu = p / (p - 1.0) / r
+        mu2p = -p / r / (p - 1.0)**2
+        var = mu2p - mu*mu
+        mu3p = -p / r * (1.0+p) / (1.0 - p)**3
+        mu3 = mu3p - 3*mu*mu2p + 2*mu**3
+        g1 = mu3 / np.power(var, 1.5)
+        mu4p = -p / r * (
+            1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4)
+        mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4
+        g2 = mu4 / var**2 - 3.0
+        return mu, var, g1, g2
+logser = logser_gen(a=1, name='logser', longname='A logarithmic')
+class poisson_gen(rv_discrete):
+    r"""A Poisson discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `poisson` is:
+    .. math::
+        f(k) = \exp(-\mu) \frac{\mu^k}{k!}
+    for :math:`k \ge 0`.
+    `poisson` takes :math:`\mu \geq 0` as shape parameter.
+    When :math:`\mu = 0`, the ``pmf`` method
+    returns ``1.0`` at quantile :math:`k = 0`.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("mu", False, (0, np.inf), (True, False))]
+    # Override rv_discrete._argcheck to allow mu=0.
+    def _argcheck(self, mu):
+        return mu >= 0
+    def _rvs(self, mu, size=None, random_state=None):
+        return random_state.poisson(mu, size)
+    def _logpmf(self, k, mu):
+        Pk = special.xlogy(k, mu) - gamln(k + 1) - mu
+        return Pk
+    def _pmf(self, k, mu):
+        # poisson.pmf(k) = exp(-mu) * mu**k / k!
+        return exp(self._logpmf(k, mu))
+    def _cdf(self, x, mu):
+        k = floor(x)
+        return special.pdtr(k, mu)
+    def _sf(self, x, mu):
+        k = floor(x)
+        return special.pdtrc(k, mu)
+    def _ppf(self, q, mu):
+        vals = ceil(special.pdtrik(q, mu))
+        vals1 = np.maximum(vals - 1, 0)
+        temp = special.pdtr(vals1, mu)
+        return np.where(temp >= q, vals1, vals)
+    def _stats(self, mu):
+        var = mu
+        tmp = np.asarray(mu)
+        mu_nonzero = tmp > 0
+        g1 = _lazywhere(mu_nonzero, (tmp,), lambda x: sqrt(1.0/x), np.inf)
+        g2 = _lazywhere(mu_nonzero, (tmp,), lambda x: 1.0/x, np.inf)
+        return mu, var, g1, g2
+poisson = poisson_gen(name="poisson", longname='A Poisson')
+class planck_gen(rv_discrete):
+    r"""A Planck discrete exponential random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `planck` is:
+    .. math::
+        f(k) = (1-\exp(-\lambda)) \exp(-\lambda k)
+    for :math:`k \ge 0` and :math:`\lambda > 0`.
+    `planck` takes :math:`\lambda` as shape parameter. The Planck distribution
+    can be written as a geometric distribution (`geom`) with
+    :math:`p = 1 - \exp(-\lambda)` shifted by ``loc = -1``.
+    %(after_notes)s
+    See Also
+    --------
+    geom
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("lambda", False, (0, np.inf), (False, False))]
+    def _argcheck(self, lambda_):
+        return lambda_ > 0
+    def _pmf(self, k, lambda_):
+        return -expm1(-lambda_)*exp(-lambda_*k)
+    def _cdf(self, x, lambda_):
+        k = floor(x)
+        return -expm1(-lambda_*(k+1))
+    def _sf(self, x, lambda_):
+        return exp(self._logsf(x, lambda_))
+    def _logsf(self, x, lambda_):
+        k = floor(x)
+        return -lambda_*(k+1)
+    def _ppf(self, q, lambda_):
+        vals = ceil(-1.0/lambda_ * log1p(-q)-1)
+        vals1 = (vals-1).clip(*(self._get_support(lambda_)))
+        temp = self._cdf(vals1, lambda_)
+        return np.where(temp >= q, vals1, vals)
+    def _rvs(self, lambda_, size=None, random_state=None):
+        # use relation to geometric distribution for sampling
+        p = -expm1(-lambda_)
+        return random_state.geometric(p, size=size) - 1.0
+    def _stats(self, lambda_):
+        mu = 1/expm1(lambda_)
+        var = exp(-lambda_)/(expm1(-lambda_))**2
+        g1 = 2*cosh(lambda_/2.0)
+        g2 = 4+2*cosh(lambda_)
+        return mu, var, g1, g2
+    def _entropy(self, lambda_):
+        C = -expm1(-lambda_)
+        return lambda_*exp(-lambda_)/C - log(C)
+planck = planck_gen(a=0, name='planck', longname='A discrete exponential ')
+class boltzmann_gen(rv_discrete):
+    r"""A Boltzmann (Truncated Discrete Exponential) random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `boltzmann` is:
+    .. math::
+        f(k) = (1-\exp(-\lambda)) \exp(-\lambda k) / (1-\exp(-\lambda N))
+    for :math:`k = 0,..., N-1`.
+    `boltzmann` takes :math:`\lambda > 0` and :math:`N > 0` as shape parameters.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("lambda_", False, (0, np.inf), (False, False)),
+                _ShapeInfo("N", True, (0, np.inf), (False, False))]
+    def _argcheck(self, lambda_, N):
+        return (lambda_ > 0) & (N > 0) & _isintegral(N)
+    def _get_support(self, lambda_, N):
+        return self.a, N - 1
+    def _pmf(self, k, lambda_, N):
+        # boltzmann.pmf(k) =
+        #               (1-exp(-lambda_)*exp(-lambda_*k)/(1-exp(-lambda_*N))
+        fact = (1-exp(-lambda_))/(1-exp(-lambda_*N))
+        return fact*exp(-lambda_*k)
+    def _cdf(self, x, lambda_, N):
+        k = floor(x)
+        return (1-exp(-lambda_*(k+1)))/(1-exp(-lambda_*N))
+    def _ppf(self, q, lambda_, N):
+        qnew = q*(1-exp(-lambda_*N))
+        vals = ceil(-1.0/lambda_ * log(1-qnew)-1)
+        vals1 = (vals-1).clip(0.0, np.inf)
+        temp = self._cdf(vals1, lambda_, N)
+        return np.where(temp >= q, vals1, vals)
+    def _stats(self, lambda_, N):
+        z = exp(-lambda_)
+        zN = exp(-lambda_*N)
+        mu = z/(1.0-z)-N*zN/(1-zN)
+        var = z/(1.0-z)**2 - N*N*zN/(1-zN)**2
+        trm = (1-zN)/(1-z)
+        trm2 = (z*trm**2 - N*N*zN)
+        g1 = z*(1+z)*trm**3 - N**3*zN*(1+zN)
+        g1 = g1 / trm2**(1.5)
+        g2 = z*(1+4*z+z*z)*trm**4 - N**4 * zN*(1+4*zN+zN*zN)
+        g2 = g2 / trm2 / trm2
+        return mu, var, g1, g2
+boltzmann = boltzmann_gen(name='boltzmann', a=0,
+                          longname='A truncated discrete exponential ')
+class randint_gen(rv_discrete):
+    r"""A uniform discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `randint` is:
+    .. math::
+        f(k) = \frac{1}{\texttt{high} - \texttt{low}}
+    for :math:`k \in \{\texttt{low}, \dots, \texttt{high} - 1\}`.
+    `randint` takes :math:`\texttt{low}` and :math:`\texttt{high}` as shape
+    parameters.
+    %(after_notes)s
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import randint
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    Calculate the first four moments:
+    >>> low, high = 7, 31
+    >>> mean, var, skew, kurt = randint.stats(low, high, moments='mvsk')
+    Display the probability mass function (``pmf``):
+    >>> x = np.arange(low - 5, high + 5)
+    >>> ax.plot(x, randint.pmf(x, low, high), 'bo', ms=8, label='randint pmf')
+    >>> ax.vlines(x, 0, randint.pmf(x, low, high), colors='b', lw=5, alpha=0.5)
+    Alternatively, the distribution object can be called (as a function) to
+    fix the shape and location. This returns a "frozen" RV object holding the
+    given parameters fixed.
+    Freeze the distribution and display the frozen ``pmf``:
+    >>> rv = randint(low, high)
+    >>> ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-',
+    ...           lw=1, label='frozen pmf')
+    >>> ax.legend(loc='lower center')
+    >>> plt.show()
+    Check the relationship between the cumulative distribution function
+    (``cdf``) and its inverse, the percent point function (``ppf``):
+    >>> q = np.arange(low, high)
+    >>> p = randint.cdf(q, low, high)
+    >>> np.allclose(q, randint.ppf(p, low, high))
+    True
+    Generate random numbers:
+    >>> r = randint.rvs(low, high, size=1000)
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("low", True, (-np.inf, np.inf), (False, False)),
+                _ShapeInfo("high", True, (-np.inf, np.inf), (False, False))]
+    def _argcheck(self, low, high):
+        return (high > low) & _isintegral(low) & _isintegral(high)
+    def _get_support(self, low, high):
+        return low, high-1
+    def _pmf(self, k, low, high):
+        # randint.pmf(k) = 1./(high - low)
+        p = np.ones_like(k) / (high - low)
+        return np.where((k >= low) & (k < high), p, 0.)
+    def _cdf(self, x, low, high):
+        k = floor(x)
+        return (k - low + 1.) / (high - low)
+    def _ppf(self, q, low, high):
+        vals = ceil(q * (high - low) + low) - 1
+        vals1 = (vals - 1).clip(low, high)
+        temp = self._cdf(vals1, low, high)
+        return np.where(temp >= q, vals1, vals)
+    def _stats(self, low, high):
+        m2, m1 = np.asarray(high), np.asarray(low)
+        mu = (m2 + m1 - 1.0) / 2
+        d = m2 - m1
+        var = (d*d - 1) / 12.0
+        g1 = 0.0
+        g2 = -6.0/5.0 * (d*d + 1.0) / (d*d - 1.0)
+        return mu, var, g1, g2
+    def _rvs(self, low, high, size=None, random_state=None):
+        """An array of *size* random integers >= ``low`` and < ``high``."""
+        if np.asarray(low).size == 1 and np.asarray(high).size == 1:
+            # no need to vectorize in that case
+            return rng_integers(random_state, low, high, size=size)
+        if size is not None:
+            # NumPy's RandomState.randint() doesn't broadcast its arguments.
+            # Use `broadcast_to()` to extend the shapes of low and high
+            # up to size.  Then we can use the numpy.vectorize'd
+            # randint without needing to pass it a `size` argument.
+            low = np.broadcast_to(low, size)
+            high = np.broadcast_to(high, size)
+        randint = np.vectorize(partial(rng_integers, random_state),
+                               otypes=[np.dtype(int)])
+        return randint(low, high)
+    def _entropy(self, low, high):
+        return log(high - low)
+randint = randint_gen(name='randint', longname='A discrete uniform '
+                      '(random integer)')
+# FIXME: problems sampling.
+class zipf_gen(rv_discrete):
+    r"""A Zipf (Zeta) discrete random variable.
+    %(before_notes)s
+    See Also
+    --------
+    zipfian
+    Notes
+    -----
+    The probability mass function for `zipf` is:
+    .. math::
+        f(k, a) = \frac{1}{\zeta(a) k^a}
+    for :math:`k \ge 1`, :math:`a > 1`.
+    `zipf` takes :math:`a > 1` as shape parameter. :math:`\zeta` is the
+    Riemann zeta function (`scipy.special.zeta`)
+    The Zipf distribution is also known as the zeta distribution, which is
+    a special case of the Zipfian distribution (`zipfian`).
+    %(after_notes)s
+    References
+    ----------
+    .. [1] "Zeta Distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Zeta_distribution
+    %(example)s
+    Confirm that `zipf` is the large `n` limit of `zipfian`.
+    >>> import numpy as np
+    >>> from scipy.stats import zipf, zipfian
+    >>> k = np.arange(11)
+    >>> np.allclose(zipf.pmf(k, a), zipfian.pmf(k, a, n=10000000))
+    True
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (1, np.inf), (False, False))]
+    def _rvs(self, a, size=None, random_state=None):
+        return random_state.zipf(a, size=size)
+    def _argcheck(self, a):
+        return a > 1
+    def _pmf(self, k, a):
+        k = k.astype(np.float64)
+        # zipf.pmf(k, a) = 1/(zeta(a) * k**a)
+        Pk = 1.0 / special.zeta(a, 1) * k**-a
+        return Pk
+    def _munp(self, n, a):
+        return _lazywhere(
+            a > n + 1, (a, n),
+            lambda a, n: special.zeta(a - n, 1) / special.zeta(a, 1),
+            np.inf)
+zipf = zipf_gen(a=1, name='zipf', longname='A Zipf')
+def _gen_harmonic_gt1(n, a):
+    """Generalized harmonic number, a > 1"""
+    # See https://en.wikipedia.org/wiki/Harmonic_number; search for "hurwitz"
+    return zeta(a, 1) - zeta(a, n+1)
+def _gen_harmonic_leq1(n, a):
+    """Generalized harmonic number, a <= 1"""
+    if not np.size(n):
+        return n
+    n_max = np.max(n)  # loop starts at maximum of all n
+    out = np.zeros_like(a, dtype=float)
+    # add terms of harmonic series; starting from smallest to avoid roundoff
+    for i in np.arange(n_max, 0, -1, dtype=float):
+        mask = i <= n  # don't add terms after nth
+        out[mask] += 1/i**a[mask]
+    return out
+def _gen_harmonic(n, a):
+    """Generalized harmonic number"""
+    n, a = np.broadcast_arrays(n, a)
+    return _lazywhere(a > 1, (n, a),
+                      f=_gen_harmonic_gt1, f2=_gen_harmonic_leq1)
+class zipfian_gen(rv_discrete):
+    r"""A Zipfian discrete random variable.
+    %(before_notes)s
+    See Also
+    --------
+    zipf
+    Notes
+    -----
+    The probability mass function for `zipfian` is:
+    .. math::
+        f(k, a, n) = \frac{1}{H_{n,a} k^a}
+    for :math:`k \in \{1, 2, \dots, n-1, n\}`, :math:`a \ge 0`,
+    :math:`n \in \{1, 2, 3, \dots\}`.
+    `zipfian` takes :math:`a` and :math:`n` as shape parameters.
+    :math:`H_{n,a}` is the :math:`n`:sup:`th` generalized harmonic
+    number of order :math:`a`.
+    The Zipfian distribution reduces to the Zipf (zeta) distribution as
+    :math:`n \rightarrow \infty`.
+    %(after_notes)s
+    References
+    ----------
+    .. [1] "Zipf's Law", Wikipedia, https://en.wikipedia.org/wiki/Zipf's_law
+    .. [2] Larry Leemis, "Zipf Distribution", Univariate Distribution
+           Relationships. http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
+    %(example)s
+    Confirm that `zipfian` reduces to `zipf` for large `n`, `a > 1`.
+    >>> import numpy as np
+    >>> from scipy.stats import zipf, zipfian
+    >>> k = np.arange(11)
+    >>> np.allclose(zipfian.pmf(k, a=3.5, n=10000000), zipf.pmf(k, a=3.5))
+    True
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (False, False))]
+    def _argcheck(self, a, n):
+        # we need np.asarray here because moment (maybe others) don't convert
+        return (a >= 0) & (n > 0) & (n == np.asarray(n, dtype=int))
+    def _get_support(self, a, n):
+        return 1, n
+    def _pmf(self, k, a, n):
+        k = k.astype(np.float64)
+        return 1.0 / _gen_harmonic(n, a) * k**-a
+    def _cdf(self, k, a, n):
+        return _gen_harmonic(k, a) / _gen_harmonic(n, a)
+    def _sf(self, k, a, n):
+        k = k + 1  # # to match SciPy convention
+        # see http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
+        return ((k**a*(_gen_harmonic(n, a) - _gen_harmonic(k, a)) + 1)
+                / (k**a*_gen_harmonic(n, a)))
+    def _stats(self, a, n):
+        # see # see http://www.math.wm.edu/~leemis/chart/UDR/PDFs/Zipf.pdf
+        Hna = _gen_harmonic(n, a)
+        Hna1 = _gen_harmonic(n, a-1)
+        Hna2 = _gen_harmonic(n, a-2)
+        Hna3 = _gen_harmonic(n, a-3)
+        Hna4 = _gen_harmonic(n, a-4)
+        mu1 = Hna1/Hna
+        mu2n = (Hna2*Hna - Hna1**2)
+        mu2d = Hna**2
+        mu2 = mu2n / mu2d
+        g1 = (Hna3/Hna - 3*Hna1*Hna2/Hna**2 + 2*Hna1**3/Hna**3)/mu2**(3/2)
+        g2 = (Hna**3*Hna4 - 4*Hna**2*Hna1*Hna3 + 6*Hna*Hna1**2*Hna2
+              - 3*Hna1**4) / mu2n**2
+        g2 -= 3
+        return mu1, mu2, g1, g2
+zipfian = zipfian_gen(a=1, name='zipfian', longname='A Zipfian')
+class dlaplace_gen(rv_discrete):
+    r"""A  Laplacian discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for `dlaplace` is:
+    .. math::
+        f(k) = \tanh(a/2) \exp(-a |k|)
+    for integers :math:`k` and :math:`a > 0`.
+    `dlaplace` takes :math:`a` as shape parameter.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("a", False, (0, np.inf), (False, False))]
+    def _pmf(self, k, a):
+        # dlaplace.pmf(k) = tanh(a/2) * exp(-a*abs(k))
+        return tanh(a/2.0) * exp(-a * abs(k))
+    def _cdf(self, x, a):
+        k = floor(x)
+        def f(k, a):
+            return 1.0 - exp(-a * k) / (exp(a) + 1)
+        def f2(k, a):
+            return exp(a * (k + 1)) / (exp(a) + 1)
+        return _lazywhere(k >= 0, (k, a), f=f, f2=f2)
+    def _ppf(self, q, a):
+        const = 1 + exp(a)
+        vals = ceil(np.where(q < 1.0 / (1 + exp(-a)),
+                             log(q*const) / a - 1,
+                             -log((1-q) * const) / a))
+        vals1 = vals - 1
+        return np.where(self._cdf(vals1, a) >= q, vals1, vals)
+    def _stats(self, a):
+        ea = exp(a)
+        mu2 = 2.*ea/(ea-1.)**2
+        mu4 = 2.*ea*(ea**2+10.*ea+1.) / (ea-1.)**4
+        return 0., mu2, 0., mu4/mu2**2 - 3.
+    def _entropy(self, a):
+        return a / sinh(a) - log(tanh(a/2.0))
+    def _rvs(self, a, size=None, random_state=None):
+        # The discrete Laplace is equivalent to the two-sided geometric
+        # distribution with PMF:
+        #   f(k) = (1 - alpha)/(1 + alpha) * alpha^abs(k)
+        #   Reference:
+        #     https://www.sciencedirect.com/science/
+        #     article/abs/pii/S0378375804003519
+        # Furthermore, the two-sided geometric distribution is
+        # equivalent to the difference between two iid geometric
+        # distributions.
+        #   Reference (page 179):
+        #     https://pdfs.semanticscholar.org/61b3/
+        #     b99f466815808fd0d03f5d2791eea8b541a1.pdf
+        # Thus, we can leverage the following:
+        #   1) alpha = e^-a
+        #   2) probability_of_success = 1 - alpha (Bernoulli trial)
+        probOfSuccess = -np.expm1(-np.asarray(a))
+        x = random_state.geometric(probOfSuccess, size=size)
+        y = random_state.geometric(probOfSuccess, size=size)
+        return x - y
+dlaplace = dlaplace_gen(a=-np.inf,
+                        name='dlaplace', longname='A discrete Laplacian')
+class skellam_gen(rv_discrete):
+    r"""A  Skellam discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    Probability distribution of the difference of two correlated or
+    uncorrelated Poisson random variables.
+    Let :math:`k_1` and :math:`k_2` be two Poisson-distributed r.v. with
+    expected values :math:`\lambda_1` and :math:`\lambda_2`. Then,
+    :math:`k_1 - k_2` follows a Skellam distribution with parameters
+    :math:`\mu_1 = \lambda_1 - \rho \sqrt{\lambda_1 \lambda_2}` and
+    :math:`\mu_2 = \lambda_2 - \rho \sqrt{\lambda_1 \lambda_2}`, where
+    :math:`\rho` is the correlation coefficient between :math:`k_1` and
+    :math:`k_2`. If the two Poisson-distributed r.v. are independent then
+    :math:`\rho = 0`.
+    Parameters :math:`\mu_1` and :math:`\mu_2` must be strictly positive.
+    For details see: https://en.wikipedia.org/wiki/Skellam_distribution
+    `skellam` takes :math:`\mu_1` and :math:`\mu_2` as shape parameters.
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("mu1", False, (0, np.inf), (False, False)),
+                _ShapeInfo("mu2", False, (0, np.inf), (False, False))]
+    def _rvs(self, mu1, mu2, size=None, random_state=None):
+        n = size
+        return (random_state.poisson(mu1, n) -
+                random_state.poisson(mu2, n))
+    def _pmf(self, x, mu1, mu2):
+        with np.errstate(over='ignore'):  # see gh-17432
+            px = np.where(x < 0,
+                          _boost._ncx2_pdf(2*mu2, 2*(1-x), 2*mu1)*2,
+                          _boost._ncx2_pdf(2*mu1, 2*(1+x), 2*mu2)*2)
+            # ncx2.pdf() returns nan's for extremely low probabilities
+        return px
+    def _cdf(self, x, mu1, mu2):
+        x = floor(x)
+        with np.errstate(over='ignore'):  # see gh-17432
+            px = np.where(x < 0,
+                          _boost._ncx2_cdf(2*mu2, -2*x, 2*mu1),
+                          1 - _boost._ncx2_cdf(2*mu1, 2*(x+1), 2*mu2))
+        return px
+    def _stats(self, mu1, mu2):
+        mean = mu1 - mu2
+        var = mu1 + mu2
+        g1 = mean / sqrt((var)**3)
+        g2 = 1 / var
+        return mean, var, g1, g2
+skellam = skellam_gen(a=-np.inf, name="skellam", longname='A Skellam')
+class yulesimon_gen(rv_discrete):
+    r"""A Yule-Simon discrete random variable.
+    %(before_notes)s
+    Notes
+    -----
+    The probability mass function for the `yulesimon` is:
+    .. math::
+        f(k) =  \alpha B(k, \alpha+1)
+    for :math:`k=1,2,3,...`, where :math:`\alpha>0`.
+    Here :math:`B` refers to the `scipy.special.beta` function.
+    The sampling of random variates is based on pg 553, Section 6.3 of [1]_.
+    Our notation maps to the referenced logic via :math:`\alpha=a-1`.
+    For details see the wikipedia entry [2]_.
+    References
+    ----------
+    .. [1] Devroye, Luc. "Non-uniform Random Variate Generation",
+         (1986) Springer, New York.
+    .. [2] https://en.wikipedia.org/wiki/Yule-Simon_distribution
+    %(after_notes)s
+    %(example)s
+    """
+    def _shape_info(self):
+        return [_ShapeInfo("alpha", False, (0, np.inf), (False, False))]
+    def _rvs(self, alpha, size=None, random_state=None):
+        E1 = random_state.standard_exponential(size)
+        E2 = random_state.standard_exponential(size)
+        ans = ceil(-E1 / log1p(-exp(-E2 / alpha)))
+        return ans
+    def _pmf(self, x, alpha):
+        return alpha * special.beta(x, alpha + 1)
+    def _argcheck(self, alpha):
+        return (alpha > 0)
+    def _logpmf(self, x, alpha):
+        return log(alpha) + special.betaln(x, alpha + 1)
+    def _cdf(self, x, alpha):
+        return 1 - x * special.beta(x, alpha + 1)
+    def _sf(self, x, alpha):
+        return x * special.beta(x, alpha + 1)
+    def _logsf(self, x, alpha):
+        return log(x) + special.betaln(x, alpha + 1)
+    def _stats(self, alpha):
+        mu = np.where(alpha <= 1, np.inf, alpha / (alpha - 1))
+        mu2 = np.where(alpha > 2,
+                       alpha**2 / ((alpha - 2.0) * (alpha - 1)**2),
+                       np.inf)
+        mu2 = np.where(alpha <= 1, np.nan, mu2)
+        g1 = np.where(alpha > 3,
+                      sqrt(alpha - 2) * (alpha + 1)**2 / (alpha * (alpha - 3)),
+                      np.inf)
+        g1 = np.where(alpha <= 2, np.nan, g1)
+        g2 = np.where(alpha > 4,
+                      alpha + 3 + ((11 * alpha**3 - 49 * alpha - 22) /
+                                   (alpha * (alpha - 4) * (alpha - 3))),
+                      np.inf)
+        g2 = np.where(alpha <= 2, np.nan, g2)
+        return mu, mu2, g1, g2
+yulesimon = yulesimon_gen(name='yulesimon', a=1)
+def _vectorize_rvs_over_shapes(_rvs1):
+    """Decorator that vectorizes _rvs method to work on ndarray shapes"""
+    # _rvs1 must be a _function_ that accepts _scalar_ args as positional
+    # arguments, `size` and `random_state` as keyword arguments.
+    # _rvs1 must return a random variate array with shape `size`. If `size` is
+    # None, _rvs1 must return a scalar.
+    # When applied to _rvs1, this decorator broadcasts ndarray args
+    # and loops over them, calling _rvs1 for each set of scalar args.
+    # For usage example, see _nchypergeom_gen
+    def _rvs(*args, size, random_state):
+        _rvs1_size, _rvs1_indices = _check_shape(args[0].shape, size)
+        size = np.array(size)
+        _rvs1_size = np.array(_rvs1_size)
+        _rvs1_indices = np.array(_rvs1_indices)
+        if np.all(_rvs1_indices):  # all args are scalars
+            return _rvs1(*args, size, random_state)
+        out = np.empty(size)
+        # out.shape can mix dimensions associated with arg_shape and _rvs1_size
+        # Sort them to arg_shape + _rvs1_size for easy indexing of dimensions
+        # corresponding with the different sets of scalar args
+        j0 = np.arange(out.ndim)
+        j1 = np.hstack((j0[~_rvs1_indices], j0[_rvs1_indices]))
+        out = np.moveaxis(out, j1, j0)
+        for i in np.ndindex(*size[~_rvs1_indices]):
+            # arg can be squeezed because singleton dimensions will be
+            # associated with _rvs1_size, not arg_shape per _check_shape
+            out[i] = _rvs1(*[np.squeeze(arg)[i] for arg in args],
+                           _rvs1_size, random_state)
+        return np.moveaxis(out, j0, j1)  # move axes back before returning
+    return _rvs
+class _nchypergeom_gen(rv_discrete):
+    r"""A noncentral hypergeometric discrete random variable.
+    For subclassing by nchypergeom_fisher_gen and nchypergeom_wallenius_gen.
+    """
+    rvs_name = None
+    dist = None
+    def _shape_info(self):
+        return [_ShapeInfo("M", True, (0, np.inf), (True, False)),
+                _ShapeInfo("n", True, (0, np.inf), (True, False)),
+                _ShapeInfo("N", True, (0, np.inf), (True, False)),
+                _ShapeInfo("odds", False, (0, np.inf), (False, False))]
+    def _get_support(self, M, n, N, odds):
+        N, m1, n = M, n, N  # follow Wikipedia notation
+        m2 = N - m1
+        x_min = np.maximum(0, n - m2)
+        x_max = np.minimum(n, m1)
+        return x_min, x_max
+    def _argcheck(self, M, n, N, odds):
+        M, n = np.asarray(M), np.asarray(n),
+        N, odds = np.asarray(N), np.asarray(odds)
+        cond1 = (M.astype(int) == M) & (M >= 0)
+        cond2 = (n.astype(int) == n) & (n >= 0)
+        cond3 = (N.astype(int) == N) & (N >= 0)
+        cond4 = odds > 0
+        cond5 = N <= M
+        cond6 = n <= M
+        return cond1 & cond2 & cond3 & cond4 & cond5 & cond6
+    def _rvs(self, M, n, N, odds, size=None, random_state=None):
+        @_vectorize_rvs_over_shapes
+        def _rvs1(M, n, N, odds, size, random_state):
+            length = np.prod(size)
+            urn = _PyStochasticLib3()
+            rv_gen = getattr(urn, self.rvs_name)
+            rvs = rv_gen(N, n, M, odds, length, random_state)
+            rvs = rvs.reshape(size)
+            return rvs
+        return _rvs1(M, n, N, odds, size=size, random_state=random_state)
+    def _pmf(self, x, M, n, N, odds):
+        x, M, n, N, odds = np.broadcast_arrays(x, M, n, N, odds)
+        if x.size == 0:  # np.vectorize doesn't work with zero size input
+            return np.empty_like(x)
+        @np.vectorize
+        def _pmf1(x, M, n, N, odds):
+            urn = self.dist(N, n, M, odds, 1e-12)
+            return urn.probability(x)
+        return _pmf1(x, M, n, N, odds)
+    def _stats(self, M, n, N, odds, moments):
+        @np.vectorize
+        def _moments1(M, n, N, odds):
+            urn = self.dist(N, n, M, odds, 1e-12)
+            return urn.moments()
+        m, v = (_moments1(M, n, N, odds) if ("m" in moments or "v" in moments)
+                else (None, None))
+        s, k = None, None
+        return m, v, s, k
+class nchypergeom_fisher_gen(_nchypergeom_gen):
+    r"""A Fisher's noncentral hypergeometric discrete random variable.
+    Fisher's noncentral hypergeometric distribution models drawing objects of
+    two types from a bin. `M` is the total number of objects, `n` is the
+    number of Type I objects, and `odds` is the odds ratio: the odds of
+    selecting a Type I object rather than a Type II object when there is only
+    one object of each type.
+    The random variate represents the number of Type I objects drawn if we
+    take a handful of objects from the bin at once and find out afterwards
+    that we took `N` objects.
+    %(before_notes)s
+    See Also
+    --------
+    nchypergeom_wallenius, hypergeom, nhypergeom
+    Notes
+    -----
+    Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
+    with parameters `N`, `n`, and `M` (respectively) as defined above.
+    The probability mass function is defined as
+    .. math::
+        p(x; M, n, N, \omega) =
+        \frac{\binom{n}{x}\binom{M - n}{N-x}\omega^x}{P_0},
+    for
+    :math:`x \in [x_l, x_u]`,
+    :math:`M \in {\mathbb N}`,
+    :math:`n \in [0, M]`,
+    :math:`N \in [0, M]`,
+    :math:`\omega > 0`,
+    where
+    :math:`x_l = \max(0, N - (M - n))`,
+    :math:`x_u = \min(N, n)`,
+    .. math::
+        P_0 = \sum_{y=x_l}^{x_u} \binom{n}{y}\binom{M - n}{N-y}\omega^y,
+    and the binomial coefficients are defined as
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+    `nchypergeom_fisher` uses the BiasedUrn package by Agner Fog with
+    permission for it to be distributed under SciPy's license.
+    The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
+    universally accepted; they are chosen for consistency with `hypergeom`.
+    Note that Fisher's noncentral hypergeometric distribution is distinct
+    from Wallenius' noncentral hypergeometric distribution, which models
+    drawing a pre-determined `N` objects from a bin one by one.
+    When the odds ratio is unity, however, both distributions reduce to the
+    ordinary hypergeometric distribution.
+    %(after_notes)s
+    References
+    ----------
+    .. [1] Agner Fog, "Biased Urn Theory".
+           https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
+    .. [2] "Fisher's noncentral hypergeometric distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Fisher's_noncentral_hypergeometric_distribution
+    %(example)s
+    """
+    rvs_name = "rvs_fisher"
+    dist = _PyFishersNCHypergeometric
+nchypergeom_fisher = nchypergeom_fisher_gen(
+    name='nchypergeom_fisher',
+    longname="A Fisher's noncentral hypergeometric")
+class nchypergeom_wallenius_gen(_nchypergeom_gen):
+    r"""A Wallenius' noncentral hypergeometric discrete random variable.
+    Wallenius' noncentral hypergeometric distribution models drawing objects of
+    two types from a bin. `M` is the total number of objects, `n` is the
+    number of Type I objects, and `odds` is the odds ratio: the odds of
+    selecting a Type I object rather than a Type II object when there is only
+    one object of each type.
+    The random variate represents the number of Type I objects drawn if we
+    draw a pre-determined `N` objects from a bin one by one.
+    %(before_notes)s
+    See Also
+    --------
+    nchypergeom_fisher, hypergeom, nhypergeom
+    Notes
+    -----
+    Let mathematical symbols :math:`N`, :math:`n`, and :math:`M` correspond
+    with parameters `N`, `n`, and `M` (respectively) as defined above.
+    The probability mass function is defined as
+    .. math::
+        p(x; N, n, M) = \binom{n}{x} \binom{M - n}{N-x}
+        \int_0^1 \left(1-t^{\omega/D}\right)^x\left(1-t^{1/D}\right)^{N-x} dt
+    for
+    :math:`x \in [x_l, x_u]`,
+    :math:`M \in {\mathbb N}`,
+    :math:`n \in [0, M]`,
+    :math:`N \in [0, M]`,
+    :math:`\omega > 0`,
+    where
+    :math:`x_l = \max(0, N - (M - n))`,
+    :math:`x_u = \min(N, n)`,
+    .. math::
+        D = \omega(n - x) + ((M - n)-(N-x)),
+    and the binomial coefficients are defined as
+    .. math:: \binom{n}{k} \equiv \frac{n!}{k! (n - k)!}.
+    `nchypergeom_wallenius` uses the BiasedUrn package by Agner Fog with
+    permission for it to be distributed under SciPy's license.
+    The symbols used to denote the shape parameters (`N`, `n`, and `M`) are not
+    universally accepted; they are chosen for consistency with `hypergeom`.
+    Note that Wallenius' noncentral hypergeometric distribution is distinct
+    from Fisher's noncentral hypergeometric distribution, which models
+    take a handful of objects from the bin at once, finding out afterwards
+    that `N` objects were taken.
+    When the odds ratio is unity, however, both distributions reduce to the
+    ordinary hypergeometric distribution.
+    %(after_notes)s
+    References
+    ----------
+    .. [1] Agner Fog, "Biased Urn Theory".
+           https://cran.r-project.org/web/packages/BiasedUrn/vignettes/UrnTheory.pdf
+    .. [2] "Wallenius' noncentral hypergeometric distribution", Wikipedia,
+           https://en.wikipedia.org/wiki/Wallenius'_noncentral_hypergeometric_distribution
+    %(example)s
+    """
+    rvs_name = "rvs_wallenius"
+    dist = _PyWalleniusNCHypergeometric
+nchypergeom_wallenius = nchypergeom_wallenius_gen(
+    name='nchypergeom_wallenius',
+    longname="A Wallenius' noncentral hypergeometric")
+# Collect names of classes and objects in this module.
+pairs = list(globals().copy().items())
+_distn_names, _distn_gen_names = get_distribution_names(pairs, rv_discrete)
+__all__ = _distn_names + _distn_gen_names

.venv/Lib/site-packages/scipy/stats/_distn_infrastructure.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_distr_params.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Sane parameters for stats.distributions.
+"""
+import numpy as np
+distcont = [
+    ['alpha', (3.5704770516650459,)],
+    ['anglit', ()],
+    ['arcsine', ()],
+    ['argus', (1.0,)],
+    ['beta', (2.3098496451481823, 0.62687954300963677)],
+    ['betaprime', (5, 6)],
+    ['bradford', (0.29891359763170633,)],
+    ['burr', (10.5, 4.3)],
+    ['burr12', (10, 4)],
+    ['cauchy', ()],
+    ['chi', (78,)],
+    ['chi2', (55,)],
+    ['cosine', ()],
+    ['crystalball', (2.0, 3.0)],
+    ['dgamma', (1.1023326088288166,)],
+    ['dweibull', (2.0685080649914673,)],
+    ['erlang', (10,)],
+    ['expon', ()],
+    ['exponnorm', (1.5,)],
+    ['exponpow', (2.697119160358469,)],
+    ['exponweib', (2.8923945291034436, 1.9505288745913174)],
+    ['f', (29, 18)],
+    ['fatiguelife', (29,)],   # correction numargs = 1
+    ['fisk', (3.0857548622253179,)],
+    ['foldcauchy', (4.7164673455831894,)],
+    ['foldnorm', (1.9521253373555869,)],
+    ['gamma', (1.9932305483800778,)],
+    ['gausshyper', (13.763771604130699, 3.1189636648681431,
+                    2.5145980350183019, 5.1811649903971615)],  # veryslow
+    ['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
+    ['genextreme', (-0.1,)],
+    ['gengamma', (4.4162385429431925, 3.1193091679242761)],
+    ['gengamma', (4.4162385429431925, -3.1193091679242761)],
+    ['genhalflogistic', (0.77274727809929322,)],
+    ['genhyperbolic', (0.5, 1.5, -0.5,)],
+    ['geninvgauss', (2.3, 1.5)],
+    ['genlogistic', (0.41192440799679475,)],
+    ['gennorm', (1.2988442399460265,)],
+    ['halfgennorm', (0.6748054997000371,)],
+    ['genpareto', (0.1,)],   # use case with finite moments
+    ['gibrat', ()],
+    ['gompertz', (0.94743713075105251,)],
+    ['gumbel_l', ()],
+    ['gumbel_r', ()],
+    ['halfcauchy', ()],
+    ['halflogistic', ()],
+    ['halfnorm', ()],
+    ['hypsecant', ()],
+    ['invgamma', (4.0668996136993067,)],
+    ['invgauss', (0.14546264555347513,)],
+    ['invweibull', (10.58,)],
+    ['jf_skew_t', (8, 4)],
+    ['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
+    ['johnsonsu', (2.554395574161155, 2.2482281679651965)],
+    ['kappa4', (0.0, 0.0)],
+    ['kappa4', (-0.1, 0.1)],
+    ['kappa4', (0.0, 0.1)],
+    ['kappa4', (0.1, 0.0)],
+    ['kappa3', (1.0,)],
+    ['ksone', (1000,)],  # replace 22 by 100 to avoid failing range, ticket 956
+    ['kstwo', (10,)],
+    ['kstwobign', ()],
+    ['laplace', ()],
+    ['laplace_asymmetric', (2,)],
+    ['levy', ()],
+    ['levy_l', ()],
+    ['levy_stable', (1.8, -0.5)],
+    ['loggamma', (0.41411931826052117,)],
+    ['logistic', ()],
+    ['loglaplace', (3.2505926592051435,)],
+    ['lognorm', (0.95368226960575331,)],
+    ['loguniform', (0.01, 1.25)],
+    ['lomax', (1.8771398388773268,)],
+    ['maxwell', ()],
+    ['mielke', (10.4, 4.6)],
+    ['moyal', ()],
+    ['nakagami', (4.9673794866666237,)],
+    ['ncf', (27, 27, 0.41578441799226107)],
+    ['nct', (14, 0.24045031331198066)],
+    ['ncx2', (21, 1.0560465975116415)],
+    ['norm', ()],
+    ['norminvgauss', (1.25, 0.5)],
+    ['pareto', (2.621716532144454,)],
+    ['pearson3', (0.1,)],
+    ['pearson3', (-2,)],
+    ['powerlaw', (1.6591133289905851,)],
+    ['powerlaw', (0.6591133289905851,)],
+    ['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
+    ['powernorm', (4.4453652254590779,)],
+    ['rayleigh', ()],
+    ['rdist', (1.6,)],
+    ['recipinvgauss', (0.63004267809369119,)],
+    ['reciprocal', (0.01, 1.25)],
+    ['rel_breitwigner', (36.545206797050334, )],
+    ['rice', (0.7749725210111873,)],
+    ['semicircular', ()],
+    ['skewcauchy', (0.5,)],
+    ['skewnorm', (4.0,)],
+    ['studentized_range', (3.0, 10.0)],
+    ['t', (2.7433514990818093,)],
+    ['trapezoid', (0.2, 0.8)],
+    ['triang', (0.15785029824528218,)],
+    ['truncexpon', (4.6907725456810478,)],
+    ['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
+    ['truncnorm', (0.1, 2.)],
+    ['truncpareto', (1.8, 5.3)],
+    ['truncpareto', (2, 5)],
+    ['truncweibull_min', (2.5, 0.25, 1.75)],
+    ['tukeylambda', (3.1321477856738267,)],
+    ['uniform', ()],
+    ['vonmises', (3.9939042581071398,)],
+    ['vonmises_line', (3.9939042581071398,)],
+    ['wald', ()],
+    ['weibull_max', (2.8687961709100187,)],
+    ['weibull_min', (1.7866166930421596,)],
+    ['wrapcauchy', (0.031071279018614728,)]]
+distdiscrete = [
+    ['bernoulli',(0.3,)],
+    ['betabinom', (5, 2.3, 0.63)],
+    ['betanbinom', (5, 9.3, 1)],
+    ['binom', (5, 0.4)],
+    ['boltzmann',(1.4, 19)],
+    ['dlaplace', (0.8,)],  # 0.5
+    ['geom', (0.5,)],
+    ['hypergeom',(30, 12, 6)],
+    ['hypergeom',(21,3,12)],  # numpy.random (3,18,12) numpy ticket:921
+    ['hypergeom',(21,18,11)],  # numpy.random (18,3,11) numpy ticket:921
+    ['nchypergeom_fisher', (140, 80, 60, 0.5)],
+    ['nchypergeom_wallenius', (140, 80, 60, 0.5)],
+    ['logser', (0.6,)],  # re-enabled, numpy ticket:921
+    ['nbinom', (0.4, 0.4)],  # from tickets: 583
+    ['nbinom', (5, 0.5)],
+    ['planck', (0.51,)],   # 4.1
+    ['poisson', (0.6,)],
+    ['randint', (7, 31)],
+    ['skellam', (15, 8)],
+    ['zipf', (6.6,)],
+    ['zipfian', (0.75, 15)],
+    ['zipfian', (1.25, 10)],
+    ['yulesimon', (11.0,)],
+    ['nhypergeom', (20, 7, 1)]
+]
+invdistdiscrete = [
+    # In each of the following, at least one shape parameter is invalid
+    ['hypergeom', (3, 3, 4)],
+    ['nhypergeom', (5, 2, 8)],
+    ['nchypergeom_fisher', (3, 3, 4, 1)],
+    ['nchypergeom_wallenius', (3, 3, 4, 1)],
+    ['bernoulli', (1.5, )],
+    ['binom', (10, 1.5)],
+    ['betabinom', (10, -0.4, -0.5)],
+    ['betanbinom', (10, -0.4, -0.5)],
+    ['boltzmann', (-1, 4)],
+    ['dlaplace', (-0.5, )],
+    ['geom', (1.5, )],
+    ['logser', (1.5, )],
+    ['nbinom', (10, 1.5)],
+    ['planck', (-0.5, )],
+    ['poisson', (-0.5, )],
+    ['randint', (5, 2)],
+    ['skellam', (-5, -2)],
+    ['zipf', (-2, )],
+    ['yulesimon', (-2, )],
+    ['zipfian', (-0.75, 15)]
+]
+invdistcont = [
+    # In each of the following, at least one shape parameter is invalid
+    ['alpha', (-1, )],
+    ['anglit', ()],
+    ['arcsine', ()],
+    ['argus', (-1, )],
+    ['beta', (-2, 2)],
+    ['betaprime', (-2, 2)],
+    ['bradford', (-1, )],
+    ['burr', (-1, 1)],
+    ['burr12', (-1, 1)],
+    ['cauchy', ()],
+    ['chi', (-1, )],
+    ['chi2', (-1, )],
+    ['cosine', ()],
+    ['crystalball', (-1, 2)],
+    ['dgamma', (-1, )],
+    ['dweibull', (-1, )],
+    ['erlang', (-1, )],
+    ['expon', ()],
+    ['exponnorm', (-1, )],
+    ['exponweib', (1, -1)],
+    ['exponpow', (-1, )],
+    ['f', (10, -10)],
+    ['fatiguelife', (-1, )],
+    ['fisk', (-1, )],
+    ['foldcauchy', (-1, )],
+    ['foldnorm', (-1, )],
+    ['genlogistic', (-1, )],
+    ['gennorm', (-1, )],
+    ['genpareto', (np.inf, )],
+    ['genexpon', (1, 2, -3)],
+    ['genextreme', (np.inf, )],
+    ['genhyperbolic', (0.5, -0.5, -1.5,)],
+    ['gausshyper', (1, 2, 3, -4)],
+    ['gamma', (-1, )],
+    ['gengamma', (-1, 0)],
+    ['genhalflogistic', (-1, )],
+    ['geninvgauss', (1, 0)],
+    ['gibrat', ()],
+    ['gompertz', (-1, )],
+    ['gumbel_r', ()],
+    ['gumbel_l', ()],
+    ['halfcauchy', ()],
+    ['halflogistic', ()],
+    ['halfnorm', ()],
+    ['halfgennorm', (-1, )],
+    ['hypsecant', ()],
+    ['invgamma', (-1, )],
+    ['invgauss', (-1, )],
+    ['invweibull', (-1, )],
+    ['jf_skew_t', (-1, 0)],
+    ['johnsonsb', (1, -2)],
+    ['johnsonsu', (1, -2)],
+    ['kappa4', (np.nan, 0)],
+    ['kappa3', (-1, )],
+    ['ksone', (-1, )],
+    ['kstwo', (-1, )],
+    ['kstwobign', ()],
+    ['laplace', ()],
+    ['laplace_asymmetric', (-1, )],
+    ['levy', ()],
+    ['levy_l', ()],
+    ['levy_stable', (-1, 1)],
+    ['logistic', ()],
+    ['loggamma', (-1, )],
+    ['loglaplace', (-1, )],
+    ['lognorm', (-1, )],
+    ['loguniform', (10, 5)],
+    ['lomax', (-1, )],
+    ['maxwell', ()],
+    ['mielke', (1, -2)],
+    ['moyal', ()],
+    ['nakagami', (-1, )],
+    ['ncx2', (-1, 2)],
+    ['ncf', (10, 20, -1)],
+    ['nct', (-1, 2)],
+    ['norm', ()],
+    ['norminvgauss', (5, -10)],
+    ['pareto', (-1, )],
+    ['pearson3', (np.nan, )],
+    ['powerlaw', (-1, )],
+    ['powerlognorm', (1, -2)],
+    ['powernorm', (-1, )],
+    ['rdist', (-1, )],
+    ['rayleigh', ()],
+    ['rice', (-1, )],
+    ['recipinvgauss', (-1, )],
+    ['semicircular', ()],
+    ['skewnorm', (np.inf, )],
+    ['studentized_range', (-1, 1)],
+    ['rel_breitwigner', (-2, )],
+    ['t', (-1, )],
+    ['trapezoid', (0, 2)],
+    ['triang', (2, )],
+    ['truncexpon', (-1, )],
+    ['truncnorm', (10, 5)],
+    ['truncpareto', (-1, 5)],
+    ['truncpareto', (1.8, .5)],
+    ['truncweibull_min', (-2.5, 0.25, 1.75)],
+    ['tukeylambda', (np.nan, )],
+    ['uniform', ()],
+    ['vonmises', (-1, )],
+    ['vonmises_line', (-1, )],
+    ['wald', ()],
+    ['weibull_min', (-1, )],
+    ['weibull_max', (-1, )],
+    ['wrapcauchy', (2, )],
+    ['reciprocal', (15, 10)],
+    ['skewcauchy', (2, )]
+]

.venv/Lib/site-packages/scipy/stats/_entropy.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""
+Created on Fri Apr  2 09:06:05 2021
+@author: matth
+"""
+from __future__ import annotations
+import math
+import numpy as np
+from scipy import special
+from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
+__all__ = ['entropy', 'differential_entropy']
+@_axis_nan_policy_factory(
+    lambda x: x,
+    n_samples=lambda kwgs: (
+        2 if ("qk" in kwgs and kwgs["qk"] is not None)
+        else 1
+    ),
+    n_outputs=1, result_to_tuple=lambda x: (x,), paired=True,
+    too_small=-1  # entropy doesn't have too small inputs
+)
+def entropy(pk: np.typing.ArrayLike,
+            qk: np.typing.ArrayLike | None = None,
+            base: float | None = None,
+            axis: int = 0
+            ) -> np.number | np.ndarray:
+    """
+    Calculate the Shannon entropy/relative entropy of given distribution(s).
+    If only probabilities `pk` are given, the Shannon entropy is calculated as
+    ``H = -sum(pk * log(pk))``.
+    If `qk` is not None, then compute the relative entropy
+    ``D = sum(pk * log(pk / qk))``. This quantity is also known
+    as the Kullback-Leibler divergence.
+    This routine will normalize `pk` and `qk` if they don't sum to 1.
+    Parameters
+    ----------
+    pk : array_like
+        Defines the (discrete) distribution. Along each axis-slice of ``pk``,
+        element ``i`` is the  (possibly unnormalized) probability of event
+        ``i``.
+    qk : array_like, optional
+        Sequence against which the relative entropy is computed. Should be in
+        the same format as `pk`.
+    base : float, optional
+        The logarithmic base to use, defaults to ``e`` (natural logarithm).
+    axis : int, optional
+        The axis along which the entropy is calculated. Default is 0.
+    Returns
+    -------
+    S : {float, array_like}
+        The calculated entropy.
+    Notes
+    -----
+    Informally, the Shannon entropy quantifies the expected uncertainty
+    inherent in the possible outcomes of a discrete random variable.
+    For example,
+    if messages consisting of sequences of symbols from a set are to be
+    encoded and transmitted over a noiseless channel, then the Shannon entropy
+    ``H(pk)`` gives a tight lower bound for the average number of units of
+    information needed per symbol if the symbols occur with frequencies
+    governed by the discrete distribution `pk` [1]_. The choice of base
+    determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
+    The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
+    number of units of information needed per symbol if the encoding is
+    optimized for the probability distribution `qk` instead of the true
+    distribution `pk`. Informally, the relative entropy quantifies the expected
+    excess in surprise experienced if one believes the true distribution is
+    `qk` when it is actually `pk`.
+    A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
+    equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
+    the formula ``CE = -sum(pk * log(qk))``. It gives the average
+    number of units of information needed per symbol if an encoding is
+    optimized for the probability distribution `qk` when the true distribution
+    is `pk`. It is not computed directly by `entropy`, but it can be computed
+    using two calls to the function (see Examples).
+    See [2]_ for more information.
+    References
+    ----------
+    .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
+           Bell System Technical Journal, 27: 379-423.
+           https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
+    .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
+           Theory (Wiley Series in Telecommunications and Signal Processing).
+           Wiley-Interscience, USA.
+    Examples
+    --------
+    The outcome of a fair coin is the most uncertain:
+    >>> import numpy as np
+    >>> from scipy.stats import entropy
+    >>> base = 2  # work in units of bits
+    >>> pk = np.array([1/2, 1/2])  # fair coin
+    >>> H = entropy(pk, base=base)
+    >>> H
+    1.0
+    >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
+    True
+    The outcome of a biased coin is less uncertain:
+    >>> qk = np.array([9/10, 1/10])  # biased coin
+    >>> entropy(qk, base=base)
+    0.46899559358928117
+    The relative entropy between the fair coin and biased coin is calculated
+    as:
+    >>> D = entropy(pk, qk, base=base)
+    >>> D
+    0.7369655941662062
+    >>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
+    True
+    The cross entropy can be calculated as the sum of the entropy and
+    relative entropy`:
+    >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
+    >>> CE
+    1.736965594166206
+    >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
+    True
+    """
+    if base is not None and base <= 0:
+        raise ValueError("`base` must be a positive number or `None`.")
+    pk = np.asarray(pk)
+    with np.errstate(invalid='ignore'):
+        pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
+    if qk is None:
+        vec = special.entr(pk)
+    else:
+        qk = np.asarray(qk)
+        pk, qk = _broadcast_arrays((pk, qk), axis=None)  # don't ignore any axes
+        sum_kwargs = dict(axis=axis, keepdims=True)
+        qk = 1.0*qk / np.sum(qk, **sum_kwargs)  # type: ignore[operator, call-overload]
+        vec = special.rel_entr(pk, qk)
+    S = np.sum(vec, axis=axis)
+    if base is not None:
+        S /= np.log(base)
+    return S
+def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
+    values = samples[0]
+    n = values.shape[axis]
+    window_length = kwargs.get("window_length",
+                               math.floor(math.sqrt(n) + 0.5))
+    if not 2 <= 2 * window_length < n:
+        return True
+    return False
+@_axis_nan_policy_factory(
+    lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,),
+    too_small=_differential_entropy_is_too_small
+)
+def differential_entropy(
+    values: np.typing.ArrayLike,
+    *,
+    window_length: int | None = None,
+    base: float | None = None,
+    axis: int = 0,
+    method: str = "auto",
+) -> np.number | np.ndarray:
+    r"""Given a sample of a distribution, estimate the differential entropy.
+    Several estimation methods are available using the `method` parameter. By
+    default, a method is selected based the size of the sample.
+    Parameters
+    ----------
+    values : sequence
+        Sample from a continuous distribution.
+    window_length : int, optional
+        Window length for computing Vasicek estimate. Must be an integer
+        between 1 and half of the sample size. If ``None`` (the default), it
+        uses the heuristic value
+        .. math::
+            \left \lfloor \sqrt{n} + 0.5 \right \rfloor
+        where :math:`n` is the sample size. This heuristic was originally
+        proposed in [2]_ and has become common in the literature.
+    base : float, optional
+        The logarithmic base to use, defaults to ``e`` (natural logarithm).
+    axis : int, optional
+        The axis along which the differential entropy is calculated.
+        Default is 0.
+    method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
+        The method used to estimate the differential entropy from the sample.
+        Default is ``'auto'``.  See Notes for more information.
+    Returns
+    -------
+    entropy : float
+        The calculated differential entropy.
+    Notes
+    -----
+    This function will converge to the true differential entropy in the limit
+    .. math::
+        n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
+    The optimal choice of ``window_length`` for a given sample size depends on
+    the (unknown) distribution. Typically, the smoother the density of the
+    distribution, the larger the optimal value of ``window_length`` [1]_.
+    The following options are available for the `method` parameter.
+    * ``'vasicek'`` uses the estimator presented in [1]_. This is
+      one of the first and most influential estimators of differential entropy.
+    * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
+      is not only consistent but, under some conditions, asymptotically normal.
+    * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
+      in simulation to have smaller bias and mean squared error than
+      the Vasicek estimator.
+    * ``'correa'`` uses the estimator presented in [5]_ based on local linear
+      regression. In a simulation study, it had consistently smaller mean
+      square error than the Vasiceck estimator, but it is more expensive to
+      compute.
+    * ``'auto'`` selects the method automatically (default). Currently,
+      this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
+      for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
+      samples, but this behavior is subject to change in future versions.
+    All estimators are implemented as described in [6]_.
+    References
+    ----------
+    .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
+           Journal of the Royal Statistical Society:
+           Series B (Methodological), 38(1), 54-59.
+    .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
+           goodness-of-fit test for exponentiality. Communications in
+           Statistics-Theory and Methods, 28(5), 1183-1202.
+    .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
+           class of statistics based on spacings. Scandinavian Journal of
+           Statistics, 61-72.
+    .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
+           of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
+    .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
+           in Statistics-Theory and Methods, 24(10), 2439-2449.
+    .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
+           Annals of Data Science, 2(2), 231-241.
+           https://link.springer.com/article/10.1007/s40745-015-0045-9
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats import differential_entropy, norm
+    Entropy of a standard normal distribution:
+    >>> rng = np.random.default_rng()
+    >>> values = rng.standard_normal(100)
+    >>> differential_entropy(values)
+    1.3407817436640392
+    Compare with the true entropy:
+    >>> float(norm.entropy())
+    1.4189385332046727
+    For several sample sizes between 5 and 1000, compare the accuracy of
+    the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
+    compare the root mean squared error (over 1000 trials) between the estimate
+    and the true differential entropy of the distribution.
+    >>> from scipy import stats
+    >>> import matplotlib.pyplot as plt
+    >>>
+    >>>
+    >>> def rmse(res, expected):
+    ...     '''Root mean squared error'''
+    ...     return np.sqrt(np.mean((res - expected)**2))
+    >>>
+    >>>
+    >>> a, b = np.log10(5), np.log10(1000)
+    >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
+    >>> reps = 1000  # number of repetitions for each sample size
+    >>> expected = stats.expon.entropy()
+    >>>
+    >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
+    >>> for method in method_errors:
+    ...     for n in ns:
+    ...        rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
+    ...        res = stats.differential_entropy(rvs, method=method, axis=-1)
+    ...        error = rmse(res, expected)
+    ...        method_errors[method].append(error)
+    >>>
+    >>> for method, errors in method_errors.items():
+    ...     plt.loglog(ns, errors, label=method)
+    >>>
+    >>> plt.legend()
+    >>> plt.xlabel('sample size')
+    >>> plt.ylabel('RMSE (1000 trials)')
+    >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
+    """
+    values = np.asarray(values)
+    values = np.moveaxis(values, axis, -1)
+    n = values.shape[-1]  # number of observations
+    if window_length is None:
+        window_length = math.floor(math.sqrt(n) + 0.5)
+    if not 2 <= 2 * window_length < n:
+        raise ValueError(
+            f"Window length ({window_length}) must be positive and less "
+            f"than half the sample size ({n}).",
+        )
+    if base is not None and base <= 0:
+        raise ValueError("`base` must be a positive number or `None`.")
+    sorted_data = np.sort(values, axis=-1)
+    methods = {"vasicek": _vasicek_entropy,
+               "van es": _van_es_entropy,
+               "correa": _correa_entropy,
+               "ebrahimi": _ebrahimi_entropy,
+               "auto": _vasicek_entropy}
+    method = method.lower()
+    if method not in methods:
+        message = f"`method` must be one of {set(methods)}"
+        raise ValueError(message)
+    if method == "auto":
+        if n <= 10:
+            method = 'van es'
+        elif n <= 1000:
+            method = 'ebrahimi'
+        else:
+            method = 'vasicek'
+    res = methods[method](sorted_data, window_length)
+    if base is not None:
+        res /= np.log(base)
+    return res
+def _pad_along_last_axis(X, m):
+    """Pad the data for computing the rolling window difference."""
+    # scales a  bit better than method in _vasicek_like_entropy
+    shape = np.array(X.shape)
+    shape[-1] = m
+    Xl = np.broadcast_to(X[..., [0]], shape)  # [0] vs 0 to maintain shape
+    Xr = np.broadcast_to(X[..., [-1]], shape)
+    return np.concatenate((Xl, X, Xr), axis=-1)
+def _vasicek_entropy(X, m):
+    """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m)
+    differences = X[..., 2 * m:] - X[..., : -2 * m:]
+    logs = np.log(n/(2*m) * differences)
+    return np.mean(logs, axis=-1)
+def _van_es_entropy(X, m):
+    """Compute the van Es estimator as described in [6]."""
+    # No equation number, but referred to as HVE_mn.
+    # Typo: there should be a log within the summation.
+    n = X.shape[-1]
+    difference = X[..., m:] - X[..., :-m]
+    term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
+    k = np.arange(m, n+1)
+    return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
+def _ebrahimi_entropy(X, m):
+    """Compute the Ebrahimi estimator as described in [6]."""
+    # No equation number, but referred to as HE_mn
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m)
+    differences = X[..., 2 * m:] - X[..., : -2 * m:]
+    i = np.arange(1, n+1).astype(float)
+    ci = np.ones_like(i)*2
+    ci[i <= m] = 1 + (i[i <= m] - 1)/m
+    ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
+    logs = np.log(n * differences / (ci * m))
+    return np.mean(logs, axis=-1)
+def _correa_entropy(X, m):
+    """Compute the Correa estimator as described in [6]."""
+    # No equation number, but referred to as HC_mn
+    n = X.shape[-1]
+    X = _pad_along_last_axis(X, m)
+    i = np.arange(1, n+1)
+    dj = np.arange(-m, m+1)[:, None]
+    j = i + dj
+    j0 = j + m - 1  # 0-indexed version of j
+    Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
+    difference = X[..., j0] - Xibar
+    num = np.sum(difference*dj, axis=-2)  # dj is d-i
+    den = n*np.sum(difference**2, axis=-2)
+    return -np.mean(np.log(num/den), axis=-1)

.venv/Lib/site-packages/scipy/stats/_fit.py ADDED Viewed

	@@ -0,0 +1,1351 @@

+import warnings
+from collections import namedtuple
+import numpy as np
+from scipy import optimize, stats
+from scipy._lib._util import check_random_state
+def _combine_bounds(name, user_bounds, shape_domain, integral):
+    """Intersection of user-defined bounds and distribution PDF/PMF domain"""
+    user_bounds = np.atleast_1d(user_bounds)
+    if user_bounds[0] > user_bounds[1]:
+        message = (f"There are no values for `{name}` on the interval "
+                   f"{list(user_bounds)}.")
+        raise ValueError(message)
+    bounds = (max(user_bounds[0], shape_domain[0]),
+              min(user_bounds[1], shape_domain[1]))
+    if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):
+        message = (f"There are no integer values for `{name}` on the interval "
+                   f"defined by the user-provided bounds and the domain "
+                   "of the distribution.")
+        raise ValueError(message)
+    elif not integral and (bounds[0] > bounds[1]):
+        message = (f"There are no values for `{name}` on the interval "
+                   f"defined by the user-provided bounds and the domain "
+                   "of the distribution.")
+        raise ValueError(message)
+    if not np.all(np.isfinite(bounds)):
+        message = (f"The intersection of user-provided bounds for `{name}` "
+                   f"and the domain of the distribution is not finite. Please "
+                   f"provide finite bounds for shape `{name}` in `bounds`.")
+        raise ValueError(message)
+    return bounds
+class FitResult:
+    r"""Result of fitting a discrete or continuous distribution to data
+    Attributes
+    ----------
+    params : namedtuple
+        A namedtuple containing the maximum likelihood estimates of the
+        shape parameters, location, and (if applicable) scale of the
+        distribution.
+    success : bool or None
+        Whether the optimizer considered the optimization to terminate
+        successfully or not.
+    message : str or None
+        Any status message provided by the optimizer.
+    """
+    def __init__(self, dist, data, discrete, res):
+        self._dist = dist
+        self._data = data
+        self.discrete = discrete
+        self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)
+        shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
+        if not discrete:
+            FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])
+        else:
+            FitParams = namedtuple('FitParams', shape_names + ['loc'])
+        self.params = FitParams(*res.x)
+        # Optimizer can report success even when nllf is infinite
+        if res.success and not np.isfinite(self.nllf()):
+            res.success = False
+            res.message = ("Optimization converged to parameter values that "
+                           "are inconsistent with the data.")
+        self.success = getattr(res, "success", None)
+        self.message = getattr(res, "message", None)
+    def __repr__(self):
+        keys = ["params", "success", "message"]
+        m = max(map(len, keys)) + 1
+        return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))
+                          for key in keys if getattr(self, key) is not None])
+    def nllf(self, params=None, data=None):
+        """Negative log-likelihood function
+        Evaluates the negative of the log-likelihood function of the provided
+        data at the provided parameters.
+        Parameters
+        ----------
+        params : tuple, optional
+            The shape parameters, location, and (if applicable) scale of the
+            distribution as a single tuple. Default is the maximum likelihood
+            estimates (``self.params``).
+        data : array_like, optional
+            The data for which the log-likelihood function is to be evaluated.
+            Default is the data to which the distribution was fit.
+        Returns
+        -------
+        nllf : float
+            The negative of the log-likelihood function.
+        """
+        params = params if params is not None else self.params
+        data = data if data is not None else self._data
+        return self._dist.nnlf(theta=params, x=data)
+    def plot(self, ax=None, *, plot_type="hist"):
+        """Visually compare the data against the fitted distribution.
+        Available only if `matplotlib` is installed.
+        Parameters
+        ----------
+        ax : `matplotlib.axes.Axes`
+            Axes object to draw the plot onto, otherwise uses the current Axes.
+        plot_type : {"hist", "qq", "pp", "cdf"}
+            Type of plot to draw. Options include:
+            - "hist": Superposes the PDF/PMF of the fitted distribution
+              over a normalized histogram of the data.
+            - "qq": Scatter plot of theoretical quantiles against the
+              empirical quantiles. Specifically, the x-coordinates are the
+              values of the fitted distribution PPF evaluated at the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the
+              number of data points, and the y-coordinates are the sorted
+              data points.
+            - "pp": Scatter plot of theoretical percentiles against the
+              observed percentiles. Specifically, the x-coordinates are the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
+              the number of data points, and the y-coordinates are the values
+              of the fitted distribution CDF evaluated at the sorted
+              data points.
+            - "cdf": Superposes the CDF of the fitted distribution over the
+              empirical CDF. Specifically, the x-coordinates of the empirical
+              CDF are the sorted data points, and the y-coordinates are the
+              percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
+              the number of data points.
+        Returns
+        -------
+        ax : `matplotlib.axes.Axes`
+            The matplotlib Axes object on which the plot was drawn.
+        Examples
+        --------
+        >>> import numpy as np
+        >>> from scipy import stats
+        >>> import matplotlib.pyplot as plt  # matplotlib must be installed
+        >>> rng = np.random.default_rng()
+        >>> data = stats.nbinom(5, 0.5).rvs(size=1000, random_state=rng)
+        >>> bounds = [(0, 30), (0, 1)]
+        >>> res = stats.fit(stats.nbinom, data, bounds)
+        >>> ax = res.plot()  # save matplotlib Axes object
+        The `matplotlib.axes.Axes` object can be used to customize the plot.
+        See `matplotlib.axes.Axes` documentation for details.
+        >>> ax.set_xlabel('number of trials')  # customize axis label
+        >>> ax.get_children()[0].set_linewidth(5)  # customize line widths
+        >>> ax.legend()
+        >>> plt.show()
+        """
+        try:
+            import matplotlib  # noqa: F401
+        except ModuleNotFoundError as exc:
+            message = "matplotlib must be installed to use method `plot`."
+            raise ModuleNotFoundError(message) from exc
+        plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,
+                 'pp': self._pp_plot, 'cdf': self._cdf_plot,
+                 'hist': self._hist_plot}
+        if plot_type.lower() not in plots:
+            message = f"`plot_type` must be one of {set(plots.keys())}"
+            raise ValueError(message)
+        plot = plots[plot_type.lower()]
+        if ax is None:
+            import matplotlib.pyplot as plt
+            ax = plt.gca()
+        fit_params = np.atleast_1d(self.params)
+        return plot(ax=ax, fit_params=fit_params)
+    def _hist_plot(self, ax, fit_params):
+        from matplotlib.ticker import MaxNLocator
+        support = self._dist.support(*fit_params)
+        lb = support[0] if np.isfinite(support[0]) else min(self._data)
+        ub = support[1] if np.isfinite(support[1]) else max(self._data)
+        pxf = "PMF" if self.discrete else "PDF"
+        if self.discrete:
+            x = np.arange(lb, ub + 2)
+            y = self.pxf(x, *fit_params)
+            ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',
+                      color='C0')
+            options = dict(density=True, bins=x, align='left', color='C1')
+            ax.xaxis.set_major_locator(MaxNLocator(integer=True))
+            ax.set_xlabel('k')
+            ax.set_ylabel('PMF')
+        else:
+            x = np.linspace(lb, ub, 200)
+            y = self.pxf(x, *fit_params)
+            ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')
+            options = dict(density=True, bins=50, align='mid', color='C1')
+            ax.set_xlabel('x')
+            ax.set_ylabel('PDF')
+        if len(self._data) > 50 or self.discrete:
+            ax.hist(self._data, label="Histogram of Data", **options)
+        else:
+            ax.plot(self._data, np.zeros_like(self._data), "*",
+                    label='Data', color='C1')
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")
+        ax.legend(*ax.get_legend_handles_labels())
+        return ax
+    def _qp_plot(self, ax, fit_params, qq):
+        data = np.sort(self._data)
+        ps = self._plotting_positions(len(self._data))
+        if qq:
+            qp = "Quantiles"
+            plot_type = 'Q-Q'
+            x = self._dist.ppf(ps, *fit_params)
+            y = data
+        else:
+            qp = "Percentiles"
+            plot_type = 'P-P'
+            x = ps
+            y = self._dist.cdf(data, *fit_params)
+        ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',
+                color='C0', zorder=1)
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()
+        lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]
+        if not qq:
+            lim = max(lim[0], 0), min(lim[1], 1)
+        if self.discrete and qq:
+            q_min, q_max = int(lim[0]), int(lim[1]+1)
+            q_ideal = np.arange(q_min, q_max)
+            # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))
+            ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',
+                    alpha=0.25, markerfacecolor='none', clip_on=True)
+        elif self.discrete and not qq:
+            # The intent of this is to match the plot that would be produced
+            # if x were continuous on [0, 1] and y were cdf(ppf(x)).
+            # It can be approximated by letting x = np.linspace(0, 1, 1000),
+            # but this might not look great when zooming in. The vertical
+            # portions are included to indicate where the transition occurs
+            # where the data completely obscures the horizontal portions.
+            p_min, p_max = lim
+            a, b = self._dist.support(*fit_params)
+            p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)
+            p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)
+            q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)
+            qs = np.arange(q_min-1, q_max+1)
+            ps = self._dist.cdf(qs, *fit_params)
+            ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,
+                    clip_on=True)
+        else:
+            ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,
+                    clip_on=True)
+        ax.set_xlim(lim)
+        ax.set_ylim(lim)
+        ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")
+        ax.set_ylabel(f"Data {qp}")
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")
+        ax.legend(*ax.get_legend_handles_labels())
+        ax.set_aspect('equal')
+        return ax
+    def _qq_plot(self, **kwargs):
+        return self._qp_plot(qq=True, **kwargs)
+    def _pp_plot(self, **kwargs):
+        return self._qp_plot(qq=False, **kwargs)
+    def _plotting_positions(self, n, a=.5):
+        # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions
+        k = np.arange(1, n+1)
+        return (k-a) / (n + 1 - 2*a)
+    def _cdf_plot(self, ax, fit_params):
+        data = np.sort(self._data)
+        ecdf = self._plotting_positions(len(self._data))
+        ls = '--' if len(np.unique(data)) < 30 else '.'
+        xlabel = 'k' if self.discrete else 'x'
+        ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)
+        xlim = ax.get_xlim()
+        q = np.linspace(*xlim, 300)
+        tcdf = self._dist.cdf(q, *fit_params)
+        ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)
+        ax.set_xlim(xlim)
+        ax.set_ylim(0, 1)
+        ax.set_xlabel(xlabel)
+        ax.set_ylabel("CDF")
+        ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")
+        handles, labels = ax.get_legend_handles_labels()
+        ax.legend(handles[::-1], labels[::-1])
+        return ax
+def fit(dist, data, bounds=None, *, guess=None, method='mle',
+        optimizer=optimize.differential_evolution):
+    r"""Fit a discrete or continuous distribution to data
+    Given a distribution, data, and bounds on the parameters of the
+    distribution, return maximum likelihood estimates of the parameters.
+    Parameters
+    ----------
+    dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
+        The object representing the distribution to be fit to the data.
+    data : 1D array_like
+        The data to which the distribution is to be fit. If the data contain
+        any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will
+        raise a ``ValueError``.
+    bounds : dict or sequence of tuples, optional
+        If a dictionary, each key is the name of a parameter of the
+        distribution, and the corresponding value is a tuple containing the
+        lower and upper bound on that parameter.  If the distribution is
+        defined only for a finite range of values of that parameter, no entry
+        for that parameter is required; e.g., some distributions have
+        parameters which must be on the interval [0, 1]. Bounds for parameters
+        location (``loc``) and scale (``scale``) are optional; by default,
+        they are fixed to 0 and 1, respectively.
+        If a sequence, element *i* is a tuple containing the lower and upper
+        bound on the *i*\ th parameter of the distribution. In this case,
+        bounds for *all* distribution shape parameters must be provided.
+        Optionally, bounds for location and scale may follow the
+        distribution shape parameters.
+        If a shape is to be held fixed (e.g. if it is known), the
+        lower and upper bounds may be equal. If a user-provided lower or upper
+        bound is beyond a bound of the domain for which the distribution is
+        defined, the bound of the distribution's domain will replace the
+        user-provided value. Similarly, parameters which must be integral
+        will be constrained to integral values within the user-provided bounds.
+    guess : dict or array_like, optional
+        If a dictionary, each key is the name of a parameter of the
+        distribution, and the corresponding value is a guess for the value
+        of the parameter.
+        If a sequence, element *i* is a guess for the *i*\ th parameter of the
+        distribution. In this case, guesses for *all* distribution shape
+        parameters must be provided.
+        If `guess` is not provided, guesses for the decision variables will
+        not be passed to the optimizer. If `guess` is provided, guesses for
+        any missing parameters will be set at the mean of the lower and
+        upper bounds. Guesses for parameters which must be integral will be
+        rounded to integral values, and guesses that lie outside the
+        intersection of the user-provided bounds and the domain of the
+        distribution will be clipped.
+    method : {'mle', 'mse'}
+        With ``method="mle"`` (default), the fit is computed by minimizing
+        the negative log-likelihood function. A large, finite penalty
+        (rather than infinite negative log-likelihood) is applied for
+        observations beyond the support of the distribution.
+        With ``method="mse"``, the fit is computed by minimizing
+        the negative log-product spacing function. The same penalty is applied
+        for observations beyond the support. We follow the approach of [1]_,
+        which is generalized for samples with repeated observations.
+    optimizer : callable, optional
+        `optimizer` is a callable that accepts the following positional
+        argument.
+        fun : callable
+            The objective function to be optimized. `fun` accepts one argument
+            ``x``, candidate shape parameters of the distribution, and returns
+            the objective function value given ``x``, `dist`, and the provided
+            `data`.
+            The job of `optimizer` is to find values of the decision variables
+            that minimizes `fun`.
+        `optimizer` must also accept the following keyword argument.
+        bounds : sequence of tuples
+            The bounds on values of the decision variables; each element will
+            be a tuple containing the lower and upper bound on a decision
+            variable.
+        If `guess` is provided, `optimizer` must also accept the following
+        keyword argument.
+        x0 : array_like
+            The guesses for each decision variable.
+        If the distribution has any shape parameters that must be integral or
+        if the distribution is discrete and the location parameter is not
+        fixed, `optimizer` must also accept the following keyword argument.
+        integrality : array_like of bools
+            For each decision variable, True if the decision variable
+            must be constrained to integer values and False if the decision
+            variable is continuous.
+        `optimizer` must return an object, such as an instance of
+        `scipy.optimize.OptimizeResult`, which holds the optimal values of
+        the decision variables in an attribute ``x``. If attributes
+        ``fun``, ``status``, or ``message`` are provided, they will be
+        included in the result object returned by `fit`.
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.FitResult`
+        An object with the following fields.
+        params : namedtuple
+            A namedtuple containing the maximum likelihood estimates of the
+            shape parameters, location, and (if applicable) scale of the
+            distribution.
+        success : bool or None
+            Whether the optimizer considered the optimization to terminate
+            successfully or not.
+        message : str or None
+            Any status message provided by the optimizer.
+        The object has the following method:
+        nllf(params=None, data=None)
+            By default, the negative log-likehood function at the fitted
+            `params` for the given `data`. Accepts a tuple containing
+            alternative shapes, location, and scale of the distribution and
+            an array of alternative data.
+        plot(ax=None)
+            Superposes the PDF/PMF of the fitted distribution over a normalized
+            histogram of the data.
+    See Also
+    --------
+    rv_continuous,  rv_discrete
+    Notes
+    -----
+    Optimization is more likely to converge to the maximum likelihood estimate
+    when the user provides tight bounds containing the maximum likelihood
+    estimate. For example, when fitting a binomial distribution to data, the
+    number of experiments underlying each sample may be known, in which case
+    the corresponding shape parameter ``n`` can be fixed.
+    References
+    ----------
+    .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings
+           method: a unified formulation with illustration of strong
+           consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.
+    Examples
+    --------
+    Suppose we wish to fit a distribution to the following data.
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> dist = stats.nbinom
+    >>> shapes = (5, 0.5)
+    >>> data = dist.rvs(*shapes, size=1000, random_state=rng)
+    Suppose we do not know how the data were generated, but we suspect that
+    it follows a negative binomial distribution with parameters *n* and *p*\.
+    (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer
+    than 30, and we know that the parameter *p* must lie on the interval
+    [0, 1]. We record this information in a variable `bounds` and pass
+    this information to `fit`.
+    >>> bounds = [(0, 30), (0, 1)]
+    >>> res = stats.fit(dist, data, bounds)
+    `fit` searches within the user-specified `bounds` for the
+    values that best match the data (in the sense of maximum likelihood
+    estimation). In this case, it found shape values similar to those
+    from which the data were actually generated.
+    >>> res.params
+    FitParams(n=5.0, p=0.5028157644634368, loc=0.0)  # may vary
+    We can visualize the results by superposing the probability mass function
+    of the distribution (with the shapes fit to the data) over a normalized
+    histogram of the data.
+    >>> import matplotlib.pyplot as plt  # matplotlib must be installed to plot
+    >>> res.plot()
+    >>> plt.show()
+    Note that the estimate for *n* was exactly integral; this is because
+    the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`
+    object "knows" that. `nbinom` also knows that the shape *p* must be a
+    value between 0 and 1. In such a case - when the domain of the distribution
+    with respect to a parameter is finite - we are not required to specify
+    bounds for the parameter.
+    >>> bounds = {'n': (0, 30)}  # omit parameter p using a `dict`
+    >>> res2 = stats.fit(dist, data, bounds)
+    >>> res2.params
+    FitParams(n=5.0, p=0.5016492009232932, loc=0.0)  # may vary
+    If we wish to force the distribution to be fit with *n* fixed at 6, we can
+    set both the lower and upper bounds on *n* to 6. Note, however, that the
+    value of the objective function being optimized is typically worse (higher)
+    in this case.
+    >>> bounds = {'n': (6, 6)}  # fix parameter `n`
+    >>> res3 = stats.fit(dist, data, bounds)
+    >>> res3.params
+    FitParams(n=6.0, p=0.5486556076755706, loc=0.0)  # may vary
+    >>> res3.nllf() > res.nllf()
+    True  # may vary
+    Note that the numerical results of the previous examples are typical, but
+    they may vary because the default optimizer used by `fit`,
+    `scipy.optimize.differential_evolution`, is stochastic. However, we can
+    customize the settings used by the optimizer to ensure reproducibility -
+    or even use a different optimizer entirely - using the `optimizer`
+    parameter.
+    >>> from scipy.optimize import differential_evolution
+    >>> rng = np.random.default_rng(767585560716548)
+    >>> def optimizer(fun, bounds, *, integrality):
+    ...     return differential_evolution(fun, bounds, strategy='best2bin',
+    ...                                   seed=rng, integrality=integrality)
+    >>> bounds = [(0, 30), (0, 1)]
+    >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)
+    >>> res4.params
+    FitParams(n=5.0, p=0.5015183149259951, loc=0.0)
+    """
+    # --- Input Validation / Standardization --- #
+    user_bounds = bounds
+    user_guess = guess
+    # distribution input validation and information collection
+    if hasattr(dist, "pdf"):  # can't use isinstance for types
+        default_bounds = {'loc': (0, 0), 'scale': (1, 1)}
+        discrete = False
+    elif hasattr(dist, "pmf"):
+        default_bounds = {'loc': (0, 0)}
+        discrete = True
+    else:
+        message = ("`dist` must be an instance of `rv_continuous` "
+                   "or `rv_discrete.`")
+        raise ValueError(message)
+    try:
+        param_info = dist._param_info()
+    except AttributeError as e:
+        message = (f"Distribution `{dist.name}` is not yet supported by "
+                   "`scipy.stats.fit` because shape information has "
+                   "not been defined.")
+        raise ValueError(message) from e
+    # data input validation
+    data = np.asarray(data)
+    if data.ndim != 1:
+        message = "`data` must be exactly one-dimensional."
+        raise ValueError(message)
+    if not (np.issubdtype(data.dtype, np.number)
+            and np.all(np.isfinite(data))):
+        message = "All elements of `data` must be finite numbers."
+        raise ValueError(message)
+    # bounds input validation and information collection
+    n_params = len(param_info)
+    n_shapes = n_params - (1 if discrete else 2)
+    param_list = [param.name for param in param_info]
+    param_names = ", ".join(param_list)
+    shape_names = ", ".join(param_list[:n_shapes])
+    if user_bounds is None:
+        user_bounds = {}
+    if isinstance(user_bounds, dict):
+        default_bounds.update(user_bounds)
+        user_bounds = default_bounds
+        user_bounds_array = np.empty((n_params, 2))
+        for i in range(n_params):
+            param_name = param_info[i].name
+            user_bound = user_bounds.pop(param_name, None)
+            if user_bound is None:
+                user_bound = param_info[i].domain
+            user_bounds_array[i] = user_bound
+        if user_bounds:
+            message = ("Bounds provided for the following unrecognized "
+                       f"parameters will be ignored: {set(user_bounds)}")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+    else:
+        try:
+            user_bounds = np.asarray(user_bounds, dtype=float)
+            if user_bounds.size == 0:
+                user_bounds = np.empty((0, 2))
+        except ValueError as e:
+            message = ("Each element of a `bounds` sequence must be a tuple "
+                       "containing two elements: the lower and upper bound of "
+                       "a distribution parameter.")
+            raise ValueError(message) from e
+        if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):
+            message = ("Each element of `bounds` must be a tuple specifying "
+                       "the lower and upper bounds of a shape parameter")
+            raise ValueError(message)
+        if user_bounds.shape[0] < n_shapes:
+            message = (f"A `bounds` sequence must contain at least {n_shapes} "
+                       "elements: tuples specifying the lower and upper "
+                       f"bounds of all shape parameters {shape_names}.")
+            raise ValueError(message)
+        if user_bounds.shape[0] > n_params:
+            message = ("A `bounds` sequence may not contain more than "
+                       f"{n_params} elements: tuples specifying the lower and "
+                       "upper bounds of distribution parameters "
+                       f"{param_names}.")
+            raise ValueError(message)
+        user_bounds_array = np.empty((n_params, 2))
+        user_bounds_array[n_shapes:] = list(default_bounds.values())
+        user_bounds_array[:len(user_bounds)] = user_bounds
+    user_bounds = user_bounds_array
+    validated_bounds = []
+    for i in range(n_params):
+        name = param_info[i].name
+        user_bound = user_bounds_array[i]
+        param_domain = param_info[i].domain
+        integral = param_info[i].integrality
+        combined = _combine_bounds(name, user_bound, param_domain, integral)
+        validated_bounds.append(combined)
+    bounds = np.asarray(validated_bounds)
+    integrality = [param.integrality for param in param_info]
+    # guess input validation
+    if user_guess is None:
+        guess_array = None
+    elif isinstance(user_guess, dict):
+        default_guess = {param.name: np.mean(bound)
+                         for param, bound in zip(param_info, bounds)}
+        unrecognized = set(user_guess) - set(default_guess)
+        if unrecognized:
+            message = ("Guesses provided for the following unrecognized "
+                       f"parameters will be ignored: {unrecognized}")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+        default_guess.update(user_guess)
+        message = ("Each element of `guess` must be a scalar "
+                   "guess for a distribution parameter.")
+        try:
+            guess_array = np.asarray([default_guess[param.name]
+                                      for param in param_info], dtype=float)
+        except ValueError as e:
+            raise ValueError(message) from e
+    else:
+        message = ("Each element of `guess` must be a scalar "
+                   "guess for a distribution parameter.")
+        try:
+            user_guess = np.asarray(user_guess, dtype=float)
+        except ValueError as e:
+            raise ValueError(message) from e
+        if user_guess.ndim != 1:
+            raise ValueError(message)
+        if user_guess.shape[0] < n_shapes:
+            message = (f"A `guess` sequence must contain at least {n_shapes} "
+                       "elements: scalar guesses for the distribution shape "
+                       f"parameters {shape_names}.")
+            raise ValueError(message)
+        if user_guess.shape[0] > n_params:
+            message = ("A `guess` sequence may not contain more than "
+                       f"{n_params} elements: scalar guesses for the "
+                       f"distribution parameters {param_names}.")
+            raise ValueError(message)
+        guess_array = np.mean(bounds, axis=1)
+        guess_array[:len(user_guess)] = user_guess
+    if guess_array is not None:
+        guess_rounded = guess_array.copy()
+        guess_rounded[integrality] = np.round(guess_rounded[integrality])
+        rounded = np.where(guess_rounded != guess_array)[0]
+        for i in rounded:
+            message = (f"Guess for parameter `{param_info[i].name}` "
+                       f"rounded from {guess_array[i]} to {guess_rounded[i]}.")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+        guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])
+        clipped = np.where(guess_clipped != guess_rounded)[0]
+        for i in clipped:
+            message = (f"Guess for parameter `{param_info[i].name}` "
+                       f"clipped from {guess_rounded[i]} to "
+                       f"{guess_clipped[i]}.")
+            warnings.warn(message, RuntimeWarning, stacklevel=2)
+        guess = guess_clipped
+    else:
+        guess = None
+    # --- Fitting --- #
+    def nllf(free_params, data=data):  # bind data NOW
+        with np.errstate(invalid='ignore', divide='ignore'):
+            return dist._penalized_nnlf(free_params, data)
+    def nlpsf(free_params, data=data):  # bind data NOW
+        with np.errstate(invalid='ignore', divide='ignore'):
+            return dist._penalized_nlpsf(free_params, data)
+    methods = {'mle': nllf, 'mse': nlpsf}
+    objective = methods[method.lower()]
+    with np.errstate(invalid='ignore', divide='ignore'):
+        kwds = {}
+        if bounds is not None:
+            kwds['bounds'] = bounds
+        if np.any(integrality):
+            kwds['integrality'] = integrality
+        if guess is not None:
+            kwds['x0'] = guess
+        res = optimizer(objective, **kwds)
+    return FitResult(dist, data, discrete, res)
+GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',
+                                 ('fit_result', 'statistic', 'pvalue',
+                                  'null_distribution'))
+def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
+                    guessed_params=None, statistic='ad', n_mc_samples=9999,
+                    random_state=None):
+    r"""
+    Perform a goodness of fit test comparing data to a distribution family.
+    Given a distribution family and data, perform a test of the null hypothesis
+    that the data were drawn from a distribution in that family. Any known
+    parameters of the distribution may be specified. Remaining parameters of
+    the distribution will be fit to the data, and the p-value of the test
+    is computed accordingly. Several statistics for comparing the distribution
+    to data are available.
+    Parameters
+    ----------
+    dist : `scipy.stats.rv_continuous`
+        The object representing the distribution family under the null
+        hypothesis.
+    data : 1D array_like
+        Finite, uncensored data to be tested.
+    known_params : dict, optional
+        A dictionary containing name-value pairs of known distribution
+        parameters. Monte Carlo samples are randomly drawn from the
+        null-hypothesized distribution with these values of the parameters.
+        Before the statistic is evaluated for each Monte Carlo sample, only
+        remaining unknown parameters of the null-hypothesized distribution
+        family are fit to the samples; the known parameters are held fixed.
+        If all parameters of the distribution family are known, then the step
+        of fitting the distribution family to each sample is omitted.
+    fit_params : dict, optional
+        A dictionary containing name-value pairs of distribution parameters
+        that have already been fit to the data, e.g. using `scipy.stats.fit`
+        or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the
+        null-hypothesized distribution with these specified values of the
+        parameter. On those Monte Carlo samples, however, these and all other
+        unknown parameters of the null-hypothesized distribution family are
+        fit before the statistic is evaluated.
+    guessed_params : dict, optional
+        A dictionary containing name-value pairs of distribution parameters
+        which have been guessed. These parameters are always considered as
+        free parameters and are fit both to the provided `data` as well as
+        to the Monte Carlo samples drawn from the null-hypothesized
+        distribution. The purpose of these `guessed_params` is to be used as
+        initial values for the numerical fitting procedure.
+    statistic : {"ad", "ks", "cvm", "filliben"} or callable, optional
+        The statistic used to compare data to a distribution after fitting
+        unknown parameters of the distribution family to the data. The
+        Anderson-Darling ("ad") [1]_, Kolmogorov-Smirnov ("ks") [1]_,
+        Cramer-von Mises ("cvm") [1]_, and Filliben ("filliben") [7]_
+        statistics are available.  Alternatively, a callable with signature
+        ``(dist, data, axis)`` may be supplied to compute the statistic. Here
+        ``dist`` is a frozen distribution object (potentially with array
+        parameters), ``data`` is an array of Monte Carlo samples (of
+        compatible shape), and ``axis`` is the axis of ``data`` along which
+        the statistic must be computed.
+    n_mc_samples : int, default: 9999
+        The number of Monte Carlo samples drawn from the null hypothesized
+        distribution to form the null distribution of the statistic. The
+        sample size of each is the same as the given `data`.
+    random_state : {None, int, `numpy.random.Generator`,
+                    `numpy.random.RandomState`}, optional
+        Pseudorandom number generator state used to generate the Monte Carlo
+        samples.
+        If `random_state` is ``None`` (default), the
+        `numpy.random.RandomState` singleton is used.
+        If `random_state` is an int, a new ``RandomState`` instance is used,
+        seeded with `random_state`.
+        If `random_state` is already a ``Generator`` or ``RandomState``
+        instance, then the provided instance is used.
+    Returns
+    -------
+    res : GoodnessOfFitResult
+        An object with the following attributes.
+        fit_result : `~scipy.stats._result_classes.FitResult`
+            An object representing the fit of the provided `dist` to `data`.
+            This  object includes the values of distribution family parameters
+            that fully define the null-hypothesized distribution, that is,
+            the distribution from which Monte Carlo samples are drawn.
+        statistic : float
+            The value of the statistic comparing provided `data` to the
+            null-hypothesized distribution.
+        pvalue : float
+            The proportion of elements in the null distribution with
+            statistic values at least as extreme as the statistic value of the
+            provided `data`.
+        null_distribution : ndarray
+            The value of the statistic for each Monte Carlo sample
+            drawn from the null-hypothesized distribution.
+    Notes
+    -----
+    This is a generalized Monte Carlo goodness-of-fit procedure, special cases
+    of which correspond with various Anderson-Darling tests, Lilliefors' test,
+    etc. The test is described in [2]_, [3]_, and [4]_ as a parametric
+    bootstrap test. This is a Monte Carlo test in which parameters that
+    specify the distribution from which samples are drawn have been estimated
+    from the data. We describe the test using "Monte Carlo" rather than
+    "parametric bootstrap" throughout to avoid confusion with the more familiar
+    nonparametric bootstrap, and describe how the test is performed below.
+    *Traditional goodness of fit tests*
+    Traditionally, critical values corresponding with a fixed set of
+    significance levels are pre-calculated using Monte Carlo methods. Users
+    perform the test by calculating the value of the test statistic only for
+    their observed `data` and comparing this value to tabulated critical
+    values. This practice is not very flexible, as tables are not available for
+    all distributions and combinations of known and unknown parameter values.
+    Also, results can be inaccurate when critical values are interpolated from
+    limited tabulated data to correspond with the user's sample size and
+    fitted parameter values. To overcome these shortcomings, this function
+    allows the user to perform the Monte Carlo trials adapted to their
+    particular data.
+    *Algorithmic overview*
+    In brief, this routine executes the following steps:
+      1. Fit unknown parameters to the given `data`, thereby forming the
+         "null-hypothesized" distribution, and compute the statistic of
+         this pair of data and distribution.
+      2. Draw random samples from this null-hypothesized distribution.
+      3. Fit the unknown parameters to each random sample.
+      4. Calculate the statistic between each sample and the distribution that
+         has been fit to the sample.
+      5. Compare the value of the statistic corresponding with `data` from (1)
+         against the values of the statistic corresponding with the random
+         samples from (4). The p-value is the proportion of samples with a
+         statistic value greater than or equal to the statistic of the observed
+         data.
+    In more detail, the steps are as follows.
+    First, any unknown parameters of the distribution family specified by
+    `dist` are fit to the provided `data` using maximum likelihood estimation.
+    (One exception is the normal distribution with unknown location and scale:
+    we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for
+    the scale as recommended in [1]_.)
+    These values of the parameters specify a particular member of the
+    distribution family referred to as the "null-hypothesized distribution",
+    that is, the distribution from which the data were sampled under the null
+    hypothesis. The `statistic`, which compares data to a distribution, is
+    computed between `data` and the null-hypothesized distribution.
+    Next, many (specifically `n_mc_samples`) new samples, each containing the
+    same number of observations as `data`, are drawn from the
+    null-hypothesized distribution. All unknown parameters of the distribution
+    family `dist` are fit to *each resample*, and the `statistic` is computed
+    between each sample and its corresponding fitted distribution. These
+    values of the statistic form the Monte Carlo null distribution (not to be
+    confused with the "null-hypothesized distribution" above).
+    The p-value of the test is the proportion of statistic values in the Monte
+    Carlo null distribution that are at least as extreme as the statistic value
+    of the provided `data`. More precisely, the p-value is given by
+    .. math::
+        p = \frac{b + 1}
+                 {m + 1}
+    where :math:`b` is the number of statistic values in the Monte Carlo null
+    distribution that are greater than or equal to the statistic value
+    calculated for `data`, and :math:`m` is the number of elements in the
+    Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`
+    to the numerator and denominator can be thought of as including the
+    value of the statistic corresponding with `data` in the null distribution,
+    but a more formal explanation is given in [5]_.
+    *Limitations*
+    The test can be very slow for some distribution families because unknown
+    parameters of the distribution family must be fit to each of the Monte
+    Carlo samples, and for most distributions in SciPy, distribution fitting
+    performed via numerical optimization.
+    *Anti-Pattern*
+    For this reason, it may be tempting
+    to treat parameters of the distribution pre-fit to `data` (by the user)
+    as though they were `known_params`, as specification of all parameters of
+    the distribution precludes the need to fit the distribution to each Monte
+    Carlo sample. (This is essentially how the original Kilmogorov-Smirnov
+    test is performed.) Although such a test can provide evidence against the
+    null hypothesis, the test is conservative in the sense that small p-values
+    will tend to (greatly) *overestimate* the probability of making a type I
+    error (that is, rejecting the null hypothesis although it is true), and the
+    power of the test is low (that is, it is less likely to reject the null
+    hypothesis even when the null hypothesis is false).
+    This is because the Monte Carlo samples are less likely to agree with the
+    null-hypothesized distribution as well as `data`. This tends to increase
+    the values of the statistic recorded in the null distribution, so that a
+    larger number of them exceed the value of statistic for `data`, thereby
+    inflating the p-value.
+    References
+    ----------
+    .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and
+           Some Comparisons." Journal of the American Statistical Association,
+           Vol. 69, pp. 730-737.
+    .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).
+           "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.
+    .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric
+           bootstrap for goodness-of-fit testing in semiparametric models."
+           Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.
+    .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on
+           a weighted bootstrap: A fast large-sample alternative to the
+           parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.
+    .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should
+           Never Be Zero: Calculating Exact P-values When Permutations Are
+           Randomly Drawn." Statistical Applications in Genetics and Molecular
+           Biology 9.1.
+    .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for
+           normality with mean and variance unknown." Journal of the American
+           statistical Association 62.318: 399-402.
+    .. [7] Filliben, James J. "The probability plot correlation coefficient
+           test for normality." Technometrics 17.1 (1975): 111-117.
+    Examples
+    --------
+    A well-known test of the null hypothesis that data were drawn from a
+    given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy
+    as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following
+    data:
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = stats.uniform.rvs(size=75, random_state=rng)
+    were sampled from a normal distribution. To perform a KS test, the
+    empirical distribution function of the observed data will be compared
+    against the (theoretical) cumulative distribution function of a normal
+    distribution. Of course, to do this, the normal distribution under the null
+    hypothesis must be fully specified. This is commonly done by first fitting
+    the ``loc`` and ``scale`` parameters of the distribution to the observed
+    data, then performing the test.
+    >>> loc, scale = np.mean(x), np.std(x, ddof=1)
+    >>> cdf = stats.norm(loc, scale).cdf
+    >>> stats.ks_1samp(x, cdf)
+    KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257)
+    An advantage of the KS-test is that the p-value - the probability of
+    obtaining a value of the test statistic under the null hypothesis as
+    extreme as the value obtained from the observed data - can be calculated
+    exactly and efficiently. `goodness_of_fit` can only approximate these
+    results.
+    >>> known_params = {'loc': loc, 'scale': scale}
+    >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
+    ...                             statistic='ks', random_state=rng)
+    >>> res.statistic, res.pvalue
+    (0.1119257570456813, 0.2788)
+    The statistic matches exactly, but the p-value is estimated by forming
+    a "Monte Carlo null distribution", that is, by explicitly drawing random
+    samples from `scipy.stats.norm` with the provided parameters and
+    calculating the stastic for each. The fraction of these statistic values
+    at least as extreme as ``res.statistic`` approximates the exact p-value
+    calculated by `scipy.stats.ks_1samp`.
+    However, in many cases, we would prefer to test only that the data were
+    sampled from one of *any* member of the normal distribution family, not
+    specifically from the normal distribution with the location and scale
+    fitted to the observed sample. In this case, Lilliefors [6]_ argued that
+    the KS test is far too conservative (that is, the p-value overstates
+    the actual probability of rejecting a true null hypothesis) and thus lacks
+    power - the ability to reject the null hypothesis when the null hypothesis
+    is actually false.
+    Indeed, our p-value above is approximately 0.28, which is far too large
+    to reject the null hypothesis at any common significance level.
+    Consider why this might be. Note that in the KS test above, the statistic
+    always compares data against the CDF of a normal distribution fitted to the
+    *observed data*. This tends to reduce the value of the statistic for the
+    observed data, but it is "unfair" when computing the statistic for other
+    samples, such as those we randomly draw to form the Monte Carlo null
+    distribution. It is easy to correct for this: whenever we compute the KS
+    statistic of a sample, we use the CDF of a normal distribution fitted
+    to *that sample*. The null distribution in this case has not been
+    calculated exactly and is tyically approximated using Monte Carlo methods
+    as described above. This is where `goodness_of_fit` excels.
+    >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',
+    ...                             random_state=rng)
+    >>> res.statistic, res.pvalue
+    (0.1119257570456813, 0.0196)
+    Indeed, this p-value is much smaller, and small enough to (correctly)
+    reject the null hypothesis at common significance levels, including 5% and
+    2.5%.
+    However, the KS statistic is not very sensitive to all deviations from
+    normality. The original advantage of the KS statistic was the ability
+    to compute the null distribution theoretically, but a more sensitive
+    statistic - resulting in a higher test power - can be used now that we can
+    approximate the null distribution
+    computationally. The Anderson-Darling statistic [1]_ tends to be more
+    sensitive, and critical values of the this statistic have been tabulated
+    for various significance levels and sample sizes using Monte Carlo methods.
+    >>> res = stats.anderson(x, 'norm')
+    >>> print(res.statistic)
+    1.2139573337497467
+    >>> print(res.critical_values)
+    [0.549 0.625 0.75  0.875 1.041]
+    >>> print(res.significance_level)
+    [15.  10.   5.   2.5  1. ]
+    Here, the observed value of the statistic exceeds the critical value
+    corresponding with a 1% significance level. This tells us that the p-value
+    of the observed data is less than 1%, but what is it? We could interpolate
+    from these (already-interpolated) values, but `goodness_of_fit` can
+    estimate it directly.
+    >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',
+    ...                             random_state=rng)
+    >>> res.statistic, res.pvalue
+    (1.2139573337497467, 0.0034)
+    A further advantage is that use of `goodness_of_fit` is not limited to
+    a particular set of distributions or conditions on which parameters
+    are known versus which must be estimated from data. Instead,
+    `goodness_of_fit` can estimate p-values relatively quickly for any
+    distribution with a sufficiently fast and reliable ``fit`` method. For
+    instance, here we perform a goodness of fit test using the Cramer-von Mises
+    statistic against the Rayleigh distribution with known location and unknown
+    scale.
+    >>> rng = np.random.default_rng()
+    >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
+    >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
+    ...                             known_params={'loc': 0}, random_state=rng)
+    This executes fairly quickly, but to check the reliability of the ``fit``
+    method, we should inspect the fit result.
+    >>> res.fit_result  # location is as specified, and scale is reasonable
+      params: FitParams(loc=0.0, scale=2.1026719844231243)
+     success: True
+     message: 'The fit was performed successfully.'
+    >>> import matplotlib.pyplot as plt  # matplotlib must be installed to plot
+    >>> res.fit_result.plot()
+    >>> plt.show()
+    If the distribution is not fit to the observed data as well as possible,
+    the test may not control the type I error rate, that is, the chance of
+    rejecting the null hypothesis even when it is true.
+    We should also look for extreme outliers in the null distribution that
+    may be caused by unreliable fitting. These do not necessarily invalidate
+    the result, but they tend to reduce the test's power.
+    >>> _, ax = plt.subplots()
+    >>> ax.hist(np.log10(res.null_distribution))
+    >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")
+    >>> ax.set_ylabel("Frequency")
+    >>> ax.set_title("Histogram of the Monte Carlo null distribution")
+    >>> plt.show()
+    This plot seems reassuring.
+    If ``fit`` method is working reliably, and if the distribution of the test
+    statistic is not particularly sensitive to the values of the fitted
+    parameters, then the p-value provided by `goodness_of_fit` is expected to
+    be a good approximation.
+    >>> res.statistic, res.pvalue
+    (0.2231991510248692, 0.0525)
+    """
+    args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
+                   statistic, n_mc_samples, random_state)
+    (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
+     guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args
+    # Fit null hypothesis distribution to data
+    nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
+                               fixed_nhd_params)
+    nhd_vals = nhd_fit_fun(data)
+    nhd_dist = dist(*nhd_vals)
+    def rvs(size):
+        return nhd_dist.rvs(size=size, random_state=random_state)
+    # Define statistic
+    fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
+    if callable(statistic):
+        compare_fun = statistic
+    else:
+        compare_fun = _compare_dict[statistic]
+    alternative = getattr(compare_fun, 'alternative', 'greater')
+    def statistic_fun(data, axis):
+        # Make things simple by always working along the last axis.
+        data = np.moveaxis(data, axis, -1)
+        rfd_vals = fit_fun(data)
+        rfd_dist = dist(*rfd_vals)
+        return compare_fun(rfd_dist, data, axis=-1)
+    res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,
+                                 n_resamples=n_mc_samples, axis=-1,
+                                 alternative=alternative)
+    opt_res = optimize.OptimizeResult()
+    opt_res.success = True
+    opt_res.message = "The fit was performed successfully."
+    opt_res.x = nhd_vals
+    # Only continuous distributions for now, hence discrete=False
+    # There's no fundamental limitation; it's just that we're not using
+    # stats.fit, discrete distributions don't have `fit` method, and
+    # we haven't written any vectorized fit functions for a discrete
+    # distribution yet.
+    return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),
+                               res.statistic, res.pvalue,
+                               res.null_distribution)
+def _get_fit_fun(dist, data, guessed_params, fixed_params):
+    shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
+    param_names = shape_names + ['loc', 'scale']
+    fparam_names = ['f'+name for name in param_names]
+    all_fixed = not set(fparam_names).difference(fixed_params)
+    guessed_shapes = [guessed_params.pop(x, None)
+                      for x in shape_names if x in guessed_params]
+    if all_fixed:
+        def fit_fun(data):
+            return [fixed_params[name] for name in fparam_names]
+    # Define statistic, including fitting distribution to data
+    elif dist in _fit_funs:
+        def fit_fun(data):
+            params = _fit_funs[dist](data, **fixed_params)
+            params = np.asarray(np.broadcast_arrays(*params))
+            if params.ndim > 1:
+                params = params[..., np.newaxis]
+            return params
+    else:
+        def fit_fun_1d(data):
+            return dist.fit(data, *guessed_shapes, **guessed_params,
+                            **fixed_params)
+        def fit_fun(data):
+            params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)
+            if params.ndim > 1:
+                params = params.T[..., np.newaxis]
+            return params
+    return fit_fun
+# Vectorized fitting functions. These are to accept ND `data` in which each
+# row (slice along last axis) is a sample to fit and scalar fixed parameters.
+# They return a tuple of shape parameter arrays, each of shape data.shape[:-1].
+def _fit_norm(data, floc=None, fscale=None):
+    loc = floc
+    scale = fscale
+    if loc is None and scale is None:
+        loc = np.mean(data, axis=-1)
+        scale = np.std(data, ddof=1, axis=-1)
+    elif loc is None:
+        loc = np.mean(data, axis=-1)
+    elif scale is None:
+        scale = np.sqrt(((data - loc)**2).mean(axis=-1))
+    return loc, scale
+_fit_funs = {stats.norm: _fit_norm}  # type: ignore[attr-defined]
+# Vectorized goodness of fit statistic functions. These accept a frozen
+# distribution object and `data` in which each row (slice along last axis) is
+# a sample.
+def _anderson_darling(dist, data, axis):
+    x = np.sort(data, axis=-1)
+    n = data.shape[-1]
+    i = np.arange(1, n+1)
+    Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))
+    S = np.sum(Si, axis=-1)
+    return -n - S
+def _compute_dplus(cdfvals):  # adapted from _stats_py before gh-17062
+    n = cdfvals.shape[-1]
+    return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)
+def _compute_dminus(cdfvals):
+    n = cdfvals.shape[-1]
+    return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)
+def _kolmogorov_smirnov(dist, data, axis):
+    x = np.sort(data, axis=-1)
+    cdfvals = dist.cdf(x)
+    Dplus = _compute_dplus(cdfvals)  # always works along last axis
+    Dminus = _compute_dminus(cdfvals)
+    return np.maximum(Dplus, Dminus)
+def _corr(X, M):
+    # Correlation coefficient r, simplified and vectorized as we need it.
+    # See [7] Equation (2). Lemma 1/2 are only for distributions symmetric
+    # about 0.
+    Xm = X.mean(axis=-1, keepdims=True)
+    Mm = M.mean(axis=-1, keepdims=True)
+    num = np.sum((X - Xm) * (M - Mm), axis=-1)
+    den = np.sqrt(np.sum((X - Xm)**2, axis=-1) * np.sum((M - Mm)**2, axis=-1))
+    return num/den
+def _filliben(dist, data, axis):
+    # [7] Section 8 # 1
+    X = np.sort(data, axis=-1)
+    # [7] Section 8 # 2
+    n = data.shape[-1]
+    k = np.arange(1, n+1)
+    # Filliben used an approximation for the uniform distribution order
+    # statistic medians.
+    # m = (k - .3175)/(n + 0.365)
+    # m[-1] = 0.5**(1/n)
+    # m[0] = 1 - m[-1]
+    # We can just as easily use the (theoretically) exact values. See e.g.
+    # https://en.wikipedia.org/wiki/Order_statistic
+    # "Order statistics sampled from a uniform distribution"
+    m = stats.beta(k, n + 1 - k).median()
+    # [7] Section 8 # 3
+    M = dist.ppf(m)
+    # [7] Section 8 # 4
+    return _corr(X, M)
+_filliben.alternative = 'less'  # type: ignore[attr-defined]
+def _cramer_von_mises(dist, data, axis):
+    x = np.sort(data, axis=-1)
+    n = data.shape[-1]
+    cdfvals = dist.cdf(x)
+    u = (2*np.arange(1, n+1) - 1)/(2*n)
+    w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)
+    return w
+_compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,
+                 "cvm": _cramer_von_mises, "filliben": _filliben}
+def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
+            n_mc_samples, random_state):
+    if not isinstance(dist, stats.rv_continuous):
+        message = ("`dist` must be a (non-frozen) instance of "
+                   "`stats.rv_continuous`.")
+        raise TypeError(message)
+    data = np.asarray(data, dtype=float)
+    if not data.ndim == 1:
+        message = "`data` must be a one-dimensional array of numbers."
+        raise ValueError(message)
+    # Leave validation of these key/value pairs to the `fit` method,
+    # but collect these into dictionaries that will be used
+    known_params = known_params or dict()
+    fit_params = fit_params or dict()
+    guessed_params = guessed_params or dict()
+    known_params_f = {("f"+key): val for key, val in known_params.items()}
+    fit_params_f = {("f"+key): val for key, val in fit_params.items()}
+    # These are the values of parameters of the null distribution family
+    # with which resamples are drawn
+    fixed_nhd_params = known_params_f.copy()
+    fixed_nhd_params.update(fit_params_f)
+    # These are fixed when fitting the distribution family to resamples
+    fixed_rfd_params = known_params_f.copy()
+    # These are used as guesses when fitting the distribution family to
+    # the original data
+    guessed_nhd_params = guessed_params.copy()
+    # These are used as guesses when fitting the distribution family to
+    # resamples
+    guessed_rfd_params = fit_params.copy()
+    guessed_rfd_params.update(guessed_params)
+    if not callable(statistic):
+        statistic = statistic.lower()
+        statistics = {'ad', 'ks', 'cvm', 'filliben'}
+        if statistic not in statistics:
+            message = f"`statistic` must be one of {statistics}."
+            raise ValueError(message)
+    n_mc_samples_int = int(n_mc_samples)
+    if n_mc_samples_int != n_mc_samples:
+        message = "`n_mc_samples` must be an integer."
+        raise TypeError(message)
+    random_state = check_random_state(random_state)
+    return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
+            guessed_rfd_params, statistic, n_mc_samples_int, random_state)

.venv/Lib/site-packages/scipy/stats/_generate_pyx.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pathlib
+import subprocess
+import sys
+import os
+import argparse
+def make_boost(outdir):
+    # Call code generator inside _boost directory
+    code_gen = pathlib.Path(__file__).parent / '_boost/include/code_gen.py'
+    subprocess.run([sys.executable, str(code_gen), '-o', outdir],
+                   check=True)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--outdir", type=str,
+                        help="Path to the output directory")
+    args = parser.parse_args()
+    if not args.outdir:
+        raise ValueError("A path to the output directory is required")
+    else:
+        # Meson build
+        srcdir_abs = pathlib.Path(os.path.abspath(os.path.dirname(__file__)))
+        outdir_abs = pathlib.Path(os.getcwd()) / args.outdir
+        make_boost(outdir_abs)

.venv/Lib/site-packages/scipy/stats/_hypotests.py ADDED Viewed

	@@ -0,0 +1,2021 @@

+from collections import namedtuple
+from dataclasses import dataclass
+from math import comb
+import numpy as np
+import warnings
+from itertools import combinations
+import scipy.stats
+from scipy.optimize import shgo
+from . import distributions
+from ._common import ConfidenceInterval
+from ._continuous_distns import chi2, norm
+from scipy.special import gamma, kv, gammaln
+from scipy.fft import ifft
+from ._stats_pythran import _a_ij_Aij_Dij2
+from ._stats_pythran import (
+    _concordant_pairs as _P, _discordant_pairs as _Q
+)
+from ._axis_nan_policy import _axis_nan_policy_factory
+from scipy.stats import _stats_py
+__all__ = ['epps_singleton_2samp', 'cramervonmises', 'somersd',
+           'barnard_exact', 'boschloo_exact', 'cramervonmises_2samp',
+           'tukey_hsd', 'poisson_means_test']
+Epps_Singleton_2sampResult = namedtuple('Epps_Singleton_2sampResult',
+                                        ('statistic', 'pvalue'))
+@_axis_nan_policy_factory(Epps_Singleton_2sampResult, n_samples=2, too_small=4)
+def epps_singleton_2samp(x, y, t=(0.4, 0.8)):
+    """Compute the Epps-Singleton (ES) test statistic.
+    Test the null hypothesis that two samples have the same underlying
+    probability distribution.
+    Parameters
+    ----------
+    x, y : array-like
+        The two samples of observations to be tested. Input must not have more
+        than one dimension. Samples can have different lengths.
+    t : array-like, optional
+        The points (t1, ..., tn) where the empirical characteristic function is
+        to be evaluated. It should be positive distinct numbers. The default
+        value (0.4, 0.8) is proposed in [1]_. Input must not have more than
+        one dimension.
+    Returns
+    -------
+    statistic : float
+        The test statistic.
+    pvalue : float
+        The associated p-value based on the asymptotic chi2-distribution.
+    See Also
+    --------
+    ks_2samp, anderson_ksamp
+    Notes
+    -----
+    Testing whether two samples are generated by the same underlying
+    distribution is a classical question in statistics. A widely used test is
+    the Kolmogorov-Smirnov (KS) test which relies on the empirical
+    distribution function. Epps and Singleton introduce a test based on the
+    empirical characteristic function in [1]_.
+    One advantage of the ES test compared to the KS test is that is does
+    not assume a continuous distribution. In [1]_, the authors conclude
+    that the test also has a higher power than the KS test in many
+    examples. They recommend the use of the ES test for discrete samples as
+    well as continuous samples with at least 25 observations each, whereas
+    `anderson_ksamp` is recommended for smaller sample sizes in the
+    continuous case.
+    The p-value is computed from the asymptotic distribution of the test
+    statistic which follows a `chi2` distribution. If the sample size of both
+    `x` and `y` is below 25, the small sample correction proposed in [1]_ is
+    applied to the test statistic.
+    The default values of `t` are determined in [1]_ by considering
+    various distributions and finding good values that lead to a high power
+    of the test in general. Table III in [1]_ gives the optimal values for
+    the distributions tested in that study. The values of `t` are scaled by
+    the semi-interquartile range in the implementation, see [1]_.
+    References
+    ----------
+    .. [1] T. W. Epps and K. J. Singleton, "An omnibus test for the two-sample
+       problem using the empirical characteristic function", Journal of
+       Statistical Computation and Simulation 26, p. 177--203, 1986.
+    .. [2] S. J. Goerg and J. Kaiser, "Nonparametric testing of distributions
+       - the Epps-Singleton two-sample test using the empirical characteristic
+       function", The Stata Journal 9(3), p. 454--465, 2009.
+    """
+    # x and y are converted to arrays by the decorator
+    t = np.asarray(t)
+    # check if x and y are valid inputs
+    nx, ny = len(x), len(y)
+    if (nx < 5) or (ny < 5):
+        raise ValueError('x and y should have at least 5 elements, but len(x) '
+                         f'= {nx} and len(y) = {ny}.')
+    if not np.isfinite(x).all():
+        raise ValueError('x must not contain nonfinite values.')
+    if not np.isfinite(y).all():
+        raise ValueError('y must not contain nonfinite values.')
+    n = nx + ny
+    # check if t is valid
+    if t.ndim > 1:
+        raise ValueError(f't must be 1d, but t.ndim equals {t.ndim}.')
+    if np.less_equal(t, 0).any():
+        raise ValueError('t must contain positive elements only.')
+    # rescale t with semi-iqr as proposed in [1]; import iqr here to avoid
+    # circular import
+    from scipy.stats import iqr
+    sigma = iqr(np.hstack((x, y))) / 2
+    ts = np.reshape(t, (-1, 1)) / sigma
+    # covariance estimation of ES test
+    gx = np.vstack((np.cos(ts*x), np.sin(ts*x))).T  # shape = (nx, 2*len(t))
+    gy = np.vstack((np.cos(ts*y), np.sin(ts*y))).T
+    cov_x = np.cov(gx.T, bias=True)  # the test uses biased cov-estimate
+    cov_y = np.cov(gy.T, bias=True)
+    est_cov = (n/nx)*cov_x + (n/ny)*cov_y
+    est_cov_inv = np.linalg.pinv(est_cov)
+    r = np.linalg.matrix_rank(est_cov_inv)
+    if r < 2*len(t):
+        warnings.warn('Estimated covariance matrix does not have full rank. '
+                      'This indicates a bad choice of the input t and the '
+                      'test might not be consistent.', # see p. 183 in [1]_
+                      stacklevel=2)
+    # compute test statistic w distributed asympt. as chisquare with df=r
+    g_diff = np.mean(gx, axis=0) - np.mean(gy, axis=0)
+    w = n*np.dot(g_diff.T, np.dot(est_cov_inv, g_diff))
+    # apply small-sample correction
+    if (max(nx, ny) < 25):
+        corr = 1.0/(1.0 + n**(-0.45) + 10.1*(nx**(-1.7) + ny**(-1.7)))
+        w = corr * w
+    p = chi2.sf(w, r)
+    return Epps_Singleton_2sampResult(w, p)
+def poisson_means_test(k1, n1, k2, n2, *, diff=0, alternative='two-sided'):
+    r"""
+    Performs the Poisson means test, AKA the "E-test".
+    This is a test of the null hypothesis that the difference between means of
+    two Poisson distributions is `diff`. The samples are provided as the
+    number of events `k1` and `k2` observed within measurement intervals
+    (e.g. of time, space, number of observations) of sizes `n1` and `n2`.
+    Parameters
+    ----------
+    k1 : int
+        Number of events observed from distribution 1.
+    n1: float
+        Size of sample from distribution 1.
+    k2 : int
+        Number of events observed from distribution 2.
+    n2 : float
+        Size of sample from distribution 2.
+    diff : float, default=0
+        The hypothesized difference in means between the distributions
+        underlying the samples.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The following options are available (default is 'two-sided'):
+          * 'two-sided': the difference between distribution means is not
+            equal to `diff`
+          * 'less': the difference between distribution means is less than
+            `diff`
+          * 'greater': the difference between distribution means is greater
+            than `diff`
+    Returns
+    -------
+    statistic : float
+        The test statistic (see [1]_ equation 3.3).
+    pvalue : float
+        The probability of achieving such an extreme value of the test
+        statistic under the null hypothesis.
+    Notes
+    -----
+    Let:
+    .. math:: X_1 \sim \mbox{Poisson}(\mathtt{n1}\lambda_1)
+    be a random variable independent of
+    .. math:: X_2  \sim \mbox{Poisson}(\mathtt{n2}\lambda_2)
+    and let ``k1`` and ``k2`` be the observed values of :math:`X_1`
+    and :math:`X_2`, respectively. Then `poisson_means_test` uses the number
+    of observed events ``k1`` and ``k2`` from samples of size ``n1`` and
+    ``n2``, respectively, to test the null hypothesis that
+    .. math::
+       H_0: \lambda_1 - \lambda_2 = \mathtt{diff}
+    A benefit of the E-test is that it has good power for small sample sizes,
+    which can reduce sampling costs [1]_. It has been evaluated and determined
+    to be more powerful than the comparable C-test, sometimes referred to as
+    the Poisson exact test.
+    References
+    ----------
+    .. [1]  Krishnamoorthy, K., & Thomson, J. (2004). A more powerful test for
+       comparing two Poisson means. Journal of Statistical Planning and
+       Inference, 119(1), 23-35.
+    .. [2]  Przyborowski, J., & Wilenski, H. (1940). Homogeneity of results in
+       testing samples from Poisson series: With an application to testing
+       clover seed for dodder. Biometrika, 31(3/4), 313-323.
+    Examples
+    --------
+    Suppose that a gardener wishes to test the number of dodder (weed) seeds
+    in a sack of clover seeds that they buy from a seed company. It has
+    previously been established that the number of dodder seeds in clover
+    follows the Poisson distribution.
+    A 100 gram sample is drawn from the sack before being shipped to the
+    gardener. The sample is analyzed, and it is found to contain no dodder
+    seeds; that is, `k1` is 0. However, upon arrival, the gardener draws
+    another 100 gram sample from the sack. This time, three dodder seeds are
+    found in the sample; that is, `k2` is 3. The gardener would like to
+    know if the difference is significant and not due to chance. The
+    null hypothesis is that the difference between the two samples is merely
+    due to chance, or that :math:`\lambda_1 - \lambda_2 = \mathtt{diff}`
+    where :math:`\mathtt{diff} = 0`. The alternative hypothesis is that the
+    difference is not due to chance, or :math:`\lambda_1 - \lambda_2 \ne 0`.
+    The gardener selects a significance level of 5% to reject the null
+    hypothesis in favor of the alternative [2]_.
+    >>> import scipy.stats as stats
+    >>> res = stats.poisson_means_test(0, 100, 3, 100)
+    >>> res.statistic, res.pvalue
+    (-1.7320508075688772, 0.08837900929018157)
+    The p-value is .088, indicating a near 9% chance of observing a value of
+    the test statistic under the null hypothesis. This exceeds 5%, so the
+    gardener does not reject the null hypothesis as the difference cannot be
+    regarded as significant at this level.
+    """
+    _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative)
+    # "for a given k_1 and k_2, an estimate of \lambda_2 is given by" [1] (3.4)
+    lmbd_hat2 = ((k1 + k2) / (n1 + n2) - diff * n1 / (n1 + n2))
+    # "\hat{\lambda_{2k}} may be less than or equal to zero ... and in this
+    # case the null hypothesis cannot be rejected ... [and] it is not necessary
+    # to compute the p-value". [1] page 26 below eq. (3.6).
+    if lmbd_hat2 <= 0:
+        return _stats_py.SignificanceResult(0, 1)
+    # The unbiased variance estimate [1] (3.2)
+    var = k1 / (n1 ** 2) + k2 / (n2 ** 2)
+    # The _observed_ pivot statistic from the input. It follows the
+    # unnumbered equation following equation (3.3) This is used later in
+    # comparison with the computed pivot statistics in an indicator function.
+    t_k1k2 = (k1 / n1 - k2 / n2 - diff) / np.sqrt(var)
+    # Equation (3.5) of [1] is lengthy, so it is broken into several parts,
+    # beginning here. Note that the probability mass function of poisson is
+    # exp^(-\mu)*\mu^k/k!, so and this is called with shape \mu, here noted
+    # here as nlmbd_hat*. The strategy for evaluating the double summation in
+    # (3.5) is to create two arrays of the values of the two products inside
+    # the summation and then broadcast them together into a matrix, and then
+    # sum across the entire matrix.
+    # Compute constants (as seen in the first and second separated products in
+    # (3.5).). (This is the shape (\mu) parameter of the poisson distribution.)
+    nlmbd_hat1 = n1 * (lmbd_hat2 + diff)
+    nlmbd_hat2 = n2 * lmbd_hat2
+    # Determine summation bounds for tail ends of distribution rather than
+    # summing to infinity. `x1*` is for the outer sum and `x2*` is the inner
+    # sum.
+    x1_lb, x1_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat1)
+    x2_lb, x2_ub = distributions.poisson.ppf([1e-10, 1 - 1e-16], nlmbd_hat2)
+    # Construct arrays to function as the x_1 and x_2 counters on the summation
+    # in (3.5). `x1` is in columns and `x2` is in rows to allow for
+    # broadcasting.
+    x1 = np.arange(x1_lb, x1_ub + 1)
+    x2 = np.arange(x2_lb, x2_ub + 1)[:, None]
+    # These are the two products in equation (3.5) with `prob_x1` being the
+    # first (left side) and `prob_x2` being the second (right side). (To
+    # make as clear as possible: the 1st contains a "+ d" term, the 2nd does
+    # not.)
+    prob_x1 = distributions.poisson.pmf(x1, nlmbd_hat1)
+    prob_x2 = distributions.poisson.pmf(x2, nlmbd_hat2)
+    # compute constants for use in the "pivot statistic" per the
+    # unnumbered equation following (3.3).
+    lmbd_x1 = x1 / n1
+    lmbd_x2 = x2 / n2
+    lmbds_diff = lmbd_x1 - lmbd_x2 - diff
+    var_x1x2 = lmbd_x1 / n1 + lmbd_x2 / n2
+    # This is the 'pivot statistic' for use in the indicator of the summation
+    # (left side of "I[.]").
+    with np.errstate(invalid='ignore', divide='ignore'):
+        t_x1x2 = lmbds_diff / np.sqrt(var_x1x2)
+    # `[indicator]` implements the "I[.] ... the indicator function" per
+    # the paragraph following equation (3.5).
+    if alternative == 'two-sided':
+        indicator = np.abs(t_x1x2) >= np.abs(t_k1k2)
+    elif alternative == 'less':
+        indicator = t_x1x2 <= t_k1k2
+    else:
+        indicator = t_x1x2 >= t_k1k2
+    # Multiply all combinations of the products together, exclude terms
+    # based on the `indicator` and then sum. (3.5)
+    pvalue = np.sum((prob_x1 * prob_x2)[indicator])
+    return _stats_py.SignificanceResult(t_k1k2, pvalue)
+def _poisson_means_test_iv(k1, n1, k2, n2, diff, alternative):
+    # """check for valid types and values of input to `poisson_mean_test`."""
+    if k1 != int(k1) or k2 != int(k2):
+        raise TypeError('`k1` and `k2` must be integers.')
+    count_err = '`k1` and `k2` must be greater than or equal to 0.'
+    if k1 < 0 or k2 < 0:
+        raise ValueError(count_err)
+    if n1 <= 0 or n2 <= 0:
+        raise ValueError('`n1` and `n2` must be greater than 0.')
+    if diff < 0:
+        raise ValueError('diff must be greater than or equal to 0.')
+    alternatives = {'two-sided', 'less', 'greater'}
+    if alternative.lower() not in alternatives:
+        raise ValueError(f"Alternative must be one of '{alternatives}'.")
+class CramerVonMisesResult:
+    def __init__(self, statistic, pvalue):
+        self.statistic = statistic
+        self.pvalue = pvalue
+    def __repr__(self):
+        return (f"{self.__class__.__name__}(statistic={self.statistic}, "
+                f"pvalue={self.pvalue})")
+def _psi1_mod(x):
+    """
+    psi1 is defined in equation 1.10 in Csörgő, S. and Faraway, J. (1996).
+    This implements a modified version by excluding the term V(x) / 12
+    (here: _cdf_cvm_inf(x) / 12) to avoid evaluating _cdf_cvm_inf(x)
+    twice in _cdf_cvm.
+    Implementation based on MAPLE code of Julian Faraway and R code of the
+    function pCvM in the package goftest (v1.1.1), permission granted
+    by Adrian Baddeley. Main difference in the implementation: the code
+    here keeps adding terms of the series until the terms are small enough.
+    """
+    def _ed2(y):
+        z = y**2 / 4
+        b = kv(1/4, z) + kv(3/4, z)
+        return np.exp(-z) * (y/2)**(3/2) * b / np.sqrt(np.pi)
+    def _ed3(y):
+        z = y**2 / 4
+        c = np.exp(-z) / np.sqrt(np.pi)
+        return c * (y/2)**(5/2) * (2*kv(1/4, z) + 3*kv(3/4, z) - kv(5/4, z))
+    def _Ak(k, x):
+        m = 2*k + 1
+        sx = 2 * np.sqrt(x)
+        y1 = x**(3/4)
+        y2 = x**(5/4)
+        e1 = m * gamma(k + 1/2) * _ed2((4 * k + 3)/sx) / (9 * y1)
+        e2 = gamma(k + 1/2) * _ed3((4 * k + 1) / sx) / (72 * y2)
+        e3 = 2 * (m + 2) * gamma(k + 3/2) * _ed3((4 * k + 5) / sx) / (12 * y2)
+        e4 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 1) / sx) / (144 * y1)
+        e5 = 7 * m * gamma(k + 1/2) * _ed2((4 * k + 5) / sx) / (144 * y1)
+        return e1 + e2 + e3 + e4 + e5
+    x = np.asarray(x)
+    tot = np.zeros_like(x, dtype='float')
+    cond = np.ones_like(x, dtype='bool')
+    k = 0
+    while np.any(cond):
+        z = -_Ak(k, x[cond]) / (np.pi * gamma(k + 1))
+        tot[cond] = tot[cond] + z
+        cond[cond] = np.abs(z) >= 1e-7
+        k += 1
+    return tot
+def _cdf_cvm_inf(x):
+    """
+    Calculate the cdf of the Cramér-von Mises statistic (infinite sample size).
+    See equation 1.2 in Csörgő, S. and Faraway, J. (1996).
+    Implementation based on MAPLE code of Julian Faraway and R code of the
+    function pCvM in the package goftest (v1.1.1), permission granted
+    by Adrian Baddeley. Main difference in the implementation: the code
+    here keeps adding terms of the series until the terms are small enough.
+    The function is not expected to be accurate for large values of x, say
+    x > 4, when the cdf is very close to 1.
+    """
+    x = np.asarray(x)
+    def term(x, k):
+        # this expression can be found in [2], second line of (1.3)
+        u = np.exp(gammaln(k + 0.5) - gammaln(k+1)) / (np.pi**1.5 * np.sqrt(x))
+        y = 4*k + 1
+        q = y**2 / (16*x)
+        b = kv(0.25, q)
+        return u * np.sqrt(y) * np.exp(-q) * b
+    tot = np.zeros_like(x, dtype='float')
+    cond = np.ones_like(x, dtype='bool')
+    k = 0
+    while np.any(cond):
+        z = term(x[cond], k)
+        tot[cond] = tot[cond] + z
+        cond[cond] = np.abs(z) >= 1e-7
+        k += 1
+    return tot
+def _cdf_cvm(x, n=None):
+    """
+    Calculate the cdf of the Cramér-von Mises statistic for a finite sample
+    size n. If N is None, use the asymptotic cdf (n=inf).
+    See equation 1.8 in Csörgő, S. and Faraway, J. (1996) for finite samples,
+    1.2 for the asymptotic cdf.
+    The function is not expected to be accurate for large values of x, say
+    x > 2, when the cdf is very close to 1 and it might return values > 1
+    in that case, e.g. _cdf_cvm(2.0, 12) = 1.0000027556716846. Moreover, it
+    is not accurate for small values of n, especially close to the bounds of
+    the distribution's domain, [1/(12*n), n/3], where the value jumps to 0
+    and 1, respectively. These are limitations of the approximation by Csörgő
+    and Faraway (1996) implemented in this function.
+    """
+    x = np.asarray(x)
+    if n is None:
+        y = _cdf_cvm_inf(x)
+    else:
+        # support of the test statistic is [12/n, n/3], see 1.1 in [2]
+        y = np.zeros_like(x, dtype='float')
+        sup = (1./(12*n) < x) & (x < n/3.)
+        # note: _psi1_mod does not include the term _cdf_cvm_inf(x) / 12
+        # therefore, we need to add it here
+        y[sup] = _cdf_cvm_inf(x[sup]) * (1 + 1./(12*n)) + _psi1_mod(x[sup]) / n
+        y[x >= n/3] = 1
+    if y.ndim == 0:
+        return y[()]
+    return y
+def _cvm_result_to_tuple(res):
+    return res.statistic, res.pvalue
+@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=1, too_small=1,
+                          result_to_tuple=_cvm_result_to_tuple)
+def cramervonmises(rvs, cdf, args=()):
+    """Perform the one-sample Cramér-von Mises test for goodness of fit.
+    This performs a test of the goodness of fit of a cumulative distribution
+    function (cdf) :math:`F` compared to the empirical distribution function
+    :math:`F_n` of observed random variates :math:`X_1, ..., X_n` that are
+    assumed to be independent and identically distributed ([1]_).
+    The null hypothesis is that the :math:`X_i` have cumulative distribution
+    :math:`F`.
+    Parameters
+    ----------
+    rvs : array_like
+        A 1-D array of observed values of the random variables :math:`X_i`.
+    cdf : str or callable
+        The cumulative distribution function :math:`F` to test the
+        observations against. If a string, it should be the name of a
+        distribution in `scipy.stats`. If a callable, that callable is used
+        to calculate the cdf: ``cdf(x, *args) -> float``.
+    args : tuple, optional
+        Distribution parameters. These are assumed to be known; see Notes.
+    Returns
+    -------
+    res : object with attributes
+        statistic : float
+            Cramér-von Mises statistic.
+        pvalue : float
+            The p-value.
+    See Also
+    --------
+    kstest, cramervonmises_2samp
+    Notes
+    -----
+    .. versionadded:: 1.6.0
+    The p-value relies on the approximation given by equation 1.8 in [2]_.
+    It is important to keep in mind that the p-value is only accurate if
+    one tests a simple hypothesis, i.e. the parameters of the reference
+    distribution are known. If the parameters are estimated from the data
+    (composite hypothesis), the computed p-value is not reliable.
+    References
+    ----------
+    .. [1] Cramér-von Mises criterion, Wikipedia,
+           https://en.wikipedia.org/wiki/Cram%C3%A9r%E2%80%93von_Mises_criterion
+    .. [2] Csörgő, S. and Faraway, J. (1996). The Exact and Asymptotic
+           Distribution of Cramér-von Mises Statistics. Journal of the
+           Royal Statistical Society, pp. 221-234.
+    Examples
+    --------
+    Suppose we wish to test whether data generated by ``scipy.stats.norm.rvs``
+    were, in fact, drawn from the standard normal distribution. We choose a
+    significance level of ``alpha=0.05``.
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng(165417232101553420507139617764912913465)
+    >>> x = stats.norm.rvs(size=500, random_state=rng)
+    >>> res = stats.cramervonmises(x, 'norm')
+    >>> res.statistic, res.pvalue
+    (0.1072085112565724, 0.5508482238203407)
+    The p-value exceeds our chosen significance level, so we do not
+    reject the null hypothesis that the observed sample is drawn from the
+    standard normal distribution.
+    Now suppose we wish to check whether the same samples shifted by 2.1 is
+    consistent with being drawn from a normal distribution with a mean of 2.
+    >>> y = x + 2.1
+    >>> res = stats.cramervonmises(y, 'norm', args=(2,))
+    >>> res.statistic, res.pvalue
+    (0.8364446265294695, 0.00596286797008283)
+    Here we have used the `args` keyword to specify the mean (``loc``)
+    of the normal distribution to test the data against. This is equivalent
+    to the following, in which we create a frozen normal distribution with
+    mean 2.1, then pass its ``cdf`` method as an argument.
+    >>> frozen_dist = stats.norm(loc=2)
+    >>> res = stats.cramervonmises(y, frozen_dist.cdf)
+    >>> res.statistic, res.pvalue
+    (0.8364446265294695, 0.00596286797008283)
+    In either case, we would reject the null hypothesis that the observed
+    sample is drawn from a normal distribution with a mean of 2 (and default
+    variance of 1) because the p-value is less than our chosen
+    significance level.
+    """
+    if isinstance(cdf, str):
+        cdf = getattr(distributions, cdf).cdf
+    vals = np.sort(np.asarray(rvs))
+    if vals.size <= 1:
+        raise ValueError('The sample must contain at least two observations.')
+    n = len(vals)
+    cdfvals = cdf(vals, *args)
+    u = (2*np.arange(1, n+1) - 1)/(2*n)
+    w = 1/(12*n) + np.sum((u - cdfvals)**2)
+    # avoid small negative values that can occur due to the approximation
+    p = max(0, 1. - _cdf_cvm(w, n))
+    return CramerVonMisesResult(statistic=w, pvalue=p)
+def _get_wilcoxon_distr(n):
+    """
+    Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
+    of ranks of positive differences).
+    Returns an array with the probabilities of all the possible ranks
+    r = 0, ..., n*(n+1)/2
+    """
+    c = np.ones(1, dtype=np.float64)
+    for k in range(1, n + 1):
+        prev_c = c
+        c = np.zeros(k * (k + 1) // 2 + 1, dtype=np.float64)
+        m = len(prev_c)
+        c[:m] = prev_c * 0.5
+        c[-m:] += prev_c * 0.5
+    return c
+def _get_wilcoxon_distr2(n):
+    """
+    Distribution of probability of the Wilcoxon ranksum statistic r_plus (sum
+    of ranks of positive differences).
+    Returns an array with the probabilities of all the possible ranks
+    r = 0, ..., n*(n+1)/2
+    This is a slower reference function
+    References
+    ----------
+    .. [1] 1. Harris T, Hardin JW. Exact Wilcoxon Signed-Rank and Wilcoxon
+        Mann-Whitney Ranksum Tests. The Stata Journal. 2013;13(2):337-343.
+    """
+    ai = np.arange(1, n+1)[:, None]
+    t = n*(n+1)/2
+    q = 2*t
+    j = np.arange(q)
+    theta = 2*np.pi/q*j
+    phi_sp = np.prod(np.cos(theta*ai), axis=0)
+    phi_s = np.exp(1j*theta*t) * phi_sp
+    p = np.real(ifft(phi_s))
+    res = np.zeros(int(t)+1)
+    res[:-1:] = p[::2]
+    res[0] /= 2
+    res[-1] = res[0]
+    return res
+def _tau_b(A):
+    """Calculate Kendall's tau-b and p-value from contingency table."""
+    # See [2] 2.2 and 4.2
+    # contingency table must be truly 2D
+    if A.shape[0] == 1 or A.shape[1] == 1:
+        return np.nan, np.nan
+    NA = A.sum()
+    PA = _P(A)
+    QA = _Q(A)
+    Sri2 = (A.sum(axis=1)**2).sum()
+    Scj2 = (A.sum(axis=0)**2).sum()
+    denominator = (NA**2 - Sri2)*(NA**2 - Scj2)
+    tau = (PA-QA)/(denominator)**0.5
+    numerator = 4*(_a_ij_Aij_Dij2(A) - (PA - QA)**2 / NA)
+    s02_tau_b = numerator/denominator
+    if s02_tau_b == 0:  # Avoid divide by zero
+        return tau, 0
+    Z = tau/s02_tau_b**0.5
+    p = 2*norm.sf(abs(Z))  # 2-sided p-value
+    return tau, p
+def _somers_d(A, alternative='two-sided'):
+    """Calculate Somers' D and p-value from contingency table."""
+    # See [3] page 1740
+    # contingency table must be truly 2D
+    if A.shape[0] <= 1 or A.shape[1] <= 1:
+        return np.nan, np.nan
+    NA = A.sum()
+    NA2 = NA**2
+    PA = _P(A)
+    QA = _Q(A)
+    Sri2 = (A.sum(axis=1)**2).sum()
+    d = (PA - QA)/(NA2 - Sri2)
+    S = _a_ij_Aij_Dij2(A) - (PA-QA)**2/NA
+    with np.errstate(divide='ignore'):
+        Z = (PA - QA)/(4*(S))**0.5
+    p = scipy.stats._stats_py._get_pvalue(Z, distributions.norm, alternative)
+    return d, p
+@dataclass
+class SomersDResult:
+    statistic: float
+    pvalue: float
+    table: np.ndarray
+def somersd(x, y=None, alternative='two-sided'):
+    r"""Calculates Somers' D, an asymmetric measure of ordinal association.
+    Like Kendall's :math:`\tau`, Somers' :math:`D` is a measure of the
+    correspondence between two rankings. Both statistics consider the
+    difference between the number of concordant and discordant pairs in two
+    rankings :math:`X` and :math:`Y`, and both are normalized such that values
+    close  to 1 indicate strong agreement and values close to -1 indicate
+    strong disagreement. They differ in how they are normalized. To show the
+    relationship, Somers' :math:`D` can be defined in terms of Kendall's
+    :math:`\tau_a`:
+    .. math::
+        D(Y|X) = \frac{\tau_a(X, Y)}{\tau_a(X, X)}
+    Suppose the first ranking :math:`X` has :math:`r` distinct ranks and the
+    second ranking :math:`Y` has :math:`s` distinct ranks. These two lists of
+    :math:`n` rankings can also be viewed as an :math:`r \times s` contingency
+    table in which element :math:`i, j` is the number of rank pairs with rank
+    :math:`i` in ranking :math:`X` and rank :math:`j` in ranking :math:`Y`.
+    Accordingly, `somersd` also allows the input data to be supplied as a
+    single, 2D contingency table instead of as two separate, 1D rankings.
+    Note that the definition of Somers' :math:`D` is asymmetric: in general,
+    :math:`D(Y|X) \neq D(X|Y)`. ``somersd(x, y)`` calculates Somers'
+    :math:`D(Y|X)`: the "row" variable :math:`X` is treated as an independent
+    variable, and the "column" variable :math:`Y` is dependent. For Somers'
+    :math:`D(X|Y)`, swap the input lists or transpose the input table.
+    Parameters
+    ----------
+    x : array_like
+        1D array of rankings, treated as the (row) independent variable.
+        Alternatively, a 2D contingency table.
+    y : array_like, optional
+        If `x` is a 1D array of rankings, `y` is a 1D array of rankings of the
+        same length, treated as the (column) dependent variable.
+        If `x` is 2D, `y` is ignored.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        The following options are available:
+        * 'two-sided': the rank correlation is nonzero
+        * 'less': the rank correlation is negative (less than zero)
+        * 'greater':  the rank correlation is positive (greater than zero)
+    Returns
+    -------
+    res : SomersDResult
+        A `SomersDResult` object with the following fields:
+            statistic : float
+               The Somers' :math:`D` statistic.
+            pvalue : float
+               The p-value for a hypothesis test whose null
+               hypothesis is an absence of association, :math:`D=0`.
+               See notes for more information.
+            table : 2D array
+               The contingency table formed from rankings `x` and `y` (or the
+               provided contingency table, if `x` is a 2D array)
+    See Also
+    --------
+    kendalltau : Calculates Kendall's tau, another correlation measure.
+    weightedtau : Computes a weighted version of Kendall's tau.
+    spearmanr : Calculates a Spearman rank-order correlation coefficient.
+    pearsonr : Calculates a Pearson correlation coefficient.
+    Notes
+    -----
+    This function follows the contingency table approach of [2]_ and
+    [3]_. *p*-values are computed based on an asymptotic approximation of
+    the test statistic distribution under the null hypothesis :math:`D=0`.
+    Theoretically, hypothesis tests based on Kendall's :math:`tau` and Somers'
+    :math:`D` should be identical.
+    However, the *p*-values returned by `kendalltau` are based
+    on the null hypothesis of *independence* between :math:`X` and :math:`Y`
+    (i.e. the population from which pairs in :math:`X` and :math:`Y` are
+    sampled contains equal numbers of all possible pairs), which is more
+    specific than the null hypothesis :math:`D=0` used here. If the null
+    hypothesis of independence is desired, it is acceptable to use the
+    *p*-value returned by `kendalltau` with the statistic returned by
+    `somersd` and vice versa. For more information, see [2]_.
+    Contingency tables are formatted according to the convention used by
+    SAS and R: the first ranking supplied (``x``) is the "row" variable, and
+    the second ranking supplied (``y``) is the "column" variable. This is
+    opposite the convention of Somers' original paper [1]_.
+    References
+    ----------
+    .. [1] Robert H. Somers, "A New Asymmetric Measure of Association for
+           Ordinal Variables", *American Sociological Review*, Vol. 27, No. 6,
+           pp. 799--811, 1962.
+    .. [2] Morton B. Brown and Jacqueline K. Benedetti, "Sampling Behavior of
+           Tests for Correlation in Two-Way Contingency Tables", *Journal of
+           the American Statistical Association* Vol. 72, No. 358, pp.
+           309--315, 1977.
+    .. [3] SAS Institute, Inc., "The FREQ Procedure (Book Excerpt)",
+           *SAS/STAT 9.2 User's Guide, Second Edition*, SAS Publishing, 2009.
+    .. [4] Laerd Statistics, "Somers' d using SPSS Statistics", *SPSS
+           Statistics Tutorials and Statistical Guides*,
+           https://statistics.laerd.com/spss-tutorials/somers-d-using-spss-statistics.php,
+           Accessed July 31, 2020.
+    Examples
+    --------
+    We calculate Somers' D for the example given in [4]_, in which a hotel
+    chain owner seeks to determine the association between hotel room
+    cleanliness and customer satisfaction. The independent variable, hotel
+    room cleanliness, is ranked on an ordinal scale: "below average (1)",
+    "average (2)", or "above average (3)". The dependent variable, customer
+    satisfaction, is ranked on a second scale: "very dissatisfied (1)",
+    "moderately dissatisfied (2)", "neither dissatisfied nor satisfied (3)",
+    "moderately satisfied (4)", or "very satisfied (5)". 189 customers
+    respond to the survey, and the results are cast into a contingency table
+    with the hotel room cleanliness as the "row" variable and customer
+    satisfaction as the "column" variable.
+    +-----+-----+-----+-----+-----+-----+
+    |     | (1) | (2) | (3) | (4) | (5) |
+    +=====+=====+=====+=====+=====+=====+
+    | (1) | 27  | 25  | 14  | 7   | 0   |
+    +-----+-----+-----+-----+-----+-----+
+    | (2) | 7   | 14  | 18  | 35  | 12  |
+    +-----+-----+-----+-----+-----+-----+
+    | (3) | 1   | 3   | 2   | 7   | 17  |
+    +-----+-----+-----+-----+-----+-----+
+    For example, 27 customers assigned their room a cleanliness ranking of
+    "below average (1)" and a corresponding satisfaction of "very
+    dissatisfied (1)". We perform the analysis as follows.
+    >>> from scipy.stats import somersd
+    >>> table = [[27, 25, 14, 7, 0], [7, 14, 18, 35, 12], [1, 3, 2, 7, 17]]
+    >>> res = somersd(table)
+    >>> res.statistic
+    0.6032766111513396
+    >>> res.pvalue
+    1.0007091191074533e-27
+    The value of the Somers' D statistic is approximately 0.6, indicating
+    a positive correlation between room cleanliness and customer satisfaction
+    in the sample.
+    The *p*-value is very small, indicating a very small probability of
+    observing such an extreme value of the statistic under the null
+    hypothesis that the statistic of the entire population (from which
+    our sample of 189 customers is drawn) is zero. This supports the
+    alternative hypothesis that the true value of Somers' D for the population
+    is nonzero.
+    """
+    x, y = np.array(x), np.array(y)
+    if x.ndim == 1:
+        if x.size != y.size:
+            raise ValueError("Rankings must be of equal length.")
+        table = scipy.stats.contingency.crosstab(x, y)[1]
+    elif x.ndim == 2:
+        if np.any(x < 0):
+            raise ValueError("All elements of the contingency table must be "
+                             "non-negative.")
+        if np.any(x != x.astype(int)):
+            raise ValueError("All elements of the contingency table must be "
+                             "integer.")
+        if x.nonzero()[0].size < 2:
+            raise ValueError("At least two elements of the contingency table "
+                             "must be nonzero.")
+        table = x
+    else:
+        raise ValueError("x must be either a 1D or 2D array")
+    # The table type is converted to a float to avoid an integer overflow
+    d, p = _somers_d(table.astype(float), alternative)
+    # add alias for consistency with other correlation functions
+    res = SomersDResult(d, p, table)
+    res.correlation = d
+    return res
+# This could be combined with `_all_partitions` in `_resampling.py`
+def _all_partitions(nx, ny):
+    """
+    Partition a set of indices into two fixed-length sets in all possible ways
+    Partition a set of indices 0 ... nx + ny - 1 into two sets of length nx and
+    ny in all possible ways (ignoring order of elements).
+    """
+    z = np.arange(nx+ny)
+    for c in combinations(z, nx):
+        x = np.array(c)
+        mask = np.ones(nx+ny, bool)
+        mask[x] = False
+        y = z[mask]
+        yield x, y
+def _compute_log_combinations(n):
+    """Compute all log combination of C(n, k)."""
+    gammaln_arr = gammaln(np.arange(n + 1) + 1)
+    return gammaln(n + 1) - gammaln_arr - gammaln_arr[::-1]
+@dataclass
+class BarnardExactResult:
+    statistic: float
+    pvalue: float
+def barnard_exact(table, alternative="two-sided", pooled=True, n=32):
+    r"""Perform a Barnard exact test on a 2x2 contingency table.
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements should be non-negative integers.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes section below.
+    pooled : bool, optional
+        Whether to compute score statistic with pooled variance (as in
+        Student's t-test, for example) or unpooled variance (as in Welch's
+        t-test). Default is ``True``.
+    n : int, optional
+        Number of sampling points used in the construction of the sampling
+        method. Note that this argument will automatically be converted to
+        the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
+        select sample points. Default is 32. Must be positive. In most cases,
+        32 points is enough to reach good precision. More points comes at
+        performance cost.
+    Returns
+    -------
+    ber : BarnardExactResult
+        A result object with the following attributes.
+        statistic : float
+            The Wald statistic with pooled or unpooled variance, depending
+            on the user choice of `pooled`.
+        pvalue : float
+            P-value, the probability of obtaining a distribution at least as
+            extreme as the one that was actually observed, assuming that the
+            null hypothesis is true.
+    See Also
+    --------
+    chi2_contingency : Chi-square test of independence of variables in a
+        contingency table.
+    fisher_exact : Fisher exact test on a 2x2 contingency table.
+    boschloo_exact : Boschloo's exact test on a 2x2 contingency table,
+        which is an uniformly more powerful alternative to Fisher's exact test.
+    Notes
+    -----
+    Barnard's test is an exact test used in the analysis of contingency
+    tables. It examines the association of two categorical variables, and
+    is a more powerful alternative than Fisher's exact test
+    for 2x2 contingency tables.
+    Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
+    where each column stores the binomial experiment, as in the example
+    below. Let's also define :math:`p_1, p_2` the theoretical binomial
+    probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
+    Barnard exact test, we can assert three different null hypotheses :
+    - :math:`H_0 : p_1 \geq p_2` versus :math:`H_1 : p_1 < p_2`,
+      with `alternative` = "less"
+    - :math:`H_0 : p_1 \leq p_2` versus :math:`H_1 : p_1 > p_2`,
+      with `alternative` = "greater"
+    - :math:`H_0 : p_1 = p_2` versus :math:`H_1 : p_1 \neq p_2`,
+      with `alternative` = "two-sided" (default one)
+    In order to compute Barnard's exact test, we are using the Wald
+    statistic [3]_ with pooled or unpooled variance.
+    Under the default assumption that both variances are equal
+    (``pooled = True``), the statistic is computed as:
+    .. math::
+        T(X) = \frac{
+            \hat{p}_1 - \hat{p}_2
+        }{
+            \sqrt{
+                \hat{p}(1 - \hat{p})
+                (\frac{1}{c_1} +
+                \frac{1}{c_2})
+            }
+        }
+    with :math:`\hat{p}_1, \hat{p}_2` and :math:`\hat{p}` the estimator of
+    :math:`p_1, p_2` and :math:`p`, the latter being the combined probability,
+    given the assumption that :math:`p_1 = p_2`.
+    If this assumption is invalid (``pooled = False``), the statistic is:
+    .. math::
+        T(X) = \frac{
+            \hat{p}_1 - \hat{p}_2
+        }{
+            \sqrt{
+                \frac{\hat{p}_1 (1 - \hat{p}_1)}{c_1} +
+                \frac{\hat{p}_2 (1 - \hat{p}_2)}{c_2}
+            }
+        }
+    The p-value is then computed as:
+    .. math::
+        \sum
+            \binom{c_1}{x_{11}}
+            \binom{c_2}{x_{12}}
+            \pi^{x_{11} + x_{12}}
+            (1 - \pi)^{t - x_{11} - x_{12}}
+    where the sum is over all  2x2 contingency tables :math:`X` such that:
+    * :math:`T(X) \leq T(X_0)` when `alternative` = "less",
+    * :math:`T(X) \geq T(X_0)` when `alternative` = "greater", or
+    * :math:`T(X) \geq |T(X_0)|` when `alternative` = "two-sided".
+    Above, :math:`c_1, c_2` are the sum of the columns 1 and 2,
+    and :math:`t` the total (sum of the 4 sample's element).
+    The returned p-value is the maximum p-value taken over the nuisance
+    parameter :math:`\pi`, where :math:`0 \leq \pi \leq 1`.
+    This function's complexity is :math:`O(n c_1 c_2)`, where `n` is the
+    number of sample points.
+    References
+    ----------
+    .. [1] Barnard, G. A. "Significance Tests for 2x2 Tables". *Biometrika*.
+           34.1/2 (1947): 123-138. :doi:`dpgkg3`
+    .. [2] Mehta, Cyrus R., and Pralay Senchaudhuri. "Conditional versus
+           unconditional exact tests for comparing two binomials."
+           *Cytel Software Corporation* 675 (2003): 1-5.
+    .. [3] "Wald Test". *Wikipedia*. https://en.wikipedia.org/wiki/Wald_test
+    Examples
+    --------
+    An example use of Barnard's test is presented in [2]_.
+        Consider the following example of a vaccine efficacy study
+        (Chan, 1998). In a randomized clinical trial of 30 subjects, 15 were
+        inoculated with a recombinant DNA influenza vaccine and the 15 were
+        inoculated with a placebo. Twelve of the 15 subjects in the placebo
+        group (80%) eventually became infected with influenza whereas for the
+        vaccine group, only 7 of the 15 subjects (47%) became infected. The
+        data are tabulated as a 2 x 2 table::
+                Vaccine  Placebo
+            Yes     7        12
+            No      8        3
+    When working with statistical hypothesis testing, we usually use a
+    threshold probability or significance level upon which we decide
+    to reject the null hypothesis :math:`H_0`. Suppose we choose the common
+    significance level of 5%.
+    Our alternative hypothesis is that the vaccine will lower the chance of
+    becoming infected with the virus; that is, the probability :math:`p_1` of
+    catching the virus with the vaccine will be *less than* the probability
+    :math:`p_2` of catching the virus without the vaccine.  Therefore, we call
+    `barnard_exact` with the ``alternative="less"`` option:
+    >>> import scipy.stats as stats
+    >>> res = stats.barnard_exact([[7, 12], [8, 3]], alternative="less")
+    >>> res.statistic
+    -1.894...
+    >>> res.pvalue
+    0.03407...
+    Under the null hypothesis that the vaccine will not lower the chance of
+    becoming infected, the probability of obtaining test results at least as
+    extreme as the observed data is approximately 3.4%. Since this p-value is
+    less than our chosen significance level, we have evidence to reject
+    :math:`H_0` in favor of the alternative.
+    Suppose we had used Fisher's exact test instead:
+    >>> _, pvalue = stats.fisher_exact([[7, 12], [8, 3]], alternative="less")
+    >>> pvalue
+    0.0640...
+    With the same threshold significance of 5%, we would not have been able
+    to reject the null hypothesis in favor of the alternative. As stated in
+    [2]_, Barnard's test is uniformly more powerful than Fisher's exact test
+    because Barnard's test does not condition on any margin. Fisher's test
+    should only be used when both sets of marginals are fixed.
+    """
+    if n <= 0:
+        raise ValueError(
+            "Number of points `n` must be strictly positive, "
+            f"found {n!r}"
+        )
+    table = np.asarray(table, dtype=np.int64)
+    if not table.shape == (2, 2):
+        raise ValueError("The input `table` must be of shape (2, 2).")
+    if np.any(table < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+    if 0 in table.sum(axis=0):
+        # If both values in column are zero, the p-value is 1 and
+        # the score's statistic is NaN.
+        return BarnardExactResult(np.nan, 1.0)
+    total_col_1, total_col_2 = table.sum(axis=0)
+    x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(-1, 1)
+    x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(1, -1)
+    # We need to calculate the wald statistics for each combination of x1 and
+    # x2.
+    p1, p2 = x1 / total_col_1, x2 / total_col_2
+    if pooled:
+        p = (x1 + x2) / (total_col_1 + total_col_2)
+        variances = p * (1 - p) * (1 / total_col_1 + 1 / total_col_2)
+    else:
+        variances = p1 * (1 - p1) / total_col_1 + p2 * (1 - p2) / total_col_2
+    # To avoid warning when dividing by 0
+    with np.errstate(divide="ignore", invalid="ignore"):
+        wald_statistic = np.divide((p1 - p2), np.sqrt(variances))
+    wald_statistic[p1 == p2] = 0  # Removing NaN values
+    wald_stat_obs = wald_statistic[table[0, 0], table[0, 1]]
+    if alternative == "two-sided":
+        index_arr = np.abs(wald_statistic) >= abs(wald_stat_obs)
+    elif alternative == "less":
+        index_arr = wald_statistic <= wald_stat_obs
+    elif alternative == "greater":
+        index_arr = wald_statistic >= wald_stat_obs
+    else:
+        msg = (
+            "`alternative` should be one of {'two-sided', 'less', 'greater'},"
+            f" found {alternative!r}"
+        )
+        raise ValueError(msg)
+    x1_sum_x2 = x1 + x2
+    x1_log_comb = _compute_log_combinations(total_col_1)
+    x2_log_comb = _compute_log_combinations(total_col_2)
+    x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
+    result = shgo(
+        _get_binomial_log_p_value_with_nuisance_param,
+        args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
+        bounds=((0, 1),),
+        n=n,
+        sampling_method="sobol",
+    )
+    # result.fun is the negative log pvalue and therefore needs to be
+    # changed before return
+    p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
+    return BarnardExactResult(wald_stat_obs, p_value)
+@dataclass
+class BoschlooExactResult:
+    statistic: float
+    pvalue: float
+def boschloo_exact(table, alternative="two-sided", n=32):
+    r"""Perform Boschloo's exact test on a 2x2 contingency table.
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements should be non-negative integers.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes section below.
+    n : int, optional
+        Number of sampling points used in the construction of the sampling
+        method. Note that this argument will automatically be converted to
+        the next higher power of 2 since `scipy.stats.qmc.Sobol` is used to
+        select sample points. Default is 32. Must be positive. In most cases,
+        32 points is enough to reach good precision. More points comes at
+        performance cost.
+    Returns
+    -------
+    ber : BoschlooExactResult
+        A result object with the following attributes.
+        statistic : float
+            The statistic used in Boschloo's test; that is, the p-value
+            from Fisher's exact test.
+        pvalue : float
+            P-value, the probability of obtaining a distribution at least as
+            extreme as the one that was actually observed, assuming that the
+            null hypothesis is true.
+    See Also
+    --------
+    chi2_contingency : Chi-square test of independence of variables in a
+        contingency table.
+    fisher_exact : Fisher exact test on a 2x2 contingency table.
+    barnard_exact : Barnard's exact test, which is a more powerful alternative
+        than Fisher's exact test for 2x2 contingency tables.
+    Notes
+    -----
+    Boschloo's test is an exact test used in the analysis of contingency
+    tables. It examines the association of two categorical variables, and
+    is a uniformly more powerful alternative to Fisher's exact test
+    for 2x2 contingency tables.
+    Boschloo's exact test uses the p-value of Fisher's exact test as a
+    statistic, and Boschloo's p-value is the probability under the null
+    hypothesis of observing such an extreme value of this statistic.
+    Let's define :math:`X_0` a 2x2 matrix representing the observed sample,
+    where each column stores the binomial experiment, as in the example
+    below. Let's also define :math:`p_1, p_2` the theoretical binomial
+    probabilities for  :math:`x_{11}` and :math:`x_{12}`. When using
+    Boschloo exact test, we can assert three different alternative hypotheses:
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 < p_2`,
+      with `alternative` = "less"
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 > p_2`,
+      with `alternative` = "greater"
+    - :math:`H_0 : p_1=p_2` versus :math:`H_1 : p_1 \neq p_2`,
+      with `alternative` = "two-sided" (default)
+    There are multiple conventions for computing a two-sided p-value when the
+    null distribution is asymmetric. Here, we apply the convention that the
+    p-value of a two-sided test is twice the minimum of the p-values of the
+    one-sided tests (clipped to 1.0). Note that `fisher_exact` follows a
+    different convention, so for a given `table`, the statistic reported by
+    `boschloo_exact` may differ from the p-value reported by `fisher_exact`
+    when ``alternative='two-sided'``.
+    .. versionadded:: 1.7.0
+    References
+    ----------
+    .. [1] R.D. Boschloo. "Raised conditional level of significance for the
+       2 x 2-table when testing the equality of two probabilities",
+       Statistica Neerlandica, 24(1), 1970
+    .. [2] "Boschloo's test", Wikipedia,
+       https://en.wikipedia.org/wiki/Boschloo%27s_test
+    .. [3] Lise M. Saari et al. "Employee attitudes and job satisfaction",
+       Human Resource Management, 43(4), 395-407, 2004,
+       :doi:`10.1002/hrm.20032`.
+    Examples
+    --------
+    In the following example, we consider the article "Employee
+    attitudes and job satisfaction" [3]_
+    which reports the results of a survey from 63 scientists and 117 college
+    professors. Of the 63 scientists, 31 said they were very satisfied with
+    their jobs, whereas 74 of the college professors were very satisfied
+    with their work. Is this significant evidence that college
+    professors are happier with their work than scientists?
+    The following table summarizes the data mentioned above::
+                         college professors   scientists
+        Very Satisfied   74                     31
+        Dissatisfied     43                     32
+    When working with statistical hypothesis testing, we usually use a
+    threshold probability or significance level upon which we decide
+    to reject the null hypothesis :math:`H_0`. Suppose we choose the common
+    significance level of 5%.
+    Our alternative hypothesis is that college professors are truly more
+    satisfied with their work than scientists. Therefore, we expect
+    :math:`p_1` the proportion of very satisfied college professors to be
+    greater than :math:`p_2`, the proportion of very satisfied scientists.
+    We thus call `boschloo_exact` with the ``alternative="greater"`` option:
+    >>> import scipy.stats as stats
+    >>> res = stats.boschloo_exact([[74, 31], [43, 32]], alternative="greater")
+    >>> res.statistic
+    0.0483...
+    >>> res.pvalue
+    0.0355...
+    Under the null hypothesis that scientists are happier in their work than
+    college professors, the probability of obtaining test
+    results at least as extreme as the observed data is approximately 3.55%.
+    Since this p-value is less than our chosen significance level, we have
+    evidence to reject :math:`H_0` in favor of the alternative hypothesis.
+    """
+    hypergeom = distributions.hypergeom
+    if n <= 0:
+        raise ValueError(
+            "Number of points `n` must be strictly positive,"
+            f" found {n!r}"
+        )
+    table = np.asarray(table, dtype=np.int64)
+    if not table.shape == (2, 2):
+        raise ValueError("The input `table` must be of shape (2, 2).")
+    if np.any(table < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+    if 0 in table.sum(axis=0):
+        # If both values in column are zero, the p-value is 1 and
+        # the score's statistic is NaN.
+        return BoschlooExactResult(np.nan, np.nan)
+    total_col_1, total_col_2 = table.sum(axis=0)
+    total = total_col_1 + total_col_2
+    x1 = np.arange(total_col_1 + 1, dtype=np.int64).reshape(1, -1)
+    x2 = np.arange(total_col_2 + 1, dtype=np.int64).reshape(-1, 1)
+    x1_sum_x2 = x1 + x2
+    if alternative == 'less':
+        pvalues = hypergeom.cdf(x1, total, x1_sum_x2, total_col_1).T
+    elif alternative == 'greater':
+        # Same formula as the 'less' case, but with the second column.
+        pvalues = hypergeom.cdf(x2, total, x1_sum_x2, total_col_2).T
+    elif alternative == 'two-sided':
+        boschloo_less = boschloo_exact(table, alternative="less", n=n)
+        boschloo_greater = boschloo_exact(table, alternative="greater", n=n)
+        res = (
+            boschloo_less if boschloo_less.pvalue < boschloo_greater.pvalue
+            else boschloo_greater
+        )
+        # Two-sided p-value is defined as twice the minimum of the one-sided
+        # p-values
+        pvalue = np.clip(2 * res.pvalue, a_min=0, a_max=1)
+        return BoschlooExactResult(res.statistic, pvalue)
+    else:
+        msg = (
+            f"`alternative` should be one of {'two-sided', 'less', 'greater'},"
+            f" found {alternative!r}"
+        )
+        raise ValueError(msg)
+    fisher_stat = pvalues[table[0, 0], table[0, 1]]
+    # fisher_stat * (1+1e-13) guards us from small numerical error. It is
+    # equivalent to np.isclose with relative tol of 1e-13 and absolute tol of 0
+    # For more throughout explanations, see gh-14178
+    index_arr = pvalues <= fisher_stat * (1+1e-13)
+    x1, x2, x1_sum_x2 = x1.T, x2.T, x1_sum_x2.T
+    x1_log_comb = _compute_log_combinations(total_col_1)
+    x2_log_comb = _compute_log_combinations(total_col_2)
+    x1_sum_x2_log_comb = x1_log_comb[x1] + x2_log_comb[x2]
+    result = shgo(
+        _get_binomial_log_p_value_with_nuisance_param,
+        args=(x1_sum_x2, x1_sum_x2_log_comb, index_arr),
+        bounds=((0, 1),),
+        n=n,
+        sampling_method="sobol",
+    )
+    # result.fun is the negative log pvalue and therefore needs to be
+    # changed before return
+    p_value = np.clip(np.exp(-result.fun), a_min=0, a_max=1)
+    return BoschlooExactResult(fisher_stat, p_value)
+def _get_binomial_log_p_value_with_nuisance_param(
+    nuisance_param, x1_sum_x2, x1_sum_x2_log_comb, index_arr
+):
+    r"""
+    Compute the log pvalue in respect of a nuisance parameter considering
+    a 2x2 sample space.
+    Parameters
+    ----------
+    nuisance_param : float
+        nuisance parameter used in the computation of the maximisation of
+        the p-value. Must be between 0 and 1
+    x1_sum_x2 : ndarray
+        Sum of x1 and x2 inside barnard_exact
+    x1_sum_x2_log_comb : ndarray
+        sum of the log combination of x1 and x2
+    index_arr : ndarray of boolean
+    Returns
+    -------
+    p_value : float
+        Return the maximum p-value considering every nuisance parameter
+        between 0 and 1
+    Notes
+    -----
+    Both Barnard's test and Boschloo's test iterate over a nuisance parameter
+    :math:`\pi \in [0, 1]` to find the maximum p-value. To search this
+    maxima, this function return the negative log pvalue with respect to the
+    nuisance parameter passed in params. This negative log p-value is then
+    used in `shgo` to find the minimum negative pvalue which is our maximum
+    pvalue.
+    Also, to compute the different combination used in the
+    p-values' computation formula, this function uses `gammaln` which is
+    more tolerant for large value than `scipy.special.comb`. `gammaln` gives
+    a log combination. For the little precision loss, performances are
+    improved a lot.
+    """
+    t1, t2 = x1_sum_x2.shape
+    n = t1 + t2 - 2
+    with np.errstate(divide="ignore", invalid="ignore"):
+        log_nuisance = np.log(
+            nuisance_param,
+            out=np.zeros_like(nuisance_param),
+            where=nuisance_param >= 0,
+        )
+        log_1_minus_nuisance = np.log(
+            1 - nuisance_param,
+            out=np.zeros_like(nuisance_param),
+            where=1 - nuisance_param >= 0,
+        )
+        nuisance_power_x1_x2 = log_nuisance * x1_sum_x2
+        nuisance_power_x1_x2[(x1_sum_x2 == 0)[:, :]] = 0
+        nuisance_power_n_minus_x1_x2 = log_1_minus_nuisance * (n - x1_sum_x2)
+        nuisance_power_n_minus_x1_x2[(x1_sum_x2 == n)[:, :]] = 0
+        tmp_log_values_arr = (
+            x1_sum_x2_log_comb
+            + nuisance_power_x1_x2
+            + nuisance_power_n_minus_x1_x2
+        )
+    tmp_values_from_index = tmp_log_values_arr[index_arr]
+    # To avoid dividing by zero in log function and getting inf value,
+    # values are centered according to the max
+    max_value = tmp_values_from_index.max()
+    # To have better result's precision, the log pvalue is taken here.
+    # Indeed, pvalue is included inside [0, 1] interval. Passing the
+    # pvalue to log makes the interval a lot bigger ([-inf, 0]), and thus
+    # help us to achieve better precision
+    with np.errstate(divide="ignore", invalid="ignore"):
+        log_probs = np.exp(tmp_values_from_index - max_value).sum()
+        log_pvalue = max_value + np.log(
+            log_probs,
+            out=np.full_like(log_probs, -np.inf),
+            where=log_probs > 0,
+        )
+    # Since shgo find the minima, minus log pvalue is returned
+    return -log_pvalue
+def _pval_cvm_2samp_exact(s, m, n):
+    """
+    Compute the exact p-value of the Cramer-von Mises two-sample test
+    for a given value s of the test statistic.
+    m and n are the sizes of the samples.
+    [1] Y. Xiao, A. Gordon, and A. Yakovlev, "A C++ Program for
+        the Cramér-Von Mises Two-Sample Test", J. Stat. Soft.,
+        vol. 17, no. 8, pp. 1-15, Dec. 2006.
+    [2] T. W. Anderson "On the Distribution of the Two-Sample Cramer-von Mises
+        Criterion," The Annals of Mathematical Statistics, Ann. Math. Statist.
+        33(3), 1148-1159, (September, 1962)
+    """
+    # [1, p. 3]
+    lcm = np.lcm(m, n)
+    # [1, p. 4], below eq. 3
+    a = lcm // m
+    b = lcm // n
+    # Combine Eq. 9 in [2] with Eq. 2 in [1] and solve for $\zeta$
+    # Hint: `s` is $U$ in [2], and $T_2$ in [1] is $T$ in [2]
+    mn = m * n
+    zeta = lcm ** 2 * (m + n) * (6 * s - mn * (4 * mn - 1)) // (6 * mn ** 2)
+    # bound maximum value that may appear in `gs` (remember both rows!)
+    zeta_bound = lcm**2 * (m + n)  # bound elements in row 1
+    combinations = comb(m + n, m)  # sum of row 2
+    max_gs = max(zeta_bound, combinations)
+    dtype = np.min_scalar_type(max_gs)
+    # the frequency table of $g_{u, v}^+$ defined in [1, p. 6]
+    gs = ([np.array([[0], [1]], dtype=dtype)]
+          + [np.empty((2, 0), dtype=dtype) for _ in range(m)])
+    for u in range(n + 1):
+        next_gs = []
+        tmp = np.empty((2, 0), dtype=dtype)
+        for v, g in enumerate(gs):
+            # Calculate g recursively with eq. 11 in [1]. Even though it
+            # doesn't look like it, this also does 12/13 (all of Algorithm 1).
+            vi, i0, i1 = np.intersect1d(tmp[0], g[0], return_indices=True)
+            tmp = np.concatenate([
+                np.stack([vi, tmp[1, i0] + g[1, i1]]),
+                np.delete(tmp, i0, 1),
+                np.delete(g, i1, 1)
+            ], 1)
+            res = (a * v - b * u) ** 2
+            tmp[0] += res.astype(dtype)
+            next_gs.append(tmp)
+        gs = next_gs
+    value, freq = gs[m]
+    return np.float64(np.sum(freq[value >= zeta]) / combinations)
+@_axis_nan_policy_factory(CramerVonMisesResult, n_samples=2, too_small=1,
+                          result_to_tuple=_cvm_result_to_tuple)
+def cramervonmises_2samp(x, y, method='auto'):
+    """Perform the two-sample Cramér-von Mises test for goodness of fit.
+    This is the two-sample version of the Cramér-von Mises test ([1]_):
+    for two independent samples :math:`X_1, ..., X_n` and
+    :math:`Y_1, ..., Y_m`, the null hypothesis is that the samples
+    come from the same (unspecified) continuous distribution.
+    Parameters
+    ----------
+    x : array_like
+        A 1-D array of observed values of the random variables :math:`X_i`.
+    y : array_like
+        A 1-D array of observed values of the random variables :math:`Y_i`.
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        The method used to compute the p-value, see Notes for details.
+        The default is 'auto'.
+    Returns
+    -------
+    res : object with attributes
+        statistic : float
+            Cramér-von Mises statistic.
+        pvalue : float
+            The p-value.
+    See Also
+    --------
+    cramervonmises, anderson_ksamp, epps_singleton_2samp, ks_2samp
+    Notes
+    -----
+    .. versionadded:: 1.7.0
+    The statistic is computed according to equation 9 in [2]_. The
+    calculation of the p-value depends on the keyword `method`:
+    - ``asymptotic``: The p-value is approximated by using the limiting
+      distribution of the test statistic.
+    - ``exact``: The exact p-value is computed by enumerating all
+      possible combinations of the test statistic, see [2]_.
+    If ``method='auto'``, the exact approach is used
+    if both samples contain equal to or less than 20 observations,
+    otherwise the asymptotic distribution is used.
+    If the underlying distribution is not continuous, the p-value is likely to
+    be conservative (Section 6.2 in [3]_). When ranking the data to compute
+    the test statistic, midranks are used if there are ties.
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Cramer-von_Mises_criterion
+    .. [2] Anderson, T.W. (1962). On the distribution of the two-sample
+           Cramer-von-Mises criterion. The Annals of Mathematical
+           Statistics, pp. 1148-1159.
+    .. [3] Conover, W.J., Practical Nonparametric Statistics, 1971.
+    Examples
+    --------
+    Suppose we wish to test whether two samples generated by
+    ``scipy.stats.norm.rvs`` have the same distribution. We choose a
+    significance level of alpha=0.05.
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> rng = np.random.default_rng()
+    >>> x = stats.norm.rvs(size=100, random_state=rng)
+    >>> y = stats.norm.rvs(size=70, random_state=rng)
+    >>> res = stats.cramervonmises_2samp(x, y)
+    >>> res.statistic, res.pvalue
+    (0.29376470588235293, 0.1412873014573014)
+    The p-value exceeds our chosen significance level, so we do not
+    reject the null hypothesis that the observed samples are drawn from the
+    same distribution.
+    For small sample sizes, one can compute the exact p-values:
+    >>> x = stats.norm.rvs(size=7, random_state=rng)
+    >>> y = stats.t.rvs(df=2, size=6, random_state=rng)
+    >>> res = stats.cramervonmises_2samp(x, y, method='exact')
+    >>> res.statistic, res.pvalue
+    (0.197802197802198, 0.31643356643356646)
+    The p-value based on the asymptotic distribution is a good approximation
+    even though the sample size is small.
+    >>> res = stats.cramervonmises_2samp(x, y, method='asymptotic')
+    >>> res.statistic, res.pvalue
+    (0.197802197802198, 0.2966041181527128)
+    Independent of the method, one would not reject the null hypothesis at the
+    chosen significance level in this example.
+    """
+    xa = np.sort(np.asarray(x))
+    ya = np.sort(np.asarray(y))
+    if xa.size <= 1 or ya.size <= 1:
+        raise ValueError('x and y must contain at least two observations.')
+    if method not in ['auto', 'exact', 'asymptotic']:
+        raise ValueError('method must be either auto, exact or asymptotic.')
+    nx = len(xa)
+    ny = len(ya)
+    if method == 'auto':
+        if max(nx, ny) > 20:
+            method = 'asymptotic'
+        else:
+            method = 'exact'
+    # get ranks of x and y in the pooled sample
+    z = np.concatenate([xa, ya])
+    # in case of ties, use midrank (see [1])
+    r = scipy.stats.rankdata(z, method='average')
+    rx = r[:nx]
+    ry = r[nx:]
+    # compute U (eq. 10 in [2])
+    u = nx * np.sum((rx - np.arange(1, nx+1))**2)
+    u += ny * np.sum((ry - np.arange(1, ny+1))**2)
+    # compute T (eq. 9 in [2])
+    k, N = nx*ny, nx + ny
+    t = u / (k*N) - (4*k - 1)/(6*N)
+    if method == 'exact':
+        p = _pval_cvm_2samp_exact(u, nx, ny)
+    else:
+        # compute expected value and variance of T (eq. 11 and 14 in [2])
+        et = (1 + 1/N)/6
+        vt = (N+1) * (4*k*N - 3*(nx**2 + ny**2) - 2*k)
+        vt = vt / (45 * N**2 * 4 * k)
+        # computed the normalized statistic (eq. 15 in [2])
+        tn = 1/6 + (t - et) / np.sqrt(45 * vt)
+        # approximate distribution of tn with limiting distribution
+        # of the one-sample test statistic
+        # if tn < 0.003, the _cdf_cvm_inf(tn) < 1.28*1e-18, return 1.0 directly
+        if tn < 0.003:
+            p = 1.0
+        else:
+            p = max(0, 1. - _cdf_cvm_inf(tn))
+    return CramerVonMisesResult(statistic=t, pvalue=p)
+class TukeyHSDResult:
+    """Result of `scipy.stats.tukey_hsd`.
+    Attributes
+    ----------
+    statistic : float ndarray
+        The computed statistic of the test for each comparison. The element
+        at index ``(i, j)`` is the statistic for the comparison between groups
+        ``i`` and ``j``.
+    pvalue : float ndarray
+        The associated p-value from the studentized range distribution. The
+        element at index ``(i, j)`` is the p-value for the comparison
+        between groups ``i`` and ``j``.
+    Notes
+    -----
+    The string representation of this object displays the most recently
+    calculated confidence interval, and if none have been previously
+    calculated, it will evaluate ``confidence_interval()``.
+    References
+    ----------
+    .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
+           Method."
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+           28 November 2020.
+    """
+    def __init__(self, statistic, pvalue, _nobs, _ntreatments, _stand_err):
+        self.statistic = statistic
+        self.pvalue = pvalue
+        self._ntreatments = _ntreatments
+        self._nobs = _nobs
+        self._stand_err = _stand_err
+        self._ci = None
+        self._ci_cl = None
+    def __str__(self):
+        # Note: `__str__` prints the confidence intervals from the most
+        # recent call to `confidence_interval`. If it has not been called,
+        # it will be called with the default CL of .95.
+        if self._ci is None:
+            self.confidence_interval(confidence_level=.95)
+        s = ("Tukey's HSD Pairwise Group Comparisons"
+             f" ({self._ci_cl*100:.1f}% Confidence Interval)\n")
+        s += "Comparison  Statistic  p-value  Lower CI  Upper CI\n"
+        for i in range(self.pvalue.shape[0]):
+            for j in range(self.pvalue.shape[0]):
+                if i != j:
+                    s += (f" ({i} - {j}) {self.statistic[i, j]:>10.3f}"
+                          f"{self.pvalue[i, j]:>10.3f}"
+                          f"{self._ci.low[i, j]:>10.3f}"
+                          f"{self._ci.high[i, j]:>10.3f}\n")
+        return s
+    def confidence_interval(self, confidence_level=.95):
+        """Compute the confidence interval for the specified confidence level.
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval
+            of the estimated proportion. Default is .95.
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence intervals for each
+            comparison. The high and low values are accessible for each
+            comparison at index ``(i, j)`` between groups ``i`` and ``j``.
+        References
+        ----------
+        .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1.
+               Tukey's Method."
+               https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+               28 November 2020.
+        Examples
+        --------
+        >>> from scipy.stats import tukey_hsd
+        >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+        >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+        >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+        >>> result = tukey_hsd(group0, group1, group2)
+        >>> ci = result.confidence_interval()
+        >>> ci.low
+        array([[-3.649159, -8.249159, -3.909159],
+               [ 0.950841, -3.649159,  0.690841],
+               [-3.389159, -7.989159, -3.649159]])
+        >>> ci.high
+        array([[ 3.649159, -0.950841,  3.389159],
+               [ 8.249159,  3.649159,  7.989159],
+               [ 3.909159, -0.690841,  3.649159]])
+        """
+        # check to see if the supplied confidence level matches that of the
+        # previously computed CI.
+        if (self._ci is not None and self._ci_cl is not None and
+                confidence_level == self._ci_cl):
+            return self._ci
+        if not 0 < confidence_level < 1:
+            raise ValueError("Confidence level must be between 0 and 1.")
+        # determine the critical value of the studentized range using the
+        # appropriate confidence level, number of treatments, and degrees
+        # of freedom as determined by the number of data less the number of
+        # treatments. ("Confidence limits for Tukey's method")[1]. Note that
+        # in the cases of unequal sample sizes there will be a criterion for
+        # each group comparison.
+        params = (confidence_level, self._nobs, self._ntreatments - self._nobs)
+        srd = distributions.studentized_range.ppf(*params)
+        # also called maximum critical value, the Tukey criterion is the
+        # studentized range critical value * the square root of mean square
+        # error over the sample size.
+        tukey_criterion = srd * self._stand_err
+        # the confidence levels are determined by the
+        # `mean_differences` +- `tukey_criterion`
+        upper_conf = self.statistic + tukey_criterion
+        lower_conf = self.statistic - tukey_criterion
+        self._ci = ConfidenceInterval(low=lower_conf, high=upper_conf)
+        self._ci_cl = confidence_level
+        return self._ci
+def _tukey_hsd_iv(args):
+    if (len(args)) < 2:
+        raise ValueError("There must be more than 1 treatment.")
+    args = [np.asarray(arg) for arg in args]
+    for arg in args:
+        if arg.ndim != 1:
+            raise ValueError("Input samples must be one-dimensional.")
+        if arg.size <= 1:
+            raise ValueError("Input sample size must be greater than one.")
+        if np.isinf(arg).any():
+            raise ValueError("Input samples must be finite.")
+    return args
+def tukey_hsd(*args):
+    """Perform Tukey's HSD test for equality of means over multiple treatments.
+    Tukey's honestly significant difference (HSD) test performs pairwise
+    comparison of means for a set of samples. Whereas ANOVA (e.g. `f_oneway`)
+    assesses whether the true means underlying each sample are identical,
+    Tukey's HSD is a post hoc test used to compare the mean of each sample
+    to the mean of each other sample.
+    The null hypothesis is that the distributions underlying the samples all
+    have the same mean. The test statistic, which is computed for every
+    possible pairing of samples, is simply the difference between the sample
+    means. For each pair, the p-value is the probability under the null
+    hypothesis (and other assumptions; see notes) of observing such an extreme
+    value of the statistic, considering that many pairwise comparisons are
+    being performed. Confidence intervals for the difference between each pair
+    of means are also available.
+    Parameters
+    ----------
+    sample1, sample2, ... : array_like
+        The sample measurements for each group. There must be at least
+        two arguments.
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.TukeyHSDResult` instance
+        The return value is an object with the following attributes:
+        statistic : float ndarray
+            The computed statistic of the test for each comparison. The element
+            at index ``(i, j)`` is the statistic for the comparison between
+            groups ``i`` and ``j``.
+        pvalue : float ndarray
+            The computed p-value of the test for each comparison. The element
+            at index ``(i, j)`` is the p-value for the comparison between
+            groups ``i`` and ``j``.
+        The object has the following methods:
+        confidence_interval(confidence_level=0.95):
+            Compute the confidence interval for the specified confidence level.
+    See Also
+    --------
+    dunnett : performs comparison of means against a control group.
+    Notes
+    -----
+    The use of this test relies on several assumptions.
+    1. The observations are independent within and among groups.
+    2. The observations within each group are normally distributed.
+    3. The distributions from which the samples are drawn have the same finite
+       variance.
+    The original formulation of the test was for samples of equal size [6]_.
+    In case of unequal sample sizes, the test uses the Tukey-Kramer method
+    [4]_.
+    References
+    ----------
+    .. [1] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.7.1. Tukey's
+           Method."
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc471.htm,
+           28 November 2020.
+    .. [2] Abdi, Herve & Williams, Lynne. (2021). "Tukey's Honestly Significant
+           Difference (HSD) Test."
+           https://personal.utdallas.edu/~herve/abdi-HSD2010-pretty.pdf
+    .. [3] "One-Way ANOVA Using SAS PROC ANOVA & PROC GLM." SAS
+           Tutorials, 2007, www.stattutorials.com/SAS/TUTORIAL-PROC-GLM.htm.
+    .. [4] Kramer, Clyde Young. "Extension of Multiple Range Tests to Group
+           Means with Unequal Numbers of Replications." Biometrics, vol. 12,
+           no. 3, 1956, pp. 307-310. JSTOR, www.jstor.org/stable/3001469.
+           Accessed 25 May 2021.
+    .. [5] NIST/SEMATECH e-Handbook of Statistical Methods, "7.4.3.3.
+           The ANOVA table and tests of hypotheses about means"
+           https://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm,
+           2 June 2021.
+    .. [6] Tukey, John W. "Comparing Individual Means in the Analysis of
+           Variance." Biometrics, vol. 5, no. 2, 1949, pp. 99-114. JSTOR,
+           www.jstor.org/stable/3001913. Accessed 14 June 2021.
+    Examples
+    --------
+    Here are some data comparing the time to relief of three brands of
+    headache medicine, reported in minutes. Data adapted from [3]_.
+    >>> import numpy as np
+    >>> from scipy.stats import tukey_hsd
+    >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+    >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+    >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+    We would like to see if the means between any of the groups are
+    significantly different. First, visually examine a box and whisker plot.
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.boxplot([group0, group1, group2])
+    >>> ax.set_xticklabels(["group0", "group1", "group2"]) # doctest: +SKIP
+    >>> ax.set_ylabel("mean") # doctest: +SKIP
+    >>> plt.show()
+    From the box and whisker plot, we can see overlap in the interquartile
+    ranges group 1 to group 2 and group 3, but we can apply the ``tukey_hsd``
+    test to determine if the difference between means is significant. We
+    set a significance level of .05 to reject the null hypothesis.
+    >>> res = tukey_hsd(group0, group1, group2)
+    >>> print(res)
+    Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
+    Comparison  Statistic  p-value   Lower CI   Upper CI
+    (0 - 1)     -4.600      0.014     -8.249     -0.951
+    (0 - 2)     -0.260      0.980     -3.909      3.389
+    (1 - 0)      4.600      0.014      0.951      8.249
+    (1 - 2)      4.340      0.020      0.691      7.989
+    (2 - 0)      0.260      0.980     -3.389      3.909
+    (2 - 1)     -4.340      0.020     -7.989     -0.691
+    The null hypothesis is that each group has the same mean. The p-value for
+    comparisons between ``group0`` and ``group1`` as well as ``group1`` and
+    ``group2`` do not exceed .05, so we reject the null hypothesis that they
+    have the same means. The p-value of the comparison between ``group0``
+    and ``group2`` exceeds .05, so we accept the null hypothesis that there
+    is not a significant difference between their means.
+    We can also compute the confidence interval associated with our chosen
+    confidence level.
+    >>> group0 = [24.5, 23.5, 26.4, 27.1, 29.9]
+    >>> group1 = [28.4, 34.2, 29.5, 32.2, 30.1]
+    >>> group2 = [26.1, 28.3, 24.3, 26.2, 27.8]
+    >>> result = tukey_hsd(group0, group1, group2)
+    >>> conf = res.confidence_interval(confidence_level=.99)
+    >>> for ((i, j), l) in np.ndenumerate(conf.low):
+    ...     # filter out self comparisons
+    ...     if i != j:
+    ...         h = conf.high[i,j]
+    ...         print(f"({i} - {j}) {l:>6.3f} {h:>6.3f}")
+    (0 - 1) -9.480  0.280
+    (0 - 2) -5.140  4.620
+    (1 - 0) -0.280  9.480
+    (1 - 2) -0.540  9.220
+    (2 - 0) -4.620  5.140
+    (2 - 1) -9.220  0.540
+    """
+    args = _tukey_hsd_iv(args)
+    ntreatments = len(args)
+    means = np.asarray([np.mean(arg) for arg in args])
+    nsamples_treatments = np.asarray([a.size for a in args])
+    nobs = np.sum(nsamples_treatments)
+    # determine mean square error [5]. Note that this is sometimes called
+    # mean square error within.
+    mse = (np.sum([np.var(arg, ddof=1) for arg in args] *
+                  (nsamples_treatments - 1)) / (nobs - ntreatments))
+    # The calculation of the standard error differs when treatments differ in
+    # size. See ("Unequal sample sizes")[1].
+    if np.unique(nsamples_treatments).size == 1:
+        # all input groups are the same length, so only one value needs to be
+        # calculated [1].
+        normalize = 2 / nsamples_treatments[0]
+    else:
+        # to compare groups of differing sizes, we must compute a variance
+        # value for each individual comparison. Use broadcasting to get the
+        # resulting matrix. [3], verified against [4] (page 308).
+        normalize = 1 / nsamples_treatments + 1 / nsamples_treatments[None].T
+    # the standard error is used in the computation of the tukey criterion and
+    # finding the p-values.
+    stand_err = np.sqrt(normalize * mse / 2)
+    # the mean difference is the test statistic.
+    mean_differences = means[None].T - means
+    # Calculate the t-statistic to use within the survival function of the
+    # studentized range to get the p-value.
+    t_stat = np.abs(mean_differences) / stand_err
+    params = t_stat, ntreatments, nobs - ntreatments
+    pvalues = distributions.studentized_range.sf(*params)
+    return TukeyHSDResult(mean_differences, pvalues, ntreatments,
+                          nobs, stand_err)

.venv/Lib/site-packages/scipy/stats/_kde.py ADDED Viewed

	@@ -0,0 +1,728 @@

+#-------------------------------------------------------------------------------
+#
+#  Define classes for (uni/multi)-variate kernel density estimation.
+#
+#  Currently, only Gaussian kernels are implemented.
+#
+#  Written by: Robert Kern
+#
+#  Date: 2004-08-09
+#
+#  Modified: 2005-02-10 by Robert Kern.
+#              Contributed to SciPy
+#            2005-10-07 by Robert Kern.
+#              Some fixes to match the new scipy_core
+#
+#  Copyright 2004-2005 by Enthought, Inc.
+#
+#-------------------------------------------------------------------------------
+# Standard library imports.
+import warnings
+# SciPy imports.
+from scipy import linalg, special
+from scipy._lib._util import check_random_state
+from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
+                   sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
+                   ones, cov)
+import numpy as np
+# Local imports.
+from . import _mvn
+from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
+# deprecated import to be removed in SciPy 1.13.0
+from scipy.special import logsumexp  # noqa: F401
+__all__ = ['gaussian_kde']
+class gaussian_kde:
+    """Representation of a kernel-density estimate using Gaussian kernels.
+    Kernel density estimation is a way to estimate the probability density
+    function (PDF) of a random variable in a non-parametric way.
+    `gaussian_kde` works for both uni-variate and multi-variate data.   It
+    includes automatic bandwidth determination.  The estimation works best for
+    a unimodal distribution; bimodal or multi-modal distributions tend to be
+    oversmoothed.
+    Parameters
+    ----------
+    dataset : array_like
+        Datapoints to estimate from. In case of univariate data this is a 1-D
+        array, otherwise a 2-D array with shape (# of dims, # of data).
+    bw_method : str, scalar or callable, optional
+        The method used to calculate the estimator bandwidth.  This can be
+        'scott', 'silverman', a scalar constant or a callable.  If a scalar,
+        this will be used directly as `kde.factor`.  If a callable, it should
+        take a `gaussian_kde` instance as only parameter and return a scalar.
+        If None (default), 'scott' is used.  See Notes for more details.
+    weights : array_like, optional
+        weights of datapoints. This must be the same shape as dataset.
+        If None (default), the samples are assumed to be equally weighted
+    Attributes
+    ----------
+    dataset : ndarray
+        The dataset with which `gaussian_kde` was initialized.
+    d : int
+        Number of dimensions.
+    n : int
+        Number of datapoints.
+    neff : int
+        Effective number of datapoints.
+        .. versionadded:: 1.2.0
+    factor : float
+        The bandwidth factor, obtained from `kde.covariance_factor`. The square
+        of `kde.factor` multiplies the covariance matrix of the data in the kde
+        estimation.
+    covariance : ndarray
+        The covariance matrix of `dataset`, scaled by the calculated bandwidth
+        (`kde.factor`).
+    inv_cov : ndarray
+        The inverse of `covariance`.
+    Methods
+    -------
+    evaluate
+    __call__
+    integrate_gaussian
+    integrate_box_1d
+    integrate_box
+    integrate_kde
+    pdf
+    logpdf
+    resample
+    set_bandwidth
+    covariance_factor
+    Notes
+    -----
+    Bandwidth selection strongly influences the estimate obtained from the KDE
+    (much more so than the actual shape of the kernel).  Bandwidth selection
+    can be done by a "rule of thumb", by cross-validation, by "plug-in
+    methods" or by other means; see [3]_, [4]_ for reviews.  `gaussian_kde`
+    uses a rule of thumb, the default is Scott's Rule.
+    Scott's Rule [1]_, implemented as `scotts_factor`, is::
+        n**(-1./(d+4)),
+    with ``n`` the number of data points and ``d`` the number of dimensions.
+    In the case of unequally weighted points, `scotts_factor` becomes::
+        neff**(-1./(d+4)),
+    with ``neff`` the effective number of datapoints.
+    Silverman's Rule [2]_, implemented as `silverman_factor`, is::
+        (n * (d + 2) / 4.)**(-1. / (d + 4)).
+    or in the case of unequally weighted points::
+        (neff * (d + 2) / 4.)**(-1. / (d + 4)).
+    Good general descriptions of kernel density estimation can be found in [1]_
+    and [2]_, the mathematics for this multi-dimensional implementation can be
+    found in [1]_.
+    With a set of weighted samples, the effective number of datapoints ``neff``
+    is defined by::
+        neff = sum(weights)^2 / sum(weights^2)
+    as detailed in [5]_.
+    `gaussian_kde` does not currently support data that lies in a
+    lower-dimensional subspace of the space in which it is expressed. For such
+    data, consider performing principle component analysis / dimensionality
+    reduction and using `gaussian_kde` with the transformed data.
+    References
+    ----------
+    .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
+           Visualization", John Wiley & Sons, New York, Chicester, 1992.
+    .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
+           Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
+           Chapman and Hall, London, 1986.
+    .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
+           Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
+    .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
+           conditional density estimation", Computational Statistics & Data
+           Analysis, Vol. 36, pp. 279-298, 2001.
+    .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
+           Series A (General), 132, 272
+    Examples
+    --------
+    Generate some random two-dimensional data:
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> def measure(n):
+    ...     "Measurement model, return two coupled measurements."
+    ...     m1 = np.random.normal(size=n)
+    ...     m2 = np.random.normal(scale=0.5, size=n)
+    ...     return m1+m2, m1-m2
+    >>> m1, m2 = measure(2000)
+    >>> xmin = m1.min()
+    >>> xmax = m1.max()
+    >>> ymin = m2.min()
+    >>> ymax = m2.max()
+    Perform a kernel density estimate on the data:
+    >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
+    >>> positions = np.vstack([X.ravel(), Y.ravel()])
+    >>> values = np.vstack([m1, m2])
+    >>> kernel = stats.gaussian_kde(values)
+    >>> Z = np.reshape(kernel(positions).T, X.shape)
+    Plot the results:
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots()
+    >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
+    ...           extent=[xmin, xmax, ymin, ymax])
+    >>> ax.plot(m1, m2, 'k.', markersize=2)
+    >>> ax.set_xlim([xmin, xmax])
+    >>> ax.set_ylim([ymin, ymax])
+    >>> plt.show()
+    """
+    def __init__(self, dataset, bw_method=None, weights=None):
+        self.dataset = atleast_2d(asarray(dataset))
+        if not self.dataset.size > 1:
+            raise ValueError("`dataset` input should have multiple elements.")
+        self.d, self.n = self.dataset.shape
+        if weights is not None:
+            self._weights = atleast_1d(weights).astype(float)
+            self._weights /= sum(self._weights)
+            if self.weights.ndim != 1:
+                raise ValueError("`weights` input should be one-dimensional.")
+            if len(self._weights) != self.n:
+                raise ValueError("`weights` input should be of length n")
+            self._neff = 1/sum(self._weights**2)
+        # This can be converted to a warning once gh-10205 is resolved
+        if self.d > self.n:
+            msg = ("Number of dimensions is greater than number of samples. "
+                   "This results in a singular data covariance matrix, which "
+                   "cannot be treated using the algorithms implemented in "
+                   "`gaussian_kde`. Note that `gaussian_kde` interprets each "
+                   "*column* of `dataset` to be a point; consider transposing "
+                   "the input to `dataset`.")
+            raise ValueError(msg)
+        try:
+            self.set_bandwidth(bw_method=bw_method)
+        except linalg.LinAlgError as e:
+            msg = ("The data appears to lie in a lower-dimensional subspace "
+                   "of the space in which it is expressed. This has resulted "
+                   "in a singular data covariance matrix, which cannot be "
+                   "treated using the algorithms implemented in "
+                   "`gaussian_kde`. Consider performing principle component "
+                   "analysis / dimensionality reduction and using "
+                   "`gaussian_kde` with the transformed data.")
+            raise linalg.LinAlgError(msg) from e
+    def evaluate(self, points):
+        """Evaluate the estimated pdf on a set of points.
+        Parameters
+        ----------
+        points : (# of dimensions, # of points)-array
+            Alternatively, a (# of dimensions,) vector can be passed in and
+            treated as a single point.
+        Returns
+        -------
+        values : (# of points,)-array
+            The values at each point.
+        Raises
+        ------
+        ValueError : if the dimensionality of the input points is different than
+                     the dimensionality of the KDE.
+        """
+        points = atleast_2d(asarray(points))
+        d, m = points.shape
+        if d != self.d:
+            if d == 1 and m == self.d:
+                # points was passed in as a row vector
+                points = reshape(points, (self.d, 1))
+                m = 1
+            else:
+                msg = (f"points have dimension {d}, "
+                       f"dataset has dimension {self.d}")
+                raise ValueError(msg)
+        output_dtype, spec = _get_output_dtype(self.covariance, points)
+        result = gaussian_kernel_estimate[spec](
+            self.dataset.T, self.weights[:, None],
+            points.T, self.cho_cov, output_dtype)
+        return result[:, 0]
+    __call__ = evaluate
+    def integrate_gaussian(self, mean, cov):
+        """
+        Multiply estimated density by a multivariate Gaussian and integrate
+        over the whole space.
+        Parameters
+        ----------
+        mean : aray_like
+            A 1-D array, specifying the mean of the Gaussian.
+        cov : array_like
+            A 2-D array, specifying the covariance matrix of the Gaussian.
+        Returns
+        -------
+        result : scalar
+            The value of the integral.
+        Raises
+        ------
+        ValueError
+            If the mean or covariance of the input Gaussian differs from
+            the KDE's dimensionality.
+        """
+        mean = atleast_1d(squeeze(mean))
+        cov = atleast_2d(cov)
+        if mean.shape != (self.d,):
+            raise ValueError("mean does not have dimension %s" % self.d)
+        if cov.shape != (self.d, self.d):
+            raise ValueError("covariance does not have dimension %s" % self.d)
+        # make mean a column vector
+        mean = mean[:, newaxis]
+        sum_cov = self.covariance + cov
+        # This will raise LinAlgError if the new cov matrix is not s.p.d
+        # cho_factor returns (ndarray, bool) where bool is a flag for whether
+        # or not ndarray is upper or lower triangular
+        sum_cov_chol = linalg.cho_factor(sum_cov)
+        diff = self.dataset - mean
+        tdiff = linalg.cho_solve(sum_cov_chol, diff)
+        sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
+        norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
+        energies = sum(diff * tdiff, axis=0) / 2.0
+        result = sum(exp(-energies)*self.weights, axis=0) / norm_const
+        return result
+    def integrate_box_1d(self, low, high):
+        """
+        Computes the integral of a 1D pdf between two bounds.
+        Parameters
+        ----------
+        low : scalar
+            Lower bound of integration.
+        high : scalar
+            Upper bound of integration.
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+        Raises
+        ------
+        ValueError
+            If the KDE is over more than one dimension.
+        """
+        if self.d != 1:
+            raise ValueError("integrate_box_1d() only handles 1D pdfs")
+        stdev = ravel(sqrt(self.covariance))[0]
+        normalized_low = ravel((low - self.dataset) / stdev)
+        normalized_high = ravel((high - self.dataset) / stdev)
+        value = np.sum(self.weights*(
+                        special.ndtr(normalized_high) -
+                        special.ndtr(normalized_low)))
+        return value
+    def integrate_box(self, low_bounds, high_bounds, maxpts=None):
+        """Computes the integral of a pdf over a rectangular interval.
+        Parameters
+        ----------
+        low_bounds : array_like
+            A 1-D array containing the lower bounds of integration.
+        high_bounds : array_like
+            A 1-D array containing the upper bounds of integration.
+        maxpts : int, optional
+            The maximum number of points to use for integration.
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+        """
+        if maxpts is not None:
+            extra_kwds = {'maxpts': maxpts}
+        else:
+            extra_kwds = {}
+        value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
+                                            self.dataset, self.weights,
+                                            self.covariance, **extra_kwds)
+        if inform:
+            msg = ('An integral in _mvn.mvnun requires more points than %s' %
+                   (self.d * 1000))
+            warnings.warn(msg, stacklevel=2)
+        return value
+    def integrate_kde(self, other):
+        """
+        Computes the integral of the product of this  kernel density estimate
+        with another.
+        Parameters
+        ----------
+        other : gaussian_kde instance
+            The other kde.
+        Returns
+        -------
+        value : scalar
+            The result of the integral.
+        Raises
+        ------
+        ValueError
+            If the KDEs have different dimensionality.
+        """
+        if other.d != self.d:
+            raise ValueError("KDEs are not the same dimensionality")
+        # we want to iterate over the smallest number of points
+        if other.n < self.n:
+            small = other
+            large = self
+        else:
+            small = self
+            large = other
+        sum_cov = small.covariance + large.covariance
+        sum_cov_chol = linalg.cho_factor(sum_cov)
+        result = 0.0
+        for i in range(small.n):
+            mean = small.dataset[:, i, newaxis]
+            diff = large.dataset - mean
+            tdiff = linalg.cho_solve(sum_cov_chol, diff)
+            energies = sum(diff * tdiff, axis=0) / 2.0
+            result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
+        sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
+        norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
+        result /= norm_const
+        return result
+    def resample(self, size=None, seed=None):
+        """Randomly sample a dataset from the estimated pdf.
+        Parameters
+        ----------
+        size : int, optional
+            The number of samples to draw.  If not provided, then the size is
+            the same as the effective number of samples in the underlying
+            dataset.
+        seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
+            If `seed` is None (or `np.random`), the `numpy.random.RandomState`
+            singleton is used.
+            If `seed` is an int, a new ``RandomState`` instance is used,
+            seeded with `seed`.
+            If `seed` is already a ``Generator`` or ``RandomState`` instance then
+            that instance is used.
+        Returns
+        -------
+        resample : (self.d, `size`) ndarray
+            The sampled dataset.
+        """ # numpy/numpydoc#87  # noqa: E501
+        if size is None:
+            size = int(self.neff)
+        random_state = check_random_state(seed)
+        norm = transpose(random_state.multivariate_normal(
+            zeros((self.d,), float), self.covariance, size=size
+        ))
+        indices = random_state.choice(self.n, size=size, p=self.weights)
+        means = self.dataset[:, indices]
+        return means + norm
+    def scotts_factor(self):
+        """Compute Scott's factor.
+        Returns
+        -------
+        s : float
+            Scott's factor.
+        """
+        return power(self.neff, -1./(self.d+4))
+    def silverman_factor(self):
+        """Compute the Silverman factor.
+        Returns
+        -------
+        s : float
+            The silverman factor.
+        """
+        return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
+    #  Default method to calculate bandwidth, can be overwritten by subclass
+    covariance_factor = scotts_factor
+    covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
+        multiplies the data covariance matrix to obtain the kernel covariance
+        matrix. The default is `scotts_factor`.  A subclass can overwrite this
+        method to provide a different method, or set it through a call to
+        `kde.set_bandwidth`."""
+    def set_bandwidth(self, bw_method=None):
+        """Compute the estimator bandwidth with given method.
+        The new bandwidth calculated after a call to `set_bandwidth` is used
+        for subsequent evaluations of the estimated density.
+        Parameters
+        ----------
+        bw_method : str, scalar or callable, optional
+            The method used to calculate the estimator bandwidth.  This can be
+            'scott', 'silverman', a scalar constant or a callable.  If a
+            scalar, this will be used directly as `kde.factor`.  If a callable,
+            it should take a `gaussian_kde` instance as only parameter and
+            return a scalar.  If None (default), nothing happens; the current
+            `kde.covariance_factor` method is kept.
+        Notes
+        -----
+        .. versionadded:: 0.11
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import scipy.stats as stats
+        >>> x1 = np.array([-7, -5, 1, 4, 5.])
+        >>> kde = stats.gaussian_kde(x1)
+        >>> xs = np.linspace(-10, 10, num=50)
+        >>> y1 = kde(xs)
+        >>> kde.set_bandwidth(bw_method='silverman')
+        >>> y2 = kde(xs)
+        >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
+        >>> y3 = kde(xs)
+        >>> import matplotlib.pyplot as plt
+        >>> fig, ax = plt.subplots()
+        >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
+        ...         label='Data points (rescaled)')
+        >>> ax.plot(xs, y1, label='Scott (default)')
+        >>> ax.plot(xs, y2, label='Silverman')
+        >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
+        >>> ax.legend()
+        >>> plt.show()
+        """
+        if bw_method is None:
+            pass
+        elif bw_method == 'scott':
+            self.covariance_factor = self.scotts_factor
+        elif bw_method == 'silverman':
+            self.covariance_factor = self.silverman_factor
+        elif np.isscalar(bw_method) and not isinstance(bw_method, str):
+            self._bw_method = 'use constant'
+            self.covariance_factor = lambda: bw_method
+        elif callable(bw_method):
+            self._bw_method = bw_method
+            self.covariance_factor = lambda: self._bw_method(self)
+        else:
+            msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
+                  "or a callable."
+            raise ValueError(msg)
+        self._compute_covariance()
+    def _compute_covariance(self):
+        """Computes the covariance matrix for each Gaussian kernel using
+        covariance_factor().
+        """
+        self.factor = self.covariance_factor()
+        # Cache covariance and Cholesky decomp of covariance
+        if not hasattr(self, '_data_cho_cov'):
+            self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
+                                               bias=False,
+                                               aweights=self.weights))
+            self._data_cho_cov = linalg.cholesky(self._data_covariance,
+                                                 lower=True)
+        self.covariance = self._data_covariance * self.factor**2
+        self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
+        self.log_det = 2*np.log(np.diag(self.cho_cov
+                                        * np.sqrt(2*pi))).sum()
+    @property
+    def inv_cov(self):
+        # Re-compute from scratch each time because I'm not sure how this is
+        # used in the wild. (Perhaps users change the `dataset`, since it's
+        # not a private attribute?) `_compute_covariance` used to recalculate
+        # all these, so we'll recalculate everything now that this is a
+        # a property.
+        self.factor = self.covariance_factor()
+        self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
+                                           bias=False, aweights=self.weights))
+        return linalg.inv(self._data_covariance) / self.factor**2
+    def pdf(self, x):
+        """
+        Evaluate the estimated pdf on a provided set of points.
+        Notes
+        -----
+        This is an alias for `gaussian_kde.evaluate`.  See the ``evaluate``
+        docstring for more details.
+        """
+        return self.evaluate(x)
+    def logpdf(self, x):
+        """
+        Evaluate the log of the estimated pdf on a provided set of points.
+        """
+        points = atleast_2d(x)
+        d, m = points.shape
+        if d != self.d:
+            if d == 1 and m == self.d:
+                # points was passed in as a row vector
+                points = reshape(points, (self.d, 1))
+                m = 1
+            else:
+                msg = (f"points have dimension {d}, "
+                       f"dataset has dimension {self.d}")
+                raise ValueError(msg)
+        output_dtype, spec = _get_output_dtype(self.covariance, points)
+        result = gaussian_kernel_estimate_log[spec](
+            self.dataset.T, self.weights[:, None],
+            points.T, self.cho_cov, output_dtype)
+        return result[:, 0]
+    def marginal(self, dimensions):
+        """Return a marginal KDE distribution
+        Parameters
+        ----------
+        dimensions : int or 1-d array_like
+            The dimensions of the multivariate distribution corresponding
+            with the marginal variables, that is, the indices of the dimensions
+            that are being retained. The other dimensions are marginalized out.
+        Returns
+        -------
+        marginal_kde : gaussian_kde
+            An object representing the marginal distribution.
+        Notes
+        -----
+        .. versionadded:: 1.10.0
+        """
+        dims = np.atleast_1d(dimensions)
+        if not np.issubdtype(dims.dtype, np.integer):
+            msg = ("Elements of `dimensions` must be integers - the indices "
+                   "of the marginal variables being retained.")
+            raise ValueError(msg)
+        n = len(self.dataset)  # number of dimensions
+        original_dims = dims.copy()
+        dims[dims < 0] = n + dims[dims < 0]
+        if len(np.unique(dims)) != len(dims):
+            msg = ("All elements of `dimensions` must be unique.")
+            raise ValueError(msg)
+        i_invalid = (dims < 0) | (dims >= n)
+        if np.any(i_invalid):
+            msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
+                   f"for a distribution in {n} dimensions.")
+            raise ValueError(msg)
+        dataset = self.dataset[dims]
+        weights = self.weights
+        return gaussian_kde(dataset, bw_method=self.covariance_factor(),
+                            weights=weights)
+    @property
+    def weights(self):
+        try:
+            return self._weights
+        except AttributeError:
+            self._weights = ones(self.n)/self.n
+            return self._weights
+    @property
+    def neff(self):
+        try:
+            return self._neff
+        except AttributeError:
+            self._neff = 1/sum(self.weights**2)
+            return self._neff
+def _get_output_dtype(covariance, points):
+    """
+    Calculates the output dtype and the "spec" (=C type name).
+    This was necessary in order to deal with the fused types in the Cython
+    routine `gaussian_kernel_estimate`. See gh-10824 for details.
+    """
+    output_dtype = np.common_type(covariance, points)
+    itemsize = np.dtype(output_dtype).itemsize
+    if itemsize == 4:
+        spec = 'float'
+    elif itemsize == 8:
+        spec = 'double'
+    elif itemsize in (12, 16):
+        spec = 'long double'
+    else:
+        raise ValueError(
+                f"{output_dtype} has unexpected item size: {itemsize}"
+            )
+    return output_dtype, spec

.venv/Lib/site-packages/scipy/stats/_ksstats.py ADDED Viewed

	@@ -0,0 +1,600 @@

+# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
+#    D_n = sup_x{|F_n(x) - F(x)|},
+#    F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
+#    F(x) is the CDF of a probability distribution.
+#
+# Exact methods:
+# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
+#   or a recursion algorithm due to Pomeranz[2].
+# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
+#   the Durbin algorithm.
+#   D_n >= d <==>  D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
+#   Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
+#   For d > 0.5, the latter intersection probability is 0.
+#
+# Approximate methods:
+# For d close to 0.5, ignoring that intersection term may still give a
+#   reasonable approximation.
+# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
+# Kolmogorov's initial asymptotic, suitable for large d. (See
+#   scipy.special.kolmogorov for that asymptotic)
+# Pelz-Good[6] used the functional equation for Jacobi theta functions to
+#   transform the Li-Chien/Korolyuk formula produce a computational formula
+#   suitable for small d.
+#
+# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
+#   the above approaches and it is that which is used here.
+#
+# Other approaches:
+# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
+# Moscovich and Nadler[9] use FFTs to compute the convolutions.
+# References:
+# [1] Durbin J (1968).
+#     "The Probability that the Sample Distribution Function Lies Between Two
+#     Parallel Straight Lines."
+#     Annals of Mathematical Statistics, 39, 398-411.
+# [2] Pomeranz J (1974).
+#     "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
+#     Small Samples (Algorithm 487)."
+#     Communications of the ACM, 17(12), 703-704.
+# [3] Marsaglia G, Tsang WW, Wang J (2003).
+#     "Evaluating Kolmogorov's Distribution."
+#     Journal of Statistical Software, 8(18), 1-4.
+# [4] LI-CHIEN, C. (1956).
+#     "On the exact distribution of the statistics of A. N. Kolmogorov and
+#     their asymptotic expansion."
+#     Acta Matematica Sinica, 6, 55-81.
+# [5] KOROLYUK, V. S. (1960).
+#     "Asymptotic analysis of the distribution of the maximum deviation in
+#     the Bernoulli scheme."
+#     Theor. Probability Appl., 4, 339-366.
+# [6] Pelz W, Good IJ (1976).
+#     "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
+#     Statistic."
+#     Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
+#  [7] Simard, R., L'Ecuyer, P. (2011)
+# 	  "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
+# 	  Journal of Statistical Software, Vol 39, 11, 1-18.
+#  [8] Carvalho, Luis (2015)
+#     "An Improved Evaluation of Kolmogorov's Distribution"
+#     Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
+#  [9] Amit Moscovich, Boaz Nadler (2017)
+#     "Fast calculation of boundary crossing probabilities for Poisson
+#     processes",
+#     Statistics & Probability Letters, Vol 123, 177-182.
+import numpy as np
+import scipy.special
+import scipy.special._ufuncs as scu
+from scipy._lib._finite_differences import _derivative
+_E128 = 128
+_EP128 = np.ldexp(np.longdouble(1), _E128)
+_EM128 = np.ldexp(np.longdouble(1), -_E128)
+_SQRT2PI = np.sqrt(2 * np.pi)
+_LOG_2PI = np.log(2 * np.pi)
+_MIN_LOG = -708
+_SQRT3 = np.sqrt(3)
+_PI_SQUARED = np.pi ** 2
+_PI_FOUR = np.pi ** 4
+_PI_SIX = np.pi ** 6
+# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
+# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
+_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
+                    -1.9175269175269175269e-3, 8.4175084175084175084e-4,
+                    -5.952380952380952381e-4, 7.9365079365079365079e-4,
+                    -2.7777777777777777778e-3, 8.3333333333333333333e-2]
+def _log_nfactorial_div_n_pow_n(n):
+    # Computes n! / n**n
+    #    = (n-1)! / n**(n-1)
+    # Uses Stirling's approximation, but removes n*log(n) up-front to
+    # avoid subtractive cancellation.
+    #    = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
+    rn = 1.0/n
+    return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
+def _clip_prob(p):
+    """clips a probability to range 0<=p<=1."""
+    return np.clip(p, 0.0, 1.0)
+def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
+    """Selects either the CDF or SF, and then clips to range 0<=p<=1."""
+    p = np.where(cdf, cdfprob, sfprob)
+    return _clip_prob(p)
+def _kolmogn_DMTW(n, d, cdf=True):
+    r"""Computes the Kolmogorov CDF:  Pr(D_n <= d) using the MTW approach to
+    the Durbin matrix algorithm.
+    Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
+    """
+    # Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
+    # Generate initial matrix H of size m*m where m=(2k-1)
+    # Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
+    # Requires memory O(m^2) and computation O(m^2 log(n)).
+    # Most suitable for small m.
+    if d >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf)
+    nd = n * d
+    if nd <= 0.5:
+        return _select_and_clip_prob(0.0, 1.0, cdf)
+    k = int(np.ceil(nd))
+    h = k - nd
+    m = 2 * k - 1
+    H = np.zeros([m, m])
+    # Initialize: v is first column (and last row) of H
+    #  v[j] = (1-h^(j+1)/(j+1)!  (except for v[-1])
+    #  w[j] = 1/(j)!
+    # q = k-th row of H (actually i!/n^i*H^i)
+    intm = np.arange(1, m + 1)
+    v = 1.0 - h ** intm
+    w = np.empty(m)
+    fac = 1.0
+    for j in intm:
+        w[j - 1] = fac
+        fac /= j  # This might underflow.  Isn't a problem.
+        v[j - 1] *= fac
+    tt = max(2 * h - 1.0, 0)**m - 2*h**m
+    v[-1] = (1.0 + tt) * fac
+    for i in range(1, m):
+        H[i - 1:, i] = w[:m - i + 1]
+    H[:, 0] = v
+    H[-1, :] = np.flip(v, axis=0)
+    Hpwr = np.eye(np.shape(H)[0])  # Holds intermediate powers of H
+    nn = n
+    expnt = 0  # Scaling of Hpwr
+    Hexpnt = 0  # Scaling of H
+    while nn > 0:
+        if nn % 2:
+            Hpwr = np.matmul(Hpwr, H)
+            expnt += Hexpnt
+        H = np.matmul(H, H)
+        Hexpnt *= 2
+        # Scale as needed.
+        if np.abs(H[k - 1, k - 1]) > _EP128:
+            H /= _EP128
+            Hexpnt += _E128
+        nn = nn // 2
+    p = Hpwr[k - 1, k - 1]
+    # Multiply by n!/n^n
+    for i in range(1, n + 1):
+        p = i * p / n
+        if np.abs(p) < _EM128:
+            p *= _EP128
+            expnt -= _E128
+    # unscale
+    if expnt != 0:
+        p = np.ldexp(p, expnt)
+    return _select_and_clip_prob(p, 1.0-p, cdf)
+def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
+    """Compute the endpoints of the interval for row i."""
+    if i == 0:
+        j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
+    else:
+        # i + 1 = 2*ip1div2 + ip1mod2
+        ip1div2, ip1mod2 = divmod(i + 1, 2)
+        if ip1mod2 == 0:  # i is odd
+            if ip1div2 == n + 1:
+                j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
+            else:
+                j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
+        else:
+            j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
+    return max(j1 + 2, 0), min(j2, n)
+def _kolmogn_Pomeranz(n, x, cdf=True):
+    r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
+    Pomeranz (1974) [2]
+    """
+    # V is n*(2n+2) matrix.
+    # Each row is convolution of the previous row and probabilities from a
+    #  Poisson distribution.
+    # Desired CDF probability is n! V[n-1, 2n+1]  (final entry in final row).
+    # Only two rows are needed at any given stage:
+    #  - Call them V0 and V1.
+    #  - Swap each iteration
+    # Only a few (contiguous) entries in each row can be non-zero.
+    #  - Keep track of start and end (j1 and j2 below)
+    #  - V0s and V1s track the start in the two rows
+    # Scale intermediate results as needed.
+    # Only a few different Poisson distributions can occur
+    t = n * x
+    ll = int(np.floor(t))
+    f = 1.0 * (t - ll)  # fractional part of t
+    g = min(f, 1.0 - f)
+    ceilf = (1 if f > 0 else 0)
+    roundf = (1 if f > 0.5 else 0)
+    npwrs = 2 * (ll + 1)    # Maximum number of powers needed in convolutions
+    gpower = np.empty(npwrs)  # gpower = (g/n)^m/m!
+    twogpower = np.empty(npwrs)  # twogpower = (2g/n)^m/m!
+    onem2gpower = np.empty(npwrs)  # onem2gpower = ((1-2g)/n)^m/m!
+    # gpower etc are *almost* Poisson probs, just missing normalizing factor.
+    gpower[0] = 1.0
+    twogpower[0] = 1.0
+    onem2gpower[0] = 1.0
+    expnt = 0
+    g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
+    for m in range(1, npwrs):
+        gpower[m] = gpower[m - 1] * g_over_n / m
+        twogpower[m] = twogpower[m - 1] * two_g_over_n / m
+        onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
+    V0 = np.zeros([npwrs])
+    V1 = np.zeros([npwrs])
+    V1[0] = 1  # first row
+    V0s, V1s = 0, 0  # start indices of the two rows
+    j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
+    for i in range(1, 2 * n + 2):
+        # Preserve j1, V1, V1s, V0s from last iteration
+        k1 = j1
+        V0, V1 = V1, V0
+        V0s, V1s = V1s, V0s
+        V1.fill(0.0)
+        j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
+        if i == 1 or i == 2 * n + 1:
+            pwrs = gpower
+        else:
+            pwrs = (twogpower if i % 2 else onem2gpower)
+        ln2 = j2 - k1 + 1
+        if ln2 > 0:
+            conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
+            conv_start = j1 - k1  # First index to use from conv
+            conv_len = j2 - j1 + 1  # Number of entries to use from conv
+            V1[:conv_len] = conv[conv_start:conv_start + conv_len]
+            # Scale to avoid underflow.
+            if 0 < np.max(V1) < _EM128:
+                V1 *= _EP128
+                expnt -= _E128
+            V1s = V0s + j1 - k1
+    # multiply by n!
+    ans = V1[n - V1s]
+    for m in range(1, n + 1):
+        if np.abs(ans) > _EP128:
+            ans *= _EM128
+            expnt += _E128
+        ans *= m
+    # Undo any intermediate scaling
+    if expnt != 0:
+        ans = np.ldexp(ans, expnt)
+    ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
+    return ans
+def _kolmogn_PelzGood(n, x, cdf=True):
+    """Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
+    Start with Li-Chien, Korolyuk approximation:
+        Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
+    where z = x*sqrt(n).
+    Transform each K_(z) using Jacobi theta functions into a form suitable
+    for small z.
+    Pelz-Good (1976). [6]
+    """
+    if x <= 0.0:
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+    if x >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
+    z = np.sqrt(n) * x
+    zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
+    qlog = -_PI_SQUARED / 8 / zsquared
+    if qlog < _MIN_LOG:  # z ~ 0.041743441416853426
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+    q = np.exp(qlog)
+    # Coefficients of terms in the sums for K1, K2 and K3
+    k1a = -zsquared
+    k1b = _PI_SQUARED / 4
+    k2a = 6 * zsix + 2 * zfour
+    k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
+    k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
+    k3d = _PI_SIX * (5 - 30 * zsquared) / 64
+    k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
+    k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
+    k3a = -30 * zsix - 90 * z**8
+    K0to3 = np.zeros(4)
+    # Use a Horner scheme to evaluate sum c_i q^(i^2)
+    # Reduces to a sum over odd integers.
+    maxk = int(np.ceil(16 * z / np.pi))
+    for k in range(maxk, 0, -1):
+        m = 2 * k - 1
+        msquared, mfour, msix = m**2, m**4, m**6
+        qpower = np.power(q, 8 * k)
+        coeffs = np.array([1.0,
+                           k1a + k1b*msquared,
+                           k2a + k2b*msquared + k2c*mfour,
+                           k3a + k3b*msquared + k3c*mfour + k3d*msix])
+        K0to3 *= qpower
+        K0to3 += coeffs
+    K0to3 *= q
+    K0to3 *= _SQRT2PI
+    # z**10 > 0 as z > 0.04
+    K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
+    # Now do the other sum over the other terms, all integers k
+    # K_2:  (pi^2 k^2) q^(k^2),
+    # K_3:  (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
+    # Don't expect much subtractive cancellation so use direct calculation
+    q = np.exp(-_PI_SQUARED / 2 / zsquared)
+    ks = np.arange(maxk, 0, -1)
+    ksquared = ks ** 2
+    sqrt3z = _SQRT3 * z
+    kspi = np.pi * ks
+    qpwers = q ** ksquared
+    k2extra = np.sum(ksquared * qpwers)
+    k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
+    K0to3[2] += k2extra
+    k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
+    k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
+    K0to3[3] += k3extra
+    powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
+    K0to3 /= powers_of_n
+    if not cdf:
+        K0to3 *= -1
+        K0to3[0] += 1
+    Ksum = sum(K0to3)
+    return Ksum
+def _kolmogn(n, x, cdf=True):
+    """Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
+    x must be of type float, n of type integer.
+    Simard & L'Ecuyer (2011) [7].
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if x >= 1.0:
+        return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
+    if x <= 0.0:
+        return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+    t = n * x
+    if t <= 1.0:  # Ruben-Gambino: 1/2n <= x <= 1/n
+        if t <= 0.5:
+            return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
+        if n <= 140:
+            prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
+        else:
+            prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
+        return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+    if t >= n - 1:  # Ruben-Gambino
+        prob = 2 * (1.0 - x)**n
+        return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
+    if x >= 0.5:  # Exact: 2 * smirnov
+        prob = 2 * scipy.special.smirnov(n, x)
+        return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
+    nxsquared = t * x
+    if n <= 140:
+        if nxsquared <= 0.754693:
+            prob = _kolmogn_DMTW(n, x, cdf=True)
+            return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+        if nxsquared <= 4:
+            prob = _kolmogn_Pomeranz(n, x, cdf=True)
+            return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
+        # Now use Miller approximation of 2*smirnov
+        prob = 2 * scipy.special.smirnov(n, x)
+        return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
+    # Split CDF and SF as they have different cutoffs on nxsquared.
+    if not cdf:
+        if nxsquared >= 370.0:
+            return 0.0
+        if nxsquared >= 2.2:
+            prob = 2 * scipy.special.smirnov(n, x)
+            return _clip_prob(prob)
+        # Fall through and compute the SF as 1.0-CDF
+    if nxsquared >= 18.0:
+        cdfprob = 1.0
+    elif n <= 100000 and n * x**1.5 <= 1.4:
+        cdfprob = _kolmogn_DMTW(n, x, cdf=True)
+    else:
+        cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
+    return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
+def _kolmogn_p(n, x):
+    """Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
+    x must be of type float, n of type integer.
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if x >= 1.0 or x <= 0:
+        return 0
+    t = n * x
+    if t <= 1.0:
+        # Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
+        if t <= 0.5:
+            return 0.0
+        if n <= 140:
+            prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
+        else:
+            prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
+        return prd * 2 * n**2
+    if t >= n - 1:
+        # Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
+        return 2 * (1.0 - x) ** (n-1) * n
+    if x >= 0.5:
+        return 2 * scipy.stats.ksone.pdf(x, n)
+    # Just take a small delta.
+    # Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
+    # as the CDF is a piecewise degree n polynomial.
+    # It has knots at 1/n, 2/n, ... (n-1)/n
+    # and is not a C-infinity function at the knots
+    delta = x / 2.0**16
+    delta = min(delta, x - 1.0/n)
+    delta = min(delta, 0.5 - x)
+    def _kk(_x):
+        return kolmogn(n, _x)
+    return _derivative(_kk, x, dx=delta, order=5)
+def _kolmogni(n, p, q):
+    """Computes the PPF/ISF of kolmogn.
+    n of type integer, n>= 1
+    p is the CDF, q the SF, p+q=1
+    """
+    if np.isnan(n):
+        return n  # Keep the same type of nan
+    if int(n) != n or n <= 0:
+        return np.nan
+    if p <= 0:
+        return 1.0/n
+    if q <= 0:
+        return 1.0
+    delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
+    if delta <= 1.0/n:
+        return (delta + 1.0 / n) / 2
+    x = -np.expm1(np.log(q/2.0)/n)
+    if x >= 1 - 1.0/n:
+        return x
+    x1 = scu._kolmogci(p)/np.sqrt(n)
+    x1 = min(x1, 1.0 - 1.0/n)
+    def _f(x):
+        return _kolmogn(n, x) - p
+    return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
+def kolmogn(n, x, cdf=True):
+    """Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
+    The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
+    for a sample of size n drawn from a distribution with CDF F(t), where
+    :math:`D_n &= sup_t |F_n(t) - F(t)|`, and
+    :math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    x : float, array_like
+        The K-S statistic, float between 0 and 1
+    cdf : bool, optional
+        whether to compute the CDF(default=true) or the SF.
+    Returns
+    -------
+    cdf : ndarray
+        CDF (or SF it cdf is False) at the specified locations.
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, x, cdf, None],
+                   op_dtypes=[None, np.float64, np.bool_, np.float64])
+    for _n, _x, _cdf, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
+    result = it.operands[-1]
+    return result
+def kolmognp(n, x):
+    """Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    x : float, array_like
+        The K-S statistic, float between 0 and 1
+    Returns
+    -------
+    pdf : ndarray
+        The PDF at the specified locations
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, x, None])
+    for _n, _x, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        z[...] = _kolmogn_p(int(_n), _x)
+    result = it.operands[-1]
+    return result
+def kolmogni(n, q, cdf=True):
+    """Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
+    Parameters
+    ----------
+    n : integer, array_like
+        the number of samples
+    q : float, array_like
+        Probabilities, float between 0 and 1
+    cdf : bool, optional
+        whether to compute the PPF(default=true) or the ISF.
+    Returns
+    -------
+    ppf : ndarray
+        PPF (or ISF if cdf is False) at the specified locations
+    The return value has shape the result of numpy broadcasting n and x.
+    """
+    it = np.nditer([n, q, cdf, None])
+    for _n, _q, _cdf, z in it:
+        if np.isnan(_n):
+            z[...] = _n
+            continue
+        if int(_n) != _n:
+            raise ValueError(f'n is not integral: {_n}')
+        _pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
+        z[...] = _kolmogni(int(_n), _pcdf, _psf)
+    result = it.operands[-1]
+    return result

.venv/Lib/site-packages/scipy/stats/_mannwhitneyu.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import numpy as np
+from collections import namedtuple
+from scipy import special
+from scipy import stats
+from scipy.stats._stats_py import _rankdata
+from ._axis_nan_policy import _axis_nan_policy_factory
+def _broadcast_concatenate(x, y, axis):
+    '''Broadcast then concatenate arrays, leaving concatenation axis last'''
+    x = np.moveaxis(x, axis, -1)
+    y = np.moveaxis(y, axis, -1)
+    z = np.broadcast(x[..., 0], y[..., 0])
+    x = np.broadcast_to(x, z.shape + (x.shape[-1],))
+    y = np.broadcast_to(y, z.shape + (y.shape[-1],))
+    z = np.concatenate((x, y), axis=-1)
+    return x, y, z
+class _MWU:
+    '''Distribution of MWU statistic under the null hypothesis'''
+    # Possible improvement: if m and n are small enough, use integer arithmetic
+    def __init__(self):
+        '''Minimal initializer'''
+        self._fmnks = -np.ones((1, 1, 1))
+        self._recursive = None
+    def pmf(self, k, m, n):
+        # In practice, `pmf` is never called with k > m*n/2.
+        # If it were, we'd exploit symmetry here:
+        # k = np.array(k, copy=True)
+        # k2 = m*n - k
+        # i = k2 < k
+        # k[i] = k2[i]
+        if (self._recursive is None and m <= 500 and n <= 500
+                or self._recursive):
+            return self.pmf_recursive(k, m, n)
+        else:
+            return self.pmf_iterative(k, m, n)
+    def pmf_recursive(self, k, m, n):
+        '''Probability mass function, recursive version'''
+        self._resize_fmnks(m, n, np.max(k))
+        # could loop over just the unique elements, but probably not worth
+        # the time to find them
+        for i in np.ravel(k):
+            self._f(m, n, i)
+        return self._fmnks[m, n, k] / special.binom(m + n, m)
+    def pmf_iterative(self, k, m, n):
+        '''Probability mass function, iterative version'''
+        fmnks = {}
+        for i in np.ravel(k):
+            fmnks = _mwu_f_iterative(m, n, i, fmnks)
+        return (np.array([fmnks[(m, n, ki)] for ki in k])
+                / special.binom(m + n, m))
+    def cdf(self, k, m, n):
+        '''Cumulative distribution function'''
+        # In practice, `cdf` is never called with k > m*n/2.
+        # If it were, we'd exploit symmetry here rather than in `sf`
+        pmfs = self.pmf(np.arange(0, np.max(k) + 1), m, n)
+        cdfs = np.cumsum(pmfs)
+        return cdfs[k]
+    def sf(self, k, m, n):
+        '''Survival function'''
+        # Note that both CDF and SF include the PMF at k. The p-value is
+        # calculated from the SF and should include the mass at k, so this
+        # is desirable
+        # Use the fact that the distribution is symmetric; i.e.
+        # _f(m, n, m*n-k) = _f(m, n, k), and sum from the left
+        kc = np.asarray(m*n - k)  # complement of k
+        i = k < kc
+        if np.any(i):
+            kc[i] = k[i]
+            cdfs = np.asarray(self.cdf(kc, m, n))
+            cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i], m, n)
+        else:
+            cdfs = np.asarray(self.cdf(kc, m, n))
+        return cdfs[()]
+    def _resize_fmnks(self, m, n, k):
+        '''If necessary, expand the array that remembers PMF values'''
+        # could probably use `np.pad` but I'm not sure it would save code
+        shape_old = np.array(self._fmnks.shape)
+        shape_new = np.array((m+1, n+1, k+1))
+        if np.any(shape_new > shape_old):
+            shape = np.maximum(shape_old, shape_new)
+            fmnks = -np.ones(shape)             # create the new array
+            m0, n0, k0 = shape_old
+            fmnks[:m0, :n0, :k0] = self._fmnks  # copy remembered values
+            self._fmnks = fmnks
+    def _f(self, m, n, k):
+        '''Recursive implementation of function of [3] Theorem 2.5'''
+        # [3] Theorem 2.5 Line 1
+        if k < 0 or m < 0 or n < 0 or k > m*n:
+            return 0
+        # if already calculated, return the value
+        if self._fmnks[m, n, k] >= 0:
+            return self._fmnks[m, n, k]
+        if k == 0 and m >= 0 and n >= 0:  # [3] Theorem 2.5 Line 2
+            fmnk = 1
+        else:   # [3] Theorem 2.5 Line 3 / Equation 3
+            fmnk = self._f(m-1, n, k-n) + self._f(m, n-1, k)
+        self._fmnks[m, n, k] = fmnk  # remember result
+        return fmnk
+# Maintain state for faster repeat calls to mannwhitneyu w/ method='exact'
+_mwu_state = _MWU()
+def _mwu_f_iterative(m, n, k, fmnks):
+    '''Iterative implementation of function of [3] Theorem 2.5'''
+    def _base_case(m, n, k):
+        '''Base cases from recursive version'''
+        # if already calculated, return the value
+        if fmnks.get((m, n, k), -1) >= 0:
+            return fmnks[(m, n, k)]
+        # [3] Theorem 2.5 Line 1
+        elif k < 0 or m < 0 or n < 0 or k > m*n:
+            return 0
+        # [3] Theorem 2.5 Line 2
+        elif k == 0 and m >= 0 and n >= 0:
+            return 1
+        return None
+    stack = [(m, n, k)]
+    fmnk = None
+    while stack:
+        # Popping only if necessary would save a tiny bit of time, but NWI.
+        m, n, k = stack.pop()
+        # If we're at a base case, continue (stack unwinds)
+        fmnk = _base_case(m, n, k)
+        if fmnk is not None:
+            fmnks[(m, n, k)] = fmnk
+            continue
+        # If both terms are base cases, continue (stack unwinds)
+        f1 = _base_case(m-1, n, k-n)
+        f2 = _base_case(m, n-1, k)
+        if f1 is not None and f2 is not None:
+            # [3] Theorem 2.5 Line 3 / Equation 3
+            fmnk = f1 + f2
+            fmnks[(m, n, k)] = fmnk
+            continue
+        # recurse deeper
+        stack.append((m, n, k))
+        if f1 is None:
+            stack.append((m-1, n, k-n))
+        if f2 is None:
+            stack.append((m, n-1, k))
+    return fmnks
+def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
+    '''Standardized MWU statistic'''
+    # Follows mannwhitneyu [2]
+    mu = n1 * n2 / 2
+    n = n1 + n2
+    # Tie correction according to [2], "Normal approximation and tie correction"
+    # "A more computationally-efficient form..."
+    tie_term = (t**3 - t).sum(axis=-1)
+    s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
+    numerator = U - mu
+    # Continuity correction.
+    # Because SF is always used to calculate the p-value, we can always
+    # _subtract_ 0.5 for the continuity correction. This always increases the
+    # p-value to account for the rest of the probability mass _at_ q = U.
+    if continuity:
+        numerator -= 0.5
+    # no problem evaluating the norm SF at an infinity
+    with np.errstate(divide='ignore', invalid='ignore'):
+        z = numerator / s
+    return z
+def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
+    ''' Input validation and standardization for mannwhitneyu '''
+    # Would use np.asarray_chkfinite, but infs are OK
+    x, y = np.atleast_1d(x), np.atleast_1d(y)
+    if np.isnan(x).any() or np.isnan(y).any():
+        raise ValueError('`x` and `y` must not contain NaNs.')
+    if np.size(x) == 0 or np.size(y) == 0:
+        raise ValueError('`x` and `y` must be of nonzero size.')
+    bools = {True, False}
+    if use_continuity not in bools:
+        raise ValueError(f'`use_continuity` must be one of {bools}.')
+    alternatives = {"two-sided", "less", "greater"}
+    alternative = alternative.lower()
+    if alternative not in alternatives:
+        raise ValueError(f'`alternative` must be one of {alternatives}.')
+    axis_int = int(axis)
+    if axis != axis_int:
+        raise ValueError('`axis` must be an integer.')
+    if not isinstance(method, stats.PermutationMethod):
+        methods = {"asymptotic", "exact", "auto"}
+        method = method.lower()
+        if method not in methods:
+            raise ValueError(f'`method` must be one of {methods}.')
+    return x, y, use_continuity, alternative, axis_int, method
+def _mwu_choose_method(n1, n2, ties):
+    """Choose method 'asymptotic' or 'exact' depending on input size, ties"""
+    # if both inputs are large, asymptotic is OK
+    if n1 > 8 and n2 > 8:
+        return "asymptotic"
+    # if there are any ties, asymptotic is preferred
+    if ties:
+        return "asymptotic"
+    return "exact"
+MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
+@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
+def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
+                 axis=0, method="auto"):
+    r'''Perform the Mann-Whitney U rank test on two independent samples.
+    The Mann-Whitney U test is a nonparametric test of the null hypothesis
+    that the distribution underlying sample `x` is the same as the
+    distribution underlying sample `y`. It is often used as a test of
+    difference in location between distributions.
+    Parameters
+    ----------
+    x, y : array-like
+        N-d arrays of samples. The arrays must be broadcastable except along
+        the dimension given by `axis`.
+    use_continuity : bool, optional
+            Whether a continuity correction (1/2) should be applied.
+            Default is True when `method` is ``'asymptotic'``; has no effect
+            otherwise.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis. Default is 'two-sided'.
+        Let *F(u)* and *G(u)* be the cumulative distribution functions of the
+        distributions underlying `x` and `y`, respectively. Then the following
+        alternative hypotheses are available:
+        * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
+          at least one *u*.
+        * 'less': the distribution underlying `x` is stochastically less
+          than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
+        * 'greater': the distribution underlying `x` is stochastically greater
+          than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
+        Note that the mathematical expressions in the alternative hypotheses
+        above describe the CDFs of the underlying distributions. The directions
+        of the inequalities appear inconsistent with the natural language
+        description at first glance, but they are not. For example, suppose
+        *X* and *Y* are random variables that follow distributions with CDFs
+        *F* and *G*, respectively. If *F(u) > G(u)* for all *u*, samples drawn
+        from *X* tend to be less than those drawn from *Y*.
+        Under a more restrictive set of assumptions, the alternative hypotheses
+        can be expressed in terms of the locations of the distributions;
+        see [5] section 5.1.
+    axis : int, optional
+        Axis along which to perform the test. Default is 0.
+    method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
+        Selects the method used to calculate the *p*-value.
+        Default is 'auto'. The following options are available.
+        * ``'asymptotic'``: compares the standardized test statistic
+          against the normal distribution, correcting for ties.
+        * ``'exact'``: computes the exact *p*-value by comparing the observed
+          :math:`U` statistic against the exact distribution of the :math:`U`
+          statistic under the null hypothesis. No correction is made for ties.
+        * ``'auto'``: chooses ``'exact'`` when the size of one of the samples
+          is less than or equal to 8 and there are no ties;
+          chooses ``'asymptotic'`` otherwise.
+        * `PermutationMethod` instance. In this case, the p-value
+          is computed using `permutation_test` with the provided
+          configuration options and other appropriate settings.
+    Returns
+    -------
+    res : MannwhitneyuResult
+        An object containing attributes:
+        statistic : float
+            The Mann-Whitney U statistic corresponding with sample `x`. See
+            Notes for the test statistic corresponding with sample `y`.
+        pvalue : float
+            The associated *p*-value for the chosen `alternative`.
+    Notes
+    -----
+    If ``U1`` is the statistic corresponding with sample `x`, then the
+    statistic corresponding with sample `y` is
+    ``U2 = x.shape[axis] * y.shape[axis] - U1``.
+    `mannwhitneyu` is for independent samples. For related / paired samples,
+    consider `scipy.stats.wilcoxon`.
+    `method` ``'exact'`` is recommended when there are no ties and when either
+    sample size is less than 8 [1]_. The implementation follows the recurrence
+    relation originally proposed in [1]_ as it is described in [3]_.
+    Note that the exact method is *not* corrected for ties, but
+    `mannwhitneyu` will not raise errors or warnings if there are ties in the
+    data. If there are ties and either samples is small (fewer than ~10
+    observations), consider passing an instance of `PermutationMethod`
+    as the `method` to perform a permutation test.
+    The Mann-Whitney U test is a non-parametric version of the t-test for
+    independent samples. When the means of samples from the populations
+    are normally distributed, consider `scipy.stats.ttest_ind`.
+    See Also
+    --------
+    scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
+    References
+    ----------
+    .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
+           variables is stochastically larger than the other", The Annals of
+           Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
+    .. [2] Mann-Whitney U Test, Wikipedia,
+           http://en.wikipedia.org/wiki/Mann-Whitney_U_test
+    .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the
+           Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and
+           Inference, Vol. 79, pp. 349-364, 1999.
+    .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
+           Learning Support Centre, 2004.
+    .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
+           or t-test? On assumptions for hypothesis tests and multiple \
+           interpretations of decision rules." Statistics surveys, Vol. 4, pp.
+           1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
+    Examples
+    --------
+    We follow the example from [4]_: nine randomly sampled young adults were
+    diagnosed with type II diabetes at the ages below.
+    >>> males = [19, 22, 16, 29, 24]
+    >>> females = [20, 11, 17, 12]
+    We use the Mann-Whitney U test to assess whether there is a statistically
+    significant difference in the diagnosis age of males and females.
+    The null hypothesis is that the distribution of male diagnosis ages is
+    the same as the distribution of female diagnosis ages. We decide
+    that a confidence level of 95% is required to reject the null hypothesis
+    in favor of the alternative that the distributions are different.
+    Since the number of samples is very small and there are no ties in the
+    data, we can compare the observed test statistic against the *exact*
+    distribution of the test statistic under the null hypothesis.
+    >>> from scipy.stats import mannwhitneyu
+    >>> U1, p = mannwhitneyu(males, females, method="exact")
+    >>> print(U1)
+    17.0
+    `mannwhitneyu` always reports the statistic associated with the first
+    sample, which, in this case, is males. This agrees with :math:`U_M = 17`
+    reported in [4]_. The statistic associated with the second statistic
+    can be calculated:
+    >>> nx, ny = len(males), len(females)
+    >>> U2 = nx*ny - U1
+    >>> print(U2)
+    3.0
+    This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
+    *p*-value can be calculated from either statistic, and the value produced
+    by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
+    >>> print(p)
+    0.1111111111111111
+    The exact distribution of the test statistic is asymptotically normal, so
+    the example continues by comparing the exact *p*-value against the
+    *p*-value produced using the normal approximation.
+    >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
+    >>> print(pnorm)
+    0.11134688653314041
+    Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
+    value :math:`p = 0.09` given in [4]_. The reason is that [4]_
+    does not apply the continuity correction performed by `mannwhitneyu`;
+    `mannwhitneyu` reduces the distance between the test statistic and the
+    mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
+    discrete statistic is being compared against a continuous distribution.
+    Here, the :math:`U` statistic used is less than the mean, so we reduce
+    the distance by adding 0.5 in the numerator.
+    >>> import numpy as np
+    >>> from scipy.stats import norm
+    >>> U = min(U1, U2)
+    >>> N = nx + ny
+    >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
+    >>> p = 2 * norm.cdf(z)  # use CDF to get p-value from smaller statistic
+    >>> print(p)
+    0.11134688653314041
+    If desired, we can disable the continuity correction to get a result
+    that agrees with that reported in [4]_.
+    >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
+    ...                         method="asymptotic")
+    >>> print(pnorm)
+    0.0864107329737
+    Regardless of whether we perform an exact or asymptotic test, the
+    probability of the test statistic being as extreme or more extreme by
+    chance exceeds 5%, so we do not consider the results statistically
+    significant.
+    Suppose that, before seeing the data, we had hypothesized that females
+    would tend to be diagnosed at a younger age than males.
+    In that case, it would be natural to provide the female ages as the
+    first input, and we would have performed a one-sided test using
+    ``alternative = 'less'``: females are diagnosed at an age that is
+    stochastically less than that of males.
+    >>> res = mannwhitneyu(females, males, alternative="less", method="exact")
+    >>> print(res)
+    MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
+    Again, the probability of getting a sufficiently low value of the
+    test statistic by chance under the null hypothesis is greater than 5%,
+    so we do not reject the null hypothesis in favor of our alternative.
+    If it is reasonable to assume that the means of samples from the
+    populations are normally distributed, we could have used a t-test to
+    perform the analysis.
+    >>> from scipy.stats import ttest_ind
+    >>> res = ttest_ind(females, males, alternative="less")
+    >>> print(res)
+    Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924)
+    Under this assumption, the *p*-value would be low enough to reject the
+    null hypothesis in favor of the alternative.
+    '''
+    x, y, use_continuity, alternative, axis_int, method = (
+        _mwu_input_validation(x, y, use_continuity, alternative, axis, method))
+    x, y, xy = _broadcast_concatenate(x, y, axis)
+    n1, n2 = x.shape[-1], y.shape[-1]
+    # Follows [2]
+    ranks, t = _rankdata(xy, 'average', return_ties=True)  # method 2, step 1
+    R1 = ranks[..., :n1].sum(axis=-1)                      # method 2, step 2
+    U1 = R1 - n1*(n1+1)/2                                  # method 2, step 3
+    U2 = n1 * n2 - U1                                      # as U1 + U2 = n1 * n2
+    if alternative == "greater":
+        U, f = U1, 1  # U is the statistic to use for p-value, f is a factor
+    elif alternative == "less":
+        U, f = U2, 1  # Due to symmetry, use SF of U2 rather than CDF of U1
+    else:
+        U, f = np.maximum(U1, U2), 2  # multiply SF by two for two-sided test
+    if method == "auto":
+        method = _mwu_choose_method(n1, n2, np.any(t > 1))
+    if method == "exact":
+        p = _mwu_state.sf(U.astype(int), min(n1, n2), max(n1, n2))
+    elif method == "asymptotic":
+        z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
+        p = stats.norm.sf(z)
+    else:  # `PermutationMethod` instance (already validated)
+        def statistic(x, y, axis):
+            return mannwhitneyu(x, y, use_continuity=use_continuity,
+                                alternative=alternative, axis=axis,
+                                method="asymptotic").statistic
+        res = stats.permutation_test((x, y), statistic, axis=axis,
+                                     **method._asdict(), alternative=alternative)
+        p = res.pvalue
+        f = 1
+    p *= f
+    # Ensure that test statistic is not greater than 1
+    # This could happen for exact test when U = m*n/2
+    p = np.clip(p, 0, 1)
+    return MannwhitneyuResult(U1, p)

.venv/Lib/site-packages/scipy/stats/_morestats.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_mstats_basic.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_mstats_extras.py ADDED Viewed

	@@ -0,0 +1,521 @@

+"""
+Additional statistics functions with support for masked arrays.
+"""
+# Original author (2007): Pierre GF Gerard-Marchant
+__all__ = ['compare_medians_ms',
+           'hdquantiles', 'hdmedian', 'hdquantiles_sd',
+           'idealfourths',
+           'median_cihs','mjci','mquantiles_cimj',
+           'rsh',
+           'trimmed_mean_ci',]
+import numpy as np
+from numpy import float64, ndarray
+import numpy.ma as ma
+from numpy.ma import MaskedArray
+from . import _mstats_basic as mstats
+from scipy.stats.distributions import norm, beta, t, binom
+def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
+    """
+    Computes quantile estimates with the Harrell-Davis method.
+    The quantile estimates are calculated as a weighted linear combination
+    of order statistics.
+    Parameters
+    ----------
+    data : array_like
+        Data array.
+    prob : sequence, optional
+        Sequence of probabilities at which to compute the quantiles.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    var : bool, optional
+        Whether to return the variance of the estimate.
+    Returns
+    -------
+    hdquantiles : MaskedArray
+        A (p,) array of quantiles (if `var` is False), or a (2,p) array of
+        quantiles and variances (if `var` is True), where ``p`` is the
+        number of quantiles.
+    See Also
+    --------
+    hdquantiles_sd
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.stats.mstats import hdquantiles
+    >>>
+    >>> # Sample data
+    >>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
+    >>>
+    >>> # Probabilities at which to compute quantiles
+    >>> probabilities = [0.25, 0.5, 0.75]
+    >>>
+    >>> # Compute Harrell-Davis quantile estimates
+    >>> quantile_estimates = hdquantiles(data, prob=probabilities)
+    >>>
+    >>> # Display the quantile estimates
+    >>> for i, quantile in enumerate(probabilities):
+    ...     print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
+    25th percentile: 3.1505820231763066 # may vary
+    50th percentile: 5.194344084883956
+    75th percentile: 7.430626414674935
+    """
+    def _hd_1D(data,prob,var):
+        "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
+        xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
+        # Don't use length here, in case we have a numpy scalar
+        n = xsorted.size
+        hd = np.empty((2,len(prob)), float64)
+        if n < 2:
+            hd.flat = np.nan
+            if var:
+                return hd
+            return hd[0]
+        v = np.arange(n+1) / float(n)
+        betacdf = beta.cdf
+        for (i,p) in enumerate(prob):
+            _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
+            w = _w[1:] - _w[:-1]
+            hd_mean = np.dot(w, xsorted)
+            hd[0,i] = hd_mean
+            #
+            hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
+            #
+        hd[0, prob == 0] = xsorted[0]
+        hd[0, prob == 1] = xsorted[-1]
+        if var:
+            hd[1, prob == 0] = hd[1, prob == 1] = np.nan
+            return hd
+        return hd[0]
+    # Initialization & checks
+    data = ma.array(data, copy=False, dtype=float64)
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None) or (data.ndim == 1):
+        result = _hd_1D(data, p, var)
+    else:
+        if data.ndim > 2:
+            raise ValueError("Array 'data' must be at most two dimensional, "
+                             "but got data.ndim = %d" % data.ndim)
+        result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
+    return ma.fix_invalid(result, copy=False)
+def hdmedian(data, axis=-1, var=False):
+    """
+    Returns the Harrell-Davis estimate of the median along the given axis.
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    axis : int, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    var : bool, optional
+        Whether to return the variance of the estimate.
+    Returns
+    -------
+    hdmedian : MaskedArray
+        The median values.  If ``var=True``, the variance is returned inside
+        the masked array.  E.g. for a 1-D array the shape change from (1,) to
+        (2,).
+    """
+    result = hdquantiles(data,[0.5], axis=axis, var=var)
+    return result.squeeze()
+def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
+    """
+    The standard error of the Harrell-Davis quantile estimates by jackknife.
+    Parameters
+    ----------
+    data : array_like
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    axis : int, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    Returns
+    -------
+    hdquantiles_sd : MaskedArray
+        Standard error of the Harrell-Davis quantile estimates.
+    See Also
+    --------
+    hdquantiles
+    """
+    def _hdsd_1D(data, prob):
+        "Computes the std error for 1D arrays."
+        xsorted = np.sort(data.compressed())
+        n = len(xsorted)
+        hdsd = np.empty(len(prob), float64)
+        if n < 2:
+            hdsd.flat = np.nan
+        vv = np.arange(n) / float(n-1)
+        betacdf = beta.cdf
+        for (i,p) in enumerate(prob):
+            _w = betacdf(vv, n*p, n*(1-p))
+            w = _w[1:] - _w[:-1]
+            # cumulative sum of weights and data points if
+            # ith point is left out for jackknife
+            mx_ = np.zeros_like(xsorted)
+            mx_[1:] = np.cumsum(w * xsorted[:-1])
+            # similar but from the right
+            mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
+            hdsd[i] = np.sqrt(mx_.var() * (n - 1))
+        return hdsd
+    # Initialization & checks
+    data = ma.array(data, copy=False, dtype=float64)
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        result = _hdsd_1D(data, p)
+    else:
+        if data.ndim > 2:
+            raise ValueError("Array 'data' must be at most two dimensional, "
+                             "but got data.ndim = %d" % data.ndim)
+        result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
+    return ma.fix_invalid(result, copy=False).ravel()
+def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
+                    alpha=0.05, axis=None):
+    """
+    Selected confidence interval of the trimmed mean along the given axis.
+    Parameters
+    ----------
+    data : array_like
+        Input data.
+    limits : {None, tuple}, optional
+        None or a two item tuple.
+        Tuple of the percentages to cut on each side of the array, with respect
+        to the number of unmasked data, as floats between 0. and 1. If ``n``
+        is the number of unmasked data before trimming, then
+        (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
+        largest data are masked.  The total number of unmasked data after
+        trimming is ``n * (1. - sum(limits))``.
+        The value of one limit can be set to None to indicate an open interval.
+        Defaults to (0.2, 0.2).
+    inclusive : (2,) tuple of boolean, optional
+        If relative==False, tuple indicating whether values exactly equal to
+        the absolute limits are allowed.
+        If relative==True, tuple indicating whether the number of data being
+        masked on each side should be rounded (True) or truncated (False).
+        Defaults to (True, True).
+    alpha : float, optional
+        Confidence level of the intervals.
+        Defaults to 0.05.
+    axis : int, optional
+        Axis along which to cut. If None, uses a flattened version of `data`.
+        Defaults to None.
+    Returns
+    -------
+    trimmed_mean_ci : (2,) ndarray
+        The lower and upper confidence intervals of the trimmed data.
+    """
+    data = ma.array(data, copy=False)
+    trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
+    tmean = trimmed.mean(axis)
+    tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
+    df = trimmed.count(axis) - 1
+    tppf = t.ppf(1-alpha/2.,df)
+    return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
+def mjci(data, prob=[0.25,0.5,0.75], axis=None):
+    """
+    Returns the Maritz-Jarrett estimators of the standard error of selected
+    experimental quantiles of the data.
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    """
+    def _mjci_1D(data, p):
+        data = np.sort(data.compressed())
+        n = data.size
+        prob = (np.array(p) * n + 0.5).astype(int)
+        betacdf = beta.cdf
+        mj = np.empty(len(prob), float64)
+        x = np.arange(1,n+1, dtype=float64) / n
+        y = x - 1./n
+        for (i,m) in enumerate(prob):
+            W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
+            C1 = np.dot(W,data)
+            C2 = np.dot(W,data**2)
+            mj[i] = np.sqrt(C2 - C1**2)
+        return mj
+    data = ma.array(data, copy=False)
+    if data.ndim > 2:
+        raise ValueError("Array 'data' must be at most two dimensional, "
+                         "but got data.ndim = %d" % data.ndim)
+    p = np.atleast_1d(np.asarray(prob))
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        return _mjci_1D(data, p)
+    else:
+        return ma.apply_along_axis(_mjci_1D, axis, data, p)
+def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
+    """
+    Computes the alpha confidence interval for the selected quantiles of the
+    data, with Maritz-Jarrett estimators.
+    Parameters
+    ----------
+    data : ndarray
+        Data array.
+    prob : sequence, optional
+        Sequence of quantiles to compute.
+    alpha : float, optional
+        Confidence level of the intervals.
+    axis : int or None, optional
+        Axis along which to compute the quantiles.
+        If None, use a flattened array.
+    Returns
+    -------
+    ci_lower : ndarray
+        The lower boundaries of the confidence interval.  Of the same length as
+        `prob`.
+    ci_upper : ndarray
+        The upper boundaries of the confidence interval.  Of the same length as
+        `prob`.
+    """
+    alpha = min(alpha, 1 - alpha)
+    z = norm.ppf(1 - alpha/2.)
+    xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
+    smj = mjci(data, prob, axis=axis)
+    return (xq - z * smj, xq + z * smj)
+def median_cihs(data, alpha=0.05, axis=None):
+    """
+    Computes the alpha-level confidence interval for the median of the data.
+    Uses the Hettmasperger-Sheather method.
+    Parameters
+    ----------
+    data : array_like
+        Input data. Masked values are discarded. The input should be 1D only,
+        or `axis` should be set to None.
+    alpha : float, optional
+        Confidence level of the intervals.
+    axis : int or None, optional
+        Axis along which to compute the quantiles. If None, use a flattened
+        array.
+    Returns
+    -------
+    median_cihs
+        Alpha level confidence interval.
+    """
+    def _cihs_1D(data, alpha):
+        data = np.sort(data.compressed())
+        n = len(data)
+        alpha = min(alpha, 1-alpha)
+        k = int(binom._ppf(alpha/2., n, 0.5))
+        gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+        if gk < 1-alpha:
+            k -= 1
+            gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+        gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
+        I = (gk - 1 + alpha)/(gk - gkk)
+        lambd = (n-k) * I / float(k + (n-2*k)*I)
+        lims = (lambd*data[k] + (1-lambd)*data[k-1],
+                lambd*data[n-k-1] + (1-lambd)*data[n-k])
+        return lims
+    data = ma.array(data, copy=False)
+    # Computes quantiles along axis (or globally)
+    if (axis is None):
+        result = _cihs_1D(data, alpha)
+    else:
+        if data.ndim > 2:
+            raise ValueError("Array 'data' must be at most two dimensional, "
+                             "but got data.ndim = %d" % data.ndim)
+        result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
+    return result
+def compare_medians_ms(group_1, group_2, axis=None):
+    """
+    Compares the medians from two independent groups along the given axis.
+    The comparison is performed using the McKean-Schrader estimate of the
+    standard error of the medians.
+    Parameters
+    ----------
+    group_1 : array_like
+        First dataset.  Has to be of size >=7.
+    group_2 : array_like
+        Second dataset.  Has to be of size >=7.
+    axis : int, optional
+        Axis along which the medians are estimated. If None, the arrays are
+        flattened.  If `axis` is not None, then `group_1` and `group_2`
+        should have the same shape.
+    Returns
+    -------
+    compare_medians_ms : {float, ndarray}
+        If `axis` is None, then returns a float, otherwise returns a 1-D
+        ndarray of floats with a length equal to the length of `group_1`
+        along `axis`.
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> a = [1, 2, 3, 4, 5, 6, 7]
+    >>> b = [8, 9, 10, 11, 12, 13, 14]
+    >>> stats.mstats.compare_medians_ms(a, b, axis=None)
+    1.0693225866553746e-05
+    The function is vectorized to compute along a given axis.
+    >>> import numpy as np
+    >>> rng = np.random.default_rng()
+    >>> x = rng.random(size=(3, 7))
+    >>> y = rng.random(size=(3, 8))
+    >>> stats.mstats.compare_medians_ms(x, y, axis=1)
+    array([0.36908985, 0.36092538, 0.2765313 ])
+    References
+    ----------
+    .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
+       for studentizing the sample median." Communications in
+       Statistics-Simulation and Computation 13.6 (1984): 751-773.
+    """
+    (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
+    (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
+                      mstats.stde_median(group_2, axis=axis))
+    W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
+    return 1 - norm.cdf(W)
+def idealfourths(data, axis=None):
+    """
+    Returns an estimate of the lower and upper quartiles.
+    Uses the ideal fourths algorithm.
+    Parameters
+    ----------
+    data : array_like
+        Input array.
+    axis : int, optional
+        Axis along which the quartiles are estimated. If None, the arrays are
+        flattened.
+    Returns
+    -------
+    idealfourths : {list of floats, masked array}
+        Returns the two internal values that divide `data` into four parts
+        using the ideal fourths algorithm either along the flattened array
+        (if `axis` is None) or along `axis` of `data`.
+    """
+    def _idf(data):
+        x = data.compressed()
+        n = len(x)
+        if n < 3:
+            return [np.nan,np.nan]
+        (j,h) = divmod(n/4. + 5/12.,1)
+        j = int(j)
+        qlo = (1-h)*x[j-1] + h*x[j]
+        k = n - j
+        qup = (1-h)*x[k] + h*x[k-1]
+        return [qlo, qup]
+    data = ma.sort(data, axis=axis).view(MaskedArray)
+    if (axis is None):
+        return _idf(data)
+    else:
+        return ma.apply_along_axis(_idf, axis, data)
+def rsh(data, points=None):
+    """
+    Evaluates Rosenblatt's shifted histogram estimators for each data point.
+    Rosenblatt's estimator is a centered finite-difference approximation to the
+    derivative of the empirical cumulative distribution function.
+    Parameters
+    ----------
+    data : sequence
+        Input data, should be 1-D. Masked values are ignored.
+    points : sequence or None, optional
+        Sequence of points where to evaluate Rosenblatt shifted histogram.
+        If None, use the data.
+    """
+    data = ma.array(data, copy=False)
+    if points is None:
+        points = data
+    else:
+        points = np.atleast_1d(np.asarray(points))
+    if data.ndim != 1:
+        raise AttributeError("The input array should be 1D only !")
+    n = data.count()
+    r = idealfourths(data, axis=None)
+    h = 1.2 * (r[-1]-r[0]) / n**(1./5)
+    nhi = (data[:,None] <= points[None,:] + h).sum(0)
+    nlo = (data[:,None] < points[None,:] - h).sum(0)
+    return (nhi-nlo) / (2.*n*h)

.venv/Lib/site-packages/scipy/stats/_multicomp.py ADDED Viewed

	@@ -0,0 +1,459 @@

+from __future__ import annotations
+import warnings
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+import numpy as np
+from scipy import stats
+from scipy.optimize import minimize_scalar
+from scipy.stats._common import ConfidenceInterval
+from scipy.stats._qmc import check_random_state
+from scipy.stats._stats_py import _var
+if TYPE_CHECKING:
+    import numpy.typing as npt
+    from scipy._lib._util import DecimalNumber, SeedType
+    from typing import Literal, Sequence  # noqa: UP035
+__all__ = [
+    'dunnett'
+]
+@dataclass
+class DunnettResult:
+    """Result object returned by `scipy.stats.dunnett`.
+    Attributes
+    ----------
+    statistic : float ndarray
+        The computed statistic of the test for each comparison. The element
+        at index ``i`` is the statistic for the comparison between
+        groups ``i`` and the control.
+    pvalue : float ndarray
+        The computed p-value of the test for each comparison. The element
+        at index ``i`` is the p-value for the comparison between
+        group ``i`` and the control.
+    """
+    statistic: np.ndarray
+    pvalue: np.ndarray
+    _alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
+    _rho: np.ndarray = field(repr=False)
+    _df: int = field(repr=False)
+    _std: float = field(repr=False)
+    _mean_samples: np.ndarray = field(repr=False)
+    _mean_control: np.ndarray = field(repr=False)
+    _n_samples: np.ndarray = field(repr=False)
+    _n_control: int = field(repr=False)
+    _rng: SeedType = field(repr=False)
+    _ci: ConfidenceInterval | None = field(default=None, repr=False)
+    _ci_cl: DecimalNumber | None = field(default=None, repr=False)
+    def __str__(self):
+        # Note: `__str__` prints the confidence intervals from the most
+        # recent call to `confidence_interval`. If it has not been called,
+        # it will be called with the default CL of .95.
+        if self._ci is None:
+            self.confidence_interval(confidence_level=.95)
+        s = (
+            "Dunnett's test"
+            f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
+            "Comparison               Statistic  p-value  Lower CI  Upper CI\n"
+        )
+        for i in range(self.pvalue.size):
+            s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
+                  f"{self.pvalue[i]:>10.3f}"
+                  f"{self._ci.low[i]:>10.3f}"
+                  f"{self._ci.high[i]:>10.3f}\n")
+        return s
+    def _allowance(
+        self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
+    ) -> float:
+        """Allowance.
+        It is the quantity to add/subtract from the observed difference
+        between the means of observed groups and the mean of the control
+        group. The result gives confidence limits.
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval.
+            Default is .95.
+        tol : float, optional
+            A tolerance for numerical optimization: the allowance will produce
+            a confidence within ``10*tol*(1 - confidence_level)`` of the
+            specified level, or a warning will be emitted. Tight tolerances
+            may be impractical due to noisy evaluation of the objective.
+            Default is 1e-3.
+        Returns
+        -------
+        allowance : float
+            Allowance around the mean.
+        """
+        alpha = 1 - confidence_level
+        def pvalue_from_stat(statistic):
+            statistic = np.array(statistic)
+            sf = _pvalue_dunnett(
+                rho=self._rho, df=self._df,
+                statistic=statistic, alternative=self._alternative,
+                rng=self._rng
+            )
+            return abs(sf - alpha)/alpha
+        # Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
+        # evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
+        # to tolerate a noisy objective function and may fail to find the
+        # minimum accurately. We mitigate this possibility with the validation
+        # step below, but implementation of a noise-tolerant root finder or
+        # minimizer would be a welcome enhancement. See gh-18150.
+        res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
+        critical_value = res.x
+        # validation
+        # tol*10 because tol=1e-3 means we tolerate a 1% change at most
+        if res.success is False or res.fun >= tol*10:
+            warnings.warn(
+                "Computation of the confidence interval did not converge to "
+                "the desired level. The confidence level corresponding with "
+                f"the returned interval is approximately {alpha*(1+res.fun)}.",
+                stacklevel=3
+            )
+        # From [1] p. 1101 between (1) and (3)
+        allowance = critical_value*self._std*np.sqrt(
+            1/self._n_samples + 1/self._n_control
+        )
+        return abs(allowance)
+    def confidence_interval(
+        self, confidence_level: DecimalNumber = 0.95
+    ) -> ConfidenceInterval:
+        """Compute the confidence interval for the specified confidence level.
+        Parameters
+        ----------
+        confidence_level : float, optional
+            Confidence level for the computed confidence interval.
+            Default is .95.
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` object
+            The object has attributes ``low`` and ``high`` that hold the
+            lower and upper bounds of the confidence intervals for each
+            comparison. The high and low values are accessible for each
+            comparison at index ``i`` for each group ``i``.
+        """
+        # check to see if the supplied confidence level matches that of the
+        # previously computed CI.
+        if (self._ci is not None) and (confidence_level == self._ci_cl):
+            return self._ci
+        if not (0 < confidence_level < 1):
+            raise ValueError("Confidence level must be between 0 and 1.")
+        allowance = self._allowance(confidence_level=confidence_level)
+        diff_means = self._mean_samples - self._mean_control
+        low = diff_means-allowance
+        high = diff_means+allowance
+        if self._alternative == 'greater':
+            high = [np.inf] * len(diff_means)
+        elif self._alternative == 'less':
+            low = [-np.inf] * len(diff_means)
+        self._ci_cl = confidence_level
+        self._ci = ConfidenceInterval(
+            low=low,
+            high=high
+        )
+        return self._ci
+def dunnett(
+    *samples: npt.ArrayLike,  # noqa: D417
+    control: npt.ArrayLike,
+    alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
+    random_state: SeedType = None
+) -> DunnettResult:
+    """Dunnett's test: multiple comparisons of means against a control group.
+    This is an implementation of Dunnett's original, single-step test as
+    described in [1]_.
+    Parameters
+    ----------
+    sample1, sample2, ... : 1D array_like
+        The sample measurements for each experimental group.
+    control : 1D array_like
+        The sample measurements for the control group.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the alternative hypothesis.
+        The null hypothesis is that the means of the distributions underlying
+        the samples and control are equal. The following alternative
+        hypotheses are available (default is 'two-sided'):
+        * 'two-sided': the means of the distributions underlying the samples
+          and control are unequal.
+        * 'less': the means of the distributions underlying the samples
+          are less than the mean of the distribution underlying the control.
+        * 'greater': the means of the distributions underlying the
+          samples are greater than the mean of the distribution underlying
+          the control.
+    random_state : {None, int, `numpy.random.Generator`}, optional
+        If `random_state` is an int or None, a new `numpy.random.Generator` is
+        created using ``np.random.default_rng(random_state)``.
+        If `random_state` is already a ``Generator`` instance, then the
+        provided instance is used.
+        The random number generator is used to control the randomized
+        Quasi-Monte Carlo integration of the multivariate-t distribution.
+    Returns
+    -------
+    res : `~scipy.stats._result_classes.DunnettResult`
+        An object containing attributes:
+        statistic : float ndarray
+            The computed statistic of the test for each comparison. The element
+            at index ``i`` is the statistic for the comparison between
+            groups ``i`` and the control.
+        pvalue : float ndarray
+            The computed p-value of the test for each comparison. The element
+            at index ``i`` is the p-value for the comparison between
+            group ``i`` and the control.
+        And the following method:
+        confidence_interval(confidence_level=0.95) :
+            Compute the difference in means of the groups
+            with the control +- the allowance.
+    See Also
+    --------
+    tukey_hsd : performs pairwise comparison of means.
+    Notes
+    -----
+    Like the independent-sample t-test, Dunnett's test [1]_ is used to make
+    inferences about the means of distributions from which samples were drawn.
+    However, when multiple t-tests are performed at a fixed significance level,
+    the "family-wise error rate" - the probability of incorrectly rejecting the
+    null hypothesis in at least one test - will exceed the significance level.
+    Dunnett's test is designed to perform multiple comparisons while
+    controlling the family-wise error rate.
+    Dunnett's test compares the means of multiple experimental groups
+    against a single control group. Tukey's Honestly Significant Difference Test
+    is another multiple-comparison test that controls the family-wise error
+    rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
+    When pairwise comparisons between experimental groups are not needed,
+    Dunnett's test is preferable due to its higher power.
+    The use of this test relies on several assumptions.
+    1. The observations are independent within and among groups.
+    2. The observations within each group are normally distributed.
+    3. The distributions from which the samples are drawn have the same finite
+       variance.
+    References
+    ----------
+    .. [1] Charles W. Dunnett. "A Multiple Comparison Procedure for Comparing
+       Several Treatments with a Control."
+       Journal of the American Statistical Association, 50:272, 1096-1121,
+       :doi:`10.1080/01621459.1955.10501294`, 1955.
+    Examples
+    --------
+    In [1]_, the influence of drugs on blood count measurements on three groups
+    of animal is investigated.
+    The following table summarizes the results of the experiment in which
+    two groups received different drugs, and one group acted as a control.
+    Blood counts (in millions of cells per cubic millimeter) were recorded::
+    >>> import numpy as np
+    >>> control = np.array([7.40, 8.50, 7.20, 8.24, 9.84, 8.32])
+    >>> drug_a = np.array([9.76, 8.80, 7.68, 9.36])
+    >>> drug_b = np.array([12.80, 9.68, 12.16, 9.20, 10.55])
+    We would like to see if the means between any of the groups are
+    significantly different. First, visually examine a box and whisker plot.
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(1, 1)
+    >>> ax.boxplot([control, drug_a, drug_b])
+    >>> ax.set_xticklabels(["Control", "Drug A", "Drug B"])  # doctest: +SKIP
+    >>> ax.set_ylabel("mean")  # doctest: +SKIP
+    >>> plt.show()
+    Note the overlapping interquartile ranges of the drug A group and control
+    group and the apparent separation between the drug B group and control
+    group.
+    Next, we will use Dunnett's test to assess whether the difference
+    between group means is significant while controlling the family-wise error
+    rate: the probability of making any false discoveries.
+    Let the null hypothesis be that the experimental groups have the same
+    mean as the control and the alternative be that an experimental group does
+    not have the same mean as the control. We will consider a 5% family-wise
+    error rate to be acceptable, and therefore we choose 0.05 as the threshold
+    for significance.
+    >>> from scipy.stats import dunnett
+    >>> res = dunnett(drug_a, drug_b, control=control)
+    >>> res.pvalue
+    array([0.62004941, 0.0059035 ])  # may vary
+    The p-value corresponding with the comparison between group A and control
+    exceeds 0.05, so we do not reject the null hypothesis for that comparison.
+    However, the p-value corresponding with the comparison between group B
+    and control is less than 0.05, so we consider the experimental results
+    to be evidence against the null hypothesis in favor of the alternative:
+    group B has a different mean than the control group.
+    """
+    samples_, control_, rng = _iv_dunnett(
+        samples=samples, control=control,
+        alternative=alternative, random_state=random_state
+    )
+    rho, df, n_group, n_samples, n_control = _params_dunnett(
+        samples=samples_, control=control_
+    )
+    statistic, std, mean_control, mean_samples = _statistic_dunnett(
+        samples_, control_, df, n_samples, n_control
+    )
+    pvalue = _pvalue_dunnett(
+        rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
+    )
+    return DunnettResult(
+        statistic=statistic, pvalue=pvalue,
+        _alternative=alternative,
+        _rho=rho, _df=df, _std=std,
+        _mean_samples=mean_samples,
+        _mean_control=mean_control,
+        _n_samples=n_samples,
+        _n_control=n_control,
+        _rng=rng
+    )
+def _iv_dunnett(
+    samples: Sequence[npt.ArrayLike],
+    control: npt.ArrayLike,
+    alternative: Literal['two-sided', 'less', 'greater'],
+    random_state: SeedType
+) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
+    """Input validation for Dunnett's test."""
+    rng = check_random_state(random_state)
+    if alternative not in {'two-sided', 'less', 'greater'}:
+        raise ValueError(
+            "alternative must be 'less', 'greater' or 'two-sided'"
+        )
+    ndim_msg = "Control and samples groups must be 1D arrays"
+    n_obs_msg = "Control and samples groups must have at least 1 observation"
+    control = np.asarray(control)
+    samples_ = [np.asarray(sample) for sample in samples]
+    # samples checks
+    samples_control: list[np.ndarray] = samples_ + [control]
+    for sample in samples_control:
+        if sample.ndim > 1:
+            raise ValueError(ndim_msg)
+        if sample.size < 1:
+            raise ValueError(n_obs_msg)
+    return samples_, control, rng
+def _params_dunnett(
+    samples: list[np.ndarray], control: np.ndarray
+) -> tuple[np.ndarray, int, int, np.ndarray, int]:
+    """Specific parameters for Dunnett's test.
+    Degree of freedom is the number of observations minus the number of groups
+    including the control.
+    """
+    n_samples = np.array([sample.size for sample in samples])
+    # From [1] p. 1100 d.f. = (sum N)-(p+1)
+    n_sample = n_samples.sum()
+    n_control = control.size
+    n = n_sample + n_control
+    n_groups = len(samples)
+    df = n - n_groups - 1
+    # From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
+    rho = n_control/n_samples + 1
+    rho = 1/np.sqrt(rho[:, None] * rho[None, :])
+    np.fill_diagonal(rho, 1)
+    return rho, df, n_groups, n_samples, n_control
+def _statistic_dunnett(
+    samples: list[np.ndarray], control: np.ndarray, df: int,
+    n_samples: np.ndarray, n_control: int
+) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
+    """Statistic of Dunnett's test.
+    Computation based on the original single-step test from [1].
+    """
+    mean_control = np.mean(control)
+    mean_samples = np.array([np.mean(sample) for sample in samples])
+    all_samples = [control] + samples
+    all_means = np.concatenate([[mean_control], mean_samples])
+    # Variance estimate s^2 from [1] Eq. 1
+    s2 = np.sum([_var(sample, mean=mean)*sample.size
+                 for sample, mean in zip(all_samples, all_means)]) / df
+    std = np.sqrt(s2)
+    # z score inferred from [1] unlabeled equation after Eq. 1
+    z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
+    return z / std, std, mean_control, mean_samples
+def _pvalue_dunnett(
+    rho: np.ndarray, df: int, statistic: np.ndarray,
+    alternative: Literal['two-sided', 'less', 'greater'],
+    rng: SeedType = None
+) -> np.ndarray:
+    """pvalue from the multivariate t-distribution.
+    Critical values come from the multivariate student-t distribution.
+    """
+    statistic = statistic.reshape(-1, 1)
+    mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
+    if alternative == "two-sided":
+        statistic = abs(statistic)
+        pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
+    elif alternative == "greater":
+        pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
+    else:
+        pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
+    return np.atleast_1d(pvalue)

.venv/Lib/site-packages/scipy/stats/_multivariate.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.dll.a ADDED Viewed

Binary file (1.5 kB). View file

.venv/Lib/site-packages/scipy/stats/_mvn.cp39-win_amd64.pyd ADDED Viewed

Binary file (106 kB). View file

.venv/Lib/site-packages/scipy/stats/_odds_ratio.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import numpy as np
+from scipy.special import ndtri
+from scipy.optimize import brentq
+from ._discrete_distns import nchypergeom_fisher
+from ._common import ConfidenceInterval
+def _sample_odds_ratio(table):
+    """
+    Given a table [[a, b], [c, d]], compute a*d/(b*c).
+    Return nan if the numerator and denominator are 0.
+    Return inf if just the denominator is 0.
+    """
+    # table must be a 2x2 numpy array.
+    if table[1, 0] > 0 and table[0, 1] > 0:
+        oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
+    elif table[0, 0] == 0 or table[1, 1] == 0:
+        oddsratio = np.nan
+    else:
+        oddsratio = np.inf
+    return oddsratio
+def _solve(func):
+    """
+    Solve func(nc) = 0.  func must be an increasing function.
+    """
+    # We could just as well call the variable `x` instead of `nc`, but we
+    # always call this function with functions for which nc (the noncentrality
+    # parameter) is the variable for which we are solving.
+    nc = 1.0
+    value = func(nc)
+    if value == 0:
+        return nc
+    # Multiplicative factor by which to increase or decrease nc when
+    # searching for a bracketing interval.
+    factor = 2.0
+    # Find a bracketing interval.
+    if value > 0:
+        nc /= factor
+        while func(nc) > 0:
+            nc /= factor
+        lo = nc
+        hi = factor*nc
+    else:
+        nc *= factor
+        while func(nc) < 0:
+            nc *= factor
+        lo = nc/factor
+        hi = nc
+    # lo and hi bracket the solution for nc.
+    nc = brentq(func, lo, hi, xtol=1e-13)
+    return nc
+def _nc_hypergeom_mean_inverse(x, M, n, N):
+    """
+    For the given noncentral hypergeometric parameters x, M, n,and N
+    (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
+    contingency table), find the noncentrality parameter of Fisher's
+    noncentral hypergeometric distribution whose mean is x.
+    """
+    nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
+    return nc
+def _hypergeom_params_from_table(table):
+    # The notation M, n and N is consistent with stats.hypergeom and
+    # stats.nchypergeom_fisher.
+    x = table[0, 0]
+    M = table.sum()
+    n = table[0].sum()
+    N = table[:, 0].sum()
+    return x, M, n, N
+def _ci_upper(table, alpha):
+    """
+    Compute the upper end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == np.inf:
+        return np.inf
+    x, M, n, N = _hypergeom_params_from_table(table)
+    # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
+    # it in the lambda expression.
+    nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
+    return nc
+def _ci_lower(table, alpha):
+    """
+    Compute the lower end of the confidence interval.
+    """
+    if _sample_odds_ratio(table) == 0:
+        return 0
+    x, M, n, N = _hypergeom_params_from_table(table)
+    nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
+    return nc
+def _conditional_oddsratio(table):
+    """
+    Conditional MLE of the odds ratio for the 2x2 contingency table.
+    """
+    x, M, n, N = _hypergeom_params_from_table(table)
+    # Get the bounds of the support.  The support of the noncentral
+    # hypergeometric distribution with parameters M, n, and N is the same
+    # for all values of the noncentrality parameter, so we can use 1 here.
+    lo, hi = nchypergeom_fisher.support(M, n, N, 1)
+    # Check if x is at one of the extremes of the support.  If so, we know
+    # the odds ratio is either 0 or inf.
+    if x == lo:
+        # x is at the low end of the support.
+        return 0
+    if x == hi:
+        # x is at the high end of the support.
+        return np.inf
+    nc = _nc_hypergeom_mean_inverse(x, M, n, N)
+    return nc
+def _conditional_oddsratio_ci(table, confidence_level=0.95,
+                              alternative='two-sided'):
+    """
+    Conditional exact confidence interval for the odds ratio.
+    """
+    if alternative == 'two-sided':
+        alpha = 0.5*(1 - confidence_level)
+        lower = _ci_lower(table, alpha)
+        upper = _ci_upper(table, alpha)
+    elif alternative == 'less':
+        lower = 0.0
+        upper = _ci_upper(table, 1 - confidence_level)
+    else:
+        # alternative == 'greater'
+        lower = _ci_lower(table, 1 - confidence_level)
+        upper = np.inf
+    return lower, upper
+def _sample_odds_ratio_ci(table, confidence_level=0.95,
+                          alternative='two-sided'):
+    oddsratio = _sample_odds_ratio(table)
+    log_or = np.log(oddsratio)
+    se = np.sqrt((1/table).sum())
+    if alternative == 'less':
+        z = ndtri(confidence_level)
+        loglow = -np.inf
+        loghigh = log_or + z*se
+    elif alternative == 'greater':
+        z = ndtri(confidence_level)
+        loglow = log_or - z*se
+        loghigh = np.inf
+    else:
+        # alternative is 'two-sided'
+        z = ndtri(0.5*confidence_level + 0.5)
+        loglow = log_or - z*se
+        loghigh = log_or + z*se
+    return np.exp(loglow), np.exp(loghigh)
+class OddsRatioResult:
+    """
+    Result of `scipy.stats.contingency.odds_ratio`.  See the
+    docstring for `odds_ratio` for more details.
+    Attributes
+    ----------
+    statistic : float
+        The computed odds ratio.
+        * If `kind` is ``'sample'``, this is sample (or unconditional)
+          estimate, given by
+          ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+        * If `kind` is ``'conditional'``, this is the conditional
+          maximum likelihood estimate for the odds ratio. It is
+          the noncentrality parameter of Fisher's noncentral
+          hypergeometric distribution with the same hypergeometric
+          parameters as `table` and whose mean is ``table[0, 0]``.
+    Methods
+    -------
+    confidence_interval :
+        Confidence interval for the odds ratio.
+    """
+    def __init__(self, _table, _kind, statistic):
+        # for now, no need to make _table and _kind public, since this sort of
+        # information is returned in very few `scipy.stats` results
+        self._table = _table
+        self._kind = _kind
+        self.statistic = statistic
+    def __repr__(self):
+        return f"OddsRatioResult(statistic={self.statistic})"
+    def confidence_interval(self, confidence_level=0.95,
+                            alternative='two-sided'):
+        """
+        Confidence interval for the odds ratio.
+        Parameters
+        ----------
+        confidence_level: float
+            Desired confidence level for the confidence interval.
+            The value must be given as a fraction between 0 and 1.
+            Default is 0.95 (meaning 95%).
+        alternative : {'two-sided', 'less', 'greater'}, optional
+            The alternative hypothesis of the hypothesis test to which the
+            confidence interval corresponds. That is, suppose the null
+            hypothesis is that the true odds ratio equals ``OR`` and the
+            confidence interval is ``(low, high)``. Then the following options
+            for `alternative` are available (default is 'two-sided'):
+            * 'two-sided': the true odds ratio is not equal to ``OR``. There
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``high < OR`` or ``low > OR``.
+            * 'less': the true odds ratio is less than ``OR``. The ``low`` end
+              of the confidence interval is 0, and there is evidence against
+              the null hypothesis at  the chosen `confidence_level` if
+              ``high < OR``.
+            * 'greater': the true odds ratio is greater than ``OR``.  The
+              ``high`` end of the confidence interval is ``np.inf``, and there
+              is evidence against the null hypothesis at the chosen
+              `confidence_level` if ``low > OR``.
+        Returns
+        -------
+        ci : ``ConfidenceInterval`` instance
+            The confidence interval, represented as an object with
+            attributes ``low`` and ``high``.
+        Notes
+        -----
+        When `kind` is ``'conditional'``, the limits of the confidence
+        interval are the conditional "exact confidence limits" as described
+        by Fisher [1]_. The conditional odds ratio and confidence interval are
+        also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
+        When `kind` is ``'sample'``, the confidence interval is computed
+        under the assumption that the logarithm of the odds ratio is normally
+        distributed with standard error given by::
+            se = sqrt(1/a + 1/b + 1/c + 1/d)
+        where ``a``, ``b``, ``c`` and ``d`` are the elements of the
+        contingency table.  (See, for example, [2]_, section 3.1.3.2,
+        or [3]_, section 2.3.3).
+        References
+        ----------
+        .. [1] R. A. Fisher (1935), The logic of inductive inference,
+               Journal of the Royal Statistical Society, Vol. 98, No. 1,
+               pp. 39-82.
+        .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+               Methods, Techniques, and Applications, CRC Press LLC, Boca
+               Raton, Florida.
+        .. [3] Alan Agresti, An Introduction to Categorical Data Analysis
+               (second edition), Wiley, Hoboken, NJ, USA (2007).
+        """
+        if alternative not in ['two-sided', 'less', 'greater']:
+            raise ValueError("`alternative` must be 'two-sided', 'less' or "
+                             "'greater'.")
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+        if self._kind == 'conditional':
+            ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
+        else:
+            ci = self._sample_odds_ratio_ci(confidence_level, alternative)
+        return ci
+    def _conditional_odds_ratio_ci(self, confidence_level=0.95,
+                                   alternative='two-sided'):
+        """
+        Confidence interval for the conditional odds ratio.
+        """
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _conditional_oddsratio_ci(table,
+                                           confidence_level=confidence_level,
+                                           alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+    def _sample_odds_ratio_ci(self, confidence_level=0.95,
+                              alternative='two-sided'):
+        """
+        Confidence interval for the sample odds ratio.
+        """
+        if confidence_level < 0 or confidence_level > 1:
+            raise ValueError('confidence_level must be between 0 and 1')
+        table = self._table
+        if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
+            # If both values in a row or column are zero, the p-value is 1,
+            # the odds ratio is NaN and the confidence interval is (0, inf).
+            ci = (0, np.inf)
+        else:
+            ci = _sample_odds_ratio_ci(table,
+                                       confidence_level=confidence_level,
+                                       alternative=alternative)
+        return ConfidenceInterval(low=ci[0], high=ci[1])
+def odds_ratio(table, *, kind='conditional'):
+    r"""
+    Compute the odds ratio for a 2x2 contingency table.
+    Parameters
+    ----------
+    table : array_like of ints
+        A 2x2 contingency table.  Elements must be non-negative integers.
+    kind : str, optional
+        Which kind of odds ratio to compute, either the sample
+        odds ratio (``kind='sample'``) or the conditional odds ratio
+        (``kind='conditional'``).  Default is ``'conditional'``.
+    Returns
+    -------
+    result : `~scipy.stats._result_classes.OddsRatioResult` instance
+        The returned object has two computed attributes:
+        statistic : float
+            * If `kind` is ``'sample'``, this is sample (or unconditional)
+              estimate, given by
+              ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
+            * If `kind` is ``'conditional'``, this is the conditional
+              maximum likelihood estimate for the odds ratio. It is
+              the noncentrality parameter of Fisher's noncentral
+              hypergeometric distribution with the same hypergeometric
+              parameters as `table` and whose mean is ``table[0, 0]``.
+        The object has the method `confidence_interval` that computes
+        the confidence interval of the odds ratio.
+    See Also
+    --------
+    scipy.stats.fisher_exact
+    relative_risk
+    Notes
+    -----
+    The conditional odds ratio was discussed by Fisher (see "Example 1"
+    of [1]_).  Texts that cover the odds ratio include [2]_ and [3]_.
+    .. versionadded:: 1.10.0
+    References
+    ----------
+    .. [1] R. A. Fisher (1935), The logic of inductive inference,
+           Journal of the Royal Statistical Society, Vol. 98, No. 1,
+           pp. 39-82.
+    .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
+           Volume I - The analysis of case-control studies. IARC Sci Publ.
+           (32):5-338. PMID: 7216345. (See section 4.2.)
+    .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
+           Methods, Techniques, and Applications, CRC Press LLC, Boca
+           Raton, Florida.
+    .. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
+           Cardiovascular Events in Women and Men: A Sex-Specific
+           Meta-analysis of Randomized Controlled Trials."
+           JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
+    Examples
+    --------
+    In epidemiology, individuals are classified as "exposed" or
+    "unexposed" to some factor or treatment. If the occurrence of some
+    illness is under study, those who have the illness are often
+    classified as "cases", and those without it are "noncases".  The
+    counts of the occurrences of these classes gives a contingency
+    table::
+                    exposed    unexposed
+        cases          a           b
+        noncases       c           d
+    The sample odds ratio may be written ``(a/c) / (b/d)``.  ``a/c`` can
+    be interpreted as the odds of a case occurring in the exposed group,
+    and ``b/d`` as the odds of a case occurring in the unexposed group.
+    The sample odds ratio is the ratio of these odds.  If the odds ratio
+    is greater than 1, it suggests that there is a positive association
+    between being exposed and being a case.
+    Interchanging the rows or columns of the contingency table inverts
+    the odds ratio, so it is import to understand the meaning of labels
+    given to the rows and columns of the table when interpreting the
+    odds ratio.
+    In [4]_, the use of aspirin to prevent cardiovascular events in women
+    and men was investigated. The study notably concluded:
+        ...aspirin therapy reduced the risk of a composite of
+        cardiovascular events due to its effect on reducing the risk of
+        ischemic stroke in women [...]
+    The article lists studies of various cardiovascular events. Let's
+    focus on the ischemic stoke in women.
+    The following table summarizes the results of the experiment in which
+    participants took aspirin or a placebo on a regular basis for several
+    years. Cases of ischemic stroke were recorded::
+                          Aspirin   Control/Placebo
+        Ischemic stroke     176           230
+        No stroke         21035         21018
+    The question we ask is "Is there evidence that the aspirin reduces the
+    risk of ischemic stroke?"
+    Compute the odds ratio:
+    >>> from scipy.stats.contingency import odds_ratio
+    >>> res = odds_ratio([[176, 230], [21035, 21018]])
+    >>> res.statistic
+    0.7646037659999126
+    For this sample, the odds of getting an ischemic stroke for those who have
+    been taking aspirin are 0.76 times that of those
+    who have received the placebo.
+    To make statistical inferences about the population under study,
+    we can compute the 95% confidence interval for the odds ratio:
+    >>> res.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=0.6241234078749812, high=0.9354102892100372)
+    The 95% confidence interval for the conditional odds ratio is
+    approximately (0.62, 0.94).
+    The fact that the entire 95% confidence interval falls below 1 supports
+    the authors' conclusion that the aspirin was associated with a
+    statistically significant reduction in ischemic stroke.
+    """
+    if kind not in ['conditional', 'sample']:
+        raise ValueError("`kind` must be 'conditional' or 'sample'.")
+    c = np.asarray(table)
+    if c.shape != (2, 2):
+        raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
+                         "of shape (2, 2).")
+    if not np.issubdtype(c.dtype, np.integer):
+        raise ValueError("`table` must be an array of integers, but got "
+                         f"type {c.dtype}")
+    c = c.astype(np.int64)
+    if np.any(c < 0):
+        raise ValueError("All values in `table` must be nonnegative.")
+    if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
+        # If both values in a row or column are zero, the p-value is NaN and
+        # the odds ratio is NaN.
+        result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
+        return result
+    if kind == 'sample':
+        oddsratio = _sample_odds_ratio(c)
+    else:  # kind is 'conditional'
+        oddsratio = _conditional_oddsratio(c)
+    result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
+    return result

.venv/Lib/site-packages/scipy/stats/_page_trend_test.py ADDED Viewed

	@@ -0,0 +1,479 @@

+from itertools import permutations
+import numpy as np
+import math
+from ._continuous_distns import norm
+import scipy.stats
+from dataclasses import dataclass
+@dataclass
+class PageTrendTestResult:
+    statistic: float
+    pvalue: float
+    method: str
+def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
+    r"""
+    Perform Page's Test, a measure of trend in observations between treatments.
+    Page's Test (also known as Page's :math:`L` test) is useful when:
+    * there are :math:`n \geq 3` treatments,
+    * :math:`m \geq 2` subjects are observed for each treatment, and
+    * the observations are hypothesized to have a particular order.
+    Specifically, the test considers the null hypothesis that
+    .. math::
+        m_1 = m_2 = m_3 \cdots = m_n,
+    where :math:`m_j` is the mean of the observed quantity under treatment
+    :math:`j`, against the alternative hypothesis that
+    .. math::
+        m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
+    where at least one inequality is strict.
+    As noted by [4]_, Page's :math:`L` test has greater statistical power than
+    the Friedman test against the alternative that there is a difference in
+    trend, as Friedman's test only considers a difference in the means of the
+    observations without considering their order. Whereas Spearman :math:`\rho`
+    considers the correlation between the ranked observations of two variables
+    (e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
+    carries), Page's :math:`L` is concerned with a trend in an observation
+    (e.g. the airspeed velocity of a swallow) across several distinct
+    treatments (e.g. carrying each of five coconuts of different weight) even
+    as the observation is repeated with multiple subjects (e.g. one European
+    swallow and one African swallow).
+    Parameters
+    ----------
+    data : array-like
+        A :math:`m \times n` array; the element in row :math:`i` and
+        column :math:`j` is the observation corresponding with subject
+        :math:`i` and treatment :math:`j`. By default, the columns are
+        assumed to be arranged in order of increasing predicted mean.
+    ranked : boolean, optional
+        By default, `data` is assumed to be observations rather than ranks;
+        it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
+        `data` is provided in the form of ranks, pass argument ``True``.
+    predicted_ranks : array-like, optional
+        The predicted ranks of the column means. If not specified,
+        the columns are assumed to be arranged in order of increasing
+        predicted mean, so the default `predicted_ranks` are
+        :math:`[1, 2, \dots, n-1, n]`.
+    method : {'auto', 'asymptotic', 'exact'}, optional
+        Selects the method used to calculate the *p*-value. The following
+        options are available.
+        * 'auto': selects between 'exact' and 'asymptotic' to
+          achieve reasonably accurate results in reasonable time (default)
+        * 'asymptotic': compares the standardized test statistic against
+          the normal distribution
+        * 'exact': computes the exact *p*-value by comparing the observed
+          :math:`L` statistic against those realized by all possible
+          permutations of ranks (under the null hypothesis that each
+          permutation is equally likely)
+    Returns
+    -------
+    res : PageTrendTestResult
+        An object containing attributes:
+        statistic : float
+            Page's :math:`L` test statistic.
+        pvalue : float
+            The associated *p*-value
+        method : {'asymptotic', 'exact'}
+            The method used to compute the *p*-value
+    See Also
+    --------
+    rankdata, friedmanchisquare, spearmanr
+    Notes
+    -----
+    As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
+    :math:`n` objects or events or performances or persons or trials ranked."
+    Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
+    "groupings by ability or some other control variable, or judges doing
+    the ranking, or random replications of some other sort."
+    The procedure for calculating the :math:`L` statistic, adapted from
+    [1]_, is:
+    1. "Predetermine with careful logic the appropriate hypotheses
+       concerning the predicted ordering of the experimental results.
+       If no reasonable basis for ordering any treatments is known, the
+       :math:`L` test is not appropriate."
+    2. "As in other experiments, determine at what level of confidence
+       you will reject the null hypothesis that there is no agreement of
+       experimental results with the monotonic hypothesis."
+    3. "Cast the experimental material into a two-way table of :math:`n`
+       columns (treatments, objects ranked, conditions) and :math:`m`
+       rows (subjects, replication groups, levels of control variables)."
+    4. "When experimental observations are recorded, rank them across each
+       row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
+    5. "Add the ranks in each column", e.g.
+       ``colsums = np.sum(ranks, axis=0)``.
+    6. "Multiply each sum of ranks by the predicted rank for that same
+       column", e.g. ``products = predicted_ranks * colsums``.
+    7. "Sum all such products", e.g. ``L = products.sum()``.
+    [1]_ continues by suggesting use of the standardized statistic
+    .. math::
+        \chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
+    "which is distributed approximately as chi-square with 1 degree of
+    freedom. The ordinary use of :math:`\chi^2` tables would be
+    equivalent to a two-sided test of agreement. If a one-sided test
+    is desired, *as will almost always be the case*, the probability
+    discovered in the chi-square table should be *halved*."
+    However, this standardized statistic does not distinguish between the
+    observed values being well correlated with the predicted ranks and being
+    _anti_-correlated with the predicted ranks. Instead, we follow [2]_
+    and calculate the standardized statistic
+    .. math::
+        \Lambda = \frac{L - E_0}{\sqrt{V_0}},
+    where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
+    :math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
+    normal under the null hypothesis".
+    The *p*-value for ``method='exact'`` is generated by comparing the observed
+    value of :math:`L` against the :math:`L` values generated for all
+    :math:`(n!)^m` possible permutations of ranks. The calculation is performed
+    using the recursive method of [5].
+    The *p*-values are not adjusted for the possibility of ties. When
+    ties are present, the reported  ``'exact'`` *p*-values may be somewhat
+    larger (i.e. more conservative) than the true *p*-value [2]_. The
+    ``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
+    conservative) than the ``'exact'`` *p*-values.
+    References
+    ----------
+    .. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
+       a significant test for linear ranks", *Journal of the American
+       Statistical Association* 58(301), p. 216--230, 1963.
+    .. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
+       approach*, CRC Press, p. 150--152, 2012.
+    .. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
+       Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
+       Accessed July 12, 2020.
+    .. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
+       https://en.wikipedia.org/wiki/Page%27s_trend_test,
+       Accessed July 12, 2020.
+    .. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
+       the two-way layout", *Communications in Statistics - Simulation and
+       Computation*,  6(1), p. 49--61, 1977.
+    Examples
+    --------
+    We use the example from [3]_: 10 students are asked to rate three
+    teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
+    with 1 being the lowest and 5 being the highest. We have decided that
+    a confidence level of 99% is required to reject the null hypothesis in
+    favor of our alternative: that the seminar will have the highest ratings
+    and the tutorial will have the lowest. Initially, the data have been
+    tabulated with each row representing an individual student's ratings of
+    the three methods in the following order: tutorial, lecture, seminar.
+    >>> table = [[3, 4, 3],
+    ...          [2, 2, 4],
+    ...          [3, 3, 5],
+    ...          [1, 3, 2],
+    ...          [2, 3, 2],
+    ...          [2, 4, 5],
+    ...          [1, 2, 4],
+    ...          [3, 4, 4],
+    ...          [2, 4, 5],
+    ...          [1, 3, 4]]
+    Because the tutorial is hypothesized to have the lowest ratings, the
+    column corresponding with tutorial rankings should be first; the seminar
+    is hypothesized to have the highest ratings, so its column should be last.
+    Since the columns are already arranged in this order of increasing
+    predicted mean, we can pass the table directly into `page_trend_test`.
+    >>> from scipy.stats import page_trend_test
+    >>> res = page_trend_test(table)
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+    This *p*-value indicates that there is a 0.1819% chance that
+    the :math:`L` statistic would reach such an extreme value under the null
+    hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
+    the null hypothesis in favor of our alternative at a 99% confidence level.
+    The value of the :math:`L` statistic is 133.5. To check this manually,
+    we rank the data such that high scores correspond with high ranks, settling
+    ties with an average rank:
+    >>> from scipy.stats import rankdata
+    >>> ranks = rankdata(table, axis=1)
+    >>> ranks
+    array([[1.5, 3. , 1.5],
+           [1.5, 1.5, 3. ],
+           [1.5, 1.5, 3. ],
+           [1. , 3. , 2. ],
+           [1.5, 3. , 1.5],
+           [1. , 2. , 3. ],
+           [1. , 2. , 3. ],
+           [1. , 2.5, 2.5],
+           [1. , 2. , 3. ],
+           [1. , 2. , 3. ]])
+    We add the ranks within each column, multiply the sums by the
+    predicted ranks, and sum the products.
+    >>> import numpy as np
+    >>> m, n = ranks.shape
+    >>> predicted_ranks = np.arange(1, n+1)
+    >>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
+    >>> res.statistic == L
+    True
+    As presented in [3]_, the asymptotic approximation of the *p*-value is the
+    survival function of the normal distribution evaluated at the standardized
+    test statistic:
+    >>> from scipy.stats import norm
+    >>> E0 = (m*n*(n+1)**2)/4
+    >>> V0 = (m*n**2*(n+1)*(n**2-1))/144
+    >>> Lambda = (L-E0)/np.sqrt(V0)
+    >>> p = norm.sf(Lambda)
+    >>> p
+    0.0012693433690751756
+    This does not precisely match the *p*-value reported by `page_trend_test`
+    above. The asymptotic distribution is not very accurate, nor conservative,
+    for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
+    use ``method='exact'`` based on the dimensions of the table and the
+    recommendations in Page's original paper [1]_. To override
+    `page_trend_test`'s choice, provide the `method` argument.
+    >>> res = page_trend_test(table, method="asymptotic")
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
+                        method='asymptotic')
+    If the data are already ranked, we can pass in the ``ranks`` instead of
+    the ``table`` to save computation time.
+    >>> res = page_trend_test(ranks,             # ranks of data
+    ...                       ranked=True,       # data is already ranked
+    ...                       )
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+    Suppose the raw data had been tabulated in an order different from the
+    order of predicted means, say lecture, seminar, tutorial.
+    >>> table = np.asarray(table)[:, [1, 2, 0]]
+    Since the arrangement of this table is not consistent with the assumed
+    ordering, we can either rearrange the table or provide the
+    `predicted_ranks`. Remembering that the lecture is predicted
+    to have the middle rank, the seminar the highest, and tutorial the lowest,
+    we pass:
+    >>> res = page_trend_test(table,             # data as originally tabulated
+    ...                       predicted_ranks=[2, 3, 1],  # our predicted order
+    ...                       )
+    >>> res
+    PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
+                        method='exact')
+    """
+    # Possible values of the method parameter and the corresponding function
+    # used to evaluate the p value
+    methods = {"asymptotic": _l_p_asymptotic,
+               "exact": _l_p_exact,
+               "auto": None}
+    if method not in methods:
+        raise ValueError(f"`method` must be in {set(methods)}")
+    ranks = np.asarray(data)
+    if ranks.ndim != 2:  # TODO: relax this to accept 3d arrays?
+        raise ValueError("`data` must be a 2d array.")
+    m, n = ranks.shape
+    if m < 2 or n < 3:
+        raise ValueError("Page's L is only appropriate for data with two "
+                         "or more rows and three or more columns.")
+    if np.any(np.isnan(data)):
+        raise ValueError("`data` contains NaNs, which cannot be ranked "
+                         "meaningfully")
+    # ensure NumPy array and rank the data if it's not already ranked
+    if ranked:
+        # Only a basic check on whether data is ranked. Checking that the data
+        # is properly ranked could take as much time as ranking it.
+        if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
+            raise ValueError("`data` is not properly ranked. Rank the data or "
+                             "pass `ranked=False`.")
+    else:
+        ranks = scipy.stats.rankdata(data, axis=-1)
+    # generate predicted ranks if not provided, ensure valid NumPy array
+    if predicted_ranks is None:
+        predicted_ranks = np.arange(1, n+1)
+    else:
+        predicted_ranks = np.asarray(predicted_ranks)
+        if (predicted_ranks.ndim < 1 or
+                (set(predicted_ranks) != set(range(1, n+1)) or
+                 len(predicted_ranks) != n)):
+            raise ValueError(f"`predicted_ranks` must include each integer "
+                             f"from 1 to {n} (the number of columns in "
+                             f"`data`) exactly once.")
+    if not isinstance(ranked, bool):
+        raise TypeError("`ranked` must be boolean.")
+    # Calculate the L statistic
+    L = _l_vectorized(ranks, predicted_ranks)
+    # Calculate the p-value
+    if method == "auto":
+        method = _choose_method(ranks)
+    p_fun = methods[method]  # get the function corresponding with the method
+    p = p_fun(L, m, n)
+    page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
+    return page_result
+def _choose_method(ranks):
+    '''Choose method for computing p-value automatically'''
+    m, n = ranks.shape
+    if n > 8 or (m > 12 and n > 3) or m > 20:  # as in [1], [4]
+        method = "asymptotic"
+    else:
+        method = "exact"
+    return method
+def _l_vectorized(ranks, predicted_ranks):
+    '''Calculate's Page's L statistic for each page of a 3d array'''
+    colsums = ranks.sum(axis=-2, keepdims=True)
+    products = predicted_ranks * colsums
+    Ls = products.sum(axis=-1)
+    Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
+    return Ls
+def _l_p_asymptotic(L, m, n):
+    '''Calculate the p-value of Page's L from the asymptotic distribution'''
+    # Using [1] as a reference, the asymptotic p-value would be calculated as:
+    # chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
+    # p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
+    # but this is insensitive to the direction of the hypothesized ranking
+    # See [2] page 151
+    E0 = (m*n*(n+1)**2)/4
+    V0 = (m*n**2*(n+1)*(n**2-1))/144
+    Lambda = (L-E0)/np.sqrt(V0)
+    # This is a one-sided "greater" test - calculate the probability that the
+    # L statistic under H0 would be greater than the observed L statistic
+    p = norm.sf(Lambda)
+    return p
+def _l_p_exact(L, m, n):
+    '''Calculate the p-value of Page's L exactly'''
+    # [1] uses m, n; [5] uses n, k.
+    # Switch convention here because exact calculation code references [5].
+    L, n, k = int(L), int(m), int(n)
+    _pagel_state.set_k(k)
+    return _pagel_state.sf(L, n)
+class _PageL:
+    '''Maintains state between `page_trend_test` executions'''
+    def __init__(self):
+        '''Lightweight initialization'''
+        self.all_pmfs = {}
+    def set_k(self, k):
+        '''Calculate lower and upper limits of L for single row'''
+        self.k = k
+        # See [5] top of page 52
+        self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
+    def sf(self, l, n):
+        '''Survival function of Page's L statistic'''
+        ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
+        return np.sum(ps)
+    def p_l_k_1(self):
+        '''Relative frequency of each L value over all possible single rows'''
+        # See [5] Equation (6)
+        ranks = range(1, self.k+1)
+        # generate all possible rows of length k
+        rank_perms = np.array(list(permutations(ranks)))
+        # compute Page's L for all possible rows
+        Ls = (ranks*rank_perms).sum(axis=1)
+        # count occurrences of each L value
+        counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
+        # factorial(k) is number of possible permutations
+        return counts/math.factorial(self.k)
+    def pmf(self, l, n):
+        '''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
+        if n not in self.all_pmfs:
+            self.all_pmfs[n] = {}
+        if self.k not in self.all_pmfs[n]:
+            self.all_pmfs[n][self.k] = {}
+        # Cache results to avoid repeating calculation. Initially this was
+        # written with lru_cache, but this seems faster? Also, we could add
+        # an option to save this for future lookup.
+        if l in self.all_pmfs[n][self.k]:
+            return self.all_pmfs[n][self.k][l]
+        if n == 1:
+            ps = self.p_l_k_1()  # [5] Equation 6
+            ls = range(self.a, self.b+1)
+            # not fast, but we'll only be here once
+            self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
+            return self.all_pmfs[n][self.k][l]
+        p = 0
+        low = max(l-(n-1)*self.b, self.a)  # [5] Equation 2
+        high = min(l-(n-1)*self.a, self.b)
+        # [5] Equation 1
+        for t in range(low, high+1):
+            p1 = self.pmf(l-t, n-1)
+            p2 = self.pmf(t, 1)
+            p += p1*p2
+        self.all_pmfs[n][self.k][l] = p
+        return p
+# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
+_pagel_state = _PageL()

.venv/Lib/site-packages/scipy/stats/_qmc.py ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.dll.a ADDED Viewed

Binary file (1.54 kB). View file

.venv/Lib/site-packages/scipy/stats/_qmc_cy.cp39-win_amd64.pyd ADDED Viewed

Binary file (409 kB). View file

.venv/Lib/site-packages/scipy/stats/_qmc_cy.pyi ADDED Viewed

	@@ -0,0 +1,54 @@

+import numpy as np
+from scipy._lib._util import DecimalNumber, IntNumber
+def _cy_wrapper_centered_discrepancy(
+        sample: np.ndarray,
+        iterative: bool,
+        workers: IntNumber,
+) -> float: ...
+def _cy_wrapper_wrap_around_discrepancy(
+        sample: np.ndarray,
+        iterative: bool,
+        workers: IntNumber,
+) -> float: ...
+def _cy_wrapper_mixture_discrepancy(
+        sample: np.ndarray,
+        iterative: bool,
+        workers: IntNumber,
+) -> float: ...
+def _cy_wrapper_l2_star_discrepancy(
+        sample: np.ndarray,
+        iterative: bool,
+        workers: IntNumber,
+) -> float: ...
+def _cy_wrapper_update_discrepancy(
+        x_new_view: np.ndarray,
+        sample_view: np.ndarray,
+        initial_disc: DecimalNumber,
+) -> float: ...
+def _cy_van_der_corput(
+        n: IntNumber,
+        base: IntNumber,
+        start_index: IntNumber,
+        workers: IntNumber,
+) -> np.ndarray: ...
+def _cy_van_der_corput_scrambled(
+        n: IntNumber,
+        base: IntNumber,
+        start_index: IntNumber,
+        permutations: np.ndarray,
+        workers: IntNumber,
+) -> np.ndarray: ...

.venv/Lib/site-packages/scipy/stats/_qmvnt.py ADDED Viewed

	@@ -0,0 +1,533 @@

+# Integration of multivariate normal and t distributions.
+# Adapted from the MATLAB original implementations by Dr. Alan Genz.
+#     http://www.math.wsu.edu/faculty/genz/software/software.html
+# Copyright (C) 2013, Alan Genz,  All rights reserved.
+# Python implementation is copyright (C) 2022, Robert Kern,  All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided the following conditions are met:
+#   1. Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#   2. Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#   3. The contributor name(s) may not be used to endorse or promote
+#      products derived from this software without specific prior
+#      written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import numpy as np
+from scipy.fft import fft, ifft
+from scipy.special import gammaincinv, ndtr, ndtri
+from scipy.stats._qmc import primes_from_2_to
+phi = ndtr
+phinv = ndtri
+def _factorize_int(n):
+    """Return a sorted list of the unique prime factors of a positive integer.
+    """
+    # NOTE: There are lots faster ways to do this, but this isn't terrible.
+    factors = set()
+    for p in primes_from_2_to(int(np.sqrt(n)) + 1):
+        while not (n % p):
+            factors.add(p)
+            n //= p
+        if n == 1:
+            break
+    if n != 1:
+        factors.add(n)
+    return sorted(factors)
+def _primitive_root(p):
+    """Compute a primitive root of the prime number `p`.
+    Used in the CBC lattice construction.
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/Primitive_root_modulo_n
+    """
+    # p is prime
+    pm = p - 1
+    factors = _factorize_int(pm)
+    n = len(factors)
+    r = 2
+    k = 0
+    while k < n:
+        d = pm // factors[k]
+        # pow() doesn't like numpy scalar types.
+        rd = pow(int(r), int(d), int(p))
+        if rd == 1:
+            r += 1
+            k = 0
+        else:
+            k += 1
+    return r
+def _cbc_lattice(n_dim, n_qmc_samples):
+    """Compute a QMC lattice generator using a Fast CBC construction.
+    Parameters
+    ----------
+    n_dim : int > 0
+        The number of dimensions for the lattice.
+    n_qmc_samples : int > 0
+        The desired number of QMC samples. This will be rounded down to the
+        nearest prime to enable the CBC construction.
+    Returns
+    -------
+    q : float array : shape=(n_dim,)
+        The lattice generator vector. All values are in the open interval
+        `(0, 1)`.
+    actual_n_qmc_samples : int
+        The prime number of QMC samples that must be used with this lattice,
+        no more, no less.
+    References
+    ----------
+    .. [1] Nuyens, D. and Cools, R. "Fast Component-by-Component Construction,
+           a Reprise for Different Kernels", In H. Niederreiter and D. Talay,
+           editors, Monte-Carlo and Quasi-Monte Carlo Methods 2004,
+           Springer-Verlag, 2006, 371-385.
+    """
+    # Round down to the nearest prime number.
+    primes = primes_from_2_to(n_qmc_samples + 1)
+    n_qmc_samples = primes[-1]
+    bt = np.ones(n_dim)
+    gm = np.hstack([1.0, 0.8 ** np.arange(n_dim - 1)])
+    q = 1
+    w = 0
+    z = np.arange(1, n_dim + 1)
+    m = (n_qmc_samples - 1) // 2
+    g = _primitive_root(n_qmc_samples)
+    # Slightly faster way to compute perm[j] = pow(g, j, n_qmc_samples)
+    # Shame that we don't have modulo pow() implemented as a ufunc.
+    perm = np.ones(m, dtype=int)
+    for j in range(m - 1):
+        perm[j + 1] = (g * perm[j]) % n_qmc_samples
+    perm = np.minimum(n_qmc_samples - perm, perm)
+    pn = perm / n_qmc_samples
+    c = pn * pn - pn + 1.0 / 6
+    fc = fft(c)
+    for s in range(1, n_dim):
+        reordered = np.hstack([
+            c[:w+1][::-1],
+            c[w+1:m][::-1],
+        ])
+        q = q * (bt[s-1] + gm[s-1] * reordered)
+        w = ifft(fc * fft(q)).real.argmin()
+        z[s] = perm[w]
+    q = z / n_qmc_samples
+    return q, n_qmc_samples
+# Note: this function is not currently used or tested by any SciPy code. It is
+# included in this file to facilitate the development of a parameter for users
+# to set the desired CDF accuracy, but must be reviewed and tested before use.
+def _qauto(func, covar, low, high, rng, error=1e-3, limit=10_000, **kwds):
+    """Automatically rerun the integration to get the required error bound.
+    Parameters
+    ----------
+    func : callable
+        Either :func:`_qmvn` or :func:`_qmvt`.
+    covar, low, high : array
+        As specified in :func:`_qmvn` and :func:`_qmvt`.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    error : float > 0
+        The desired error bound.
+    limit : int > 0:
+        The rough limit of the number of integration points to consider. The
+        integration will stop looping once this limit has been *exceeded*.
+    **kwds :
+        Other keyword arguments to pass to `func`. When using :func:`_qmvt`, be
+        sure to include ``nu=`` as one of these.
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    n_samples : int
+        The number of integration points actually used.
+    """
+    n = len(covar)
+    n_samples = 0
+    if n == 1:
+        prob = phi(high) - phi(low)
+        # More or less
+        est_error = 1e-15
+    else:
+        mi = min(limit, n * 1000)
+        prob = 0.0
+        est_error = 1.0
+        ei = 0.0
+        while est_error > error and n_samples < limit:
+            mi = round(np.sqrt(2) * mi)
+            pi, ei, ni = func(mi, covar, low, high, rng=rng, **kwds)
+            n_samples += ni
+            wt = 1.0 / (1 + (ei / est_error)**2)
+            prob += wt * (pi - prob)
+            est_error = np.sqrt(wt) * ei
+    return prob, est_error, n_samples
+# Note: this function is not currently used or tested by any SciPy code. It is
+# included in this file to facilitate the resolution of gh-8367, gh-16142, and
+# possibly gh-14286, but must be reviewed and tested before use.
+def _qmvn(m, covar, low, high, rng, lattice='cbc', n_batches=10):
+    """Multivariate normal integration over box bounds.
+    Parameters
+    ----------
+    m : int > n_batches
+        The number of points to sample. This number will be divided into
+        `n_batches` batches that apply random offsets of the sampling lattice
+        for each batch in order to estimate the error.
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    lattice : 'cbc' or callable
+        The type of lattice rule to use to construct the integration points.
+    n_batches : int > 0, optional
+        The number of QMC batches to apply.
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    """
+    cho, lo, hi = _permuted_cholesky(covar, low, high)
+    n = cho.shape[0]
+    ct = cho[0, 0]
+    c = phi(lo[0] / ct)
+    d = phi(hi[0] / ct)
+    ci = c
+    dci = d - ci
+    prob = 0.0
+    error_var = 0.0
+    q, n_qmc_samples = _cbc_lattice(n - 1, max(m // n_batches, 1))
+    y = np.zeros((n - 1, n_qmc_samples))
+    i_samples = np.arange(n_qmc_samples) + 1
+    for j in range(n_batches):
+        c = np.full(n_qmc_samples, ci)
+        dc = np.full(n_qmc_samples, dci)
+        pv = dc.copy()
+        for i in range(1, n):
+            # Pseudorandomly-shifted lattice coordinate.
+            z = q[i - 1] * i_samples + rng.random()
+            # Fast remainder(z, 1.0)
+            z -= z.astype(int)
+            # Tent periodization transform.
+            x = abs(2 * z - 1)
+            y[i - 1, :] = phinv(c + x * dc)
+            s = cho[i, :i] @ y[:i, :]
+            ct = cho[i, i]
+            c = phi((lo[i] - s) / ct)
+            d = phi((hi[i] - s) / ct)
+            dc = d - c
+            pv = pv * dc
+        # Accumulate the mean and error variances with online formulations.
+        d = (pv.mean() - prob) / (j + 1)
+        prob += d
+        error_var = (j - 1) * error_var / (j + 1) + d * d
+    # Error bounds are 3 times the standard error of the estimates.
+    est_error = 3 * np.sqrt(error_var)
+    n_samples = n_qmc_samples * n_batches
+    return prob, est_error, n_samples
+# Note: this function is not currently used or tested by any SciPy code. It is
+# included in this file to facilitate the resolution of gh-8367, gh-16142, and
+# possibly gh-14286, but must be reviewed and tested before use.
+def _mvn_qmc_integrand(covar, low, high, use_tent=False):
+    """Transform the multivariate normal integration into a QMC integrand over
+    a unit hypercube.
+    The dimensionality of the resulting hypercube integration domain is one
+    less than the dimensionality of the original integrand. Note that this
+    transformation subsumes the integration bounds in order to account for
+    infinite bounds. The QMC integration one does with the returned integrand
+    should be on the unit hypercube.
+    Parameters
+    ----------
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    use_tent : bool, optional
+        If True, then use tent periodization. Only helpful for lattice rules.
+    Returns
+    -------
+    integrand : Callable[[NDArray], NDArray]
+        The QMC-integrable integrand. It takes an
+        ``(n_qmc_samples, ndim_integrand)`` array of QMC samples in the unit
+        hypercube and returns the ``(n_qmc_samples,)`` evaluations of at these
+        QMC points.
+    ndim_integrand : int
+        The dimensionality of the integrand. Equal to ``n-1``.
+    """
+    cho, lo, hi = _permuted_cholesky(covar, low, high)
+    n = cho.shape[0]
+    ndim_integrand = n - 1
+    ct = cho[0, 0]
+    c = phi(lo[0] / ct)
+    d = phi(hi[0] / ct)
+    ci = c
+    dci = d - ci
+    def integrand(*zs):
+        ndim_qmc = len(zs)
+        n_qmc_samples = len(np.atleast_1d(zs[0]))
+        assert ndim_qmc == ndim_integrand
+        y = np.zeros((ndim_qmc, n_qmc_samples))
+        c = np.full(n_qmc_samples, ci)
+        dc = np.full(n_qmc_samples, dci)
+        pv = dc.copy()
+        for i in range(1, n):
+            if use_tent:
+                # Tent periodization transform.
+                x = abs(2 * zs[i-1] - 1)
+            else:
+                x = zs[i-1]
+            y[i - 1, :] = phinv(c + x * dc)
+            s = cho[i, :i] @ y[:i, :]
+            ct = cho[i, i]
+            c = phi((lo[i] - s) / ct)
+            d = phi((hi[i] - s) / ct)
+            dc = d - c
+            pv = pv * dc
+        return pv
+    return integrand, ndim_integrand
+def _qmvt(m, nu, covar, low, high, rng, lattice='cbc', n_batches=10):
+    """Multivariate t integration over box bounds.
+    Parameters
+    ----------
+    m : int > n_batches
+        The number of points to sample. This number will be divided into
+        `n_batches` batches that apply random offsets of the sampling lattice
+        for each batch in order to estimate the error.
+    nu : float >= 0
+        The shape parameter of the multivariate t distribution.
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    rng : Generator, optional
+        default_rng(), yada, yada
+    lattice : 'cbc' or callable
+        The type of lattice rule to use to construct the integration points.
+    n_batches : int > 0, optional
+        The number of QMC batches to apply.
+    Returns
+    -------
+    prob : float
+        The estimated probability mass within the bounds.
+    est_error : float
+        3 times the standard error of the batch estimates.
+    n_samples : int
+        The number of samples actually used.
+    """
+    sn = max(1.0, np.sqrt(nu))
+    low = np.asarray(low, dtype=np.float64)
+    high = np.asarray(high, dtype=np.float64)
+    cho, lo, hi = _permuted_cholesky(covar, low / sn, high / sn)
+    n = cho.shape[0]
+    prob = 0.0
+    error_var = 0.0
+    q, n_qmc_samples = _cbc_lattice(n, max(m // n_batches, 1))
+    i_samples = np.arange(n_qmc_samples) + 1
+    for j in range(n_batches):
+        pv = np.ones(n_qmc_samples)
+        s = np.zeros((n, n_qmc_samples))
+        for i in range(n):
+            # Pseudorandomly-shifted lattice coordinate.
+            z = q[i] * i_samples + rng.random()
+            # Fast remainder(z, 1.0)
+            z -= z.astype(int)
+            # Tent periodization transform.
+            x = abs(2 * z - 1)
+            # FIXME: Lift the i==0 case out of the loop to make the logic
+            # easier to follow.
+            if i == 0:
+                # We'll use one of the QR variates to pull out the
+                # t-distribution scaling.
+                if nu > 0:
+                    r = np.sqrt(2 * gammaincinv(nu / 2, x))
+                else:
+                    r = np.ones_like(x)
+            else:
+                y = phinv(c + x * dc)  # noqa: F821
+                with np.errstate(invalid='ignore'):
+                    s[i:, :] += cho[i:, i - 1][:, np.newaxis] * y
+            si = s[i, :]
+            c = np.ones(n_qmc_samples)
+            d = np.ones(n_qmc_samples)
+            with np.errstate(invalid='ignore'):
+                lois = lo[i] * r - si
+                hiis = hi[i] * r - si
+            c[lois < -9] = 0.0
+            d[hiis < -9] = 0.0
+            lo_mask = abs(lois) < 9
+            hi_mask = abs(hiis) < 9
+            c[lo_mask] = phi(lois[lo_mask])
+            d[hi_mask] = phi(hiis[hi_mask])
+            dc = d - c
+            pv *= dc
+        # Accumulate the mean and error variances with online formulations.
+        d = (pv.mean() - prob) / (j + 1)
+        prob += d
+        error_var = (j - 1) * error_var / (j + 1) + d * d
+    # Error bounds are 3 times the standard error of the estimates.
+    est_error = 3 * np.sqrt(error_var)
+    n_samples = n_qmc_samples * n_batches
+    return prob, est_error, n_samples
+def _permuted_cholesky(covar, low, high, tol=1e-10):
+    """Compute a scaled, permuted Cholesky factor, with integration bounds.
+    The scaling and permuting of the dimensions accomplishes part of the
+    transformation of the original integration problem into a more numerically
+    tractable form. The lower-triangular Cholesky factor will then be used in
+    the subsequent integration. The integration bounds will be scaled and
+    permuted as well.
+    Parameters
+    ----------
+    covar : (n, n) float array
+        Possibly singular, positive semidefinite symmetric covariance matrix.
+    low, high : (n,) float array
+        The low and high integration bounds.
+    tol : float, optional
+        The singularity tolerance.
+    Returns
+    -------
+    cho : (n, n) float array
+        Lower Cholesky factor, scaled and permuted.
+    new_low, new_high : (n,) float array
+        The scaled and permuted low and high integration bounds.
+    """
+    # Make copies for outputting.
+    cho = np.array(covar, dtype=np.float64)
+    new_lo = np.array(low, dtype=np.float64)
+    new_hi = np.array(high, dtype=np.float64)
+    n = cho.shape[0]
+    if cho.shape != (n, n):
+        raise ValueError("expected a square symmetric array")
+    if new_lo.shape != (n,) or new_hi.shape != (n,):
+        raise ValueError(
+            "expected integration boundaries the same dimensions "
+            "as the covariance matrix"
+        )
+    # Scale by the sqrt of the diagonal.
+    dc = np.sqrt(np.maximum(np.diag(cho), 0.0))
+    # But don't divide by 0.
+    dc[dc == 0.0] = 1.0
+    new_lo /= dc
+    new_hi /= dc
+    cho /= dc
+    cho /= dc[:, np.newaxis]
+    y = np.zeros(n)
+    sqtp = np.sqrt(2 * np.pi)
+    for k in range(n):
+        epk = (k + 1) * tol
+        im = k
+        ck = 0.0
+        dem = 1.0
+        s = 0.0
+        lo_m = 0.0
+        hi_m = 0.0
+        for i in range(k, n):
+            if cho[i, i] > tol:
+                ci = np.sqrt(cho[i, i])
+                if i > 0:
+                    s = cho[i, :k] @ y[:k]
+                lo_i = (new_lo[i] - s) / ci
+                hi_i = (new_hi[i] - s) / ci
+                de = phi(hi_i) - phi(lo_i)
+                if de <= dem:
+                    ck = ci
+                    dem = de
+                    lo_m = lo_i
+                    hi_m = hi_i
+                    im = i
+        if im > k:
+            # Swap im and k
+            cho[im, im] = cho[k, k]
+            _swap_slices(cho, np.s_[im, :k], np.s_[k, :k])
+            _swap_slices(cho, np.s_[im + 1:, im], np.s_[im + 1:, k])
+            _swap_slices(cho, np.s_[k + 1:im, k], np.s_[im, k + 1:im])
+            _swap_slices(new_lo, k, im)
+            _swap_slices(new_hi, k, im)
+        if ck > epk:
+            cho[k, k] = ck
+            cho[k, k + 1:] = 0.0
+            for i in range(k + 1, n):
+                cho[i, k] /= ck
+                cho[i, k + 1:i + 1] -= cho[i, k] * cho[k + 1:i + 1, k]
+            if abs(dem) > tol:
+                y[k] = ((np.exp(-lo_m * lo_m / 2) - np.exp(-hi_m * hi_m / 2)) /
+                        (sqtp * dem))
+            else:
+                y[k] = (lo_m + hi_m) / 2
+                if lo_m < -10:
+                    y[k] = hi_m
+                elif hi_m > 10:
+                    y[k] = lo_m
+            cho[k, :k + 1] /= ck
+            new_lo[k] /= ck
+            new_hi[k] /= ck
+        else:
+            cho[k:, k] = 0.0
+            y[k] = (new_lo[k] + new_hi[k]) / 2
+    return cho, new_lo, new_hi
+def _swap_slices(x, slc1, slc2):
+    t = x[slc1].copy()
+    x[slc1] = x[slc2].copy()
+    x[slc2] = t

.venv/Lib/site-packages/scipy/stats/_relative_risk.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import operator
+from dataclasses import dataclass
+import numpy as np
+from scipy.special import ndtri
+from ._common import ConfidenceInterval
+def _validate_int(n, bound, name):
+    msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
+    try:
+        n = operator.index(n)
+    except TypeError:
+        raise TypeError(msg) from None
+    if n < bound:
+        raise ValueError(msg)
+    return n
+@dataclass
+class RelativeRiskResult:
+    """
+    Result of `scipy.stats.contingency.relative_risk`.
+    Attributes
+    ----------
+    relative_risk : float
+        This is::
+            (exposed_cases/exposed_total) / (control_cases/control_total)
+    exposed_cases : int
+        The number of "cases" (i.e. occurrence of disease or other event
+        of interest) among the sample of "exposed" individuals.
+    exposed_total : int
+        The total number of "exposed" individuals in the sample.
+    control_cases : int
+        The number of "cases" among the sample of "control" or non-exposed
+        individuals.
+    control_total : int
+        The total number of "control" individuals in the sample.
+    Methods
+    -------
+    confidence_interval :
+        Compute the confidence interval for the relative risk estimate.
+    """
+    relative_risk: float
+    exposed_cases: int
+    exposed_total: int
+    control_cases: int
+    control_total: int
+    def confidence_interval(self, confidence_level=0.95):
+        """
+        Compute the confidence interval for the relative risk.
+        The confidence interval is computed using the Katz method
+        (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
+        Parameters
+        ----------
+        confidence_level : float, optional
+            The confidence level to use for the confidence interval.
+            Default is 0.95.
+        Returns
+        -------
+        ci : ConfidenceInterval instance
+            The return value is an object with attributes ``low`` and
+            ``high`` that hold the confidence interval.
+        References
+        ----------
+        .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
+               confidence intervals for the risk ratio in cohort studies",
+               Biometrics, 34, 469-474 (1978).
+        .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
+               CRC Press LLC, Boca Raton, FL, USA (1996).
+        Examples
+        --------
+        >>> from scipy.stats.contingency import relative_risk
+        >>> result = relative_risk(exposed_cases=10, exposed_total=75,
+        ...                        control_cases=12, control_total=225)
+        >>> result.relative_risk
+        2.5
+        >>> result.confidence_interval()
+        ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
+        """
+        if not 0 <= confidence_level <= 1:
+            raise ValueError('confidence_level must be in the interval '
+                             '[0, 1].')
+        # Handle edge cases where either exposed_cases or control_cases
+        # is zero.  We follow the convention of the R function riskratio
+        # from the epitools library.
+        if self.exposed_cases == 0 and self.control_cases == 0:
+            # relative risk is nan.
+            return ConfidenceInterval(low=np.nan, high=np.nan)
+        elif self.exposed_cases == 0:
+            # relative risk is 0.
+            return ConfidenceInterval(low=0.0, high=np.nan)
+        elif self.control_cases == 0:
+            # relative risk is inf
+            return ConfidenceInterval(low=np.nan, high=np.inf)
+        alpha = 1 - confidence_level
+        z = ndtri(1 - alpha/2)
+        rr = self.relative_risk
+        # Estimate of the variance of log(rr) is
+        # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
+        #                1/control_cases - 1/control_total
+        # and the standard error is the square root of that.
+        se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
+                     1/self.control_cases - 1/self.control_total)
+        delta = z*se
+        katz_lo = rr*np.exp(-delta)
+        katz_hi = rr*np.exp(delta)
+        return ConfidenceInterval(low=katz_lo, high=katz_hi)
+def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
+    """
+    Compute the relative risk (also known as the risk ratio).
+    This function computes the relative risk associated with a 2x2
+    contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
+    of accepting a table as an argument, the individual numbers that are
+    used to compute the relative risk are given as separate parameters.
+    This is to avoid the ambiguity of which row or column of the contingency
+    table corresponds to the "exposed" cases and which corresponds to the
+    "control" cases.  Unlike, say, the odds ratio, the relative risk is not
+    invariant under an interchange of the rows or columns.
+    Parameters
+    ----------
+    exposed_cases : nonnegative int
+        The number of "cases" (i.e. occurrence of disease or other event
+        of interest) among the sample of "exposed" individuals.
+    exposed_total : positive int
+        The total number of "exposed" individuals in the sample.
+    control_cases : nonnegative int
+        The number of "cases" among the sample of "control" or non-exposed
+        individuals.
+    control_total : positive int
+        The total number of "control" individuals in the sample.
+    Returns
+    -------
+    result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
+        The object has the float attribute ``relative_risk``, which is::
+            rr = (exposed_cases/exposed_total) / (control_cases/control_total)
+        The object also has the method ``confidence_interval`` to compute
+        the confidence interval of the relative risk for a given confidence
+        level.
+    See Also
+    --------
+    odds_ratio
+    Notes
+    -----
+    The R package epitools has the function `riskratio`, which accepts
+    a table with the following layout::
+                        disease=0   disease=1
+        exposed=0 (ref)    n00         n01
+        exposed=1          n10         n11
+    With a 2x2 table in the above format, the estimate of the CI is
+    computed by `riskratio` when the argument method="wald" is given,
+    or with the function `riskratio.wald`.
+    For example, in a test of the incidence of lung cancer among a
+    sample of smokers and nonsmokers, the "exposed" category would
+    correspond to "is a smoker" and the "disease" category would
+    correspond to "has or had lung cancer".
+    To pass the same data to ``relative_risk``, use::
+        relative_risk(n11, n10 + n11, n01, n00 + n01)
+    .. versionadded:: 1.7.0
+    References
+    ----------
+    .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
+           (second edition), Wiley, Hoboken, NJ, USA (2007).
+    .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
+           CRC Press LLC, Boca Raton, FL, USA (1996).
+    Examples
+    --------
+    >>> from scipy.stats.contingency import relative_risk
+    This example is from Example 3.1 of [2]_.  The results of a heart
+    disease study are summarized in the following table::
+                 High CAT   Low CAT    Total
+                 --------   -------    -----
+        CHD         27         44        71
+        No CHD      95        443       538
+        Total      122        487       609
+    CHD is coronary heart disease, and CAT refers to the level of
+    circulating catecholamine.  CAT is the "exposure" variable, and
+    high CAT is the "exposed" category. So the data from the table
+    to be passed to ``relative_risk`` is::
+        exposed_cases = 27
+        exposed_total = 122
+        control_cases = 44
+        control_total = 487
+    >>> result = relative_risk(27, 122, 44, 487)
+    >>> result.relative_risk
+    2.4495156482861398
+    Find the confidence interval for the relative risk.
+    >>> result.confidence_interval(confidence_level=0.95)
+    ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
+    The interval does not contain 1, so the data supports the statement
+    that high CAT is associated with greater risk of CHD.
+    """
+    # Relative risk is a trivial calculation.  The nontrivial part is in the
+    # `confidence_interval` method of the RelativeRiskResult class.
+    exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
+    exposed_total = _validate_int(exposed_total, 1, "exposed_total")
+    control_cases = _validate_int(control_cases, 0, "control_cases")
+    control_total = _validate_int(control_total, 1, "control_total")
+    if exposed_cases > exposed_total:
+        raise ValueError('exposed_cases must not exceed exposed_total.')
+    if control_cases > control_total:
+        raise ValueError('control_cases must not exceed control_total.')
+    if exposed_cases == 0 and control_cases == 0:
+        # relative risk is 0/0.
+        rr = np.nan
+    elif exposed_cases == 0:
+        # relative risk is 0/nonzero
+        rr = 0.0
+    elif control_cases == 0:
+        # relative risk is nonzero/0.
+        rr = np.inf
+    else:
+        p1 = exposed_cases / exposed_total
+        p2 = control_cases / control_total
+        rr = p1 / p2
+    return RelativeRiskResult(relative_risk=rr,
+                              exposed_cases=exposed_cases,
+                              exposed_total=exposed_total,
+                              control_cases=control_cases,
+                              control_total=control_total)