'''
Created on 13.06.2017
@author: Samuel
'''
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import warnings
from itertools import repeat
from collections import defaultdict
import numpy as np
import scipy as sc
from scipy.special import binom
from scipy.stats import nbinom, poisson
from scipy.special import lambertw
from statsmodels.distributions.empirical_distribution import ECDF
np.random.seed()
#import line_profiler
from vemomoto_core.concurrent.nicepar import Counter
from vemomoto_core.concurrent.concurrent_futures_ext import ProcessPoolExecutor
"""
# R package names
packnames = ('dgof',)
utils = importr('utils')
utils.chooseCRANmirror(ind=1)
# R vector of strings
from rpy2.robjects.vectors import StrVector
import rpy2.robjects.packages as rpackages
# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
utils.install_packages(StrVector(names_to_install))
#"""
[docs]def anderson_darling_test_discrete(data, modelX, modelY,
simulate_p_value=False):
# import R's "dgof" package
dgof = importr('dgof')
#raise NotImplementedError("This functionality requires a proper R setup.",
# "Other")
#print("data.shape", data.shape)
#print("np.unique(data)", np.unique(data))
#print("modelX.shape", modelX.shape)
#print("modelY.shape", modelY.shape)
rData = robjects.IntVector(data)
first1 = np.nonzero(modelY==1)[0]
if len(first1):
first1 = first1[0] + 1
else:
first1 = len(modelX)
modelX = modelX[:first1]
modelY = modelY[:first1]
rModelX = robjects.IntVector(modelX)
rModelY = robjects.FloatVector([0] + list(modelY))
#print("rModelX.r_repr()", rModelX.r_repr())
#print("rModelY.r_repr()", rModelY.r_repr())
rStepFun = robjects.r['stepfun']
#print("rStepFun.r_repr()", rStepFun)
rCVMTest = robjects.r['cvm.test']
rCMF = rStepFun(rModelX, rModelY)
testResult = rCVMTest(rData, rCMF, "A2", simulate_p_value)
return testResult.rx("p.value")[0][0]
#@profile
[docs]def zero_truncated_NB(size, n, p, poissonLimit=False, quantile = 0.999,
MHSteps=100):
"""
returns a sample of size "size" from the negative binomial distribution with
parameters n, p under the condition that at least one element in the
sample is nonzero.
MHSteps denotes the number of Metropolis-Hastings iterations
"""
if p==1:
poissonLimit = True
# if obtaining a random sample with total count 0 is sufficiently unlikely,
# sample until a suitable sample is found.
if poissonLimit:
zeroP = np.exp(-size*n)
else:
zeroP = p**(size*n)
if zeroP < 0.7:
while not poissonLimit:
result = np.random.negative_binomial(n, p, size)
if result.any():
return result
while poissonLimit:
result = np.random.poisson(n, size)
if result.any():
return result
# pmf of truncated negative binomial for total count
q = min(quantile*(1-zeroP)+zeroP, 0.999999)
if poissonLimit:
maxbin = poisson.ppf(q, size*n)
else:
dist = nbinom(n, p)
maxbin = nbinom.ppf(q, size*n, p)
maxbin = max(maxbin, 5)
x = np.arange(1, maxbin+1)
if poissonLimit:
trunc_pmf = poisson.pmf(x, size*n)
else:
trunc_pmf = nbinom.pmf(x, size*n, p)
trunc_pmf /= np.sum(trunc_pmf)
# sampling the total count value
totalCount = np.random.choice(x, p=trunc_pmf)
if poissonLimit:
return np.random.multinomial(totalCount, np.full(size, 1/size))
elif totalCount == 1:
# if only one observation has been made, it does not matter when
result = np.zeros(size)
result[0] = 1
return result
elif totalCount == 2:
# if two observations have been made, we have to decide whether they
# occurred in the same sample or in distinct samples
# when computing the joint probabilities of the possible events, I
# neglect factors that appear in all probabilities
# p11 = (size choose 2) * pmf(1)**2
p11 = (size-1)/2 * dist.pmf(1)**2
# p20 = size * pmf(2) * pmf(0)
p20 = dist.pmf(2) * dist.pmf(0)
norm = p11 + p20
p11 /= norm
p20 /= norm
result = np.zeros(size)
if np.random.choice([True, False], p=[p11, p20]):
result[:2] = 1
else:
result[0] = 2
return result
elif totalCount == 3:
# p111 = (size choose 3) * pmf(1, n, p)**3
p111 = (size-1)*(size-2)/6 * dist.pmf(1)**3
p210 = (size-1) * dist.pmf(2)*dist.pmf(1)*dist.pmf(0)
p300 = dist.pmf(3)*dist.pmf(0)**2
ps = np.array([p111, p210, p300])
ps /= np.sum(ps)
result = np.zeros(size)
choice = np.random.choice(np.arange(3), p=ps)
if choice == 0:
result[:3] = 1
elif choice == 1:
result[0] = 2
result[1] = 1
elif choice == 2:
result[0] = 3
return result
else:
return _dist_bins_MH(size, totalCount, dist, MHSteps)
def _dist_bins_MH(size, totalCount, dist, steps=100):
# initialize
if dist.mean() / dist.var() > 0.5:
# if overdispersion is low, generate initial sample under indpendece
# assumption
counts = np.random.multinomial(totalCount, np.full(size, 1/size))
nonzero = list(np.nonzero(counts)[0])
else:
# if overdispersion is high, generate initial sample with one big heap
counts = np.zeros(size)
counts[0] = totalCount
nonzero = [0]
for i in range(steps):
# pick a pair of indices to swap a count value
swapFrom = np.random.choice(nonzero)
swapTo = np.random.randint(0, size)
if swapFrom == swapTo:
continue
swappedCounts = counts.copy()
swappedCounts[swapFrom] -= 1
swappedCounts[swapTo] += 1
changeP = dist.logpmf(swappedCounts).sum() - dist.logpmf(counts).sum()
if changeP >= 0 or changeP >= np.log(np.random.rand()):
if counts[swapTo] == 0:
nonzero.append(swapTo)
if counts[swapFrom] == 1:
nonzero.remove(swapFrom)
counts = swappedCounts
return counts
def __anderson_darling_statistic_P(data, fitData):
cmfData = ECDF(data)
x, pmf = np.unique(fitData, return_counts=True)
pmf = (pmf / pmf.sum())[:-1]
cmf = pmf.cumsum()
return data.size * np.sum(np.square(cmfData(x)[:-1]-cmf) * pmf / (cmf * (1-cmf)))
# test statistic for Anderson-Darling test
#@profile
def __anderson_darling_statistic_NB(data, parameters=None,
getParamEstimates=False,
poissonLimit=False):
isPDFData = hasattr(data[0], "__iter__")
if isPDFData:
x, counts = data
size = np.sum(counts)
maxx = x[-1]
else:
maxx = np.max(data)
size = data.size
if parameters is None:
# observed mean and variance and other base measures
if isPDFData:
mean = np.sum(x*counts)/size
var = np.sum(np.square(x-mean)*counts)/(size-1)
else:
mean = np.mean(data)
var = np.var(data, ddof=1)
mean = max(mean, 1e-10)
var = max(var, 1e-10)
# method of moments parameter estimates
if not poissonLimit:
#p = mean / var <- this does not work if we neglect zeros
optf = lambda pp: mean/pp-var-mean**2*pp**(size*(pp*(mean**2+var)-mean)/((1-pp)*mean))
left = mean/(var+mean**2)
if left >= 1:
poissonLimit=True
else:
right = 1
err = 1e-5
while right-left > err:
p = (right+left)/2
if optf(p) > 0:
left = p
else:
right = p
p = (right+left)/2
poissonLimit = right == 1
if not poissonLimit:
n = (p*(mean**2+var)-mean)/((1-p)*mean)
else:
if mean*size <= 1:
if getParamEstimates:
return 0, 0, 1
else:
return 0
p = 1
n = lambertw(-np.exp(-mean*size)*mean*size).real/size + mean
else:
n, p = parameters
if not isPDFData:
x = np.arange(maxx+1)
# empirical cmf
ecmf = ECDF(data)(x)
else:
ecmf = np.cumsum(counts)/size
if not poissonLimit:
pmf = binom(x+(n-1), (n-1))*p**n*np.power(1-p, x) #nbinom.pmf(x, n, p)
pmf[0] -= p**(size*n)
pmf /= (1-p**(size*n))
else:
pmf = poisson.pmf(x, n)
pmf[0] -= np.exp(-size*n)
pmf /= (1-np.exp(-size*n))
cmf = pmf.cumsum()
if cmf[-1] > 1: # in case of significant numerical errors that lead to
# a probabiity > 1: normalize
pmf /= cmf[-1]
cmf /= cmf[-1]
#if np.mean(data) >= np.var(data, ddof=1):
# print("meanvar issue", np.mean(data), np.var(data, ddof=1))
# anderson-darling statistic
T = size * np.sum(np.square(ecmf-cmf) * pmf / (cmf * (1-cmf)))
#print(T, T2)
if getParamEstimates:
return (T, n, p)
else:
return T
__ADRESULTS = {}
__PRESULTS = {}
__PSAMPLERESULTS = {}
MAXSIZE = 500000
RESULTSAVER = [__ADRESULTS, __PRESULTS, __PSAMPLERESULTS, MAXSIZE]
#@profile
[docs]def anderson_darling_NB(data, parameters=None, poissonLimit=False,
pSampleSize=0, bootstrapN=400, bootstrapN_P=0,
MHSteps=100, usePreviousPVals=True,
usePreviousPSamples=False,
resultSaver=None):
"""
global __PRESULTS
global __ADRESULTS
global __PSAMPLERESULTS
global MAXSIZE
"""
if resultSaver is None:
resultSaver = RESULTSAVER #{}, {}, {}, 50000
__ADRESULTS, __PRESULTS, __PSAMPLERESULTS, MAXSIZE = resultSaver
#"""
pSampleSize2 = max(pSampleSize, bootstrapN_P)
if parameters is None and np.sum(data) <= 1:
if pSampleSize2:
if bootstrapN_P:
return 1, np.ones(pSampleSize2), np.ones((bootstrapN_P, pSampleSize))
else:
return 1, np.ones(pSampleSize2)
else:
return 1
if parameters is not None:
parameters = tuple(parameters)
obs, counts = np.unique(data, return_counts=True)
dataHash = hash((tuple(obs), tuple(counts), poissonLimit, parameters))
if not bootstrapN_P:
if usePreviousPVals:
result = __PRESULTS.get(dataHash, None)
if result is not None:
if not pSampleSize:
return result
"""
elif usePreviousPSamples:
result2 = __PSAMPLERESULTS.get(dataHash, None)
#result2 = {}.get(dataHash, None)
if result2 is not None and len(result2) >= pSampleSize:
#return result, result2[:pSampleSize]
pass
if usePreviousPVals and (dataHash in __PRESULTS):
if not pSampleSize:
return __PRESULTS[dataHash]
elif usePreviousPSamples:
result = __PSAMPLERESULTS.get(dataHash, None)
if result is not None and len(result) >= pSampleSize:
print(len(__PRESULTS), len(__PSAMPLERESULTS))
return __PRESULTS[dataHash], result[:pSampleSize]
"""
else:
np.random.seed()
# base statistic a
T, n, p = __anderson_darling_statistic_NB((obs, counts), parameters, True, poissonLimit)
parameters2 = (n, p)
poissonLimit2 = poissonLimit or p == 1
# bootstrapping
pVal = 0
if pSampleSize2:
if bootstrapN_P:
pSamples = np.zeros((bootstrapN_P, pSampleSize))
pValues = np.zeros(pSampleSize2)
bootstrapN = max(bootstrapN, pSampleSize, bootstrapN_P)
for i in range(bootstrapN):
row = zero_truncated_NB(data.size, n, p, poissonLimit2, MHSteps)
robs, rcounts = np.unique(row, return_counts=True)
rowHash = hash((tuple(robs), tuple(rcounts), poissonLimit2, parameters))
Tb = __ADRESULTS.get(rowHash, None)
if Tb is None:
Tb = __anderson_darling_statistic_NB((robs, rcounts), parameters2,
poissonLimit=poissonLimit2)
__ADRESULTS[rowHash] = Tb
if Tb >= T:
pVal += 1
if (i < bootstrapN_P) or (i < pSampleSize):
if usePreviousPVals and (poissonLimit != poissonLimit2):
rowHash = hash((tuple(robs), tuple(rcounts), poissonLimit, parameters))
pVal2 = pSample = None
if i < bootstrapN_P:
if usePreviousPSamples:
#print("RU", len(__PRESULTS), len(__PSAMPLERESULTS))
pSample = __PSAMPLERESULTS.get(rowHash, None)
if pSample is None:
pVal2, pSample = anderson_darling_NB(row, parameters, poissonLimit,
pSampleSize, bootstrapN, 0, MHSteps,
usePreviousPVals, usePreviousPSamples,
resultSaver=resultSaver)
pSamples[i] = pSample
if i < pSampleSize2:
if pVal2 is None and usePreviousPSamples:
pVal2 = __PRESULTS.get(rowHash, None)
if pVal2 is None:
pVal2 = anderson_darling_NB(row, parameters, poissonLimit,
0, bootstrapN, 0, MHSteps,
usePreviousPVals, usePreviousPSamples,
resultSaver=resultSaver)
pValues[i] = pVal2
pVal /= bootstrapN
if len(__ADRESULTS) > MAXSIZE:
__ADRESULTS.clear()
if len(__PRESULTS) > MAXSIZE:
__PRESULTS.clear()
if len(__PSAMPLERESULTS) > MAXSIZE:
__PSAMPLERESULTS.clear()
__PRESULTS[dataHash] = pVal
if pSampleSize:
__PSAMPLERESULTS[dataHash] = pValues
if bootstrapN_P:
#print(len(__ADRESULTS), len(__PRESULTS), len(__PSAMPLERESULTS))
#print(len(__PRESULTS), len(__PSAMPLERESULTS), usePreviousPSamples)
return pVal, pValues, pSamples
return pVal, pValues
else:
return pVal
[docs]def anderson_darling_P(data, poissonLimit, pSampleSize, bootstrapN, bootstrapN_P, counter=None):
global __ADRESULTS
global __PRESULTS
__ADRESULTS.clear()
__PRESULTS.clear()
#anderson_darling_NB(data[0], None, poissonLimit, pSampleSize, bootstrapN, bootstrapN_P)
p1 = []
p2 = []
p3 = []
with ProcessPoolExecutor() as pool:
testRes = pool.map(anderson_darling_NB, data,
repeat(None), repeat(poissonLimit),
repeat(pSampleSize), repeat(bootstrapN),
repeat(bootstrapN_P), repeat(100), repeat(True),
repeat(False), chunksize=1)
for pVal, pValues, pSamples in testRes:
p1.append(pVal)
p2.append(pValues)
p3.append(pSamples)
counter()
p1 = np.array(p1)
p2 = np.array(p2)
p3 = np.array(p3)
T = __anderson_darling_statistic_P(p1, p2[:,:pSampleSize].ravel())
ts = []
pVal = 0
for i in range(bootstrapN_P):
Tb = __anderson_darling_statistic_P(p2[:,i], p3[:,i].ravel())
if Tb >= T:
pVal += 1
ts.append(Tb)
pVal /= bootstrapN_P
return pVal
def _test_ZTMNB():
n, p = 8385.273718025031, 0.9999971718211882 #1, 0.9
samplesize, testn = 30, 1000
a = np.array([zero_truncated_NB(samplesize, n, p)
for _ in range(testn)])
"""
resN, resP = [], []
for r in range(30):
t, nn, pp = __anderson_darling_statistic_NB(zero_truncated_NB(samplesize, n, p),
None, True, False)
resN.append(nn)
resP.append(pp)
print(np.mean(resN), np.mean(resP))
print(resN)
print(resP)
"""
meanSumExp = samplesize*n*(1-p)/(p*(1-p**(n*samplesize)))
varSumExp = meanSumExp/p-p**(n*samplesize)*meanSumExp**2
meanExp = n*(1-p)/(p*(1-p**(n*samplesize)))
varExp = meanExp/p-p**(n*samplesize)*meanExp**2
sums = a.sum(1)
meanSum = np.mean(sums)
varSum = np.var(sums)
var = np.mean(np.var(a, 1, ddof=1))
mean = np.mean(a)
print(meanSumExp, meanSum)
print(varSumExp, varSum)
print(meanExp, mean)
print(varExp, var)
m = mean
s = var
N = samplesize
#"""
#optf = lambda pp: m-s*pp-m**2*pp**(pp/(1-pp)*(m**2+s-m)/m)
#optf = lambda pp: m-s*pp-m**2*pp**(pp/(1-pp)*(N*(m**2+s)-m)/m)
optf = lambda pp: m/pp-s-m**2*pp**(N*(pp*(m**2+s)-m)/((1-pp)*m))
#optfM = lambda nn: nn*(1-pp)/(pp*(1-pp**(nn*samplesize)))-mean
left = mean/(var+mean**2)
right = 1
err = 1e-7
while right-left > err:
new = (right+left)/2
if optf(new) > 0:
left = new
else:
right = new
pp = (right+left)/2
#"""
#pp = mean/var
#nn = (pp*(meanSum**2+varSum)-meanSum)/(samplesize*(1-pp)*meanSum)
nn = (pp*(m**2+s)-m)/((1-pp)*m)
#nn = ((np.log(mean-var*pp)-2*np.log(mean))/np.log(pp)-1)/samplesize
pold = mean/var
nold = mean*pold/(1-pold)
print("Estimate n", nn, n, nold)
print("Estimate p", pp, p, pold)
m = meanExp
s = varExp
#pp = m/(s+m**2)
m*(1-p**(n*N))*p/(N*(1-p))
m - p**(n*N+1)*m**2-s*p
(p*(m**2+s)-m)/(N*(1-p)*m)
m - p**(n*N+1)*m**2-s*p
m-s*p-m**2*p**(p/(1-p)*(m**2+s-m)/m)
ppp = np.linspace(0,1,1000)
nnn = np.linspace(0,3,1000)
y = m-s*ppp-m**2*ppp**(ppp/(1-ppp)*(m**2+s-m)/m)
y = nnn*(1-pp)/(pp*(1-pp**(nnn*samplesize)))-mean
(pp/(1-pp)*(m**2+s-m)/m)
print("done")
def _test_anderson_darling_NB():
testn = 50
datan = 50
np.random.seed(2)
print("Starting tests with testn={} and datan={}".format(testn, datan))
p = np.full(testn, 0.8)
n = np.full(testn, 0.5)
p = np.random.rand(testn)/3+0.05 #+0.3
n = np.random.exponential(0.5, size=testn)
n = np.random.negative_binomial(1, 0.2, size=testn) / 20 + 0.01
p2 = np.random.rand(testn)
n2 = np.random.poisson(1, size=testn)+1
m = lambda x: (np.mean(x), np.var(x), np.mean(x)/np.var(x))
"""
print("== Tests with Poisson distribution ==")
pvals = [anderson_darling_NB(zero_truncated_NB(datan, nn, 1, True),
False)
for nn, pp in zip(n2, p)]
pvals = np.array(pvals)[~np.isnan(pvals)]
print("mean:", np.mean(pvals))
freq, bins = np.histogram(pvals, 20, [0, 1])
freq = freq/np.sum(freq)
print(np.round(freq,2))
print(np.round(np.cumsum(freq),2))
#"""
from matplotlib import pyplot as plt
print("== Tests with NB ==")
"""
for i in range(10, 101, 10):
a = np.zeros(i)
a[0] = 1
print(i, np.mean([anderson_darling_NB(a, False) for _ in range(20)], 0))
print("Proceed")
pv, pvc = anderson_darling_NB(np.array([1,1]+[0]*20), None, True, True)
print(pv)
a = np.histogram(pvc, 20, [0,1])[0]
print(a/a.sum())
#"""
global __PRESULTS
global __ADRESULTS
global __PSAMPLERESULTS
pvals = []
pvalcandidates = []
i = 1
from itertools import chain as iterchain
samples = []
for nn, pp in zip(n, p):
samples.append(zero_truncated_NB(datan, nn, pp, False))
pvs = []
for i in range(100):
c = Counter(testn, 0.05)
def counter():
perc = c.next()
if perc:
print("{}%".format(round(perc*100)))
#"""
samples = []
for nn, pp in zip(n, p):
samples.append(zero_truncated_NB(datan, nn, pp, False))
#"""
pv = anderson_darling_P(samples, False, 50, 100, 50, counter)
#pv = anderson_darling_P(samples, False, 100, 100, 100, counter)
print(i, ":", pv)
pvs.append(pv)
print(pvs)
print(np.histogram(pvs, 20, (0,1)))
return
print("== Tests with binomial distribution ==")
pvals = [anderson_darling_NB(np.random.binomial(nn, pp, datan))
for nn, pp in zip(n2, p)]
pvals = np.array(pvals)[~np.isnan(pvals)]
print("mean:", np.mean(pvals))
freq, bins = np.histogram(pvals, 20, [0, 1])
freq = freq/np.sum(freq)
print(np.round(freq,2))
print(np.round(np.cumsum(freq),2))
print("== Tests with multi density NB ==")
pvals = [anderson_darling_NB(np.random.binomial(nn, pp, datan)+
np.random.binomial(nn2, pp2, datan))
for nn, pp, nn2, pp2 in zip(n, p, n2, p2)]
pvals = np.array(pvals)[~np.isnan(pvals)]
print("mean:", np.mean(pvals))
freq, bins = np.histogram(pvals, 20, [0, 1])
freq = freq/np.sum(freq)
print(np.round(freq,2))
print(np.round(np.cumsum(freq),2))
print("== Tests with multi density Poisson ==")
pvals = [anderson_darling_NB(np.random.poisson(nn, datan)
+ np.random.randint(0, 2, datan) * np.random.poisson(nn2, datan))
for nn, pp, nn2, pp2 in zip(n, p, n2, p2)]
pvals = np.array(pvals)[~np.isnan(pvals)]
print("mean:", np.mean(pvals))
freq, bins = np.histogram(pvals, 20, [0, 1])
freq = freq/np.sum(freq)
print(np.round(freq,2))
print(np.round(np.cumsum(freq),2))
[docs]def vonmises_logpdf(x, kappa, loc, scale):
return (kappa * np.cos((x-loc)/scale) - np.log(2*np.pi)
- np.log(sc.special.i0e(kappa)) - kappa) - np.log(scale)
[docs]def R2(predicted, observed):
mean = np.mean(observed)
return 1 - np.sum((observed-predicted)**2)/np.sum((observed-mean)**2)
if __name__ == '__main__':
import sys
x = np.linspace(0 , 10, 10)
print(R2(x, x+2))
sys.exit()
_test_ZTMNB()
from simprofile import profile
#line_profiler.LineProfiler(zero_truncated_NB)
_test_anderson_darling_NB()
#profile("_test_anderson_darling_NB()", globals(), locals())
from scipy.stats import vonmises
kappa = 300
scale = 1/(2*np.pi)
loc = 0.7
a=np.linspace(0,1,11)
from timeit import timeit
lp = vonmises.logpdf
print(timeit("lp(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()}))
print(timeit("vonmises_logpdf(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()}))
print(timeit("lp(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()}))
print(timeit("vonmises_logpdf(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()}))
b1 = vonmises.logpdf(a, kappa, loc, scale)
b2 = vonmises_logpdf(a, kappa, loc, scale)
print(a)
print(b1)
print(b2)
print(np.abs(b1-b2))
sys.exit()
np.random.seed()
x = np.arange(1, 10+1)
d = np.random.choice(x, 25, replace=True)
y = np.cumsum(np.ones(x.size) / x.size)
x = np.arange(16)
#x = np.arange(2)
d = np.zeros(90)
y = np.array([0.998635494757076, 0.999995651829634, 0.999999983505647,
0.999999999932426, 0.999999999999711, 0.999999999999999,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
#y = np.array([0.7, 0.8])
print(anderson_darling_test_discrete(d, x, y))