Source code for hybrid_vector_model.statsutils

'''
Created on 13.06.2017

@author: Samuel
'''

#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import warnings
from itertools import repeat
from collections import defaultdict

import numpy as np
import scipy as sc
from scipy.special import binom
from scipy.stats import nbinom, poisson
from scipy.special import lambertw
from statsmodels.distributions.empirical_distribution import ECDF
np.random.seed()

#import line_profiler
from vemomoto_core.concurrent.nicepar import Counter
from vemomoto_core.concurrent.concurrent_futures_ext import ProcessPoolExecutor

"""
# R package names
packnames = ('dgof',)

utils = importr('utils')
utils.chooseCRANmirror(ind=1)

# R vector of strings
from rpy2.robjects.vectors import StrVector
import rpy2.robjects.packages as rpackages

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))
#"""


[docs]def anderson_darling_test_discrete(data, modelX, modelY, simulate_p_value=False): # import R's "dgof" package dgof = importr('dgof') #raise NotImplementedError("This functionality requires a proper R setup.", # "Other") #print("data.shape", data.shape) #print("np.unique(data)", np.unique(data)) #print("modelX.shape", modelX.shape) #print("modelY.shape", modelY.shape) rData = robjects.IntVector(data) first1 = np.nonzero(modelY==1)[0] if len(first1): first1 = first1[0] + 1 else: first1 = len(modelX) modelX = modelX[:first1] modelY = modelY[:first1] rModelX = robjects.IntVector(modelX) rModelY = robjects.FloatVector([0] + list(modelY)) #print("rModelX.r_repr()", rModelX.r_repr()) #print("rModelY.r_repr()", rModelY.r_repr()) rStepFun = robjects.r['stepfun'] #print("rStepFun.r_repr()", rStepFun) rCVMTest = robjects.r['cvm.test'] rCMF = rStepFun(rModelX, rModelY) testResult = rCVMTest(rData, rCMF, "A2", simulate_p_value) return testResult.rx("p.value")[0][0]
#@profile
[docs]def zero_truncated_NB(size, n, p, poissonLimit=False, quantile = 0.999, MHSteps=100): """ returns a sample of size "size" from the negative binomial distribution with parameters n, p under the condition that at least one element in the sample is nonzero. MHSteps denotes the number of Metropolis-Hastings iterations """ if p==1: poissonLimit = True # if obtaining a random sample with total count 0 is sufficiently unlikely, # sample until a suitable sample is found. if poissonLimit: zeroP = np.exp(-size*n) else: zeroP = p**(size*n) if zeroP < 0.7: while not poissonLimit: result = np.random.negative_binomial(n, p, size) if result.any(): return result while poissonLimit: result = np.random.poisson(n, size) if result.any(): return result # pmf of truncated negative binomial for total count q = min(quantile*(1-zeroP)+zeroP, 0.999999) if poissonLimit: maxbin = poisson.ppf(q, size*n) else: dist = nbinom(n, p) maxbin = nbinom.ppf(q, size*n, p) maxbin = max(maxbin, 5) x = np.arange(1, maxbin+1) if poissonLimit: trunc_pmf = poisson.pmf(x, size*n) else: trunc_pmf = nbinom.pmf(x, size*n, p) trunc_pmf /= np.sum(trunc_pmf) # sampling the total count value totalCount = np.random.choice(x, p=trunc_pmf) if poissonLimit: return np.random.multinomial(totalCount, np.full(size, 1/size)) elif totalCount == 1: # if only one observation has been made, it does not matter when result = np.zeros(size) result[0] = 1 return result elif totalCount == 2: # if two observations have been made, we have to decide whether they # occurred in the same sample or in distinct samples # when computing the joint probabilities of the possible events, I # neglect factors that appear in all probabilities # p11 = (size choose 2) * pmf(1)**2 p11 = (size-1)/2 * dist.pmf(1)**2 # p20 = size * pmf(2) * pmf(0) p20 = dist.pmf(2) * dist.pmf(0) norm = p11 + p20 p11 /= norm p20 /= norm result = np.zeros(size) if np.random.choice([True, False], p=[p11, p20]): result[:2] = 1 else: result[0] = 2 return result elif totalCount == 3: # p111 = (size choose 3) * pmf(1, n, p)**3 p111 = (size-1)*(size-2)/6 * dist.pmf(1)**3 p210 = (size-1) * dist.pmf(2)*dist.pmf(1)*dist.pmf(0) p300 = dist.pmf(3)*dist.pmf(0)**2 ps = np.array([p111, p210, p300]) ps /= np.sum(ps) result = np.zeros(size) choice = np.random.choice(np.arange(3), p=ps) if choice == 0: result[:3] = 1 elif choice == 1: result[0] = 2 result[1] = 1 elif choice == 2: result[0] = 3 return result else: return _dist_bins_MH(size, totalCount, dist, MHSteps)
def _dist_bins_MH(size, totalCount, dist, steps=100): # initialize if dist.mean() / dist.var() > 0.5: # if overdispersion is low, generate initial sample under indpendece # assumption counts = np.random.multinomial(totalCount, np.full(size, 1/size)) nonzero = list(np.nonzero(counts)[0]) else: # if overdispersion is high, generate initial sample with one big heap counts = np.zeros(size) counts[0] = totalCount nonzero = [0] for i in range(steps): # pick a pair of indices to swap a count value swapFrom = np.random.choice(nonzero) swapTo = np.random.randint(0, size) if swapFrom == swapTo: continue swappedCounts = counts.copy() swappedCounts[swapFrom] -= 1 swappedCounts[swapTo] += 1 changeP = dist.logpmf(swappedCounts).sum() - dist.logpmf(counts).sum() if changeP >= 0 or changeP >= np.log(np.random.rand()): if counts[swapTo] == 0: nonzero.append(swapTo) if counts[swapFrom] == 1: nonzero.remove(swapFrom) counts = swappedCounts return counts def __anderson_darling_statistic_P(data, fitData): cmfData = ECDF(data) x, pmf = np.unique(fitData, return_counts=True) pmf = (pmf / pmf.sum())[:-1] cmf = pmf.cumsum() return data.size * np.sum(np.square(cmfData(x)[:-1]-cmf) * pmf / (cmf * (1-cmf))) # test statistic for Anderson-Darling test #@profile def __anderson_darling_statistic_NB(data, parameters=None, getParamEstimates=False, poissonLimit=False): isPDFData = hasattr(data[0], "__iter__") if isPDFData: x, counts = data size = np.sum(counts) maxx = x[-1] else: maxx = np.max(data) size = data.size if parameters is None: # observed mean and variance and other base measures if isPDFData: mean = np.sum(x*counts)/size var = np.sum(np.square(x-mean)*counts)/(size-1) else: mean = np.mean(data) var = np.var(data, ddof=1) mean = max(mean, 1e-10) var = max(var, 1e-10) # method of moments parameter estimates if not poissonLimit: #p = mean / var <- this does not work if we neglect zeros optf = lambda pp: mean/pp-var-mean**2*pp**(size*(pp*(mean**2+var)-mean)/((1-pp)*mean)) left = mean/(var+mean**2) if left >= 1: poissonLimit=True else: right = 1 err = 1e-5 while right-left > err: p = (right+left)/2 if optf(p) > 0: left = p else: right = p p = (right+left)/2 poissonLimit = right == 1 if not poissonLimit: n = (p*(mean**2+var)-mean)/((1-p)*mean) else: if mean*size <= 1: if getParamEstimates: return 0, 0, 1 else: return 0 p = 1 n = lambertw(-np.exp(-mean*size)*mean*size).real/size + mean else: n, p = parameters if not isPDFData: x = np.arange(maxx+1) # empirical cmf ecmf = ECDF(data)(x) else: ecmf = np.cumsum(counts)/size if not poissonLimit: pmf = binom(x+(n-1), (n-1))*p**n*np.power(1-p, x) #nbinom.pmf(x, n, p) pmf[0] -= p**(size*n) pmf /= (1-p**(size*n)) else: pmf = poisson.pmf(x, n) pmf[0] -= np.exp(-size*n) pmf /= (1-np.exp(-size*n)) cmf = pmf.cumsum() if cmf[-1] > 1: # in case of significant numerical errors that lead to # a probabiity > 1: normalize pmf /= cmf[-1] cmf /= cmf[-1] #if np.mean(data) >= np.var(data, ddof=1): # print("meanvar issue", np.mean(data), np.var(data, ddof=1)) # anderson-darling statistic T = size * np.sum(np.square(ecmf-cmf) * pmf / (cmf * (1-cmf))) #print(T, T2) if getParamEstimates: return (T, n, p) else: return T __ADRESULTS = {} __PRESULTS = {} __PSAMPLERESULTS = {} MAXSIZE = 500000 RESULTSAVER = [__ADRESULTS, __PRESULTS, __PSAMPLERESULTS, MAXSIZE] #@profile
[docs]def anderson_darling_NB(data, parameters=None, poissonLimit=False, pSampleSize=0, bootstrapN=400, bootstrapN_P=0, MHSteps=100, usePreviousPVals=True, usePreviousPSamples=False, resultSaver=None): """ global __PRESULTS global __ADRESULTS global __PSAMPLERESULTS global MAXSIZE """ if resultSaver is None: resultSaver = RESULTSAVER #{}, {}, {}, 50000 __ADRESULTS, __PRESULTS, __PSAMPLERESULTS, MAXSIZE = resultSaver #""" pSampleSize2 = max(pSampleSize, bootstrapN_P) if parameters is None and np.sum(data) <= 1: if pSampleSize2: if bootstrapN_P: return 1, np.ones(pSampleSize2), np.ones((bootstrapN_P, pSampleSize)) else: return 1, np.ones(pSampleSize2) else: return 1 if parameters is not None: parameters = tuple(parameters) obs, counts = np.unique(data, return_counts=True) dataHash = hash((tuple(obs), tuple(counts), poissonLimit, parameters)) if not bootstrapN_P: if usePreviousPVals: result = __PRESULTS.get(dataHash, None) if result is not None: if not pSampleSize: return result """ elif usePreviousPSamples: result2 = __PSAMPLERESULTS.get(dataHash, None) #result2 = {}.get(dataHash, None) if result2 is not None and len(result2) >= pSampleSize: #return result, result2[:pSampleSize] pass if usePreviousPVals and (dataHash in __PRESULTS): if not pSampleSize: return __PRESULTS[dataHash] elif usePreviousPSamples: result = __PSAMPLERESULTS.get(dataHash, None) if result is not None and len(result) >= pSampleSize: print(len(__PRESULTS), len(__PSAMPLERESULTS)) return __PRESULTS[dataHash], result[:pSampleSize] """ else: np.random.seed() # base statistic a T, n, p = __anderson_darling_statistic_NB((obs, counts), parameters, True, poissonLimit) parameters2 = (n, p) poissonLimit2 = poissonLimit or p == 1 # bootstrapping pVal = 0 if pSampleSize2: if bootstrapN_P: pSamples = np.zeros((bootstrapN_P, pSampleSize)) pValues = np.zeros(pSampleSize2) bootstrapN = max(bootstrapN, pSampleSize, bootstrapN_P) for i in range(bootstrapN): row = zero_truncated_NB(data.size, n, p, poissonLimit2, MHSteps) robs, rcounts = np.unique(row, return_counts=True) rowHash = hash((tuple(robs), tuple(rcounts), poissonLimit2, parameters)) Tb = __ADRESULTS.get(rowHash, None) if Tb is None: Tb = __anderson_darling_statistic_NB((robs, rcounts), parameters2, poissonLimit=poissonLimit2) __ADRESULTS[rowHash] = Tb if Tb >= T: pVal += 1 if (i < bootstrapN_P) or (i < pSampleSize): if usePreviousPVals and (poissonLimit != poissonLimit2): rowHash = hash((tuple(robs), tuple(rcounts), poissonLimit, parameters)) pVal2 = pSample = None if i < bootstrapN_P: if usePreviousPSamples: #print("RU", len(__PRESULTS), len(__PSAMPLERESULTS)) pSample = __PSAMPLERESULTS.get(rowHash, None) if pSample is None: pVal2, pSample = anderson_darling_NB(row, parameters, poissonLimit, pSampleSize, bootstrapN, 0, MHSteps, usePreviousPVals, usePreviousPSamples, resultSaver=resultSaver) pSamples[i] = pSample if i < pSampleSize2: if pVal2 is None and usePreviousPSamples: pVal2 = __PRESULTS.get(rowHash, None) if pVal2 is None: pVal2 = anderson_darling_NB(row, parameters, poissonLimit, 0, bootstrapN, 0, MHSteps, usePreviousPVals, usePreviousPSamples, resultSaver=resultSaver) pValues[i] = pVal2 pVal /= bootstrapN if len(__ADRESULTS) > MAXSIZE: __ADRESULTS.clear() if len(__PRESULTS) > MAXSIZE: __PRESULTS.clear() if len(__PSAMPLERESULTS) > MAXSIZE: __PSAMPLERESULTS.clear() __PRESULTS[dataHash] = pVal if pSampleSize: __PSAMPLERESULTS[dataHash] = pValues if bootstrapN_P: #print(len(__ADRESULTS), len(__PRESULTS), len(__PSAMPLERESULTS)) #print(len(__PRESULTS), len(__PSAMPLERESULTS), usePreviousPSamples) return pVal, pValues, pSamples return pVal, pValues else: return pVal
[docs]def anderson_darling_P(data, poissonLimit, pSampleSize, bootstrapN, bootstrapN_P, counter=None): global __ADRESULTS global __PRESULTS __ADRESULTS.clear() __PRESULTS.clear() #anderson_darling_NB(data[0], None, poissonLimit, pSampleSize, bootstrapN, bootstrapN_P) p1 = [] p2 = [] p3 = [] with ProcessPoolExecutor() as pool: testRes = pool.map(anderson_darling_NB, data, repeat(None), repeat(poissonLimit), repeat(pSampleSize), repeat(bootstrapN), repeat(bootstrapN_P), repeat(100), repeat(True), repeat(False), chunksize=1) for pVal, pValues, pSamples in testRes: p1.append(pVal) p2.append(pValues) p3.append(pSamples) counter() p1 = np.array(p1) p2 = np.array(p2) p3 = np.array(p3) T = __anderson_darling_statistic_P(p1, p2[:,:pSampleSize].ravel()) ts = [] pVal = 0 for i in range(bootstrapN_P): Tb = __anderson_darling_statistic_P(p2[:,i], p3[:,i].ravel()) if Tb >= T: pVal += 1 ts.append(Tb) pVal /= bootstrapN_P return pVal
def _test_ZTMNB(): n, p = 8385.273718025031, 0.9999971718211882 #1, 0.9 samplesize, testn = 30, 1000 a = np.array([zero_truncated_NB(samplesize, n, p) for _ in range(testn)]) """ resN, resP = [], [] for r in range(30): t, nn, pp = __anderson_darling_statistic_NB(zero_truncated_NB(samplesize, n, p), None, True, False) resN.append(nn) resP.append(pp) print(np.mean(resN), np.mean(resP)) print(resN) print(resP) """ meanSumExp = samplesize*n*(1-p)/(p*(1-p**(n*samplesize))) varSumExp = meanSumExp/p-p**(n*samplesize)*meanSumExp**2 meanExp = n*(1-p)/(p*(1-p**(n*samplesize))) varExp = meanExp/p-p**(n*samplesize)*meanExp**2 sums = a.sum(1) meanSum = np.mean(sums) varSum = np.var(sums) var = np.mean(np.var(a, 1, ddof=1)) mean = np.mean(a) print(meanSumExp, meanSum) print(varSumExp, varSum) print(meanExp, mean) print(varExp, var) m = mean s = var N = samplesize #""" #optf = lambda pp: m-s*pp-m**2*pp**(pp/(1-pp)*(m**2+s-m)/m) #optf = lambda pp: m-s*pp-m**2*pp**(pp/(1-pp)*(N*(m**2+s)-m)/m) optf = lambda pp: m/pp-s-m**2*pp**(N*(pp*(m**2+s)-m)/((1-pp)*m)) #optfM = lambda nn: nn*(1-pp)/(pp*(1-pp**(nn*samplesize)))-mean left = mean/(var+mean**2) right = 1 err = 1e-7 while right-left > err: new = (right+left)/2 if optf(new) > 0: left = new else: right = new pp = (right+left)/2 #""" #pp = mean/var #nn = (pp*(meanSum**2+varSum)-meanSum)/(samplesize*(1-pp)*meanSum) nn = (pp*(m**2+s)-m)/((1-pp)*m) #nn = ((np.log(mean-var*pp)-2*np.log(mean))/np.log(pp)-1)/samplesize pold = mean/var nold = mean*pold/(1-pold) print("Estimate n", nn, n, nold) print("Estimate p", pp, p, pold) m = meanExp s = varExp #pp = m/(s+m**2) m*(1-p**(n*N))*p/(N*(1-p)) m - p**(n*N+1)*m**2-s*p (p*(m**2+s)-m)/(N*(1-p)*m) m - p**(n*N+1)*m**2-s*p m-s*p-m**2*p**(p/(1-p)*(m**2+s-m)/m) ppp = np.linspace(0,1,1000) nnn = np.linspace(0,3,1000) y = m-s*ppp-m**2*ppp**(ppp/(1-ppp)*(m**2+s-m)/m) y = nnn*(1-pp)/(pp*(1-pp**(nnn*samplesize)))-mean (pp/(1-pp)*(m**2+s-m)/m) print("done") def _test_anderson_darling_NB(): testn = 50 datan = 50 np.random.seed(2) print("Starting tests with testn={} and datan={}".format(testn, datan)) p = np.full(testn, 0.8) n = np.full(testn, 0.5) p = np.random.rand(testn)/3+0.05 #+0.3 n = np.random.exponential(0.5, size=testn) n = np.random.negative_binomial(1, 0.2, size=testn) / 20 + 0.01 p2 = np.random.rand(testn) n2 = np.random.poisson(1, size=testn)+1 m = lambda x: (np.mean(x), np.var(x), np.mean(x)/np.var(x)) """ print("== Tests with Poisson distribution ==") pvals = [anderson_darling_NB(zero_truncated_NB(datan, nn, 1, True), False) for nn, pp in zip(n2, p)] pvals = np.array(pvals)[~np.isnan(pvals)] print("mean:", np.mean(pvals)) freq, bins = np.histogram(pvals, 20, [0, 1]) freq = freq/np.sum(freq) print(np.round(freq,2)) print(np.round(np.cumsum(freq),2)) #""" from matplotlib import pyplot as plt print("== Tests with NB ==") """ for i in range(10, 101, 10): a = np.zeros(i) a[0] = 1 print(i, np.mean([anderson_darling_NB(a, False) for _ in range(20)], 0)) print("Proceed") pv, pvc = anderson_darling_NB(np.array([1,1]+[0]*20), None, True, True) print(pv) a = np.histogram(pvc, 20, [0,1])[0] print(a/a.sum()) #""" global __PRESULTS global __ADRESULTS global __PSAMPLERESULTS pvals = [] pvalcandidates = [] i = 1 from itertools import chain as iterchain samples = [] for nn, pp in zip(n, p): samples.append(zero_truncated_NB(datan, nn, pp, False)) pvs = [] for i in range(100): c = Counter(testn, 0.05) def counter(): perc = c.next() if perc: print("{}%".format(round(perc*100))) #""" samples = [] for nn, pp in zip(n, p): samples.append(zero_truncated_NB(datan, nn, pp, False)) #""" pv = anderson_darling_P(samples, False, 50, 100, 50, counter) #pv = anderson_darling_P(samples, False, 100, 100, 100, counter) print(i, ":", pv) pvs.append(pv) print(pvs) print(np.histogram(pvs, 20, (0,1))) return print("== Tests with binomial distribution ==") pvals = [anderson_darling_NB(np.random.binomial(nn, pp, datan)) for nn, pp in zip(n2, p)] pvals = np.array(pvals)[~np.isnan(pvals)] print("mean:", np.mean(pvals)) freq, bins = np.histogram(pvals, 20, [0, 1]) freq = freq/np.sum(freq) print(np.round(freq,2)) print(np.round(np.cumsum(freq),2)) print("== Tests with multi density NB ==") pvals = [anderson_darling_NB(np.random.binomial(nn, pp, datan)+ np.random.binomial(nn2, pp2, datan)) for nn, pp, nn2, pp2 in zip(n, p, n2, p2)] pvals = np.array(pvals)[~np.isnan(pvals)] print("mean:", np.mean(pvals)) freq, bins = np.histogram(pvals, 20, [0, 1]) freq = freq/np.sum(freq) print(np.round(freq,2)) print(np.round(np.cumsum(freq),2)) print("== Tests with multi density Poisson ==") pvals = [anderson_darling_NB(np.random.poisson(nn, datan) + np.random.randint(0, 2, datan) * np.random.poisson(nn2, datan)) for nn, pp, nn2, pp2 in zip(n, p, n2, p2)] pvals = np.array(pvals)[~np.isnan(pvals)] print("mean:", np.mean(pvals)) freq, bins = np.histogram(pvals, 20, [0, 1]) freq = freq/np.sum(freq) print(np.round(freq,2)) print(np.round(np.cumsum(freq),2))
[docs]def vonmises_logpdf(x, kappa, loc, scale): return (kappa * np.cos((x-loc)/scale) - np.log(2*np.pi) - np.log(sc.special.i0e(kappa)) - kappa) - np.log(scale)
[docs]def R2(predicted, observed): mean = np.mean(observed) return 1 - np.sum((observed-predicted)**2)/np.sum((observed-mean)**2)
if __name__ == '__main__': import sys x = np.linspace(0 , 10, 10) print(R2(x, x+2)) sys.exit() _test_ZTMNB() from simprofile import profile #line_profiler.LineProfiler(zero_truncated_NB) _test_anderson_darling_NB() #profile("_test_anderson_darling_NB()", globals(), locals()) from scipy.stats import vonmises kappa = 300 scale = 1/(2*np.pi) loc = 0.7 a=np.linspace(0,1,11) from timeit import timeit lp = vonmises.logpdf print(timeit("lp(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()})) print(timeit("vonmises_logpdf(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()})) print(timeit("lp(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()})) print(timeit("vonmises_logpdf(a, kappa, loc, scale)", number=1000, globals={**globals(), **locals()})) b1 = vonmises.logpdf(a, kappa, loc, scale) b2 = vonmises_logpdf(a, kappa, loc, scale) print(a) print(b1) print(b2) print(np.abs(b1-b2)) sys.exit() np.random.seed() x = np.arange(1, 10+1) d = np.random.choice(x, 25, replace=True) y = np.cumsum(np.ones(x.size) / x.size) x = np.arange(16) #x = np.arange(2) d = np.zeros(90) y = np.array([0.998635494757076, 0.999995651829634, 0.999999983505647, 0.999999999932426, 0.999999999999711, 0.999999999999999, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) #y = np.array([0.7, 0.8]) print(anderson_darling_test_discrete(d, x, y))