import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from scipy.stats import beta
from scipy.stats import norm # for normal cdf
from scipy.integrate import quad  # for numerical integration
import warnings
import time
from numpy.random import random
from scipy import interpolate


# cos_series function serves two purposes:
# 1. to create the target density
# 2. to generate estimators

def cos_series(x,t,z):
    sum = t[0]*1
    for j in range(1,z):
        sum = sum + t[j]*(2**0.5)*np.cos((np.pi)*j*x)
    return sum


# target specifies the series coefficients for the target density
# A is the constant that we need to adjust to make sure the density we construct is positive.

A = 2
target = np.asarray([1, A*(2**(-2)), A*(3**(-2)), 0, A*(10**(-2)), 0, A*(7**(-2)), A*(8**(-2)),0,0, A*(4**(-2)), 0, A*(6**(-2)), A*(5**(-2)), A*(9**(-2))])


# create our target density
def f(x):
    return cos_series(x,target,15)


# plot this function, see if it works
x = np.linspace(0,1,10000)
y = f(x)
plt.plot(x,y)
plt.show()

# yes! it's positive and integrate to one since its first Fourier coefficients = 1.


# part1: get the approximated inverse cdf using interpolation

def sample(g):
    x = np.linspace(0,1,100000, endpoint = True)    # change this depending on the domain of your density
    y = g(x)                        # probability density function, pdf
    cdf_y = np.cumsum(y)            # cumulative distribution function, cdf
    cdf_y = cdf_y/cdf_y.max()       # takes care of normalizing cdf to 1.0
    cdf_y[0] = 0                    # the tale old as time... F(0) has to be 0... approximation won't do. 
    inverse_cdf = interpolate.interp1d(cdf_y,x)    # interpolate the function y = f(x) to the points outside the given range 
    return inverse_cdf

# part2: create function that generates random sample using inverse transform sampling.

def return_samples(N):
    # let's generate some samples according to the chosen pdf, f(x)
    uniform_samples = random(int(N))
    required_samples = sample(f)(uniform_samples)
    return required_samples


# sanity check: plot to see if this actually works (seems pretty damn good :p)
x = np.linspace(0,1,10000)
y = f(x)
plt.figure()
plt.plot(x,y, color = 'r')
plt.hist(return_samples(100000),bins='auto', density = True, range=(x.min(),x.max()), color = 'g', alpha = 0.5)
plt.title("Design Density (line) and Random Sample (histogram)")
plt.ylabel("density")
plt.savefig('density_sample.pdf')
plt.show()


# modified/optimized p-algorithm. For the argument, pass t= t_hard, z= J, c= 1 to start
def palg_cos(t,z,c, tol = 1e-2,maxiter = 500):
    count = 0
    err = tol+1
    
    while count < maxiter and err >= tol:
        def f(x,t,z,c,count):
            if count < 1:
                return np.maximum(cos_series(x,t,z),0)
            return np.maximum(f(x,t,z,c, count - 1) + 1 - c,0)
        def g(x):
            return f(x,t,z,c,count)
        c1 = quad(g,0,1)[0] # numerical integration       
        err = np.abs(c1-1)
        count = count + 1
        c=c1
    
    if count == maxiter:
        return 0
        #print("Failed to converge!")
            
    return lambda x: f(x,t,z,c,count)

### User Guide ###
# t : array of the series coefficients to be specified for cosine orthonormal basis
# z : how many series terms we considering
# c : initial value to start with. always set c = 1 to start with (this part of code can be further optimized)
# tol: the acceptable error for the p-alg numerical integration
# maxiter: how many iterations do we allow before terminate for no convergence
##################


# this section gives a step by step test run on how the estimator is constructed. Can ignore.

# sample size
Nt = 20000

# draw random sample from beta(a,b) distribution
# data = np.random.beta(a,b,Nt)

# draw random sample from target distribution
data = return_samples(Nt)

# cutoff J: 50, 100, 200, 500
J = 200

# estimates of J series coefficients
t = [] 
for j in range(1,J):
    t.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))

t.insert(0,1)

t = np.asarray(t)

# approximate penalization using self-normalized sum

Z = []

for j in range(1,J):
    Z.append(np.mean(((2**0.5)*np.cos((np.pi)*j*data)-t[j])**2))

lbd = (((np.log(J))/Nt)**0.5)*(norm.ppf(1-(1/(J*4*(((np.pi)*(np.log(J)))**0.5)))))*((np.amax(Z))**0.5)

# get hard-thresholding coefficients

t_hard = np.copy(t)

t_hard[np.abs(t) <lbd] = 0

t_hard

array([1.        , 0.49790887, 0.22250975, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.13296416, 0.        , 0.        , 0.08639149, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])


# how long it takes for p-alg to run

start = time.time()
test = palg_cos(t_hard,J,1,tol = 1e-3,maxiter = 500)
end = time.time()
print(end - start)

0.055786848068237305


# a = 2 # beta distribution alpha
# b = 5 # beta distribution beta

x = np.arange(0.0, 1.001, 0.001)
#y = beta.pdf(x,a,b)
y = f(x)

plt.plot(x,y, color='r', label = 'true density')
plt.plot(x, test(x), color='b', label = 'p-alg series')
plt.plot(x, cos_series(x,t,int(np.floor((4*5000)**(1/4)))), color='g', label = 'regular series')
plt.legend()
plt.ylabel('density')
plt.xlabel('value')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Comparing True Density with Estimator, N=10000')
plt.show()

# so it's ok so far. I think the original series can be made even more pathological


# Simulate MISE for our estimator for the first simulation study

# sample size
Nt = 5000

# cutoff J
J1 = 200

# list of regularized series coefficients for different cutoffs
T1 = [[] for _ in range(4)] # for our estimator
T2 = [[] for _ in range(4)] # for the comparison estimator

# number of iteration for Monte Carlo MISE 
B = 1000

# get the list T1 for our estimator and T2 for naive estimator
for k in range(1,5,1):
    for i in range(B):
        data = return_samples(k*5000)
        #empty lists to start
        t1 = []
        t2 = []
        #create T1 list
        for j in range(1,J1):
            t1.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))
        t1.insert(0,1)
        t1 = np.asarray(t1)
        Z = []
        for j in range(1,J1):
            Z.append(np.mean(((2**0.5)*np.cos((np.pi)*j*data)-t1[j])**2))
        #lbd = ((k*5000)**(-0.5))*(norm.ppf(1-(alpha/(2*J1))))*(np.amax(Z)**0.5)
        lbd = (((np.log(J1))/(k*5000))**0.5)*(norm.ppf(1-(1/(J1*4*(((np.pi)*(np.log(J1)))**0.5)))))*((np.amax(Z))**0.5)
        t_hard = np.copy(t1)
        t_hard[np.abs(t1) < lbd] = 0
        T1[k-1].append(t_hard)
        #create T2 list
        J2 = int(np.floor((k*5000)**(1/4))) #np.floor gives float object. need int()
        for j in range(1,J2):
            t2.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))
        t2.insert(0,1)
        t2 = np.asarray(t2)
        T2[k-1].append(t2)


# get the list of ISE
ISE1 = [[] for _ in range(4)] # for our estimator
ISE2 = [[] for _ in range(4)] # for comparison estimator

J1= 200

for k in range(1,5,1):
    for t in T1[k-1]:
        f_star = palg_cos(t,J1,1,tol=1e-3,maxiter=500)
        if f_star == 0:
            ise = np.nan        
        else:
            def integrand(x):
                return (f_star(x) - f(x))**2
            ise = quad(integrand,0,1)[0]
        ISE1[k-1].append(ise)
    for t in T2[k-1]:
        J2 = int(np.floor((k*5000)**(1/4)))
        f_check = palg_cos(t,J2,1,tol=1e-3,maxiter=500)
        if f_check == 0:
            ise = np.nan        
        else:
            def integrand(x):
                return (f_check(x) - f(x))**2
            ise = quad(integrand,0,1)[0]
        ISE2[k-1].append(ise)        

# not sure if the warning message is of any concern at all


plt.figure()
fig, axs = plt.subplots(2, 2)
axs[0, 0].hist(ISE1[0], bins = 20)
axs[0, 0].set_title("N=5000")
axs[1, 0].hist(ISE1[1], bins = 20)
axs[1, 0].set_title("N=10000")
axs[0, 1].hist(ISE1[2], bins = 20)
axs[0, 1].set_title("N=15000")
axs[1, 1].hist(ISE1[3], bins = 20)
axs[1, 1].set_title("N=20000")
fig.tight_layout()
plt.savefig('sim_ISE.pdf')
plt.show()

<Figure size 432x288 with 0 Axes>


MISE1 = np.array([np.nanmean(ISE1[j]) for j in range(4)])
err1_l = MISE1 - np.array([np.nanquantile(ISE1[j], 0.05) for j in range(4)])
err1_u = np.array([np.nanquantile(ISE1[j], 0.95) for j in range(4)]) - MISE1
err1 = np.array([err1_l,err1_u])


MISE2 = np.array([np.nanmean(ISE2[j]) for j in range(4)])
err2_l = MISE2 - np.array([np.nanquantile(ISE2[j], 0.05) for j in range(4)])
err2_u = np.array([np.nanquantile(ISE2[j], 0.95) for j in range(4)]) - MISE2
err2 = np.array([err2_l,err2_u])


x = np.array([1,2,3,4])


MISE1_label = [np.round(x,4) for x in MISE1]
MISE2_label = [np.round(x,4) for x in MISE2]


plt.figure()
plt.scatter(x, MISE1, label = 'f_star', marker = "*", color = "red")
plt.scatter(x, MISE2, label = 'f_check', marker = ".", color = "blue")
plt.xlim([0.5,4.5])
plt.ylim([0,0.03])

for i, txt in enumerate(MISE1_label):
    label = f"({txt})"
    plt.annotate(label, (x[i]-0.3, MISE1[i]-0.002))
    
for i, txt in enumerate(MISE2_label):
    label = f"({txt})"
    plt.annotate(label, (x[i]-0.3, MISE2[i]-0.002))
    
plt.xlabel('Sample Size')

plt.title("Simulated MISE")

plt.legend()
plt.xticks([1,2,3,4],['N = 5000', 'N = 10000', 'N = 15000', 'N = 20000'])
plt.savefig('sim_MISE.pdf')
plt.show()

Minimax Series Simulations¶

Motivation¶

Approximate Sparsity¶

Contribution¶

Code¶

Target Density¶

Random Sample Generator From Target Density¶

P-Algorithm¶

Test Run for P-Alg¶

Simulate ISE/MISE¶