This project discusses nonparametric density estimation under a special type of sparsity condition, the approximate sparsity.
Suppose $X$ is an economic variable of interest and we want to estimate its probability density $f_X$. In particular, $f_X$ has a series representation $$ f_X(x) = \sum_{j=1}^\infty \theta_j\phi_j(x),\quad \theta_j = E[\phi_j(X)] $$ where $\{\phi_j\}$ is an orthonormal basis, choosing by researcher.
Given data $\{X_i\}_{i=1}^N$, we can estimate $$ \hat{f}_J(x) = \sum_{j=1}^J \hat{\theta}_j\phi_j(x),\quad \hat{\theta}_j = \frac{1}{n}\sum_{i=1}^n \phi_j(X_i) $$
The main question is how to choose the series cutoff $J$, which governs the variance-bias tradeoff.
Since the true coefficients $\theta_j = E[\phi_j(X)]$ are expectations, this framework essentially estimates many expectations. Recent high-dimensional econometrics literature provides one method of choosing the optimal cutoff $J$ in a data-driven way, which relies on LASSO.
In particular, LASSO has been shown to work well when you have sparse coefficients, and in fact, it still works when the sparsity only holds approximately. Using our density $f_X(x) = \sum_{j=1}^\infty\theta_j\phi_j(x)$ for example.
In my paper, I establish the complexity of the function classes that are characterized by the approximate sparsity condition, and demonstrate that LASSO as a selection mechanism produces the optimal estimator for the densities in these classes. This has implications for both density estimation and nonparametric regression problem.
Below is the Python code corresponding to the theory in my paper. A few highlights:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import beta
from scipy.stats import norm # for normal cdf
from scipy.integrate import quad # for numerical integration
import warnings
import time
from numpy.random import random
from scipy import interpolate
# cos_series function serves two purposes:
# 1. to create the target density
# 2. to generate estimators
def cos_series(x,t,z):
sum = t[0]*1
for j in range(1,z):
sum = sum + t[j]*(2**0.5)*np.cos((np.pi)*j*x)
return sum
# target specifies the series coefficients for the target density
# A is the constant that we need to adjust to make sure the density we construct is positive.
A = 2
target = np.asarray([1, A*(2**(-2)), A*(3**(-2)), 0, A*(10**(-2)), 0, A*(7**(-2)), A*(8**(-2)),0,0, A*(4**(-2)), 0, A*(6**(-2)), A*(5**(-2)), A*(9**(-2))])
# create our target density
def f(x):
return cos_series(x,target,15)
# plot this function, see if it works
x = np.linspace(0,1,10000)
y = f(x)
plt.plot(x,y)
plt.show()
# yes! it's positive and integrate to one since its first Fourier coefficients = 1.
# part1: get the approximated inverse cdf using interpolation
def sample(g):
x = np.linspace(0,1,100000, endpoint = True) # change this depending on the domain of your density
y = g(x) # probability density function, pdf
cdf_y = np.cumsum(y) # cumulative distribution function, cdf
cdf_y = cdf_y/cdf_y.max() # takes care of normalizing cdf to 1.0
cdf_y[0] = 0 # the tale old as time... F(0) has to be 0... approximation won't do.
inverse_cdf = interpolate.interp1d(cdf_y,x) # interpolate the function y = f(x) to the points outside the given range
return inverse_cdf
# part2: create function that generates random sample using inverse transform sampling.
def return_samples(N):
# let's generate some samples according to the chosen pdf, f(x)
uniform_samples = random(int(N))
required_samples = sample(f)(uniform_samples)
return required_samples
# sanity check: plot to see if this actually works (seems pretty damn good :p)
x = np.linspace(0,1,10000)
y = f(x)
plt.figure()
plt.plot(x,y, color = 'r')
plt.hist(return_samples(100000),bins='auto', density = True, range=(x.min(),x.max()), color = 'g', alpha = 0.5)
plt.title("Design Density (line) and Random Sample (histogram)")
plt.ylabel("density")
plt.savefig('density_sample.pdf')
plt.show()
# modified/optimized p-algorithm. For the argument, pass t= t_hard, z= J, c= 1 to start
def palg_cos(t,z,c, tol = 1e-2,maxiter = 500):
count = 0
err = tol+1
while count < maxiter and err >= tol:
def f(x,t,z,c,count):
if count < 1:
return np.maximum(cos_series(x,t,z),0)
return np.maximum(f(x,t,z,c, count - 1) + 1 - c,0)
def g(x):
return f(x,t,z,c,count)
c1 = quad(g,0,1)[0] # numerical integration
err = np.abs(c1-1)
count = count + 1
c=c1
if count == maxiter:
return 0
#print("Failed to converge!")
return lambda x: f(x,t,z,c,count)
### User Guide ###
# t : array of the series coefficients to be specified for cosine orthonormal basis
# z : how many series terms we considering
# c : initial value to start with. always set c = 1 to start with (this part of code can be further optimized)
# tol: the acceptable error for the p-alg numerical integration
# maxiter: how many iterations do we allow before terminate for no convergence
##################
# this section gives a step by step test run on how the estimator is constructed. Can ignore.
# sample size
Nt = 20000
# draw random sample from beta(a,b) distribution
# data = np.random.beta(a,b,Nt)
# draw random sample from target distribution
data = return_samples(Nt)
# cutoff J: 50, 100, 200, 500
J = 200
# estimates of J series coefficients
t = []
for j in range(1,J):
t.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))
t.insert(0,1)
t = np.asarray(t)
# approximate penalization using self-normalized sum
Z = []
for j in range(1,J):
Z.append(np.mean(((2**0.5)*np.cos((np.pi)*j*data)-t[j])**2))
lbd = (((np.log(J))/Nt)**0.5)*(norm.ppf(1-(1/(J*4*(((np.pi)*(np.log(J)))**0.5)))))*((np.amax(Z))**0.5)
# get hard-thresholding coefficients
t_hard = np.copy(t)
t_hard[np.abs(t) <lbd] = 0
t_hard
array([1. , 0.49790887, 0.22250975, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.13296416, 0. , 0. , 0.08639149, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
# how long it takes for p-alg to run
start = time.time()
test = palg_cos(t_hard,J,1,tol = 1e-3,maxiter = 500)
end = time.time()
print(end - start)
0.055786848068237305
# a = 2 # beta distribution alpha
# b = 5 # beta distribution beta
x = np.arange(0.0, 1.001, 0.001)
#y = beta.pdf(x,a,b)
y = f(x)
plt.plot(x,y, color='r', label = 'true density')
plt.plot(x, test(x), color='b', label = 'p-alg series')
plt.plot(x, cos_series(x,t,int(np.floor((4*5000)**(1/4)))), color='g', label = 'regular series')
plt.legend()
plt.ylabel('density')
plt.xlabel('value')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.title('Comparing True Density with Estimator, N=10000')
plt.show()
# so it's ok so far. I think the original series can be made even more pathological
# Simulate MISE for our estimator for the first simulation study
# sample size
Nt = 5000
# cutoff J
J1 = 200
# list of regularized series coefficients for different cutoffs
T1 = [[] for _ in range(4)] # for our estimator
T2 = [[] for _ in range(4)] # for the comparison estimator
# number of iteration for Monte Carlo MISE
B = 1000
# get the list T1 for our estimator and T2 for naive estimator
for k in range(1,5,1):
for i in range(B):
data = return_samples(k*5000)
#empty lists to start
t1 = []
t2 = []
#create T1 list
for j in range(1,J1):
t1.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))
t1.insert(0,1)
t1 = np.asarray(t1)
Z = []
for j in range(1,J1):
Z.append(np.mean(((2**0.5)*np.cos((np.pi)*j*data)-t1[j])**2))
#lbd = ((k*5000)**(-0.5))*(norm.ppf(1-(alpha/(2*J1))))*(np.amax(Z)**0.5)
lbd = (((np.log(J1))/(k*5000))**0.5)*(norm.ppf(1-(1/(J1*4*(((np.pi)*(np.log(J1)))**0.5)))))*((np.amax(Z))**0.5)
t_hard = np.copy(t1)
t_hard[np.abs(t1) < lbd] = 0
T1[k-1].append(t_hard)
#create T2 list
J2 = int(np.floor((k*5000)**(1/4))) #np.floor gives float object. need int()
for j in range(1,J2):
t2.append(np.mean((2**0.5)*np.cos((np.pi)*j*data)))
t2.insert(0,1)
t2 = np.asarray(t2)
T2[k-1].append(t2)
# get the list of ISE
ISE1 = [[] for _ in range(4)] # for our estimator
ISE2 = [[] for _ in range(4)] # for comparison estimator
J1= 200
for k in range(1,5,1):
for t in T1[k-1]:
f_star = palg_cos(t,J1,1,tol=1e-3,maxiter=500)
if f_star == 0:
ise = np.nan
else:
def integrand(x):
return (f_star(x) - f(x))**2
ise = quad(integrand,0,1)[0]
ISE1[k-1].append(ise)
for t in T2[k-1]:
J2 = int(np.floor((k*5000)**(1/4)))
f_check = palg_cos(t,J2,1,tol=1e-3,maxiter=500)
if f_check == 0:
ise = np.nan
else:
def integrand(x):
return (f_check(x) - f(x))**2
ise = quad(integrand,0,1)[0]
ISE2[k-1].append(ise)
# not sure if the warning message is of any concern at all
plt.figure()
fig, axs = plt.subplots(2, 2)
axs[0, 0].hist(ISE1[0], bins = 20)
axs[0, 0].set_title("N=5000")
axs[1, 0].hist(ISE1[1], bins = 20)
axs[1, 0].set_title("N=10000")
axs[0, 1].hist(ISE1[2], bins = 20)
axs[0, 1].set_title("N=15000")
axs[1, 1].hist(ISE1[3], bins = 20)
axs[1, 1].set_title("N=20000")
fig.tight_layout()
plt.savefig('sim_ISE.pdf')
plt.show()
<Figure size 432x288 with 0 Axes>
MISE1 = np.array([np.nanmean(ISE1[j]) for j in range(4)])
err1_l = MISE1 - np.array([np.nanquantile(ISE1[j], 0.05) for j in range(4)])
err1_u = np.array([np.nanquantile(ISE1[j], 0.95) for j in range(4)]) - MISE1
err1 = np.array([err1_l,err1_u])
MISE2 = np.array([np.nanmean(ISE2[j]) for j in range(4)])
err2_l = MISE2 - np.array([np.nanquantile(ISE2[j], 0.05) for j in range(4)])
err2_u = np.array([np.nanquantile(ISE2[j], 0.95) for j in range(4)]) - MISE2
err2 = np.array([err2_l,err2_u])
x = np.array([1,2,3,4])
MISE1_label = [np.round(x,4) for x in MISE1]
MISE2_label = [np.round(x,4) for x in MISE2]
plt.figure()
plt.scatter(x, MISE1, label = 'f_star', marker = "*", color = "red")
plt.scatter(x, MISE2, label = 'f_check', marker = ".", color = "blue")
plt.xlim([0.5,4.5])
plt.ylim([0,0.03])
for i, txt in enumerate(MISE1_label):
label = f"({txt})"
plt.annotate(label, (x[i]-0.3, MISE1[i]-0.002))
for i, txt in enumerate(MISE2_label):
label = f"({txt})"
plt.annotate(label, (x[i]-0.3, MISE2[i]-0.002))
plt.xlabel('Sample Size')
plt.title("Simulated MISE")
plt.legend()
plt.xticks([1,2,3,4],['N = 5000', 'N = 10000', 'N = 15000', 'N = 20000'])
plt.savefig('sim_MISE.pdf')
plt.show()