import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.linear_model import Lasso, LassoCV, LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import integrate


# part 1: define the score function (average empirical risk in cross-validation)

    # x is data,
    # xs is the KFold split index
    # J is the series cutoff
    # h is a list of lists of the estimated conditional means,
    # i.e. h[i] is a list of estimated conditional means associated with i-th split
    # k is the k-fold
    # this score function will go inside the estimator

# the naive estimator fJ inside the loss function: 
def fJ(x,y,J,h):
    sum1 = 1  
    for j in range(1,J):
        sum1 = sum1 + (h[j].predict(x))*(2**0.5)*np.cos((np.pi)*j*y)      
    return sum1

# loss function used to construct the score
def loss(x,y,J,h):
    summ = 0
    for j in range(J):
        summ = summ + (h[j].predict(x))**2
    return summ - 2*fJ(x,y,J,h)

# the score function
def score(x,xs,J,h,k):
    score_ = []
    dx = x.shape[1]
    for i in range(k):
        y = x.iloc[xs[i][1]].iloc[:,0]
        X = x.iloc[xs[i][1]].iloc[:,range(1,dx)]
        score_.append(np.mean(loss(X,y,J,h[i])))
    return np.mean(score_)  



# part 2: construct our estimator 
# (comment steps corresponds to the CV algorithm introduced before, for a generalized k-fold)

# x= data, p= series large p, k is the k-fold,

def est(x,p,k):
    
# step 1: split the sample for cross-validation
    
    kf = KFold(n_splits=k, shuffle=True)
    xs = [(train, test) for train, test in kf.split(x)]
    
    # xs is the KFold split index
    
    # h will be a list of deepcopied trained models
    # s will be a list of scores/associated empirical risk
    
    h = [[] for _ in range(k)]
    s = []
    
    # total number of columns in the data
    dx = x.shape[1]
    # Y should always be in the first column

    
# step 2: for each training sample, creates a list trained models
    
    # x.iloc[xs[i][0]] corresponding to the i-th training sample
    # x.iloc[xs[i][1]] corresponding to the i-th testing sample
    
    for i in range(k):
        for j in range(p):
            # i-th training sample Y values
            y = x.iloc[xs[i][0]].iloc[:,0]
            # i-th training sample X values
            X = x.iloc[xs[i][0]].iloc[:,range(1,dx)] 
            # this is phi_j(y)
            pjy = np.cos((np.pi)*j*y) if j < 1 else (2**0.5)*np.cos((np.pi)*j*y) 
            #MLP NeuroNet: this estimates E[phi_j(Y)|X] using MLP
            #model = MLPRegressor(random_state=923, max_iter=200).fit(X,pjy)            
            #alternatively, consider random forest
            model = RandomForestRegressor(max_depth=50, random_state=923).fit(X, pjy)            
            # deepcopy the trained model, will use for cross-validation
            h[i].append(deepcopy(model))
    
# step 3: this creates a list of scores/average empirical risks from cross-validations
# each associated with a specific series cutoff
    
    for J in range(1,p+1):
        s.append(score(x,xs,J,h,k))
    
# step 5/6: 
# find the cutoff associated with the smallest avg empirical risk;
# construct the final estimator by averaging through the estimator from each training sample.
    
    def amcv0(x,y,s,h,k):
        jhat = np.argmin(s)+1
        f = 0
        for i in range(k):
            f = f+ fJ(x,y,jhat,h[i])
        return f/k
    
# the end
    return lambda x,y: amcv0(x,y,s,h,k)


## can simulate some data to try out the estimator to make sure it runs

# beta(2,5) distribution, 10000 observations, 100 covariates

# Xsim = np.random.beta(2,5, size = (10000,100))

# simulate the coefficients, assuming approximate sparsity

#B0 = []
#for i in range(100):
#    B0.append(((i+1)**(-2)))

#B0 = np.asarray(B0)

# generate the outcome variable Y = exp(X'B + u)/(1+ exp(X'B + u)), u is standard normal noise

#sigmoid = lambda x: 1 / (1 + np.exp(-x))

#Ysim = sigmoid(Xsim@B0 + np.random.normal(0,1,10000))

#Ysim = pd.DataFrame(np.reshape(Ysim, (-1,1)))

#Ysim.columns=['y']

#Xsim = pd.DataFrame(Xsim)

#Xsim.columns =["X"+str(i) for i in range(100)]

#XYsim = pd.concat([Ysim, Xsim], axis = 1)

# ftry = est(XYsim,50,5) # the code runs just fine


## A sanity check to make sure that we actually have a density:

# ftry(XYsim.iloc[99,1:].values.reshape(1,-1),0.1)

# t = np.linspace(0,1,10000)
# y = ftry(XYsim.iloc[99,1:].values.reshape(1,-1),t) # for a single observation, has to use reshape(1,-1) to change the shape of the data
# plt.plot(t,y)
# plt.show()

# trydens = lambda t: ftry(XYsim.iloc[99,1:].values.reshape(1,-1),t)
# integrate.quad(trydens, 0, 1)

High-Dimensional Conditional Density Estimation¶

Some Theory to Motivate¶

Main Idea of the Estimator¶

A Simple Cross-Validation "Algorithm"¶

Empirical Risk for Cross-Validation¶

Comments on the Code:¶