import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LassoCV #cross-validated lasso
from sklearn.linear_model import Lasso
from sklearn.neighbors import KernelDensity
from copy import deepcopy
from scipy.stats import norm
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression
import warnings # warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.integrate import quad
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV


################################################
# Series representation of conditional density #
################################################

# For density taking support in [0,1], we use cosine orthonormal basis.
# See amcv project and my job market paper for details.
# Can use any ML methods. In the code below we use Random Forest. 

################################################
# Series representation of conditional density #
################################################

# For density taking support in [0,1], we use cosine orthonormal basis.
# See amcv project and my job market paper for details.
# Can use any ML methods. In the code below we use Random Forest. 

def fcondJ(x,z):
    h0 = []
    dx = x.shape[1]
    
    def f0(x,y,J,h0):
        sum1 = 1  
        for j in range(1,J):
            sum1 = sum1 + (h0[j].predict(x))*(2**0.5)*np.cos((np.pi)*j*y)      
        return sum1

    for j in range(z):
        t = x.iloc[:,0]
        X = x.iloc[:,range(1,dx)]
        y = np.cos((np.pi)*j*t)*np.where(t>0,1,0) if j < 1 else (2**0.5)*np.cos((np.pi)*j*t)*np.where(t>0,1,0)
        # note that np.where(t>0,1,0) = 1{t>0} 
        ## CV LASSO
        # model = Lasso(alpha=0.1).fit(X,y)
        # model = LassoCV(cv=5,random_state=923, n_jobs=5, max_iter = 3000).fit(X,y)
        # add alphas = np.linspace for cv. the automatic ones are weird and gives very large alphas
        ## MLP NeuroNet
        # model = MLPRegressor(random_state=923, max_iter=200).fit(X,y)
        ## Random Forest
        model = RandomForestRegressor(random_state=923).fit(X, y)
        #n_jobs = 5 means using 5 cores cpu for computing
        
        h0.append(deepcopy(model))
        
    return lambda s,r: f0(s,r,z,h0)

# mJ function
def m0(x,J,d):
    summ = 1
    for j in range(1,J):
        summ = summ + (2**0.5)*np.cos((np.pi)*j*x)*(2**0.5)*np.cos((np.pi)*j*d)        
    return summ

def m(J,d):
    return lambda x: m0(x,J,d)*np.where(x>0,1,0)

########################################
# Kernel Density and Kernel Regression #
########################################

# We use gaussian kernel for default, can probably use other kernels.

def K(h,x,d):
    return (norm.pdf((x-d)/h))/h

def Ky(h,x,y,d):
    return (norm.pdf((x-d)/h))*y/h

# then the density would be np.mean(K(h,data,d)) for whatever data we use. h here might need to be tuned.
# kernel regression will also be using this same kernel.


################################################
# Main Code For Repeated Cross-Section Setting #
################################################


# 1. data is should come in as a pandas data frame
#   - the columns of the dataframe should look like the following: [Y1, Y0, D, nD, X]
#   - Y1 and Y0 are outcomes of individual post/pre treatment
#   - D is the treatment variable, D=0 for the control group, and D=d for treatment
#   - nD = 1{D=0}, the indicator for the control
#   - X is individual level controls

# 2. `d` specifies a specific treatment intensity for which we want to learn the ATT(d)

# 3. `v` specifies the v-fold cross-fitting, should be an integer, e.g. 5

# 4. `z` is the series cutoff for the series representation of conditional density, 
#     try z = np.around(n**(1/4)), where n = len(data.index) is the sample size

# 5. `h` is the undersmooth kernel bandwidth,  try h = np.around(n**(-1/4))

# 6. pos = position of the first var of X, i.e. where X starts in the df

# 7. B is the number of bootstraps for the bootstrap standard error, try B=500 or 1000


def att1(data,d,v,z,h,pos,B):
    
## part 1: prep work 

    # set up cross-fitting index 
    
    kf = KFold(n_splits=v, shuffle=True, random_state = 923)
    xs = [(train, test) for train, test in kf.split(data)]
    
    # empty lists for cross-fitting ATT and asymptotic variance 
    att = []
    var = []
    
    # empty list for bootstrap
    mp = []
    attb = [[] for _ in range(v)]
    
    # lists of Gaussian multipliers for bootstrap standard errors
    for b in range(B):
        np.random.seed(int(923+b))
        mp.append(np.random.normal(1, 1, len(data)))
    
# Part 2: Main Code: Cross-Fitting in Two Steps, with Final Step Implementing Multiplier Bootstrap

    for i in range(v):
        train = data.iloc[xs[i][0]]
        test = data.iloc[xs[i][1]]        
        trainx = train.iloc[:,pos:train.shape[1]+1] # all the controls
        traind = pd.concat([train.D,trainx], axis = 1) # the treatment plus control
        
  # series representation of conditional density
        jhat = z
        fJ0 = fcondJ(traind,z)
        fJ = lambda x: np.where(fJ0(x,d)>0,fJ0(x,d),0)
        mJ = m(jhat,d)
        
  # step 1: estimation of nuisance parameters with ML using training sample       
        yl = train['Y1'] - train['Y0']
        
        # kernel density at f_D(d)
        fd = np.mean(K(h,train.D,d))
        
        # kernel regression for E[Y1-Y0|D=d]
        Ed = np.mean(Ky(h,train.D,yl,d))/fd
        
        # regression E[nD|X], which is basically P(D=0|X) 
        # model1 = RandomForestClassifier(max_depth=50, random_state=923).fit(trainx,train.nD)
        ## Can use other ML learners, e.g., logit lasso, MLP
        ## model1 = LogisticRegressionCV(cv = 5, penalty='l2', random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx,train.nD) 
        ## model1 = MLPClassifier(random_state=923, max_iter=500).fit(trainx,train.nD)
        #base_mlp = MLPClassifier(random_state=923, max_iter=500)
        #calibrated_mlp = CalibratedClassifierCV(base_mlp, cv=5, n_jobs=5)
        #model1 = calibrated_mlp.fit(trainx,train.nD)
        base_rf = RandomForestClassifier(random_state=923)
        calibrated_rf = CalibratedClassifierCV(base_rf, cv=5, n_jobs=5)
        model1 = calibrated_rf.fit(trainx,train.nD)
        # create a deepcopy for cross-fitting
        g = lambda x: deepcopy(model1).predict_proba(x)[:,1]
        # predict_proba returns a vector of probabilities for each outcomes, use model.classes_ to see the order of the outcomes
        
        # regression E[(Y1-Y0)*nD|X]
        yD = yl*train.nD
        model2 = RandomForestRegressor(random_state=923).fit(trainx, yD)
        ## Can use other ML learners, e.g., LASSO, MLP
        # model2 = LassoCV(cv=5, random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx, yD) # yD := (T-l)(l(1-l))Y1{D=0}
        #model2 = MLPRegressor(random_state=923, max_iter=500).fit(trainx, yD)
        # create a deepcopy for cross-fitting
        E = lambda x: deepcopy(model2).predict(x)        

  # step 2: estimate the i-th ATT inside the cross-fitting using testing sample
  #         this will be averaged later for cross-fitting

        testx = test.iloc[:,pos:test.shape[1]+1]
        
        nD = test.nD
        
        D = test.D
        
        ylt = test['Y1']-test['Y0']
        
        phi = ylt*nD*(fJ(testx)/(fd*g(testx)))
        
        adj = E(testx)*((mJ(D)*g(testx)- nD*fJ(testx))/(fd*((g(testx))**2)))
        
        theta = np.mean(phi+adj)
        
  ## Bonus 1: multiplier bootstrap

        for b in range(B):
            attb[i].append(np.mean((mp[b][xs[i][1]])*(Ed-phi-adj)))
  
        ATT = Ed - theta
        
        att.append(ATT)
  
  ## Bonus 2: asymptotic variance 
    
        sig2 = ((Ky(h,D,ylt,d)-Ed)/fd
             - phi - adj + theta
             +((theta-Ed)/fd)*(K(h,D,d)-fd))**2
        
        VAR = np.mean(sig2)
        
        var.append(VAR)
        
# Part 3: Results Containing ATT, Asymptotic Variance, and List of Multiplier-Bootstraped ATTs

    return np.mean(att), np.mean(var), [np.mean(x) for x in zip(*attb)]

    # attb is a mother list of children lists, and [np.mean(x) for x in zip(*attb)] gives 
    # the element-wise average (e.g. average over x-th element in each of the children lists in the mother lists).


###########################################
# Code For Bootstrap Confidence Intervals #
###########################################

# suppose we store the results as
# att, var, attb = att1(data,d,v,z,h,pos,B)

# lower bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 97.5)

# upper bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 2.5)


##################################################
# Bonus Code For Repeated Cross-Sections Setting #
##################################################

# very simple extension, but need to know the treatment status for everyone in the cross-sections
# see paper for the detailed theory

# 1. data is should come in as a pandas data frame
#   - the columns of the dataframe should look like the following: [Y, T, D, nD, X]
#   - Y is individual outcome
#   - T is the indicator of which cross-section the data belongs too, think of this as a ``pre-post" var  
#   - D is the treatment variable, D=0 for the control group, and D=d for treatment
#   - nD = 1{D=0}, the indicator for the control
#   - X is individual level controls

# 2. `d` specifies a specific treatment intensity for which we want to learn the ATT(d)

# 3. `v` specifies the v-fold cross-fitting, should be an integer, e.g. 5

# 4. `z` is the series cutoff for the series representation of conditional density, 
#     try z = np.around(n**(1/4)), where n = len(data.index) is the sample size

# 5. `h` is the undersmooth kernel bandwidth,  try h = np.around(n**(-1/4))

# 6. pos = position of the first var of X, i.e. where X starts in the df

# 7. B is the number of bootstraps for the bootstrap standard error, try B=500 or 1000



def att2(data,d,v,z,h,pos,B):
    
## part 1: prep work 

    # set up cross-fitting index 
    
    kf = KFold(n_splits=v, shuffle=True, random_state = 923)
    xs = [(train, test) for train, test in kf.split(data)]
    
    # empty lists for cross-fitting ATT and asymptotic variance 
    att = []
    var = []
    
    # empty list for bootstrap
    mp = []
    attb = [[] for _ in range(v)]
    
    # lists of Gaussian multipliers for bootstrap standard errors
    for b in range(B):
        np.random.seed(int(923+b))
        mp.append(np.random.normal(1, 1, len(data)))
    
# Part 2: Main Code: Cross-Fitting in Two Steps, with Final Step Implementing Multiplier Bootstrap

    for i in range(v):
        train = data.iloc[xs[i][0]]
        test = data.iloc[xs[i][1]]        
        trainx = train.iloc[:,pos:train.shape[1]+1] # all the controls
        traind = pd.concat([train.D,trainx], axis = 1) # the treatment plus control
        
  # series representation of conditional density
        jhat = z
        fJ0 = fcondJ(traind,z)
        fJ = lambda x: np.where(fJ0(x,d)>0,fJ0(x,d),0)
        mJ = m(jhat,d)
        
  # step 1: estimation of nuisance parameters with ML using training sample       
        lbd = len(train[train['T']==1]['T'])/len(train['T'])
        yl = (train['Y'])*((train['T'] - lbd)/(lbd*(1-lbd)))
        
        # kernel density at f_D(d)
        fd = np.mean(K(h,train.D,d))
        
        # kernel regression for E[Y1-Y0|D=d]
        Ed = np.mean(Ky(h,train.D,yl,d))/fd
        
        # regression E[nD|X], which is basically P(D=0|X) 
        # model1 = RandomForestClassifier(max_depth=50, random_state=923).fit(trainx,train.nD)
        ## Can use other ML learners, e.g., logit lasso, MLP
        ## model1 = LogisticRegressionCV(cv = 5, penalty='l2', random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx,train.nD) 
        ## model1 = MLPClassifier(random_state=923, max_iter=500).fit(trainx,train.nD)
        #base_mlp = MLPClassifier(random_state=923, max_iter=500)
        #calibrated_mlp = CalibratedClassifierCV(base_mlp, cv=5, n_jobs=5)
        #model1 = calibrated_mlp.fit(trainx,train.nD)
        base_rf = RandomForestClassifier(random_state=923)
        calibrated_rf = CalibratedClassifierCV(base_rf, cv=5, n_jobs=5)
        model1 = calibrated_rf.fit(trainx,train.nD)
        # create a deepcopy for cross-fitting
        g = lambda x: deepcopy(model1).predict_proba(x)[:,1]
        # predict_proba returns a vector of probabilities for each outcomes, use model.classes_ to see the order of the outcomes
        
        # regression E[(Y1-Y0)*nD|X]
        yD = yl*train.nD
        model2 = RandomForestRegressor(random_state=923).fit(trainx, yD)
        ## Can use other ML learners, e.g., LASSO, MLP
        # model2 = LassoCV(cv=5, random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx, yD) # yD := (T-l)(l(1-l))Y1{D=0}
        #model2 = MLPRegressor(random_state=923, max_iter=500).fit(trainx, yD)
        # create a deepcopy for cross-fitting
        E = lambda x: deepcopy(model2).predict(x)        

  # step 2: estimate the i-th ATT inside the cross-fitting using testing sample
  #         this will be averaged later for cross-fitting

        testx = test.iloc[:,pos:test.shape[1]+1]
        
        nD = test.nD
        
        D = test.D
        
        ylt = (test['Y'])*(test['T'] - lbd)/(lbd*(1-lbd))
        
        phi = ylt*nD*(fJ(testx)/(fd*g(testx)))
        
        adj = E(testx)*((mJ(D)*g(testx)- nD*fJ(testx))/(fd*((g(testx))**2)))
        
        theta = np.mean(phi+adj)
        
  ## Bonus 1: multiplier bootstrap

        for b in range(B):
            attb[i].append(np.mean((mp[b][xs[i][1]])*(Ed-phi-adj)))
  
        ATT = Ed - theta
        
        att.append(ATT)
  
  ## Bonus 2: asymptotic variance 
    
        sig2 = ((Ky(h,D,ylt,d)-Ed)/fd
             - phi - adj + theta
             +((theta-Ed)/fd)*(K(h,D,d)-fd))**2
        
        VAR = np.mean(sig2)
        
        var.append(VAR)
        
# Part 3: Results Containing ATT, Asymptotic Variance, and List of Multiplier-Bootstraped ATTs

    return np.mean(att), np.mean(var), [np.mean(x) for x in zip(*attb)]

    # attb is a mother list of children lists, and [np.mean(x) for x in zip(*attb)] gives 
    # the element-wise average (e.g. average over x-th element in each of the children lists in the mother lists).


###########################################
# Code For Bootstrap Confidence Intervals #
###########################################

# suppose we store the results as
# att, var, attb = att1(data,d,v,z,h,pos,B)

# lower bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 97.5)

# upper bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 2.5)


#################################
### Code Based on Kernel Only ###
#################################

# The code can be significantly simplified by replacing series with kernel.
# See Lucas Zhang's dissertation for theoretical justifications.


#################################
### Repeated Outcomes, Kernel ###
#################################

# 1. data is should come in as a pandas data frame
#   - the columns of the dataframe should look like the following: [Y1, Y0, D, nD, X]
#   - Y1 and Y0 are outcomes of individual post/pre treatment
#   - D is the treatment variable, D=0 for the control group, and D=d for treatment
#   - nD = 1{D=0}, the indicator for the control
#   - X is individual level controls

# 2. `d` specifies a specific treatment intensity for which we want to learn the ATT(d)

# 3. `v` specifies the v-fold cross-fitting, should be an integer, e.g. 5

# 4. `h` is the undersmooth kernel bandwidth,  try h = np.around(n**(-1/4))
#    where n = len(data.index) is the sample size   

# 5. pos = position of the first var of X, i.e. where X starts in the df

# 6. B is the number of bootstraps for the bootstrap standard error, try B=500 or 1000


def attk(data,d,v,h,pos,B):
    
    kf = KFold(n_splits=v, shuffle=True, random_state = 923)
    xs = [(train, test) for train, test in kf.split(data)]
    att = []
    var = []
    mp = []
    attb = [[] for _ in range(v)]
    
    # lists of Gaussian multipliers for bootstrap standard errors
    # random seed added to ensure reproducibility
    for b in range(B):
        np.random.seed(int(923+b))
        mp.append(np.random.normal(1, 1, len(data)))

# Main Code: Cross-Fitting in Two Steps, with Final Step Implementing Multiplier Bootstrap

    for i in range(v):
        train = data.iloc[xs[i][0]]
        test = data.iloc[xs[i][1]]        
        trainx = train.iloc[:,pos:train.shape[1]+1] # all the controls
        
  # Kernel Representations
        modelh = RandomForestRegressor(random_state=923).fit(trainx, K(h,train.D,d))
        fh = lambda x: deepcopy(modelh).predict(x)
        mh = lambda x: K(h,x,d)
        
  # step 1: estimation of nuisance parameters with ML using training sample       
        yl = train['Y1'] - train['Y0']
        
        # kernel density at f_D(d)
        fd = np.mean(K(h,train.D,d))
        
        # regression E[nD|X], which is basically P(D=0|X) 
        # model1 = RandomForestClassifier(max_depth=50, random_state=923).fit(trainx,train.nD)
        ## Can use other ML learners, e.g., logit lasso, MLP
        ## model1 = LogisticRegressionCV(cv = 5, penalty='l2', random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx,train.nD) 
        ## model1 = MLPClassifier(random_state=923, max_iter=500).fit(trainx,train.nD)
        #base_mlp = MLPClassifier(random_state=923, max_iter=500)
        #calibrated_mlp = CalibratedClassifierCV(base_mlp, cv=5, n_jobs=5)
        #model1 = calibrated_mlp.fit(trainx,train.nD)
        base_rf = RandomForestClassifier(random_state=923)
        calibrated_rf = CalibratedClassifierCV(base_rf, cv=5, n_jobs=5)
        model1 = calibrated_rf.fit(trainx,train.nD)
        # create a deepcopy for cross-fitting
        g = lambda x: deepcopy(model1).predict_proba(x)[:,1]
        # predict_proba returns a vector of probabilities for each outcomes, use model.classes_ to see the order of the outcomes
        
        # regression E[(Y1-Y0)*nD|X]
        yD = yl*train.nD
        model2 = RandomForestRegressor(random_state=923).fit(trainx, yD)
        ## Can use other ML learners, e.g., LASSO, MLP
        # model2 = LassoCV(cv=5, random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx, yD) # yD := (T-l)(l(1-l))Y1{D=0}
        # model2 = MLPRegressor(random_state=923, max_iter=500).fit(trainx, yD)
        # create a deepcopy for cross-fitting
        E = lambda x: deepcopy(model2).predict(x)        

  # step 2: estimate the i-th ATT inside the cross-fitting using testing sample
  #         this will be averaged later for cross-fitting

        testx = test.iloc[:,pos:test.shape[1]+1]
        
        nD = test.nD
        
        D = test.D
        
        ylt = test['Y1']-test['Y0']
        
        phi = ylt*(K(h,D,d)*g(testx) - nD*fh(testx))/(fd*g(testx))
        
        adj = E(testx)*((mh(D)*g(testx)- nD*fh(testx))/(fd*((g(testx))**2)))
        
        theta = np.mean(phi+adj)
        
  ## Bonus 1: multiplier bootstrap

        for b in range(B):
            attb[i].append(np.mean((mp[b][xs[i][1]])*(phi+adj)))
  
        ATT = theta
        
        att.append(ATT)
  
  ## Bonus 2: asymptotic variance 
    
        sig2 = (phi + adj - theta*(K(h,D,d)-fd)/fd)**2
        
        VAR = np.mean(sig2)
        
        var.append(VAR)
        
# Part 3: Results Containing ATT, Asymptotic Variance, and List of Multiplier-Bootstraped ATTs

    return np.mean(att), np.mean(var), [np.mean(x) for x in zip(*attb)]

    # attb is a mother list of children lists, and [np.mean(x) for x in zip(*attb)] gives 
    # the element-wise average (e.g. average over x-th element in each of the children lists in the mother lists).


###########################################
# Code For Bootstrap Confidence Intervals #
###########################################

# suppose we store the results as
# att, var, attb = att1(data,d,v,z,h,pos,B)

# lower bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 97.5)

# upper bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 2.5)


#######################################
### Repeated Cross Sections, Kernel ###
#######################################

# very simple extension, but need to know the treatment status for everyone in the cross-sections
# see paper for the detailed theory

# 1. data is should come in as a pandas data frame
#   - the columns of the dataframe should look like the following: [Y, T, D, nD, X]
#   - Y is individual outcome
#   - T is the indicator of which cross-section the data belongs too, think of this as a ``pre-post" var  
#   - D is the treatment variable, D=0 for the control group, and D=d for treatment
#   - nD = 1{D=0}, the indicator for the control
#   - X is individual level controls

# 2. `d` specifies a specific treatment intensity for which we want to learn the ATT(d)

# 3. `v` specifies the v-fold cross-fitting, should be an integer, e.g. 5

# 4. `h` is the undersmooth kernel bandwidth,  try h = np.around(n**(-1/4))
#    where n = len(data.index) is the sample size  

# 5. pos = position of the first var of X, i.e. where X starts in the df

# 6. B is the number of bootstraps for the bootstrap standard error, try B=500 or 1000


def attk2(data,d,v,h,pos,B):
    
    kf = KFold(n_splits=v, shuffle=True, random_state = 923)
    xs = [(train, test) for train, test in kf.split(data)]
    att = []
    var = []
    mp = []
    attb = [[] for _ in range(v)]
    
    # lists of Gaussian multipliers for bootstrap standard errors
    # random seed added to ensure reproducibility
    for b in range(B):
        np.random.seed(int(923+b))
        mp.append(np.random.normal(1, 1, len(data)))

# Main Code: Cross-Fitting in Two Steps, with Final Step Implementing Multiplier Bootstrap

    for i in range(v):
        train = data.iloc[xs[i][0]]
        test = data.iloc[xs[i][1]]        
        trainx = train.iloc[:,pos:train.shape[1]+1] # all the controls
        
  # Kernel Representations
        modelh = RandomForestRegressor(random_state=923).fit(trainx, K(h,train.D,d))
        fh = lambda x: deepcopy(modelh).predict(x)
        mh = lambda x: K(h,x,d)
        
  # step 1: estimation of nuisance parameters with ML using training sample       
        lbd = len(train[train['T']==1]['T'])/len(train['T'])
        yl = (train['Y'])*((train['T'] - lbd)/(lbd*(1-lbd)))
        
        # kernel density at f_D(d)
        fd = np.mean(K(h,train.D,d))
        
        # regression E[nD|X], which is basically P(D=0|X) 
        # model1 = RandomForestClassifier(max_depth=50, random_state=923).fit(trainx,train.nD)
        ## Can use other ML learners, e.g., logit lasso, MLP
        ## model1 = LogisticRegressionCV(cv = 5, penalty='l2', random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx,train.nD) 
        ## model1 = MLPClassifier(random_state=923, max_iter=500).fit(trainx,train.nD)
        #base_mlp = MLPClassifier(random_state=923, max_iter=500)
        #calibrated_mlp = CalibratedClassifierCV(base_mlp, cv=5, n_jobs=5)
        #model1 = calibrated_mlp.fit(trainx,train.nD)
        base_rf = RandomForestClassifier(random_state=923)
        calibrated_rf = CalibratedClassifierCV(base_rf, cv=5, n_jobs=5)
        model1 = calibrated_rf.fit(trainx,train.nD)
        # create a deepcopy for cross-fitting
        g = lambda x: deepcopy(model1).predict_proba(x)[:,1]
        # predict_proba returns a vector of probabilities for each outcomes, use model.classes_ to see the order of the outcomes
        
        # regression E[(Y1-Y0)*nD|X]
        yD = yl*train.nD
        model2 = RandomForestRegressor(random_state=923).fit(trainx, yD)
        ## Can use other ML learners, e.g., LASSO, MLP
        # model2 = LassoCV(cv=5, random_state=923, n_jobs = 5, max_iter = 3000).fit(trainx, yD) # yD := (T-l)(l(1-l))Y1{D=0}
        # model2 = MLPRegressor(random_state=923, max_iter=500).fit(trainx, yD)
        # create a deepcopy for cross-fitting
        E = lambda x: deepcopy(model2).predict(x)        

  # step 2: estimate the i-th ATT inside the cross-fitting using testing sample
  #         this will be averaged later for cross-fitting

        testx = test.iloc[:,pos:test.shape[1]+1]
        
        nD = test.nD
        
        D = test.D
        
        ylt = (test['Y'])*(test['T'] - lbd)/(lbd*(1-lbd))
        
        phi = ylt*(K(h,D,d)*g(testx) - nD*fh(testx))/(fd*g(testx))
        
        adj = E(testx)*((mh(D)*g(testx)- nD*fh(testx))/(fd*((g(testx))**2)))
        
        theta = np.mean(phi+adj)
        
  ## Bonus 1: multiplier bootstrap

        for b in range(B):
            attb[i].append(np.mean((mp[b][xs[i][1]])*(phi+adj)))
  
        ATT = theta
        
        att.append(ATT)
  
  ## Bonus 2: asymptotic variance 
    
        sig2 = (phi + adj - theta*(K(h,D,d)-fd)/fd)**2
        
        VAR = np.mean(sig2)
        
        var.append(VAR)
        
# Part 3: Results Containing ATT, Asymptotic Variance, and List of Multiplier-Bootstraped ATTs

    return np.mean(att), np.mean(var), [np.mean(x) for x in zip(*attb)]

    # attb is a mother list of children lists, and [np.mean(x) for x in zip(*attb)] gives 
    # the element-wise average (e.g. average over x-th element in each of the children lists in the mother lists).


###########################################
# Code For Bootstrap Confidence Intervals #
###########################################

# suppose we store the results as
# att, var, attb = att1(data,d,v,z,h,pos,B)

# lower bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 97.5)

# upper bound of 95% bootstrap CI:
# att - np.percentile(attb-att, 2.5)

Difference-in-Differences with Continuous Treatment¶

09/18/2023 Updates:¶

Intro and Motivation¶

Basic Diff-in-Diff¶

Diff-in-Diff with Covariates (Controls)¶

Continuous Diff-in-Diff Under Double Machine Learning (DML)¶

Comments on the Code¶