# Launch from terminal with the
# PYTHONHASHSEED=0 jupyter notebook

import os
os.environ["MKL_DYNAMIC"] = "FALSE"
os.environ["MKL_CBWR"] = "COMPATIBLE"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

import pandas as pd
import numpy as np
from numpy.random import SeedSequence
import matplotlib.pyplot as plt
from copy import deepcopy
from scipy.stats import norm
from scipy.stats import beta
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
import warnings # warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from joblib import Parallel, delayed
import time


# PLEASE MAKE SURE THE PACKAGE VERSIONS MATCH THOSE IN THE README
import sys
import scipy
import sklearn
import joblib

print("Python:", sys.version)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("matplotlib:", plt.matplotlib.__version__)
print("scipy:", scipy.__version__)
print("scikit-learn:", sklearn.__version__)
print("joblib:", joblib.__version__)

Python: 3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:45:13) [Clang 16.0.6 ]
pandas: 2.1.4
numpy: 1.26.2
matplotlib: 3.8.2
scipy: 1.11.4
scikit-learn: 1.3.2
joblib: 1.3.2


# function to generate multivariate normal samples

# NOTE!! 
#    - The numpy default function multivariate_normal is not stable!
#    - The function multivariate_normal needs a matrix factorization of Σ, which is handed off to LAPACK/BLAS, 
#         and that’s where the OS-dependent rounding differences crept in.
#    - For simple univariate or independent draws, there’s no BLAS call — just a transformation 
#         of uniform random numbers from the PCG64 stream.

def sample_equicorr_X(rng, n, p, mean_X=0.2, rho=0.1):
    z0 = rng.normal(size=n)          # common factor per row
    z  = rng.normal(size=(n, p))     # idiosyncratic shocks
    return mean_X + np.sqrt(rho) * z0[:, None] + np.sqrt(1.0 - rho) * z


# Panel DGP

def gen_panel(p, n, seed):    
    # ---- RNG setup: one master seed, independent substreams ----
    ss = np.random.SeedSequence(seed)
    ss_XD, ss_Dprime, ss_e1, ss_e2, ss_e3 = ss.spawn(5)
    rng_XD     = np.random.default_rng(ss_XD)      # for X and D
    rng_Dprime = np.random.default_rng(ss_Dprime)  # for D'
    rng_e1     = np.random.default_rng(ss_e1)
    rng_e2     = np.random.default_rng(ss_e2)
    rng_e3     = np.random.default_rng(ss_e3)

    # ---- Parameters ----
    mean_X = 0.2
    rho = 0.1

    gamma = np.array([0.5 / (j**2) for j in range(1, p + 1)], dtype=float)
    alpha = np.array([0.3 / (j**2) for j in range(1, p + 1)], dtype=float)
    beta  = np.array([0.5 / j if j <= 6 else 0.0 for j in range(1, p + 1)], dtype=float)

    # ---- Draw covariates X ~ N(mean_X, cov_X) ----
    X = sample_equicorr_X(rng_XD, n=n, p=p, mean_X=mean_X, rho=rho).astype(np.float64)
    
    # ---- Treatment indicator D ----
    X_gamma = X @ gamma
    P_D_0 = 1.0 / (1.0 + np.exp(-X_gamma))              # P(D=0|X)
    D0 = rng_XD.binomial(1, P_D_0)                      # D0=1 as control
    D  = 1.0 - D0                                       # D=1 as treated
    
    # ---- Continuous treatment D' for treated only ----
    indices_D_1 = np.where(D == 1.0)[0]                 # treated indices
    mu_vec = X[indices_D_1] @ alpha                      # shape: (n_treated,)
    m_vec  = 0.1 / (1.0 + np.exp(-1.0 * mu_vec))         
    V_vec  = rng_Dprime.beta(2.0, 2.0, size=len(indices_D_1))
    D_prime = D.astype(float)
    D_prime[indices_D_1] = m_vec + V_vec                # support in (m, m+1)


    # ---- Outcomes ----
    e1 = rng_e1.normal(0.0, 1.0, n)
    Y_t_1_0 = e1  # Generating Y_{t-1}(0)
    Y_t_1 = Y_t_1_0 # Generating Y_{t-1}

    tau = 1.0
    e2 = rng_e2.normal(0.0, 1.0, n)
    Y_t_0 = Y_t_1 + X @ beta + tau + e2  # Generating Y_t(0)

    e3 = rng_e3.normal(0.0, 1.0, n)
    Y_t_D = Y_t_0 - 0.5 * (D_prime ** 2) + e3  # Generating Y_t(D)

    Y_t = np.where(D_prime > 0, Y_t_D, Y_t_0) # Final Y_t


    # ---- Assemble DataFrame ----
    data_dict = {
        'Y1': Y_t,
        'Y0': Y_t_1,
        'D': D_prime,
        'nD': D0
    }
    for i in range(p):
        data_dict[f'X_{i+1}'] = X[:, i]

    data = pd.DataFrame(data_dict)

    # Optional: clean up near-binary D/nD if numerical noise ever appears (shouldn’t here)
    threshold = 1e-6
    data['D']  = np.where(np.abs(data['D'])  < threshold, 0.0, data['D'])
    data['nD'] = np.where(np.abs(data['nD']) < threshold, 0.0, data['nD'])
    data['nD'] = np.where(np.abs(data['nD'] - 1.0) < threshold, 1.0, data['nD'])
    
    # === Quantize at the very end (stabilizes RF splits across OS) ===
    cols_to_q = ['Y1', 'Y0', 'D', 'nD'] + [f'X_{i+1}' for i in range(p)]
    # If you prefer to leave outcomes unrounded, drop 'Y0','Y1' from this list
    for c in cols_to_q:
        arr = data[c].to_numpy(dtype=np.float64, copy=False)
        arr = np.round(arr, 4)
        data[c] = arr    
    
    return data


# Repeated Cross-sections DGP

def gen_cross(p, n, seed):
    # ---- RNG setup: one master seed, independent substreams ----
    ss = np.random.SeedSequence(seed)
    ss_T, ss_XD, ss_Dprime, ss_e1, ss_e2, ss_e3 = ss.spawn(6)
    rng_T      = np.random.default_rng(ss_T)      # for T
    rng_XD     = np.random.default_rng(ss_XD)      # for X and D
    rng_Dprime = np.random.default_rng(ss_Dprime)  # for D'
    rng_e1     = np.random.default_rng(ss_e1)
    rng_e2     = np.random.default_rng(ss_e2)
    rng_e3     = np.random.default_rng(ss_e3)

    # ---- Time ---- 
    T = rng_T.binomial(1.0, 0.5, n).astype(float) #T ~ Bernoulli(0.5)

    # ---- Parameters ----
    mean_X = 0.2
    rho = 0.1

    gamma = np.array([0.5 / (j**2) for j in range(1, p + 1)], dtype=float)
    alpha = np.array([0.3 / (j**2) for j in range(1, p + 1)], dtype=float)
    beta  = np.array([0.5 / j if j <= 6 else 0.0 for j in range(1, p + 1)], dtype=float)

    # ---- Draw covariates X ~ N(mean_X, cov_X) ----
    X = sample_equicorr_X(rng_XD, n=n, p=p, mean_X=mean_X, rho=rho).astype(np.float64)
    
    # ---- Treatment indicator D ----
    X_gamma = X @ gamma
    P_D_0 = 1.0 / (1.0 + np.exp(-X_gamma))              # P(D=0|X)
    D0 = rng_XD.binomial(1, P_D_0)                      # D0=1 as control
    D  = 1.0 - D0                                       # D=1 as treated
    
    # ---- Continuous treatment D' for treated only ----
    indices_D_1 = np.where(D == 1.0)[0]                 # treated indices
    mu_vec = X[indices_D_1] @ alpha                      # shape: (n_treated,)
    m_vec  = 0.1 / (1.0 + np.exp(-1.0 * mu_vec))         
    V_vec  = rng_Dprime.beta(2.0, 2.0, size=len(indices_D_1))
    D_prime = D.astype(float)
    D_prime[indices_D_1] = m_vec + V_vec                # support in (m, m+1)
    
    # ---- Outcomes ----
    e1 = rng_e1.normal(0.0, 1.0, n)
    Y_t_1_0 = e1  # Generating Y_{t-1}(0)
    Y_t_1 = Y_t_1_0 # Generating Y_{t-1}

    tau = 1.0
    e2 = rng_e2.normal(0.0, 1.0, n)
    Y_t_0 = Y_t_1 + X @ beta + tau + e2  # Generating Y_t(0)

    e3 = rng_e3.normal(0.0, 1.0, n)
    Y_t_D = Y_t_0 - 0.5 * (D_prime ** 2) + e3  # Generating Y_t(D)

    Y_t = np.where(D_prime > 0, Y_t_D, Y_t_0) # Final Y_t
    
    Y = Y_t_1 + T * (Y_t - Y_t_1) # Generating repeated cross-sectional Y

    # ---- Assemble DataFrame ----
    data_dict = {
        'Y': Y,
        'T': T,
        'D': D_prime,
        'nD': D0
    }
    for i in range(p):
        data_dict[f'X_{i+1}'] = X[:, i]

    data = pd.DataFrame(data_dict)
    
    # Optional: clean up near-binary D/nD/T if numerical noise ever appears (shouldn’t here)
    threshold = 1e-6
    data['D']  = np.where(np.abs(data['D'])  < threshold, 0.0, data['D'])
    data['T'] = np.where(np.abs(data['T']) < threshold, 0.0, data['T'])
    data['T'] = np.where(np.abs(data['T'] - 1.0) < threshold, 1.0, data['T'])
    data['nD'] = np.where(np.abs(data['nD']) < threshold, 0.0, data['nD'])
    data['nD'] = np.where(np.abs(data['nD'] - 1.0) < threshold, 1.0, data['nD'])
    
    # === Quantize at the very end (stabilizes RF splits across OS) ===
    cols_to_q = ['Y', 'T', 'D', 'nD'] + [f'X_{i+1}' for i in range(p)]
    # If you prefer to leave outcomes unrounded, drop 'Y0','Y1' from this list
    for c in cols_to_q:
        arr = data[c].to_numpy(dtype=np.float64, copy=False)
        arr = np.round(arr, 4)
        data[c] = arr  
    
    return data


def att_panel(data, d, v, h, pos, estimators):
    """
    Estimate the Average Treatment Effect on the Treated (ATT) using repeated outcomes methods.
    
    Parameters:
    data (pd.DataFrame): Input data frame with columns [Y1, Y0, D, nD, X]   
     - Y1, Y0: outcomes post/pre treatment
     - D: treatment variable, D=0 for the control group, and D=d for treatment
     - nD = 1{D=0}: the indicator for the control
     - X: individual level controls
    d (float): Specific treatment intensity to learn the ATT for
    v (int): Number of folds for cross-fitting
    h (float): Undersmooth kernel bandwidth, e.g., np.around(n**(-1/4))
    pos (int): Position of the first variable of X in the data frame
    estimators (dict): Dictionary specifying estimators for 'propensity', 'outcome', and 'condens'

    Returns:
    tuple: (Mean ATT, Mean variance)
    """
    
    # Initialize cross-validation
    kf = KFold(n_splits=v, shuffle=True, random_state=923)
    epsilon = 1e-6

    # Define Gaussian kernel function
    #def K(h, x, d):
    #    return norm.pdf((x - d) / h) / h

    # Define kernel function (default to Epanechnikov kernel)
    def K(h, x, d):
        u = (x - d) / h
        return np.where(np.abs(u) <= 1, 0.75 * (1 - u**2) / h, 0)

    # Function to compute ATT and variance for each fold
    def compute_fold(train_index, test_index):
        train = data.iloc[train_index]
        test = data.iloc[test_index]
        trainx = train.iloc[:, pos:]  # all the controls

        # Kernel regression model
        condens_estimator = deepcopy(estimators['condens']).fit(trainx, K(h, train.D, d))
        fh = lambda x: deepcopy(condens_estimator).predict(x)

        # Step 1: Estimation of nuisance parameters with ML using training sample       
        yl = train['Y1'] - train['Y0']

        # Kernel density at f_D(d)
        fd = np.mean(K(h, train.D, d))

        # Propensity score P(D=0|X)
        propensity_estimator = deepcopy(estimators['propensity']).fit(trainx, train.nD)
        g = lambda x: np.clip(deepcopy(propensity_estimator).predict_proba(x)[:, 1], epsilon, 1-epsilon)

        # Regression E[(Y1-Y0)|X,D=0]
        trainx0 = train.iloc[:, pos-1:]  # Include nD variable
        outcome_estimator = deepcopy(estimators['outcome']).fit(trainx0, yl)
        E = lambda x: deepcopy(outcome_estimator).predict(x)

        # Step 2: Estimate the i-th ATT inside the cross-fitting using testing sample
        testx = test.iloc[:, pos:]
        
        # Adding a column of 1's for nD = 1{D=0}
        testx0 = testx.copy()
        testx0.insert(0, 'nD', 1)
        
        nD = test.nD
        D = test.D
        ylt = test['Y1'] - test['Y0']
        
        # Calculate orthogonal score psi
        psi = (ylt - E(testx0)) * (K(h, D, d) * g(testx) - nD * fh(testx)) / (fd * g(testx))
        
        # Calculate ATT for this fold
        theta = np.mean(psi)
        
        # Asymptotic variance calculation
        sig2 = (psi - (theta * K(h, D, d)) / fd) ** 2
        VAR = np.mean(sig2)
        
        return theta, VAR

    # Parallelize the cross-validation loop
    results = Parallel(n_jobs=5, backend="loky", batch_size=1)(delayed(compute_fold)(train_index, test_index) for train_index, test_index in kf.split(data))
    
    att, var = zip(*results)
    
    # Return mean ATT and mean variance across all folds
    return np.mean(att), np.mean(var)


def att_cross(data, d, v, h, pos, estimators):
    """
    Estimate the Average Treatment Effect on the Treated (ATT) using repeated cross-sections methods.
    
    Parameters:
    data (pd.DataFrame): Input data frame with columns [Y, T, D, nD, X]   
     - Y: outcomes
     - T: indicator of pre/post 
     - D: treatment variable, D=0 for the control group, and D=d for treatment
     - nD = 1{D=0}: the indicator for the control
     - X: individual level controls
    d (float): Specific treatment intensity to learn the ATT for
    v (int): Number of folds for cross-fitting
    h (float): Undersmooth kernel bandwidth, e.g., np.around(n**(-1/4))
    pos (int): Position of the first variable of X in the data frame
    estimators (dict): Dictionary specifying estimators for 'propensity', 'outcome', and 'condens'

    Returns:
    tuple: (Mean ATT, Mean variance)
    """
    
    # Initialize cross-validation
    kf = KFold(n_splits=v, shuffle=True, random_state=923)
    epsilon = 1e-6
    
    # Define Epanechnikov kernel function
    def K(h, x, d):
        u = (x - d) / h
        return np.where(np.abs(u) <= 1, 0.75 * (1 - u**2) / h, 0)

    # Function to compute ATT and variance for each fold
    def compute_fold(train_index, test_index):
        train = data.iloc[train_index]
        test = data.iloc[test_index]
        trainx = train.iloc[:, pos:]  # all the controls
        
        # Kernel regression model
        condens_estimator = deepcopy(estimators['condens']).fit(trainx, K(h, train.D, d))
        fh = lambda x: deepcopy(condens_estimator).predict(x)
        
        # Step 1: Estimation of nuisance parameters with ML using training sample       
        lbd = len(train[train['T']==1]['T']) / len(train['T'])
        yl = (train['Y'])*((train['T'] - lbd)/(lbd*(1-lbd)))
        
        # Kernel density at f_D(d)
        fd = np.mean(K(h, train.D, d))
        
        # Propensity score P(D=0|X)
        propensity_estimator = deepcopy(estimators['propensity']).fit(trainx, train.nD)
        g = lambda x: np.clip(deepcopy(propensity_estimator).predict_proba(x)[:, 1], epsilon, 1-epsilon)
        
        # Regression E[(Y1-Y0)|X,D]
        trainx0 = train.iloc[:, pos-1:]  # Include nD variable
        outcome_estimator = deepcopy(estimators['outcome']).fit(trainx0, yl)
        E = lambda x: deepcopy(outcome_estimator).predict(x)

        # Step 2: Estimate the i-th ATT inside the cross-fitting using testing sample
        testx = test.iloc[:, pos:]
        
        # Adding a column of 1's for nD = 1{D=0}
        testx0 = testx.copy()
        testx0.insert(0, 'nD', 1)
        
        nD = test.nD
        D = test.D
        ylt = (test['Y'])*(test['T'] - lbd)/(lbd*(1-lbd))
        
        # Calculate orthogonal score psi
        psi = (ylt - E(testx0)) * (K(h, D, d) * g(testx) - nD * fh(testx)) / (fd * g(testx))
        
        # Calculate ATT for this fold
        theta = np.mean(psi)
        
        # Asymptotic variance calculation
        sig2 = (psi - (theta * K(h, D, d)) / fd) ** 2
        VAR = np.mean(sig2)
        
        return theta, VAR

    # Parallelize the cross-validation loop
    results = Parallel(n_jobs=5, backend="loky", batch_size=1)(delayed(compute_fold)(train_index, test_index) for train_index, test_index in kf.split(data))

    att, var = zip(*results)
    
    # Return mean ATT and mean variance across all folds
    return np.mean(att), np.mean(var)


# Monte Carlo simulation: panel

def panel_sim(num_simulations, p, n, v, pos, d, true_value, estimators):
    results = []
    coverage_count = 0
    
    for i in range(num_simulations):
        # Perform ATT estimation
        data = gen_panel(p, n, 923+i)
        h = 1.06*np.std(data[data.D>0].D)*(n**(-1/4))
        att, var = att_panel(data, d, v, h, pos, estimators)
        results.append((att, var))
        
        # Calculate 95% confidence interval
        conf_interval = (att - 1.96 * np.sqrt(var/n), att + 1.96 * np.sqrt(var/n))
        
        # Check if the true value is within the confidence interval
        if conf_interval[0] <= true_value <= conf_interval[1]:
            coverage_count += 1
    
    # Convert results to DataFrame for easy analysis
    results_df = pd.DataFrame(results, columns=['ATT', 'VAR'])
    
    # Calculate coverage probability
    coverage_probability = coverage_count / num_simulations
    
    return results_df, coverage_probability


# Monte Carlo simulation: cross-sections

def cross_sim(num_simulations, p, n, v, pos, d, true_value, estimators):
    results = []
    coverage_count = 0
    
    for i in range(num_simulations):
        # Perform ATT estimation
        data = gen_cross(p, n, 923+i)
        h = 1.06*np.std(data[data.D>0].D)*(n**(-1/4))
        att, var = att_cross(data, d, v, h, pos, estimators)
        results.append((att, var))
        
        # Calculate 95% confidence interval
        conf_interval = (att - 1.96 * np.sqrt(var/n), att + 1.96 * np.sqrt(var/n))
        
        # Check if the true value is within the confidence interval
        if conf_interval[0] <= true_value <= conf_interval[1]:
            coverage_count += 1
    
    # Convert results to DataFrame for easy analysis
    results_df = pd.DataFrame(results, columns=['ATT', 'VAR'])
    
    # Calculate coverage probability
    coverage_probability = coverage_count / num_simulations
    
    return results_df, coverage_probability


# Estimators used in our case (same as in the empirical application)
estimators = {
    'propensity':  CalibratedClassifierCV(
        estimator=RandomForestClassifier(max_depth=20, n_estimators=200, random_state=923),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=923),
        n_jobs=5
    ),
    'outcome': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923),
    'condens': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923)
}


# Can also consider propensity score without calibration

# estimators_nocal = {
#     'propensity':  RandomForestClassifier(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923),
#     'outcome': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923),
#     'condens': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923)
# }


# First check how long does it take for the estimators to run for a single iteration. 
# (Multiplying these timers by 500 should give the replicator a sense of the total running time on their machine)

# panel, n= 2000 

start = time.perf_counter()
results1_timer, cover_prob1_timer = panel_sim(num_simulations=1, p=100, n=2000, v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)
end = time.perf_counter()

print(f"Panel, n= 2000, one iteration takes: {end - start:.2f} seconds")

# panel, n= 10000
start = time.perf_counter()
results2_timer, cover_prob2_timer = panel_sim(num_simulations=1, p=100, n=10000, v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)
end = time.perf_counter()

print(f"Panel, n= 10000, one iteration takes: {end - start:.2f} seconds")

# cross-sections, n= 2000
start = time.perf_counter()
results3_timer, cover_prob3_timer = cross_sim(num_simulations=1, p=100, n=2000, v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)
end = time.perf_counter()

print(f"Cross-sections, n= 2000, one iteration takes: {end - start:.2f} seconds")

# cross-sections, n=10000
start = time.perf_counter()
results4_timer, cover_prob4_timer = cross_sim(num_simulations=1, p=100, n=10000, v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)
end = time.perf_counter()

print(f"Cross-sections, n= 10000, one iteration takes: {end - start:.2f} seconds")

Panel, n= 2000, one iteration takes: 21.14 seconds
Panel, n= 10000, one iteration takes: 123.64 seconds
Cross-sections, n= 2000, one iteration takes: 21.49 seconds
Cross-sections, n= 10000, one iteration takes: 127.88 seconds


# Second, we run the first 5 simulations in the panel data with sample size = 2000, 
# repeat that 4 times to make sure the results are identical.

# This serves two purposes:

# 1. Running only the first 5 simulations serves as a sanity check. If the replicator's results 
#    for the first 5 don't match with this, then no need to run the entire simulation. 

# 2. Doing the same first 5 simulations repeatedly 4 times ensures internal consistency.
#    If the results are identical from these four repeated runs, then this means the differences
#    in replication should solely be attributed to differences in package versions & machine's capacity.

results_rep1, cover_prob_rep1 = panel_sim(num_simulations=5, p=100, n=2000, 
                                           v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)

results_rep2, cover_prob_rep2 = panel_sim(num_simulations=5, p=100, n=2000, 
                                           v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)

results_rep3, cover_prob_rep3 = panel_sim(num_simulations=5, p=100, n=2000, 
                                           v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)

results_rep4, cover_prob_rep4 = panel_sim(num_simulations=5, p=100, n=2000, 
                                           v=5, pos=4, d=0.9, true_value=-0.405, estimators = estimators)


# We should expect these results to be IDENTICAL across these four simulations.
# Also compare this to those in the README.

bias = []
std = []
rmse = []
avse = []
cover = []
true_val = -0.5 * 0.9 * 0.9

for i in [i+1 for i in range(4)]:
    df_name = f'results_rep{i}'
    cover_name = f'cover_prob_rep{i}'
    df = globals()[df_name]
    cover_prob = globals()[cover_name]
    bias.append(np.mean(df.ATT)-true_val)
    std.append(np.std(df.ATT))
    rmse.append(np.mean((df.ATT - true_val)**2)**(1/2))
    avse.append(np.mean((df.VAR/2000)**(1/2)))
    cover.append(cover_prob)

first5_rep_results = {
    'Repetition test': ['rep 1', 'rep 2', 'rep 3', 'rep 4'],
    'Bias': bias,
    'Standard Deviation': std,
    'RMSE': rmse,
    'AVSE': avse,
    'Coverage Probability': cover
}

rep_df = pd.DataFrame(first5_rep_results)
rep_df


results_rep1


#results_df1[0:5]


#--------------------------------------------------
#--------- ACTUAL SIMULATION STARTS HERE ----------
#--------------------------------------------------


# Estimators used in our case (same as in the empirical application)
estimators = {
    'propensity':  CalibratedClassifierCV(
        estimator=RandomForestClassifier(max_depth=20, n_estimators=200, random_state=923),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=923),
        n_jobs=5
    ),
    'outcome': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923),
    'condens': RandomForestRegressor(max_depth=20, n_estimators=200, n_jobs = 5, random_state=923)
}


# Simulation Results 1: Panel, n = 2000, futher undersmoothing
num_simulations = 500
p = 100  # Dimensionality of covariates
n = 2000  # Number of individuals
v = 5 #cross fitting
pos = 4 # i.e. the position of the first covariates (and the columns after should all be covariates)
d = 0.9 # treatment intensity
true_value = -0.5*0.9*0.9


results_df1, cover_prob1 = panel_sim(num_simulations, p, n, v, pos, d, true_value,estimators)


# Simulation Results 2: Panel, n = 10000, futher undersmoothing
num_simulations = 500
p = 100  # Dimensionality of covariates
n = 10000  # Number of individuals
v = 5 #cross fitting
pos = 4 # i.e. the position of the first covariates (and the columns after should all be covariates)
d = 0.9 # treatment intensity
true_value = -0.5*0.9*0.9


results_df2, cover_prob2 = panel_sim(num_simulations, p, n, v, pos, d, true_value,estimators)

/opt/anaconda3/envs/repenv2/lib/python3.11/site-packages/joblib/externals/loky/process_executor.py:752: UserWarning: A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
  warnings.warn(


# Simulation Results 3: Cross-Sections, n = 2000, futher undersmoothing
num_simulations = 500
p = 100  # Dimensionality of covariates
n = 2000  # Number of individuals
v = 5 #cross fitting
pos = 4 # i.e. the position of the first covariates (and the columns after should all be covariates)
d = 0.9 # treatment intensity
true_value = -0.5*0.9*0.9


results_df3, cover_prob3 = cross_sim(num_simulations, p, n, v, pos, d, true_value, estimators)


# Simulation Results 4: Cross-Sections, n = 10000, futher undersmoothing
num_simulations = 500
p = 100  # Dimensionality of covariates
n = 10000  # Number of individuals
v = 5 #cross fitting
pos = 4 # i.e. the position of the first covariates (and the columns after should all be covariates)
d = 0.9 # treatment intensity
true_value = -0.5*0.9*0.9


results_df4, cover_prob4 = cross_sim(num_simulations, p, n, v, pos, d, true_value, estimators)


bias = []
std = []
rmse = []
avse = []
cover = []
true_val = -0.5 * 0.9 * 0.9

for i in [i+1 for i in range(4)]:
    df_name = f'results_df{i}'
    cover_name = f'cover_prob{i}'
    df = globals()[df_name]
    cover_prob = globals()[cover_name]
    bias.append(np.mean(df.ATT)-true_val)
    std.append(np.std(df.ATT))
    rmse.append(np.mean((df.ATT - true_val)**2)**(1/2))
    if i % 2 == 0:
        avse.append(np.mean((df.VAR/10000)**(1/2)))
    else: 
        avse.append(np.mean((df.VAR/2000)**(1/2)))
    cover.append(cover_prob)


sim_results = {
    'Method and Sample Size': ['panel, n=2000', 'panel, n=10000', 'cross-sections, n = 2000', 'cross-sections, n = 10000'],
    'Bias': bias,
    'Standard Deviation': std,
    'RMSE': rmse,
    'AVSE': avse,
    'Coverage Probability': cover
}


sim_df = pd.DataFrame(sim_results)


sim_df


#sim_df.to_csv("simulation_results.csv", index=False)


fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# True ATT(d) at d = 0.9
true_val = -0.5 * 0.9 * 0.9

# Define common properties
props = dict(bins=25, color='skyblue', edgecolor='black')
line_props = dict(color='red', linestyle='dashed', linewidth=2)
title_props = dict(fontsize=11)
label_props = dict(fontsize=11)

# Determine the common x-axis range
x_min1 = min(results_df1.ATT.min(), results_df2.ATT.min())
x_max1 = max(results_df1.ATT.max(), results_df2.ATT.max())

x_min2 = min(results_df3.ATT.min(), results_df4.ATT.min())
x_max2 = max(results_df3.ATT.max(), results_df4.ATT.max())


# Plot for Panel Setting, n = 2000
axs[0, 0].hist(results_df1.ATT, **props)
axs[0, 0].axvline(true_val, **line_props)
axs[0, 0].set_title('Panel, n = 2000', **title_props)
axs[0, 0].set_xlabel('ATT Estimates', **label_props)
axs[0, 0].set_ylabel('Count', **label_props)
axs[0, 0].set_xlim([x_min1, x_max1])

# Plot for Panel Setting, n = 10000
axs[0, 1].hist(results_df2.ATT, **props)
axs[0, 1].axvline(true_val, **line_props)
axs[0, 1].set_title('Panel, n = 10000', **title_props)
axs[0, 1].set_xlabel('ATT Estimates', **label_props)
axs[0, 1].set_ylabel('Count', **label_props)
axs[0, 1].set_xlim([x_min1, x_max1])

# Plot for Repeated Cross-Sections Setting, n = 2000
axs[1, 0].hist(results_df3.ATT, **props)
axs[1, 0].axvline(true_val, **line_props)
axs[1, 0].set_title('Repeated Cross-Sections, n = 2000', **title_props)
axs[1, 0].set_xlabel('ATT Estimates', **label_props)
axs[1, 0].set_ylabel('Count', **label_props)
axs[1, 0].set_xlim([x_min2, x_max2])

# Plot for Repeated Cross-Sections Setting, n = 10000
axs[1, 1].hist(results_df4.ATT, **props)
axs[1, 1].axvline(true_val, **line_props)
axs[1, 1].set_title('Repeated Cross-Sections, n = 10000', **title_props)
axs[1, 1].set_xlabel('ATT Estimates', **label_props)
axs[1, 1].set_ylabel('Count', **label_props)
axs[1, 1].set_xlim([x_min2, x_max2])

# Add an overall title
#fig.suptitle('Monte Carlo Simulations, Histograms', fontsize=12)

# Adjust layout
plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust the rect to make room for the suptitle
plt.subplots_adjust(hspace=0.3)  # Increase the vertical space between subplots

# Save the figure in high resolution
plt.savefig('simulation_histograms.png', dpi=300)

plt.show()


# OPTIONAL: PLOT KERNEL DENSITY OF THE CONTINUOUS TREATMENT

from sklearn.neighbors import KernelDensity

data_try = gen_panel(100, 10000, 1234)

# Extract positive D values
D_positive = data_try[data_try.D > 0].D.to_numpy()

# # Define a range of bandwidths for cross-validation search
# bandwidths = np.linspace(0.001, 0.5, 40)

# # Set cross-validation with controlled randomness
# cv = KFold(n_splits=5, shuffle=True, random_state=923)

# # Perform GridSearchCV with fixed seed
# grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=cv)
# grid.fit(D_positive[:, None])

# # Retrieve the optimal bandwidth and apply undersmoothing (3/4 factor)
h = np.round(1.06*np.std(data_try[data_try.D>0].D)*len(D_positive)**(-1/4), 4)


# Fit KDE with the selected bandwidth
kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(D_positive[:, None])

# Define evaluation points (dlist)
dlist = np.round(np.linspace(0, 1.5, 100), 2).tolist()
dlist_arr = np.array(dlist)[:, None]  # Convert to 2D array for KDE input

# Compute KDE estimates
log_dens = kde.score_samples(dlist_arr)
density_values = np.exp(log_dens)  # Convert log-density to actual density

# Plot KDE on dlist
fig, ax = plt.subplots(figsize=(6, 4))

ax.plot(dlist, density_values, color='blue', linewidth=1.5, label="KDE Estimate")
ax.set_xlabel(r'Treatment Intensity ($d$)', fontsize=10)
ax.set_ylabel(r'Density', fontsize=10)
ax.set_title(r'Kernel Density Estimate with Undersmoothed Bandwidth', fontsize=10)

# Improved legend
ax.legend(fontsize=10, loc='upper right', frameon=False)

# Remove top and right spines for cleaner look
#sns.despine()

# Show the plot
plt.show()

	Repetition test	Bias	Standard Deviation	RMSE	AVSE	Coverage Probability
0	rep 1	-0.196682	0.225984	0.299587	0.25438	1.0
1	rep 2	-0.196682	0.225984	0.299587	0.25438	1.0
2	rep 3	-0.196682	0.225984	0.299587	0.25438	1.0
3	rep 4	-0.196682	0.225984	0.299587	0.25438	1.0

	ATT	VAR
0	-0.867817	126.585074
1	-0.780443	127.321924
2	-0.424844	165.245573
3	-0.674244	93.205751
4	-0.261061	140.120348

	ATT	VAR
0	-0.867817	126.585074
1	-0.780443	127.321924
2	-0.424844	165.245573
3	-0.674244	93.205751
4	-0.261061	140.120348

	Method and Sample Size	Bias	Standard Deviation	RMSE	AVSE	Coverage Probability
0	panel, n=2000	-0.071962	0.272512	0.281853	0.252393	0.918
1	panel, n=10000	-0.019826	0.126120	0.127669	0.126150	0.930
2	cross-sections, n = 2000	-0.033944	0.574471	0.575473	0.557849	0.944
3	cross-sections, n = 10000	0.028960	0.275407	0.276925	0.270998	0.950

Part 1: Import packages¶

Part 2: Define functions and estimators¶

Part 3: A smoke test before the actual simulation¶

Part 4: Simulations (as shown in paper)¶