import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


np.random.seed(923)
n = 6000  # number of simulated units

# --- 1. Simulate mcrsharee1983 with 30% zeros and 70% in (0,1) ---
mcrsharee1983 = np.zeros(n)
treated_mask = np.random.rand(n) > 0.30  # 70% get non-zero share
mcrsharee1983[treated_mask] = np.random.beta(5, 6, size=treated_mask.sum())

# Quadratic treatment effect, which will be baked into deprshare and bintechcount
tau_cl = 1 - 0.5*(2*mcrsharee1983-1)**2
tau_tech = 4- 0.5*(2*mcrsharee1983-1)**2


# --- 2. Simulate deprshare1980–1986 with tau baked into 1984–1986 ---
deprshares = {}
for year in range(1980, 1987):
    base = np.random.normal(4 + 0.2 * (year - 1980), 2.5, size=n)
    if year >= 1984:
        base += tau_cl 
    deprshares[f'deprshare{year}'] = np.clip(base, 0, 100)

# --- 3. Simulate bintechcount1980–1986 as integer counts with tau baked into 1984–1986 ---
bintechcounts = {}
for i, year in enumerate(range(1980, 1987)):
    base_mean = 25 + (year - 1980)  # increasing mean over time
    base = np.random.normal(base_mean, 15, size=n)
    if year >= 1984:
        base += tau_tech 
    bintechcounts[f'bintechcount{year}'] = np.clip(np.round(base), 0, 100).astype(int)


# --- 4. Simulate continuous + binary covariates ---
bdtot = np.random.normal(200, 200, size=n)
totms = np.random.normal(100, 100, size=n)
private1983 = np.random.binomial(1, 0.6, size=n)
msa1983 = np.random.binomial(1, 0.5, size=n)

# --- 5. Simulate st10–st48 one-hot state dummies ---
st_cols = [f'st{i}' for i in range(10, 49)]
st_matrix = np.zeros((n, len(st_cols)), dtype = int)
chosen_states = np.random.choice(len(st_cols), size=n)
for i, s in enumerate(chosen_states):
    st_matrix[i, s] = 1
st_df = pd.DataFrame(st_matrix, columns=st_cols)

# --- 6. Simulate 9 tech dummies ---
tech_cols = ['open_heart_tech1983', 'catheterization_tech1983', 'radioiso_diag_tech1983',
             'radio_implants_tech1983', 'radioiso_ther_tech1983', 'xray_ther_tech1983',
             'megavolt_tech1983', 'organ_transplant_tech1983', 'ct_tech1983']
tech_df = pd.DataFrame(np.random.binomial(1, 0.3, size=(n, len(tech_cols))), columns=tech_cols)

# --- 7. Assemble full DataFrame ---
df_sim = pd.DataFrame({
    'mcrsharee1983': mcrsharee1983,
    'bdtot1983': bdtot,
    'totms1983': totms,
    'private1983': private1983,
    'msa1983': msa1983
})
df_sim = pd.concat([
    df_sim,
    pd.DataFrame(deprshares),
    pd.DataFrame(bintechcounts),
    st_df,
    tech_df
], axis=1)

# --- 8. Match df_main column order exactly ---
df_vars = (
    [f'deprshare{i}' for i in range(1980, 1987)] +
    [f'bintechcount{i}' for i in range(1980, 1987)] +
    ['mcrsharee1983', 'bdtot1983', 'totms1983', 'private1983', 'msa1983'] +
    st_cols + tech_cols
)
df_sim = df_sim[df_vars]


# --- Export data to csv (should be in the same directory as where the Jupyter notebook is located)
df_sim.to_csv("ppsreplicationdata_simulated.csv", index=False)


# --- Now df_sim mimics df_main exactly
df_sim.head()


# --- Optional: plot the kernel density of positive treatment intensity ---

# Set seaborn theme for better aesthetics
sns.set_theme()
sns.set_context("paper")
sns.set_style("ticks", {"xtick.direction": "in", "ytick.direction": "in"})

# Extract positive D values
D_positive = df_sim[df_sim.mcrsharee1983 > 0].mcrsharee1983.to_numpy()

# # Define a range of bandwidths for cross-validation search
bandwidths = np.linspace(0.001, 0.5, 40)

# # Set cross-validation with controlled randomness
cv = KFold(n_splits=5, shuffle=True, random_state=923)

# # Perform GridSearchCV with fixed seed
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=cv)
grid.fit(D_positive[:, None])

# # Retrieve the optimal bandwidth and apply undersmoothing (3/4 factor)
h = np.round(grid.best_params_['bandwidth'], 4)
print(f"Selected undersmoothed bandwidth: {h}")

# Fit KDE with the selected bandwidth
kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(D_positive[:, None])

# Define evaluation points (dlist)
dlist = np.round(np.linspace(0.1, 0.8, 71), 2).tolist()
dlist_arr = np.array(dlist)[:, None]  # Convert to 2D array for KDE input

# Compute KDE estimates
log_dens = kde.score_samples(dlist_arr)
density_values = np.exp(log_dens)  # Convert log-density to actual density

# Plot KDE on dlist
fig, ax = plt.subplots(figsize=(6, 4))

ax.plot(dlist, density_values, color='blue', linewidth=1.5, label="KDE Estimate")
ax.set_xlabel(r'Treatment Intensity ($d$)', fontsize=10)
ax.set_ylabel(r'Density', fontsize=10)
ax.set_title(r'Kernel Density Estimate with Undersmoothed Bandwidth', fontsize=10)

# Improved legend
ax.legend(fontsize=10, loc='upper right', frameon=False)

# Remove top and right spines for cleaner look
sns.despine()

# Show the plot
plt.show()

Selected undersmoothed bandwidth: 0.0266

	deprshare1980	deprshare1981	deprshare1982	deprshare1983	deprshare1984	deprshare1985	deprshare1986	bintechcount1980	bintechcount1981	bintechcount1982	...	st48	open_heart_tech1983	catheterization_tech1983	radioiso_diag_tech1983	radioiso_ther_tech1983	xray_ther_tech1983	megavolt_tech1983	organ_transplant_tech1983	ct_tech1983
0	5.151917	2.166601	3.039583	4.519645	3.055672	5.795765	5.165730	3	33	9	...	0	0	1	0	1	1	0	1	1
1	1.459992	0.989867	0.000000	8.648418	3.971911	10.228455	2.795892	6	0	43	...	1	1	0	0	0	0	0	1	0
2	3.608329	4.194742	5.235346	0.414681	4.825809	3.256308	4.456675	5	34	31	...	0	0	0	1	0	0	1	0	0
3	2.913305	8.310639	2.149501	3.470897	4.004727	0.336987	8.854215	13	14	41	...	0	0	1	0	0	0	1	1	0
4	2.719057	0.961900	4.080290	4.040181	3.981682	9.035677	5.381902	0	16	51	...	1	1	0	0	1	0	1	0	1