import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
np.random.seed(923)
n = 6000 # number of simulated units
# --- 1. Simulate mcrsharee1983 with 30% zeros and 70% in (0,1) ---
mcrsharee1983 = np.zeros(n)
treated_mask = np.random.rand(n) > 0.30 # 70% get non-zero share
mcrsharee1983[treated_mask] = np.random.beta(5, 6, size=treated_mask.sum())
# Quadratic treatment effect, which will be baked into deprshare and bintechcount
tau_cl = 1 - 0.5*(2*mcrsharee1983-1)**2
tau_tech = 4- 0.5*(2*mcrsharee1983-1)**2
# --- 2. Simulate deprshare1980–1986 with tau baked into 1984–1986 ---
deprshares = {}
for year in range(1980, 1987):
base = np.random.normal(4 + 0.2 * (year - 1980), 2.5, size=n)
if year >= 1984:
base += tau_cl
deprshares[f'deprshare{year}'] = np.clip(base, 0, 100)
# --- 3. Simulate bintechcount1980–1986 as integer counts with tau baked into 1984–1986 ---
bintechcounts = {}
for i, year in enumerate(range(1980, 1987)):
base_mean = 25 + (year - 1980) # increasing mean over time
base = np.random.normal(base_mean, 15, size=n)
if year >= 1984:
base += tau_tech
bintechcounts[f'bintechcount{year}'] = np.clip(np.round(base), 0, 100).astype(int)
# --- 4. Simulate continuous + binary covariates ---
bdtot = np.random.normal(200, 200, size=n)
totms = np.random.normal(100, 100, size=n)
private1983 = np.random.binomial(1, 0.6, size=n)
msa1983 = np.random.binomial(1, 0.5, size=n)
# --- 5. Simulate st10–st48 one-hot state dummies ---
st_cols = [f'st{i}' for i in range(10, 49)]
st_matrix = np.zeros((n, len(st_cols)), dtype = int)
chosen_states = np.random.choice(len(st_cols), size=n)
for i, s in enumerate(chosen_states):
st_matrix[i, s] = 1
st_df = pd.DataFrame(st_matrix, columns=st_cols)
# --- 6. Simulate 9 tech dummies ---
tech_cols = ['open_heart_tech1983', 'catheterization_tech1983', 'radioiso_diag_tech1983',
'radio_implants_tech1983', 'radioiso_ther_tech1983', 'xray_ther_tech1983',
'megavolt_tech1983', 'organ_transplant_tech1983', 'ct_tech1983']
tech_df = pd.DataFrame(np.random.binomial(1, 0.3, size=(n, len(tech_cols))), columns=tech_cols)
# --- 7. Assemble full DataFrame ---
df_sim = pd.DataFrame({
'mcrsharee1983': mcrsharee1983,
'bdtot1983': bdtot,
'totms1983': totms,
'private1983': private1983,
'msa1983': msa1983
})
df_sim = pd.concat([
df_sim,
pd.DataFrame(deprshares),
pd.DataFrame(bintechcounts),
st_df,
tech_df
], axis=1)
# --- 8. Match df_main column order exactly ---
df_vars = (
[f'deprshare{i}' for i in range(1980, 1987)] +
[f'bintechcount{i}' for i in range(1980, 1987)] +
['mcrsharee1983', 'bdtot1983', 'totms1983', 'private1983', 'msa1983'] +
st_cols + tech_cols
)
df_sim = df_sim[df_vars]
# --- Export data to csv (should be in the same directory as where the Jupyter notebook is located)
df_sim.to_csv("ppsreplicationdata_simulated.csv", index=False)
# --- Now df_sim mimics df_main exactly
df_sim.head()
| deprshare1980 | deprshare1981 | deprshare1982 | deprshare1983 | deprshare1984 | deprshare1985 | deprshare1986 | bintechcount1980 | bintechcount1981 | bintechcount1982 | ... | st48 | open_heart_tech1983 | catheterization_tech1983 | radioiso_diag_tech1983 | radio_implants_tech1983 | radioiso_ther_tech1983 | xray_ther_tech1983 | megavolt_tech1983 | organ_transplant_tech1983 | ct_tech1983 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5.151917 | 2.166601 | 3.039583 | 4.519645 | 3.055672 | 5.795765 | 5.165730 | 3 | 33 | 9 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 |
| 1 | 1.459992 | 0.989867 | 0.000000 | 8.648418 | 3.971911 | 10.228455 | 2.795892 | 6 | 0 | 43 | ... | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 3.608329 | 4.194742 | 5.235346 | 0.414681 | 4.825809 | 3.256308 | 4.456675 | 5 | 34 | 31 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 2.913305 | 8.310639 | 2.149501 | 3.470897 | 4.004727 | 0.336987 | 8.854215 | 13 | 14 | 41 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 4 | 2.719057 | 0.961900 | 4.080290 | 4.040181 | 3.981682 | 9.035677 | 5.381902 | 0 | 16 | 51 | ... | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
5 rows × 67 columns
# --- Optional: plot the kernel density of positive treatment intensity ---
# Set seaborn theme for better aesthetics
sns.set_theme()
sns.set_context("paper")
sns.set_style("ticks", {"xtick.direction": "in", "ytick.direction": "in"})
# Extract positive D values
D_positive = df_sim[df_sim.mcrsharee1983 > 0].mcrsharee1983.to_numpy()
# # Define a range of bandwidths for cross-validation search
bandwidths = np.linspace(0.001, 0.5, 40)
# # Set cross-validation with controlled randomness
cv = KFold(n_splits=5, shuffle=True, random_state=923)
# # Perform GridSearchCV with fixed seed
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=cv)
grid.fit(D_positive[:, None])
# # Retrieve the optimal bandwidth and apply undersmoothing (3/4 factor)
h = np.round(grid.best_params_['bandwidth'], 4)
print(f"Selected undersmoothed bandwidth: {h}")
# Fit KDE with the selected bandwidth
kde = KernelDensity(kernel='gaussian', bandwidth=h).fit(D_positive[:, None])
# Define evaluation points (dlist)
dlist = np.round(np.linspace(0.1, 0.8, 71), 2).tolist()
dlist_arr = np.array(dlist)[:, None] # Convert to 2D array for KDE input
# Compute KDE estimates
log_dens = kde.score_samples(dlist_arr)
density_values = np.exp(log_dens) # Convert log-density to actual density
# Plot KDE on dlist
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(dlist, density_values, color='blue', linewidth=1.5, label="KDE Estimate")
ax.set_xlabel(r'Treatment Intensity ($d$)', fontsize=10)
ax.set_ylabel(r'Density', fontsize=10)
ax.set_title(r'Kernel Density Estimate with Undersmoothed Bandwidth', fontsize=10)
# Improved legend
ax.legend(fontsize=10, loc='upper right', frameon=False)
# Remove top and right spines for cleaner look
sns.despine()
# Show the plot
plt.show()
Selected undersmoothed bandwidth: 0.0266