import pandas as pd
import numpy as np 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt 
import seaborn as sns 
import scipy as sp


df =  pd.read_csv('twin_cleaned.csv')


df.head()


df1 = df[['educ','hrwage']]
df1.head()


df_f = df[df['female'] == 1]
df_f.head()


df_nf = df[df['female'] == 0]
df_nf.head()


np.mean(df1)

educ      14.030415
hrwage    14.436402
dtype: float64


np.mean(df1['educ'])

14.030415430267063


df1.describe()


np.cov(df1['educ'],df1['hrwage'])

array([[  4.31874498,   8.32070508],
       [  8.32070508, 170.30438488]])


plt.hist(df1['educ'])
plt.title("Histogram of Education Attainment")
plt.xlabel("Years of Education")
plt.ylabel("Count")
plt.show()


plt.scatter(df1['educ'], df1['hrwage'])
plt.title("Scatter Plot of Hourly Wage and Education")
plt.xlabel("Years of Education")
plt.ylabel("Hourly Wage")
plt.show()


plt.scatter(df_f['educ'], df_f['hrwage'], label = "female", color = "red")
plt.scatter(df_nf['educ'], df_nf['hrwage'], label = "non-female", color = "green")
plt.title("Scatter Plot of Hourly Wage and Education")
plt.xlabel("Years of Education")
plt.ylabel("Hourly Wage")
plt.legend()
plt.show()


model = smf.ols(formula = 'hrwage ~ educ', data=df)
results = model.fit(cov_type = 'HC1')
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 hrwage   R-squared:                       0.094
Model:                            OLS   Adj. R-squared:                  0.093
Method:                 Least Squares   F-statistic:                     48.64
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           7.36e-12
Time:                        13:59:53   Log-Likelihood:                -2653.9
No. Observations:                 674   AIC:                             5312.
Df Residuals:                     672   BIC:                             5321.
Df Model:                           1                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -12.5953      3.597     -3.501      0.000     -19.646      -5.544
educ           1.9266      0.276      6.974      0.000       1.385       2.468
==============================================================================
Omnibus:                      635.040   Durbin-Watson:                   1.506
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            22980.336
Skew:                           4.233   Prob(JB):                         0.00
Kurtosis:                      30.324   Cond. No.                         97.3
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


sns.set()
sns.regplot(x="educ", y="hrwage", ci= None, data=df)
plt.xlabel('years of education')
plt.ylabel('hourly wage')
plt.title('Scatter Plot with Fitted Regression Line ')
plt.show()


results.params

Intercept   -12.595282
educ          1.926649
dtype: float64


b0, b1 = results.params


plt.plot(df['educ'], df['hrwage'], '.')
plt.plot(df['educ'], b0 + b1*df['educ'],'-')
plt.xlabel('years of education')
plt.ylabel('hourly wage')
plt.title('Scatter Plot with Fitted Regression Line ')
plt.show()


results.resid

0     -6.295527
1     -9.022142
2     -1.241281
3     -2.987481
4      4.922832
         ...   
669   -8.841126
670   -4.884505
671   -2.734504
672    0.618794
673   -4.324504
Length: 674, dtype: float64


df['resid'] = results.resid
df.head()


plt.scatter(df['educ'], df['resid'])
plt.xlabel('years of education')
plt.ylabel('regression residuals')
plt.title('Scatter Plot of Regression Residuals Against Years of Education')
plt.show()


model2 = smf.ols(formula = "hrwage ~ educ + age", data = df)
results2 = model2.fit(cov_type = 'HC1')
print(results2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 hrwage   R-squared:                       0.165
Model:                            OLS   Adj. R-squared:                  0.162
Method:                 Least Squares   F-statistic:                     28.31
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           1.57e-12
Time:                        13:59:53   Log-Likelihood:                -2626.6
No. Observations:                 674   AIC:                             5259.
Df Residuals:                     671   BIC:                             5273.
Df Model:                           2                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -27.9905      5.497     -5.092      0.000     -38.764     -17.217
educ           2.2003      0.293      7.505      0.000       1.626       2.775
age            0.3035      0.056      5.372      0.000       0.193       0.414
==============================================================================
Omnibus:                      629.267   Durbin-Watson:                   1.547
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            22518.519
Skew:                           4.175   Prob(JB):                         0.00
Kurtosis:                      30.058   Cond. No.                         343.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


model3 = smf.ols(formula = "hrwage ~ educ + age + female", data = df)
results3 = model3.fit(cov_type = 'HC1')
print(results3.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 hrwage   R-squared:                       0.209
Model:                            OLS   Adj. R-squared:                  0.205
Method:                 Least Squares   F-statistic:                     21.54
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           2.51e-13
Time:                        13:59:54   Log-Likelihood:                -2608.4
No. Observations:                 674   AIC:                             5225.
Df Residuals:                     670   BIC:                             5243.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -22.4813      4.963     -4.530      0.000     -32.209     -12.754
educ           2.0789      0.276      7.540      0.000       1.538       2.619
age            0.2910      0.054      5.375      0.000       0.185       0.397
female        -5.5977      0.976     -5.736      0.000      -7.510      -3.685
==============================================================================
Omnibus:                      615.803   Durbin-Watson:                   1.557
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            21391.259
Skew:                           4.041   Prob(JB):                         0.00
Kurtosis:                      29.389   Cond. No.                         354.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


df['logwage'] = np.log(df['hrwage'])
df.head()


df['pexp'] = df['age'] - df['educ'] - 7
df.head()


df['pexp2'] = df['pexp']**2
df.head()


mincer = smf.ols(formula = "logwage ~ educ + pexp + pexp2", data = df)
resultsm = mincer.fit(cov_type = 'HC1')
print(resultsm.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                logwage   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.284
Method:                 Least Squares   F-statistic:                     74.50
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           1.36e-41
Time:                        13:59:54   Log-Likelihood:                -526.84
No. Observations:                 674   AIC:                             1062.
Df Residuals:                     670   BIC:                             1080.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0870      0.181     -0.482      0.630      -0.441       0.267
educ           0.1405      0.011     12.247      0.000       0.118       0.163
pexp           0.0595      0.006     10.251      0.000       0.048       0.071
pexp2         -0.0010      0.000     -7.636      0.000      -0.001      -0.001
==============================================================================
Omnibus:                       58.208   Durbin-Watson:                   1.515
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              126.140
Skew:                           0.504   Prob(JB):                     4.07e-28
Kurtosis:                       4.864   Cond. No.                     5.46e+03
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)
[2] The condition number is large, 5.46e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


df['educ_f'] = df['educ']*df['female']
df.head()


model_f1 = smf.ols(formula = "hrwage ~ educ + female + educ_f", data = df)
results_f1 = model_f1.fit(cov_type = 'HC1')
print(results_f1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 hrwage   R-squared:                       0.155
Model:                            OLS   Adj. R-squared:                  0.151
Method:                 Least Squares   F-statistic:                     29.80
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           4.28e-18
Time:                        13:59:54   Log-Likelihood:                -2630.6
No. Observations:                 674   AIC:                             5269.
Df Residuals:                     670   BIC:                             5287.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -18.6312      7.473     -2.493      0.013     -33.277      -3.985
educ           2.5974      0.570      4.559      0.000       1.481       3.714
female        12.7693      7.906      1.615      0.106      -2.726      28.265
educ_f        -1.3279      0.601     -2.210      0.027      -2.506      -0.150
==============================================================================
Omnibus:                      601.899   Durbin-Watson:                   1.518
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            19908.763
Skew:                           3.914   Prob(JB):                         0.00
Kurtosis:                      28.449   Cond. No.                         285.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


model_f2 = smf.ols(formula = "hrwage ~ educ + female + educ:female", data = df)
results_f2 = model_f2.fit(cov_type = 'HC1')
print(results_f2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 hrwage   R-squared:                       0.155
Model:                            OLS   Adj. R-squared:                  0.151
Method:                 Least Squares   F-statistic:                     29.80
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           4.28e-18
Time:                        13:59:54   Log-Likelihood:                -2630.6
No. Observations:                 674   AIC:                             5269.
Df Residuals:                     670   BIC:                             5287.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept     -18.6312      7.473     -2.493      0.013     -33.277      -3.985
educ            2.5974      0.570      4.559      0.000       1.481       3.714
female         12.7693      7.906      1.615      0.106      -2.726      28.265
educ:female    -1.3279      0.601     -2.210      0.027      -2.506      -0.150
==============================================================================
Omnibus:                      601.899   Durbin-Watson:                   1.518
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            19908.763
Skew:                           3.914   Prob(JB):                         0.00
Kurtosis:                      28.449   Cond. No.                         285.
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


sns.lmplot(x="educ", y="hrwage", hue = "female", ci= None, data=df)
plt.xlabel('years of education')
plt.ylabel('hourly wage')
plt.title('Gender Wage Differential')
plt.show()


model_f = smf.ols(formula = "logwage ~ educ + pexp + pexp2 + female + white", data = df)
results_f = model_f.fit(cov_type = 'HC1')
hypotheses = '(educ = 0), (pexp = 2), (female = white)'
ftest = results_f.f_test(hypotheses)
print(ftest)

<F test: F=array([[42246.69900786]]), p=0.0, df_denom=668, df_num=3>


model_w = smf.ols(formula = "logwage ~ educ + pexp", data = df)
results_w = model_w.fit(cov_type = 'HC1')
df['resid2'] = (results_w.resid)**2
df.head()


df['educ2'] = (df['educ'])**2
df['pexp_educ'] = (df['pexp'])*(df['educ'])
model_aux = smf.ols(formula = "resid2 ~ educ + pexp + educ2 + pexp2 + pexp_educ", data = df)
results_aux = model_aux.fit(cov_type = 'HC1')
hypotheses = '(educ = 0), (pexp = 0), (educ2=0), (pexp2=0), (pexp_educ=0)'
ftest = results_aux.f_test(hypotheses)
print(ftest)

<F test: F=array([[3.41338605]]), p=0.004712653753891873, df_denom=668, df_num=5>


did = pd.read_csv("did.csv")
did.head()


did_model1 = smf.ols(formula = "fte ~ d + nj + d_nj", data = did)
did_results1 = did_model1.fit(cov_type = 'HC1')
print(did_results1.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.404
Date:                Sun, 02 Aug 2020   Prob (F-statistic):              0.240
Time:                        13:59:54   Log-Likelihood:                -2904.2
No. Observations:                 794   AIC:                             5816.
Df Residuals:                     790   BIC:                             5835.
Df Model:                           3                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.3312      1.346     17.337      0.000      20.694      25.969
d             -2.1656      1.641     -1.320      0.187      -5.382       1.051
nj            -2.8918      1.439     -2.010      0.044      -5.712      -0.072
d_nj           2.7536      1.795      1.534      0.125      -0.765       6.273
==============================================================================
Omnibus:                      218.742   Durbin-Watson:                   1.842
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              804.488
Skew:                           1.268   Prob(JB):                    2.03e-175
Kurtosis:                       7.229   Cond. No.                         11.3
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)


did_model2 = smf.ols(formula = "fte ~ d + nj + d_nj + kfc + roys + wendys ", data = did)
did_results2 = did_model2.fit(cov_type = 'HC1')
print(did_results2.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    fte   R-squared:                       0.194
Model:                            OLS   Adj. R-squared:                  0.188
Method:                 Least Squares   F-statistic:                     56.02
Date:                Sun, 02 Aug 2020   Prob (F-statistic):           1.18e-57
Time:                        14:00:30   Log-Likelihood:                -2821.7
No. Observations:                 794   AIC:                             5657.
Df Residuals:                     787   BIC:                             5690.
Df Model:                           6                                         
Covariance Type:                  HC1                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     25.7309      1.322     19.463      0.000      23.140      28.322
d             -2.1946      1.432     -1.532      0.125      -5.002       0.613
nj            -2.3205      1.272     -1.825      0.068      -4.813       0.172
d_nj           2.8140      1.578      1.783      0.075      -0.279       5.907
kfc          -10.8112      0.642    -16.830      0.000     -12.070      -9.552
roys          -2.2518      0.847     -2.659      0.008      -3.912      -0.592
wendys        -1.1176      0.973     -1.149      0.251      -3.024       0.789
==============================================================================
Omnibus:                      282.192   Durbin-Watson:                   1.963
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1652.747
Skew:                           1.489   Prob(JB):                         0.00
Kurtosis:                       9.410   Cond. No.                         11.7
==============================================================================

Warnings:
[1] Standard Errors are heteroscedasticity robust (HC1)

	first	educ	educt	hrwage	lwage	age	white	female	educt_t	age2	dlwage	deduc	deduct
0	1.0	16.0	16.0	11.935573	2.479523	33.251190	1	1	16.0	1105.64160	0.259346	0.0	0.0
1	NaN	16.0	16.0	9.208958	2.220177	33.251190	1	1	16.0	1105.64160	-0.259346	0.0	0.0
2	NaN	12.0	16.0	9.283223	2.228209	43.570145	1	1	12.0	1898.35750	-0.721318	-6.0	-4.0
3	1.0	18.0	12.0	19.096916	2.949527	43.570145	1	1	16.0	1898.35750	0.721318	6.0	4.0
4	NaN	12.0	12.0	15.447336	2.728481	30.983910	1	0	12.0	960.00262	-0.129809	0.0	0.0

	educ	hrwage
0	16.0	11.935573
1	16.0	9.208958
2	12.0	9.283223
3	18.0	19.096916
4	12.0	15.447336

	first	educ	educt	hrwage	lwage	age	white	female	educt_t	age2	dlwage	deduc	deduct
0	1.0	16.0	16.0	11.935573	2.479523	33.251190	1	1	16.0	1105.6416	0.259346	0.0	0.0
1	NaN	16.0	16.0	9.208958	2.220177	33.251190	1	1	16.0	1105.6416	-0.259346	0.0	0.0
2	NaN	12.0	16.0	9.283223	2.228209	43.570145	1	1	12.0	1898.3575	-0.721318	-6.0	-4.0
3	1.0	18.0	12.0	19.096916	2.949527	43.570145	1	1	16.0	1898.3575	0.721318	6.0	4.0
8	NaN	15.0	13.0	8.094971	2.091243	34.978775	1	1	15.0	1223.5146	-1.523260	2.0	2.0

	first	educ	educt	hrwage	lwage	age	white	educt_t	age2	dlwage	deduc	deduct
4	NaN	12.0	12.000000	15.447336	2.728481	30.983910	1	12.000000	960.00262	-0.129809	0.0	0.000000
5	1.0	12.0	12.000000	17.432159	2.858290	30.983910	1	12.000000	960.00262	0.129809	0.0	0.000000
6	NaN	14.0	14.000000	19.316584	2.887312	35.211269	1	14.333333	1239.83350	0.073764	0.0	0.333333
7	1.0	14.0	14.333333	16.698618	2.813548	35.211269	1	14.000000	1239.83350	-0.073764	0.0	-0.333333
10	NaN	14.0	15.000000	15.681567	2.752376	29.669403	1	14.000000	880.27350	-0.346573	1.0	-1.000000

	educ	hrwage
count	674.000000	674.000000
mean	14.030415	14.436402
std	2.078159	13.050072
min	8.000000	2.059872
25%	12.000000	7.426579
50%	14.000000	11.443457
75%	16.000000	16.570554
max	18.000000	133.333330

A Starting Point¶

Load Data¶

Take a Look at the Data¶

Summarize the Data¶

Summary Statistics¶

Histograms¶

Scatter Plots¶

Simple Linear Regression with One Variable¶

Regression Output¶

Interpretation of Regression Output¶

Plot the Regression Line¶

Use Seaborn Library¶

Do It Yourself¶

Plot the Residuals¶

Regression with Multiple Regressors¶

Additional Regressors¶

Adding Binary Regressor¶

Transformation of Dependent Variable and Regressors¶

Add Transformed Variables to the Dataset¶

Mincer Regression¶

Interaction Between Regressors¶

Using direct method¶

Using the package option¶

Visualization¶

Joint Hypothesis Testing¶

White Test for Heteroskedasticity¶

Difference in Differences¶

Card and Krueger 1994 (Minimum Wage)¶

	CO_OWNED	CENTRALJ	DEMP	nj	bk	roys	fte
0	0	1	12.00	1	1	0	15.00
1	0	1	6.50	1	1	0	15.00
2	0	1	-1.00	1	0	1	24.00
3	1	0	2.25	1	0	1	19.25
4	0	0	13.00	1	1	0	21.50