import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.optimize import fsolve
import scipy.optimize as opt 
import scipy as sp
import scipy.io
from numpy.linalg import inv
from scipy.linalg import sqrtm
import statsmodels.formula.api as sm
import statsmodels.api as sma
import pylab
from __future__ import division #for the division operator "/"
#plt.style.use('ggplot')
from linearmodels.iv import IV2SLS
from linearmodels.iv import IVLIML
from scipy.linalg import fractional_matrix_power
from bokeh.charts import Bar
from matplotlib import rc
from scipy.stats import norm
from scipy.stats import rankdata
from sklearn.neighbors import KernelDensity
from statsmodels.iolib.summary2 import summary_col
from statsmodels.nonparametric.smoothers_lowess import lowess
import scipy.stats as stats
from statsmodels.distributions.empirical_distribution import ECDF
from statsmodels.regression.quantile_regression import QuantReg
import time


# import household data # hh_df stands for household_data

###################################################
### change your directory here for replication. ###
###################################################

# hh_df = pd.read_csv('Desktop/Urban_Pset/household_data.csv')
hh_df = pd.read_csv('household_data.csv')

# add additional column of standardized loginc

hh_df['norm_inc'] = (hh_df.loginc - np.mean(hh_df.loginc))/np.std(hh_df.loginc)

N = hh_df['hhid'].count()

z = np.array(hh_df[['norm_inc','college','jchoice']])


# import neighborhood data # nbh_df stands for neighborhood data

###################################################
### change your directory here for replication. ###
###################################################

# nbh_df = pd.read_csv('Desktop/Urban_Pset/neighborhood_data.csv')
nbh_df = pd.read_csv('neighborhood_data.csv')

J = nbh_df['jid'].count()

x = np.array(nbh_df[['schqual','price','sharej','iv']])


# for loop is too slow. Matrix multiplication is much much faster.

# create matrix of probabilities P
def P(B,z,x,d):
    Q = np.exp(z[:,0:2]@B@(x[:,0:2].T) + np.array([d,]*N))
    T = 1/np.sum(Q, axis=1)
    return np.multiply(Q, T[:, np.newaxis])


# create a contraction function
def contraction(B,z,x,d,P):
    d1 = np.zeros(J)
    for j in range(J):
        d1[j] = d[j] + np.log(x[j,2]) - np.log((1/N)*np.sum(P[:,j]))                                              
    return d1

# create matrix of dummies D
D = np.zeros(shape=(N,J))
for i in range(N):
    for j in range(J):
        D[i,j] = np.int(z[i,2] == j+1)
        
# define log likelihood function with built-in contraction mapping        
def loglike(B,z,x,d):
    count = 0
    maxiter = 200
    tol = 1e-6
    
    while count < maxiter:
        Q = P(B,z,x,d)
        d1 = contraction(B,z,x,d,Q)
        err = np.max(np.abs(d1-d))
        if err >= tol:
            count = count + 1
            d=d1
            continue
        elif err < tol:
            # calculate P using the optimal d1
            Q = P(B,z,x,d1)
            d_star = d1
            break
            
    return [np.sum(np.multiply(np.log(Q),D)), d_star]


#initial guess of delta and B
d = np.zeros(J)
B = np.array([[1,1],[1,1]])

/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 1) will return an array of dtype('int64')
  format(shape, fill_value, array(fill_value).dtype), FutureWarning)


start = time.time()
P(B,z,x,d)
end = time.time()
print(end - start)
# takes about 0.04 secs, good.

0.04052305221557617


# check runtime of each different B
start = time.time()
loglike(B,z,x,d)
end = time.time()
print(end - start)

# takes about 2 secs for each B now. much better

/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64')
  format(shape, fill_value, array(fill_value).dtype), FutureWarning)

2.0972378253936768


def bmin(params):
    i,j,k,l = params
    return -loglike(np.array([[i,j],[k,l]]),z,x,d)[0]


b0 = [1/2,1/2,1/2,1/2]
b0

[0.5, 0.5, 0.5, 0.5]


##############
# WARNING!!!!#
##############


# this may take several minutes...not as fast as in matlab since I'm not sure if the fmin here
# is the same as fminunc in matlab!


params = opt.fmin(bmin, b0, ftol = 1e-6, maxiter = 200)

/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64')
  format(shape, fill_value, array(fill_value).dtype), FutureWarning)

Optimization terminated successfully.
         Current function value: 49576.603724
         Iterations: 128
         Function evaluations: 218


Bhat = np.array([params[0:2],params[2:4]])
Bhat

array([[ 1.05151504,  0.18173876],
       [ 0.48293352,  0.09102158]])


dhat = loglike(Bhat,z,x,d)[1]

/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64')
  format(shape, fill_value, array(fill_value).dtype), FutureWarning)


# add the dhat to the original neighborhood dataframe for IV analysis
nbh_df['dhat'] = dhat


# IV to recover A:

mod = IVLIML.from_formula('dhat ~ 1 + schqual + [price ~ iv]', nbh_df)

result = mod.fit()

print(result.summary)

                          IV-LIML Estimation Summary                          
==============================================================================
Dep. Variable:                   dhat   R-squared:                      0.4139
Estimator:                    IV-LIML   Adj. R-squared:                 0.4080
No. Observations:                 200   F-statistic:                    344.65
Date:                Sun, Mar 03 2019   P-value (F-stat)                0.0000
Time:                        13:16:17   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
==============================================================================
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.5720     0.0498     11.497     0.0000      0.4745      0.6695
schqual        1.0084     0.0969     10.406     0.0000      0.8185      1.1983
price         -0.9642     0.0641    -15.050     0.0000     -1.0898     -0.8386
==============================================================================

Endogenous: price
Instruments: iv
Robust Covariance (Heteroskedastic)
Debiased: False
Kappa: 1.000


Ahat = np.array(result.params[['schqual','price']])
Ahat

array([ 1.00841337, -0.96421125])


# add xi to the dataframe
nbh_df['xihat'] = result.resids


# plot price and unobservables xi
sns.set()
ax = sns.scatterplot('xihat','price',data=nbh_df)
plt.show()


# parameter associated with log-income and school quality is the B[1,1]
B_counter = Bhat * np.array([[1.5,1],[1,1]]) 
print(B_counter)

A_counter = Ahat*2
print(A_counter)

[[ 1.57727256  0.18173876]
 [ 0.48293352  0.09102158]]
[ 2.01682673 -1.9284225 ]


xihat = result.resids


p0 = np.zeros(J)


# define a new contraction mapping for the prices

# we have to make an assumption that xi are fixed, i.e. unobservable quality is
# uncorrelated with housing price ex-post, i.e., once xi are known, it can't change anymore 

# create a contraction function

def p_contract(B,z,x,p,A):
    p1 = np.zeros(J)
    d =  A[0]*x[:,0] + A[1]*p + xihat
    x1= np.column_stack((x[:,0], p, x[:,2]))
    Q = P(B,z,x1,d)    
    for j in range(J):
        p1[j] = p[j] - (1/A[1])*(np.log((1/N)*np.sum(Q[:,j])) - np.log(x1[j,2]))                                              
    return p1


# check the runtime
start = time.time()
p_contract(B_counter,z,x,p,A_counter)
end = time.time()
print(end - start)

0.49749279022216797


# find the new equilibrium prices
count = 0
maxiter = 200
tol = 1e-6 
#p = x[:,1] # using the realized price as the guess of the price.   
p = np.zeros(J)+1 #seems like the initial values matter A LOT...
while count < maxiter:
    p2 = p_contract(B_counter,z,x,p,A_counter)
    err = np.max(np.abs(p2-p))
    count = count + 1
    if err >= tol:
        p=p2
        continue
    elif err < tol:
        p_hat = p2
        print(count)
        break

29


# counter factual p scaled
p_star = p_hat*(x[0,1]/p_hat[0])


# add counter factual p to the dataframe
nbh_df['counter_p'] = p_star


# plot price and unobservables xi
current_palette = sns.color_palette()
sns.set()
ax1 = sns.scatterplot('schqual','price',data = nbh_df, marker = "o", color = "lightskyblue", label = 'data')
ax2 = sns.scatterplot('schqual','counter_p', data = nbh_df, marker="+", color ="darkorange", label = 'counter factual')
plt.xlim((-2, 2))
plt.ylim((-4, 8))
ax1.legend(loc='upper left')
ax2.legend(loc='upper left')
plt.xlabel('school quality')
plt.ylabel('price')
plt.show()

Location Choice Model, Sorting, and Counterfactual Estimation¶

Model¶

Data¶

Estimation Procedure¶

Step 1: Estimate parameter $B$¶

Step 2: Estimate Parameter $A$ Using Instrumental Variable Approach¶

Additional Exercise 1: Plot the Sorting Behavior¶

Additional Exercise 2: Counterfactual Prices¶