Labor/Urban project, based on the methodology of Bayer, Ferreira, McMillan (2007), which itself borrows the famous BLP type of methods.
This is my attempt to complete a replication project in Python instead of the typically used matlab.
Recent more comprehensive implementation of BLP in Python has been developed, check out the PyBLP package.
Consider the following location choice model.
where
Our goal is to use this data to estimate A and B.
One of the difficulties of estimating all of the parameters directly by maximum-likelihood is the presence of $J$ nuisance parameters in $\xi_j$. We consider a two-step procedure described in the BFM paper where $B$ can be estimated in the first step, without having to estimate $A$ or $\xi_j$, and $A$ is estimated in the second step.
The indirect utility is written as
$$ V_{ij} = \underbrace{z_i'Bx_j}_{\lambda_{ij}} + \underbrace{x_j'A + \xi_j}_{\delta_j} + \epsilon_{ij} $$We estimated $\lambda_{ij} := z_i'Bx_j$ and the fixed effect $\delta_j = x_j'A + \xi_j$ using MLE.
Then we form the log likelihood, define $D_{ij} = 1\{i~\text{choose}~j\}$, $$LL = \sum_{i=1}^N\sum_{j=1}^JD_{ij}\log P_{ij}$$ but the number of parameters to solve can be large and hence computationally difficult.
Use contraction mapping, based on equilibrium condition, to solve for $\delta_j$'s conditional on a guess of other parameters:
Let $S_j:=$ share of households choosing $j$ in the data. Then the equilibrium condition suggests $$\frac{1}{N}\sum_{i=1}^N P_{ij} = S_j\quad \forall~j=1,\cdots, J $$
For a guess of a vector of $\delta$'s, we need to find $\delta_j'$ s.t.
$$\frac{1}{N}\sum_{i=1}^N \frac{\exp(\lambda_{ij}+\delta_j')}{\sum_{h=1}^J \exp(\lambda_{ih}+\delta_h)} = S_j\quad \forall~j=1,\cdots J$$
Note that we find one $j$ at a time. Then use some usual trick, we have
$$ \frac{1}{N}\sum_{i=1}^N \frac{\exp(\lambda_{ij}+\delta_j' - \delta_j +\delta_j)}{\sum_{h=1}^J \exp(\lambda_{ih}+\delta_h)} = S_j $$
$$\implies \exp(\delta_j'-\delta_j)\frac{1}{N}\sum_{i=1}^N \frac{\exp(\lambda_{ij}+\delta_j)}{\sum_{h=1}^J \exp(\lambda_{ih}+\delta_h)} = S_j $$
$$\implies \delta_j' = \delta_j + \log(S_j) - \log(\frac{1}{N}\sum_{i=1}^N \frac{\exp(\lambda_{ij}+\delta_j)}{\sum_{h=1}^J \exp(\lambda_{ih}+\delta_h)})\quad(*)$$
and $(*)$ is the contraction mapping we desired. For a initial guess of $\delta$'s, we can use this contraction mapping and iterate to find the fixed point.
So in the actual estimation, we search over a grid of matrices $B$ to maximize log likelihood, and at each $B$, we calculate a $\delta$ using the above contraction mapping, and in the end we will have an estimate of $B^*$ that maximize likelihood and a corresponding $\delta^*$
The code for the first step is provided bellow:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import fsolve
import scipy.optimize as opt
import scipy as sp
import scipy.io
from numpy.linalg import inv
from scipy.linalg import sqrtm
import statsmodels.formula.api as sm
import statsmodels.api as sma
import pylab
from __future__ import division #for the division operator "/"
#plt.style.use('ggplot')
from linearmodels.iv import IV2SLS
from linearmodels.iv import IVLIML
from scipy.linalg import fractional_matrix_power
from bokeh.charts import Bar
from matplotlib import rc
from scipy.stats import norm
from scipy.stats import rankdata
from sklearn.neighbors import KernelDensity
from statsmodels.iolib.summary2 import summary_col
from statsmodels.nonparametric.smoothers_lowess import lowess
import scipy.stats as stats
from statsmodels.distributions.empirical_distribution import ECDF
from statsmodels.regression.quantile_regression import QuantReg
import time
# import household data # hh_df stands for household_data
###################################################
### change your directory here for replication. ###
###################################################
# hh_df = pd.read_csv('Desktop/Urban_Pset/household_data.csv')
hh_df = pd.read_csv('household_data.csv')
# add additional column of standardized loginc
hh_df['norm_inc'] = (hh_df.loginc - np.mean(hh_df.loginc))/np.std(hh_df.loginc)
N = hh_df['hhid'].count()
z = np.array(hh_df[['norm_inc','college','jchoice']])
# import neighborhood data # nbh_df stands for neighborhood data
###################################################
### change your directory here for replication. ###
###################################################
# nbh_df = pd.read_csv('Desktop/Urban_Pset/neighborhood_data.csv')
nbh_df = pd.read_csv('neighborhood_data.csv')
J = nbh_df['jid'].count()
x = np.array(nbh_df[['schqual','price','sharej','iv']])
# for loop is too slow. Matrix multiplication is much much faster.
# create matrix of probabilities P
def P(B,z,x,d):
Q = np.exp(z[:,0:2]@B@(x[:,0:2].T) + np.array([d,]*N))
T = 1/np.sum(Q, axis=1)
return np.multiply(Q, T[:, np.newaxis])
# create a contraction function
def contraction(B,z,x,d,P):
d1 = np.zeros(J)
for j in range(J):
d1[j] = d[j] + np.log(x[j,2]) - np.log((1/N)*np.sum(P[:,j]))
return d1
# create matrix of dummies D
D = np.zeros(shape=(N,J))
for i in range(N):
for j in range(J):
D[i,j] = np.int(z[i,2] == j+1)
# define log likelihood function with built-in contraction mapping
def loglike(B,z,x,d):
count = 0
maxiter = 200
tol = 1e-6
while count < maxiter:
Q = P(B,z,x,d)
d1 = contraction(B,z,x,d,Q)
err = np.max(np.abs(d1-d))
if err >= tol:
count = count + 1
d=d1
continue
elif err < tol:
# calculate P using the optimal d1
Q = P(B,z,x,d1)
d_star = d1
break
return [np.sum(np.multiply(np.log(Q),D)), d_star]
#initial guess of delta and B
d = np.zeros(J)
B = np.array([[1,1],[1,1]])
/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 1) will return an array of dtype('int64') format(shape, fill_value, array(fill_value).dtype), FutureWarning)
start = time.time()
P(B,z,x,d)
end = time.time()
print(end - start)
# takes about 0.04 secs, good.
0.04052305221557617
# check runtime of each different B
start = time.time()
loglike(B,z,x,d)
end = time.time()
print(end - start)
# takes about 2 secs for each B now. much better
/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64') format(shape, fill_value, array(fill_value).dtype), FutureWarning)
2.0972378253936768
def bmin(params):
i,j,k,l = params
return -loglike(np.array([[i,j],[k,l]]),z,x,d)[0]
b0 = [1/2,1/2,1/2,1/2]
b0
[0.5, 0.5, 0.5, 0.5]
##############
# WARNING!!!!#
##############
# this may take several minutes...not as fast as in matlab since I'm not sure if the fmin here
# is the same as fminunc in matlab!
params = opt.fmin(bmin, b0, ftol = 1e-6, maxiter = 200)
/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64') format(shape, fill_value, array(fill_value).dtype), FutureWarning)
Optimization terminated successfully. Current function value: 49576.603724 Iterations: 128 Function evaluations: 218
Bhat = np.array([params[0:2],params[2:4]])
Bhat
array([[ 1.05151504, 0.18173876], [ 0.48293352, 0.09102158]])
dhat = loglike(Bhat,z,x,d)[1]
/Users/lucaszz/anaconda/lib/python3.6/site-packages/numpy/core/numeric.py:301: FutureWarning: in the future, full(200, 0) will return an array of dtype('int64') format(shape, fill_value, array(fill_value).dtype), FutureWarning)
# add the dhat to the original neighborhood dataframe for IV analysis
nbh_df['dhat'] = dhat
Once we obtained the $\delta_j$'s, we can estimate the remaining parameters, i.e., $A$: $$ \delta_j = x_j'A + \xi_j $$
Since $x_j$ contains price $p_j$ of living in location $j$, it's highly likely that the prices are correlated with unobservable location characteristics $\xi_j$.
Once we found a valid intrumental variable for prices (provided in the data), we can estimate the parameters $A$ using standard 2SLS or Garen's CF approach.
# IV to recover A:
mod = IVLIML.from_formula('dhat ~ 1 + schqual + [price ~ iv]', nbh_df)
result = mod.fit()
print(result.summary)
IV-LIML Estimation Summary ============================================================================== Dep. Variable: dhat R-squared: 0.4139 Estimator: IV-LIML Adj. R-squared: 0.4080 No. Observations: 200 F-statistic: 344.65 Date: Sun, Mar 03 2019 P-value (F-stat) 0.0000 Time: 13:16:17 Distribution: chi2(2) Cov. Estimator: robust Parameter Estimates ============================================================================== Parameter Std. Err. T-stat P-value Lower CI Upper CI ------------------------------------------------------------------------------ Intercept 0.5720 0.0498 11.497 0.0000 0.4745 0.6695 schqual 1.0084 0.0969 10.406 0.0000 0.8185 1.1983 price -0.9642 0.0641 -15.050 0.0000 -1.0898 -0.8386 ============================================================================== Endogenous: price Instruments: iv Robust Covariance (Heteroskedastic) Debiased: False Kappa: 1.000
Ahat = np.array(result.params[['schqual','price']])
Ahat
array([ 1.00841337, -0.96421125])
Compute estimates of $\xi_j$ for each j and create a scatter plot between $\xi_j$ and $p_j$ (price).
# add xi to the dataframe
nbh_df['xihat'] = result.resids
# plot price and unobservables xi
sns.set()
ax = sns.scatterplot('xihat','price',data=nbh_df)
plt.show()
Now suppose that the coefficient on school quality in A doubles, while the coefficient associated with log-income and school quality increases by $1.5$
Calculate the new equilibrium prices, normalizing p1 to be the same in the old and the new equilibrium.
Create a scatter plot showing how price relates to school quality in both the new and old equilibrium.
# parameter associated with log-income and school quality is the B[1,1]
B_counter = Bhat * np.array([[1.5,1],[1,1]])
print(B_counter)
A_counter = Ahat*2
print(A_counter)
[[ 1.57727256 0.18173876] [ 0.48293352 0.09102158]] [ 2.01682673 -1.9284225 ]
xihat = result.resids
p0 = np.zeros(J)
# define a new contraction mapping for the prices
# we have to make an assumption that xi are fixed, i.e. unobservable quality is
# uncorrelated with housing price ex-post, i.e., once xi are known, it can't change anymore
# create a contraction function
def p_contract(B,z,x,p,A):
p1 = np.zeros(J)
d = A[0]*x[:,0] + A[1]*p + xihat
x1= np.column_stack((x[:,0], p, x[:,2]))
Q = P(B,z,x1,d)
for j in range(J):
p1[j] = p[j] - (1/A[1])*(np.log((1/N)*np.sum(Q[:,j])) - np.log(x1[j,2]))
return p1
# check the runtime
start = time.time()
p_contract(B_counter,z,x,p,A_counter)
end = time.time()
print(end - start)
0.49749279022216797
# find the new equilibrium prices
count = 0
maxiter = 200
tol = 1e-6
#p = x[:,1] # using the realized price as the guess of the price.
p = np.zeros(J)+1 #seems like the initial values matter A LOT...
while count < maxiter:
p2 = p_contract(B_counter,z,x,p,A_counter)
err = np.max(np.abs(p2-p))
count = count + 1
if err >= tol:
p=p2
continue
elif err < tol:
p_hat = p2
print(count)
break
29
# counter factual p scaled
p_star = p_hat*(x[0,1]/p_hat[0])
# add counter factual p to the dataframe
nbh_df['counter_p'] = p_star
# plot price and unobservables xi
current_palette = sns.color_palette()
sns.set()
ax1 = sns.scatterplot('schqual','price',data = nbh_df, marker = "o", color = "lightskyblue", label = 'data')
ax2 = sns.scatterplot('schqual','counter_p', data = nbh_df, marker="+", color ="darkorange", label = 'counter factual')
plt.xlim((-2, 2))
plt.ylim((-4, 8))
ax1.legend(loc='upper left')
ax2.legend(loc='upper left')
plt.xlabel('school quality')
plt.ylabel('price')
plt.show()