Logo

M-Estimators for Robust Linear ModelingΒΆ

Link to Notebook GitHub

In [1]:
from __future__ import print_function
from statsmodels.compat import lmap
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

import statsmodels.api as sm
  • An M-estimator minimizes the function

$$Q(e_i, \rho) = \sum_i~\rho \left (\frac{e_i}{s}\right )$$

where $\rho$ is a symmetric function of the residuals

  • The effect of $\rho$ is to reduce the influence of outliers
  • $s$ is an estimate of scale.
  • The robust estimates $\hat{\beta}$ are computed by the iteratively re-weighted least squares algorithm
  • We have several choices available for the weighting functions to be used
In [2]:
norms = sm.robust.norms
In [3]:
def plot_weights(support, weights_func, xlabels, xticks):
    fig = plt.figure(figsize=(12,8))
    ax = fig.add_subplot(111)
    ax.plot(support, weights_func(support))
    ax.set_xticks(xticks)
    ax.set_xticklabels(xlabels, fontsize=16)
    ax.set_ylim(-.1, 1.1)
    return ax

Andrew's Wave

In [4]:
help(norms.AndrewWave.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.AndrewWave method
    Andrew's wave weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = sin(z/a)/(z/a)     for \|z\| <= a*pi

        weights(z) = 0                  for \|z\| > a*pi


In [5]:
a = 1.339
support = np.linspace(-np.pi*a, np.pi*a, 100)
andrew = norms.AndrewWave(a=a)
plot_weights(support, andrew.weights, ['$-\pi*a$', '0', '$\pi*a$'], [-np.pi*a, 0, np.pi*a]);
<matplotlib.figure.Figure at 0x7f00911a6b90>

Hampel's 17A

In [6]:
help(norms.Hampel.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.Hampel method
    Hampel weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = 1                            for \|z\| <= a

        weights(z) = a/\|z\|                        for a < \|z\| <= b

        weights(z) = a*(c - \|z\|)/(\|z\|*(c-b))      for b < \|z\| <= c

        weights(z) = 0                            for \|z\| > c


In [7]:
c = 8
support = np.linspace(-3*c, 3*c, 1000)
hampel = norms.Hampel(a=2., b=4., c=c)
plot_weights(support, hampel.weights, ['3*c', '0', '3*c'], [-3*c, 0, 3*c]);
<matplotlib.figure.Figure at 0x7f00923eead0>

Huber's t

In [8]:
help(norms.HuberT.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.HuberT method
    Huber's t weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = 1          for \|z\| <= t

        weights(z) = t/\|z\|      for \|z\| > t


In [9]:
t = 1.345
support = np.linspace(-3*t, 3*t, 1000)
huber = norms.HuberT(t=t)
plot_weights(support, huber.weights, ['-3*t', '0', '3*t'], [-3*t, 0, 3*t]);
<matplotlib.figure.Figure at 0x7f00921aba10>

Least Squares

In [10]:
help(norms.LeastSquares.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.LeastSquares method
    The least squares estimator weighting function for the IRLS algorithm.

    The psi function scaled by the input z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = np.ones(z.shape)


In [11]:
support = np.linspace(-3, 3, 1000)
lst_sq = norms.LeastSquares()
plot_weights(support, lst_sq.weights, ['-3', '0', '3'], [-3, 0, 3]);
<matplotlib.figure.Figure at 0x7f00911b1090>

Ramsay's Ea

In [12]:
help(norms.RamsayE.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.RamsayE method
    Ramsay's Ea weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = exp(-a*\|z\|)


In [13]:
a = .3
support = np.linspace(-3*a, 3*a, 1000)
ramsay = norms.RamsayE(a=a)
plot_weights(support, ramsay.weights, ['-3*a', '0', '3*a'], [-3*a, 0, 3*a]);
<matplotlib.figure.Figure at 0x7f00923e1410>

Trimmed Mean

In [14]:
help(norms.TrimmedMean.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.TrimmedMean method
    Least trimmed mean weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        weights(z) = 1             for \|z\| <= c

        weights(z) = 0             for \|z\| > c


In [15]:
c = 2
support = np.linspace(-3*c, 3*c, 1000)
trimmed = norms.TrimmedMean(c=c)
plot_weights(support, trimmed.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);
<matplotlib.figure.Figure at 0x7f0091842710>

Tukey's Biweight

In [16]:
help(norms.TukeyBiweight.weights)
Help on method weights in module statsmodels.robust.norms:

weights(self, z) unbound statsmodels.robust.norms.TukeyBiweight method
    Tukey's biweight weighting function for the IRLS algorithm

    The psi function scaled by z

    Parameters
    ----------
    z : array-like
        1d array

    Returns
    -------
    weights : array
        psi(z) = (1 - (z/c)**2)**2          for \|z\| <= R

        psi(z) = 0                          for \|z\| > R


In [17]:
c = 4.685
support = np.linspace(-3*c, 3*c, 1000)
tukey = norms.TukeyBiweight(c=c)
plot_weights(support, tukey.weights, ['-3*c', '0', '3*c'], [-3*c, 0, 3*c]);
<matplotlib.figure.Figure at 0x7f0091499390>

Scale Estimators

  • Robust estimates of the location
In [18]:
x = np.array([1, 2, 3, 4, 500])
  • The mean is not a robust estimator of location
In [19]:
x.mean()
Out[19]:
102.0
  • The median, on the other hand, is a robust estimator with a breakdown point of 50%
In [20]:
np.median(x)
Out[20]:
3.0
  • Analagously for the scale
  • The standard deviation is not robust
In [21]:
x.std()
Out[21]:
199.00251254695254

Median Absolute Deviation

$$ median_i |X_i - median_j(X_j)|) $$

Standardized Median Absolute Deviation is a consistent estimator for $\hat{\sigma}$

$$\hat{\sigma}=K \cdot MAD$$

where $K$ depends on the distribution. For the normal distribution for example,

$$K = \Phi^{-1}(.75)$$

In [22]:
stats.norm.ppf(.75)
Out[22]:
0.67448975019608171
In [23]:
print(x)
[  1   2   3   4 500]

In [24]:
sm.robust.scale.stand_mad(x)
/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/robust/scale.py:49: FutureWarning: stand_mad is deprecated and will be removed in 0.7.0. Use mad instead.
  "instead.", FutureWarning)

Out[24]:
1.482602218505602
In [25]:
np.array([1,2,3,4,5.]).std()
Out[25]:
1.4142135623730951
  • The default for Robust Linear Models is MAD
  • another popular choice is Huber's proposal 2
In [26]:
np.random.seed(12345)
fat_tails = stats.t(6).rvs(40)
In [27]:
kde = sm.nonparametric.KDE(fat_tails)
kde.fit()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax.plot(kde.support, kde.density);
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-275-605f8dff1422> in <module>()
----> 1 kde = sm.nonparametric.KDE(fat_tails)
      2 kde.fit()
      3 fig = plt.figure(figsize=(12,8))
      4 ax = fig.add_subplot(111)
      5 ax.plot(kde.support, kde.density);

AttributeError: 'module' object has no attribute 'KDE'
In [28]:
print(fat_tails.mean(), fat_tails.std())
0.0688231044811 1.34716332297

In [29]:
print(stats.norm.fit(fat_tails))
(0.068823104481087458, 1.3471633229698652)

In [30]:
print(stats.t.fit(fat_tails, f0=6))
(6, 0.039009187170278126, 1.0564230978488927)

In [31]:
huber = sm.robust.scale.Huber()
loc, scale = huber(fat_tails)
print(loc, scale)
0.0404898433327 1.15571400476

In [32]:
sm.robust.stand_mad(fat_tails)
Out[32]:
1.1153350011654151
In [33]:
sm.robust.stand_mad(fat_tails, c=stats.t(6).ppf(.75))
Out[33]:
1.0483916565928972
In [34]:
sm.robust.scale.mad(fat_tails)
Out[34]:
1.1153350011654151

Duncan's Occupational Prestige data - M-estimation for outliers

In [35]:
from statsmodels.graphics.api import abline_plot
from statsmodels.formula.api import ols, rlm
In [36]:
prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data
---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
<ipython-input-284-d85a43c9ce16> in <module>()
----> 1 prestige = sm.datasets.get_rdataset("Duncan", "car", cache=True).data

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in get_rdataset(dataname, package, cache)
    284                      "master/doc/"+package+"/rst/")
    285     cache = _get_cache(cache)
--> 286     data, from_cache = _get_data(data_base_url, dataname, cache)
    287     data = read_csv(data, index_col=0)
    288     data = _maybe_reset_index(data)

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in _get_data(base_url, dataname, cache, extension)
    215     url = base_url + (dataname + ".%s") % extension
    216     try:
--> 217         data, from_cache = _urlopen_cached(url, cache)
    218     except HTTPError as err:
    219         if '404' in str(err):

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in _urlopen_cached(url, cache)
    206     # not using the cache or didn't find it in cache
    207     if not from_cache:
--> 208         data = urlopen(url).read()
    209         if cache is not None:  # then put it in the cache
    210             _cache_it(data, cache_path)

/usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    152     else:
    153         opener = _opener
--> 154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):

/usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    429             req = meth(req)
    430 
--> 431         response = self._open(req, data)
    432 
    433         # post-process response

/usr/lib/python2.7/urllib2.pyc in _open(self, req, data)
    447         protocol = req.get_type()
    448         result = self._call_chain(self.handle_open, protocol, protocol +
--> 449                                   '_open', req)
    450         if result:
    451             return result

/usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    407             func = getattr(handler, meth_name)
    408 
--> 409             result = func(*args)
    410             if result is not None:
    411                 return result

/usr/lib/python2.7/urllib2.pyc in https_open(self, req)
   1238         def https_open(self, req):
   1239             return self.do_open(httplib.HTTPSConnection, req,
-> 1240                 context=self._context)
   1241 
   1242         https_request = AbstractHTTPHandler.do_request_

/usr/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
   1195         except socket.error, err: # XXX what error?
   1196             h.close()
-> 1197             raise URLError(err)
   1198         else:
   1199             try:

URLError: <urlopen error [Errno -2] Name or service not known>
In [37]:
print(prestige.head(10))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-285-d3ced65c396c> in <module>()
----> 1 print(prestige.head(10))

NameError: name 'prestige' is not defined
In [38]:
fig = plt.figure(figsize=(12,12))
ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')
ax1.scatter(prestige.income, prestige.prestige)
xy_outlier = prestige.ix['minister'][['income','prestige']]
ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)
ax2 = fig.add_subplot(212, xlabel='Education',
                           ylabel='Prestige')
ax2.scatter(prestige.education, prestige.prestige);
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-286-ab347cc850d5> in <module>()
      1 fig = plt.figure(figsize=(12,12))
      2 ax1 = fig.add_subplot(211, xlabel='Income', ylabel='Prestige')
----> 3 ax1.scatter(prestige.income, prestige.prestige)
      4 xy_outlier = prestige.ix['minister'][['income','prestige']]
      5 ax1.annotate('Minister', xy_outlier, xy_outlier+1, fontsize=16)

NameError: name 'prestige' is not defined
<matplotlib.figure.Figure at 0x7f00913a8050>
In [39]:
ols_model = ols('prestige ~ income + education', prestige).fit()
print(ols_model.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-287-94340cd202f8> in <module>()
----> 1 ols_model = ols('prestige ~ income + education', prestige).fit()
      2 print(ols_model.summary())

NameError: name 'prestige' is not defined
In [40]:
infl = ols_model.get_influence()
student = infl.summary_frame()['student_resid']
print(student)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-288-762835c5010b> in <module>()
----> 1 infl = ols_model.get_influence()
      2 student = infl.summary_frame()['student_resid']
      3 print(student)

NameError: name 'ols_model' is not defined
In [41]:
print(student.ix[np.abs(student) > 2])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-289-4de70e5d7415> in <module>()
----> 1 print(student.ix[np.abs(student) > 2])

NameError: name 'student' is not defined
In [42]:
print(infl.summary_frame().ix['minister'])
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-290-1b2a9fb1d022> in <module>()
----> 1 print(infl.summary_frame().ix['minister'])

NameError: name 'infl' is not defined
In [43]:
sidak = ols_model.outlier_test('sidak')
sidak.sort('unadj_p', inplace=True)
print(sidak)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-291-88381d4a1ea2> in <module>()
----> 1 sidak = ols_model.outlier_test('sidak')
      2 sidak.sort('unadj_p', inplace=True)
      3 print(sidak)

NameError: name 'ols_model' is not defined
In [44]:
fdr = ols_model.outlier_test('fdr_bh')
fdr.sort('unadj_p', inplace=True)
print(fdr)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-292-69e9cb113222> in <module>()
----> 1 fdr = ols_model.outlier_test('fdr_bh')
      2 fdr.sort('unadj_p', inplace=True)
      3 print(fdr)

NameError: name 'ols_model' is not defined
In [45]:
rlm_model = rlm('prestige ~ income + education', prestige).fit()
print(rlm_model.summary())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-293-f16d19a78b97> in <module>()
----> 1 rlm_model = rlm('prestige ~ income + education', prestige).fit()
      2 print(rlm_model.summary())

NameError: name 'prestige' is not defined
In [46]:
print(rlm_model.weights)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-294-2a0e93853fea> in <module>()
----> 1 print(rlm_model.weights)

NameError: name 'rlm_model' is not defined

Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points

  • Data is on the luminosity and temperature of 47 stars in the direction of Cygnus.
In [47]:
dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data
---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
<ipython-input-295-4880a5abdb55> in <module>()
----> 1 dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in get_rdataset(dataname, package, cache)
    284                      "master/doc/"+package+"/rst/")
    285     cache = _get_cache(cache)
--> 286     data, from_cache = _get_data(data_base_url, dataname, cache)
    287     data = read_csv(data, index_col=0)
    288     data = _maybe_reset_index(data)

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in _get_data(base_url, dataname, cache, extension)
    215     url = base_url + (dataname + ".%s") % extension
    216     try:
--> 217         data, from_cache = _urlopen_cached(url, cache)
    218     except HTTPError as err:
    219         if '404' in str(err):

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/datasets/utils.pyc in _urlopen_cached(url, cache)
    206     # not using the cache or didn't find it in cache
    207     if not from_cache:
--> 208         data = urlopen(url).read()
    209         if cache is not None:  # then put it in the cache
    210             _cache_it(data, cache_path)

/usr/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    152     else:
    153         opener = _opener
--> 154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):

/usr/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    429             req = meth(req)
    430 
--> 431         response = self._open(req, data)
    432 
    433         # post-process response

/usr/lib/python2.7/urllib2.pyc in _open(self, req, data)
    447         protocol = req.get_type()
    448         result = self._call_chain(self.handle_open, protocol, protocol +
--> 449                                   '_open', req)
    450         if result:
    451             return result

/usr/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    407             func = getattr(handler, meth_name)
    408 
--> 409             result = func(*args)
    410             if result is not None:
    411                 return result

/usr/lib/python2.7/urllib2.pyc in https_open(self, req)
   1238         def https_open(self, req):
   1239             return self.do_open(httplib.HTTPSConnection, req,
-> 1240                 context=self._context)
   1241 
   1242         https_request = AbstractHTTPHandler.do_request_

/usr/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
   1195         except socket.error, err: # XXX what error?
   1196             h.close()
-> 1197             raise URLError(err)
   1198         else:
   1199             try:

URLError: <urlopen error [Errno -2] Name or service not known>
In [48]:
from matplotlib.patches import Ellipse
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='log(Temp)', ylabel='log(Light)', title='Hertzsprung-Russell Diagram of Star Cluster CYG OB1')
ax.scatter(*dta.values.T)
# highlight outliers
e = Ellipse((3.5, 6), .2, 1, alpha=.25, color='r')
ax.add_patch(e);
ax.annotate('Red giants', xy=(3.6, 6), xytext=(3.8, 6),
            arrowprops=dict(facecolor='black', shrink=0.05, width=2),
            horizontalalignment='left', verticalalignment='bottom',
            clip_on=True, # clip to the axes bounding box
            fontsize=16,
     )
# annotate these with their index
for i,row in dta.ix[dta['log.Te'] < 3.8].iterrows():
    ax.annotate(i, row, row + .01, fontsize=14)
xlim, ylim = ax.get_xlim(), ax.get_ylim()
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-296-d061dc5cc6d1> in <module>()
      2 fig = plt.figure(figsize=(12,8))
      3 ax = fig.add_subplot(111, xlabel='log(Temp)', ylabel='log(Light)', title='Hertzsprung-Russell Diagram of Star Cluster CYG OB1')
----> 4 ax.scatter(*dta.values.T)
      5 # highlight outliers
      6 e = Ellipse((3.5, 6), .2, 1, alpha=.25, color='r')

/usr/lib/python2.7/dist-packages/matplotlib/axes/_axes.pyc in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, **kwargs)
   3630             verts = None
   3631 
-> 3632         marker_obj = mmarkers.MarkerStyle(marker)
   3633         path = marker_obj.get_path().transformed(
   3634             marker_obj.get_transform())

/usr/lib/python2.7/dist-packages/matplotlib/markers.pyc in __init__(self, marker, fillstyle)
    167         """
    168         self._fillstyle = fillstyle
--> 169         self.set_marker(marker)
    170         self.set_fillstyle(fillstyle)
    171 

/usr/lib/python2.7/dist-packages/matplotlib/markers.pyc in set_marker(self, marker)
    248 
    249         self._marker = marker
--> 250         self._recache()
    251 
    252     def get_path(self):

/usr/lib/python2.7/dist-packages/matplotlib/markers.pyc in _recache(self)
    189         self._capstyle = 'butt'
    190         self._filled = True
--> 191         self._marker_function()
    192 
    193     if six.PY3:

/usr/lib/python2.7/dist-packages/matplotlib/markers.pyc in _set_vertices(self)
    280     def _set_vertices(self):
    281         verts = self._marker
--> 282         marker = Path(verts)
    283         self._set_custom_marker(marker)
    284 

/usr/lib/python2.7/dist-packages/matplotlib/path.pyc in __init__(self, vertices, codes, _interpolation_steps, closed, readonly)
    149             codes[-1] = self.CLOSEPOLY
    150 
--> 151         assert vertices.ndim == 2
    152         assert vertices.shape[1] == 2
    153 

AssertionError: 
<matplotlib.figure.Figure at 0x7f00913f15d0>
In [49]:
from IPython.display import Image
Image(filename='star_diagram.png')
Out[49]:
In [50]:
y = dta['log.light']
X = sm.add_constant(dta['log.Te'], prepend=True)
ols_model = sm.OLS(y, X).fit()
abline_plot(model_results=ols_model, ax=ax)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-298-6c7349bbf59f> in <module>()
----> 1 y = dta['log.light']
      2 X = sm.add_constant(dta['log.Te'], prepend=True)
      3 ols_model = sm.OLS(y, X).fit()
      4 abline_plot(model_results=ols_model, ax=ax)

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1778             return self._getitem_multilevel(key)
   1779         else:
-> 1780             return self._getitem_column(key)
   1781 
   1782     def _getitem_column(self, key):

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1785         # get column
   1786         if self.columns.is_unique:
-> 1787             return self._get_item_cache(key)
   1788 
   1789         # duplicate columns & possible reduce dimensionaility

/usr/lib/python2.7/dist-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1062         res = cache.get(item)
   1063         if res is None:
-> 1064             values = self._data.get(item)
   1065             res = self._box_item_values(item, values)
   1066             cache[item] = res

/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2847 
   2848             if not isnull(item):
-> 2849                 loc = self.items.get_loc(item)
   2850             else:
   2851                 indexer = np.arange(len(self.items))[isnull(self.items)]

/usr/lib/python2.7/dist-packages/pandas/core/index.pyc in get_loc(self, key)
   1400         loc : int if unique index, possibly slice or mask if not
   1401         """
-> 1402         return self._engine.get_loc(_values_from_object(key))
   1403 
   1404     def get_value(self, series, key):

/usr/lib/python2.7/dist-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3807)()

/usr/lib/python2.7/dist-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3687)()

/usr/lib/python2.7/dist-packages/pandas/hashtable.so in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12310)()

/usr/lib/python2.7/dist-packages/pandas/hashtable.so in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12261)()

KeyError: 'log.light'
In [51]:
rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()
abline_plot(model_results=rlm_mod, ax=ax, color='red')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-299-9210cc7d7045> in <module>()
----> 1 rlm_mod = sm.RLM(y, X, sm.robust.norms.TrimmedMean(.5)).fit()
      2 abline_plot(model_results=rlm_mod, ax=ax, color='red')

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/robust/robust_linear_model.pyc in __init__(self, endog, exog, M, missing, **kwargs)
    115         self.M = M
    116         super(base.LikelihoodModel, self).__init__(endog, exog,
--> 117                 missing=missing, **kwargs)
    118         self._initialize()
    119         #things to remove_data

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/model.pyc in __init__(self, endog, exog, **kwargs)
     58         hasconst = kwargs.pop('hasconst', None)
     59         self.data = self._handle_data(endog, exog, missing, hasconst,
---> 60                                       **kwargs)
     61         self.k_constant = self.data.k_constant
     62         self.exog = self.data.exog

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/model.pyc in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
     82 
     83     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---> 84         data = handle_data(endog, exog, missing, hasconst, **kwargs)
     85         # kwargs arrays could have changed, easier to just attach here
     86         for key in kwargs:

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/data.pyc in handle_data(endog, exog, missing, hasconst, **kwargs)
    564     klass = handle_data_class_factory(endog, exog)
    565     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--> 566                  **kwargs)

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/data.pyc in __init__(self, endog, exog, missing, hasconst, **kwargs)
     74         # this has side-effects, attaches k_constant and const_idx
     75         self._handle_constant(hasconst)
---> 76         self._check_integrity()
     77         self._cache = resettable_cache()
     78 

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/data.pyc in _check_integrity(self)
    451                 not self.orig_endog.index.equals(self.orig_exog.index)):
    452             raise ValueError("The indices for endog and exog are not aligned")
--> 453         super(PandasData, self)._check_integrity()
    454 
    455     def _get_row_labels(self, arr):

/build/buildd/statsmodels-0.6.1/debian/python-statsmodels/usr/lib/python2.7/dist-packages/statsmodels/base/data.pyc in _check_integrity(self)
    363         if self.exog is not None:
    364             if len(self.exog) != len(self.endog):
--> 365                 raise ValueError("endog and exog matrices are different sizes")
    366 
    367     def wrap_output(self, obj, how='columns', names=None):

ValueError: endog and exog matrices are different sizes
  • Why? Because M-estimators are not robust to leverage points.
In [52]:
infl = ols_model.get_influence()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-300-5ff696623192> in <module>()
----> 1 infl = ols_model.get_influence()

NameError: name 'ols_model' is not defined
In [53]:
h_bar = 2*(ols_model.df_model + 1 )/ols_model.nobs
hat_diag = infl.summary_frame()['hat_diag']
hat_diag.ix[hat_diag > h_bar]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-301-0fb883e54663> in <module>()
----> 1 h_bar = 2*(ols_model.df_model + 1 )/ols_model.nobs
      2 hat_diag = infl.summary_frame()['hat_diag']
      3 hat_diag.ix[hat_diag > h_bar]

NameError: name 'ols_model' is not defined
In [54]:
sidak2 = ols_model.outlier_test('sidak')
sidak2.sort('unadj_p', inplace=True)
print(sidak2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-302-a6c575f5e012> in <module>()
----> 1 sidak2 = ols_model.outlier_test('sidak')
      2 sidak2.sort('unadj_p', inplace=True)
      3 print(sidak2)

NameError: name 'ols_model' is not defined
In [55]:
fdr2 = ols_model.outlier_test('fdr_bh')
fdr2.sort('unadj_p', inplace=True)
print(fdr2)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-303-cf26bbbd14f6> in <module>()
----> 1 fdr2 = ols_model.outlier_test('fdr_bh')
      2 fdr2.sort('unadj_p', inplace=True)
      3 print(fdr2)

NameError: name 'ols_model' is not defined
  • Let's delete that line
In [56]:
del ax.lines[-1]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-304-346dae874d03> in <module>()
----> 1 del ax.lines[-1]

IndexError: list assignment index out of range
In [57]:
weights = np.ones(len(X))
weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
wls_model = sm.WLS(y, X, weights=weights).fit()
abline_plot(model_results=wls_model, ax=ax, color='green')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-305-e3ec53a40864> in <module>()
      1 weights = np.ones(len(X))
----> 2 weights[X[X['log.Te'] < 3.8].index.values - 1] = 0
      3 wls_model = sm.WLS(y, X, weights=weights).fit()
      4 abline_plot(model_results=wls_model, ax=ax, color='green')

ValueError: field named log.Te not found
  • MM estimators are good for this type of problem, unfortunately, we don't yet have these yet.
  • It's being worked on, but it gives a good excuse to look at the R cell magics in the notebook.
In [58]:
yy = y.values[:,None]
xx = X['log.Te'].values[:,None]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-306-a5672cb240e0> in <module>()
      1 yy = y.values[:,None]
----> 2 xx = X['log.Te'].values[:,None]

ValueError: field named log.Te not found
In [59]:
%load_ext rmagic

%R library(robustbase)
%Rpush yy xx
%R mod <- lmrob(yy ~ xx);
%R params <- mod$coefficients;
%Rpull params
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-307-53d6967d0401> in <module>()
----> 1 get_ipython().magic(u'load_ext rmagic')
      2 
      3 get_ipython().magic(u'R library(robustbase)')
      4 get_ipython().magic(u'Rpush yy xx')
      5 get_ipython().magic(u'R mod <- lmrob(yy ~ xx);')

/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2203         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2204         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2205         return self.run_line_magic(magic_name, magic_arg_s)
   2206 
   2207     #-------------------------------------------------------------------------

/usr/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2124                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2125             with self.builtin_trap:
-> 2126                 result = fn(*args,**kwargs)
   2127             return result
   2128 

/usr/lib/python2.7/dist-packages/IPython/core/magics/extension.pyc in load_ext(self, module_str)

/usr/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/usr/lib/python2.7/dist-packages/IPython/core/magics/extension.pyc in load_ext(self, module_str)
     61         if not module_str:
     62             raise UsageError('Missing module name.')
---> 63         res = self.shell.extension_manager.load_extension(module_str)
     64 
     65         if res == 'already loaded':

/usr/lib/python2.7/dist-packages/IPython/core/extensions.pyc in load_extension(self, module_str)
     96             if module_str not in sys.modules:
     97                 with prepended_to_syspath(self.ipython_extension_dir):
---> 98                     __import__(module_str)
     99             mod = sys.modules[module_str]
    100             if self._call_load_ipython_extension(mod):

/usr/lib/python2.7/dist-packages/IPython/extensions/rmagic.py in <module>()
     54 import numpy as np
     55 
---> 56 import rpy2.rinterface as ri
     57 import rpy2.robjects as ro
     58 try:

ImportError: No module named rpy2.rinterface
In [60]:
%R print(mod)
ERROR: Line magic function `%R` not found.

In [61]:
print(params)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-309-73ac4b936803> in <module>()
----> 1 print(params)

NameError: name 'params' is not defined
In [62]:
abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-310-e1a9f35b3320> in <module>()
----> 1 abline_plot(intercept=params[0], slope=params[1], ax=ax, color='green')

NameError: name 'params' is not defined

Exercise: Breakdown points of M-estimator

In [63]:
np.random.seed(12345)
nobs = 200
beta_true = np.array([3, 1, 2.5, 3, -4])
X = np.random.uniform(-20,20, size=(nobs, len(beta_true)-1))
# stack a constant in front
X = sm.add_constant(X, prepend=True) # np.c_[np.ones(nobs), X]
mc_iter = 500
contaminate = .25 # percentage of response variables to contaminate
In [64]:
all_betas = []
for i in range(mc_iter):
    y = np.dot(X, beta_true) + np.random.normal(size=200)
    random_idx = np.random.randint(0, nobs, size=int(contaminate * nobs))
    y[random_idx] = np.random.uniform(-750, 750)
    beta_hat = sm.RLM(y, X).fit().params
    all_betas.append(beta_hat)
In [65]:
all_betas = np.asarray(all_betas)
se_loss = lambda x : np.linalg.norm(x, ord=2)**2
se_beta = lmap(se_loss, all_betas - beta_true)

Squared error loss

In [66]:
np.array(se_beta).mean()
Out[66]:
0.44502948730686215
In [67]:
all_betas.mean(0)
Out[67]:
array([ 2.9971,  0.999 ,  2.4991,  2.9971, -3.9963])
In [68]:
beta_true
Out[68]:
array([ 3. ,  1. ,  2.5,  3. , -4. ])
In [69]:
se_loss(all_betas.mean(0) - beta_true)
Out[69]:
3.2360913286754188e-05

Previous topic

Robust Linear Models

Next topic

Weight Functions

This Page