Missing Data

All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing

In [1]: import statsmodels.api as sm

In [2]: data = sm.datasets.longley.load(as_pandas=False)

In [3]: data.exog = sm.add_constant(data.exog)

# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))

In [5]: missing_idx[[4, 10, 15]] = True

In [6]: data.endog[missing_idx] = np.nan

In [7]: ols_model = sm.OLS(data.endog, data.exog)

In [8]: ols_fit = ols_model.fit()

In [9]: print(ols_fit.params)
[nan nan nan nan nan nan nan]

This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.

In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
---------------------------------------------------------------------------
MissingDataError                          Traceback (most recent call last)
<ipython-input-10-5debd60362bf> in <module>
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')

/usr/lib/python3/dist-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    835     def __init__(self, endog, exog=None, missing='none', hasconst=None,
    836                  **kwargs):
--> 837         super(OLS, self).__init__(endog, exog, missing=missing,
    838                                   hasconst=hasconst, **kwargs)
    839         if "weights" in self._init_keys:

/usr/lib/python3/dist-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    681         else:
    682             weights = weights.squeeze()
--> 683         super(WLS, self).__init__(endog, exog, missing=missing,
    684                                   weights=weights, hasconst=hasconst, **kwargs)
    685         nobs = self.exog.shape[0]

/usr/lib/python3/dist-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
    194     """
    195     def __init__(self, endog, exog, **kwargs):
--> 196         super(RegressionModel, self).__init__(endog, exog, **kwargs)
    197         self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
    198 

/usr/lib/python3/dist-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
    214 
    215     def __init__(self, endog, exog=None, **kwargs):
--> 216         super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
    217         self.initialize()
    218 

/usr/lib/python3/dist-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
     65         missing = kwargs.pop('missing', 'none')
     66         hasconst = kwargs.pop('hasconst', None)
---> 67         self.data = self._handle_data(endog, exog, missing, hasconst,
     68                                       **kwargs)
     69         self.k_constant = self.data.k_constant

/usr/lib/python3/dist-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
     89 
     90     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---> 91         data = handle_data(endog, exog, missing, hasconst, **kwargs)
     92         # kwargs arrays could have changed, easier to just attach here
     93         for key in kwargs:

/usr/lib/python3/dist-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    632 
    633     klass = handle_data_class_factory(endog, exog)
--> 634     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
    635                  **kwargs)

/usr/lib/python3/dist-packages/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     62             self.formula = kwargs.pop('formula')
     63         if missing != 'none':
---> 64             arrays, nan_idx = self.handle_missing(endog, exog, missing,
     65                                                   **kwargs)
     66             self.missing_row_idx = nan_idx

/usr/lib/python3/dist-packages/statsmodels/base/data.py in handle_missing(cls, endog, exog, missing, **kwargs)
    275 
    276         elif missing == 'raise':
--> 277             raise MissingDataError("NaNs were encountered in the data")
    278 
    279         elif missing == 'drop':

MissingDataError: NaNs were encountered in the data

If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.

In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')

We are considering adding a configuration framework so that you can set the option with a global setting.