Toy weather data

Here is an example of how to easily manipulate a toy weather dataset using xarray and other recommended Python libraries:

Shared setup:

import xarray as xr
import numpy as np
import pandas as pd
import seaborn as sns # pandas aware plotting library

np.random.seed(123)

times = pd.date_range('2000-01-01', '2001-12-31', name='time')
annual_cycle = np.sin(2 * np.pi * (times.dayofyear.values / 365.25 - 0.28))

base = 10 + 15 * annual_cycle.reshape(-1, 1)
tmin_values = base + 3 * np.random.randn(annual_cycle.size, 3)
tmax_values = base + 10 + 3 * np.random.randn(annual_cycle.size, 3)

ds = xr.Dataset({'tmin': (('time', 'location'), tmin_values),
                 'tmax': (('time', 'location'), tmax_values)},
                {'time': times, 'location': ['IA', 'IN', 'IL']})

Examine a dataset with pandas and seaborn

In [1]: ds
Out[1]: 
<xarray.Dataset>
Dimensions:  (x: 2, y: 3)
Coordinates:
  * x        (x) <U1 'a' 'b'
Dimensions without coordinates: y
Data variables:
    foo      (x, y) float64 -1.295 0.4137 0.2767 -0.472 -0.01396 -0.3625
    bar      (x) int64 1 2
    baz      float64 3.142

In [2]: df = ds.to_dataframe()

In [3]: df.head()
Out[3]: 
          foo  bar       baz
x y                         
a 0 -1.294524    1  3.141593
  1  0.413738    1  3.141593
  2  0.276662    1  3.141593
b 0 -0.472035    2  3.141593
  1 -0.013960    2  3.141593

In [4]: df.describe()
Out[4]: 
            foo       bar       baz
count  6.000000  6.000000  6.000000
mean  -0.242110  1.500000  3.141593
std    0.620686  0.547723  0.000000
min   -1.294524  1.000000  3.141593
25%   -0.444662  1.000000  3.141593
50%   -0.188251  1.500000  3.141593
75%    0.204006  2.000000  3.141593
max    0.413738  2.000000  3.141593

In [5]: ds.mean(dim='location').to_dataframe().plot()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-5f2e7c6e2f84> in <module>()
----> 1 ds.mean(dim='location').to_dataframe().plot()

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/common.py in wrapped_func(self, dim, keep_attrs, skipna, **kwargs)
     49                 return self.reduce(func, dim, keep_attrs, skipna=skipna,
     50                                    numeric_only=numeric_only, allow_lazy=True,
---> 51                                    **kwargs)
     52         else:
     53             def wrapped_func(self, dim=None, keep_attrs=False, **kwargs):

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in reduce(self, func, dim, keep_attrs, numeric_only, allow_lazy, **kwargs)
   2199         if missing_dimensions:
   2200             raise ValueError('Dataset does not contain the dimensions: %s'
-> 2201                              % missing_dimensions)
   2202 
   2203         variables = OrderedDict()

ValueError: Dataset does not contain the dimensions: ['location']
../_images/examples_tmin_tmax_plot.png
In [6]: sns.pairplot(df.reset_index(), vars=ds.data_vars)
Out[6]: <seaborn.axisgrid.PairGrid at 0x7f8b2e5cf940>
../_images/examples_pairplot.png

Probability of freeze by calendar month

In [7]: freeze = (ds['tmin'] <= 0).groupby('time.month').mean('time')
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    661         try:
--> 662             variable = self._variables[name]
    663         except KeyError:

KeyError: 'tmin'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-7-aacd97af80ae> in <module>()
----> 1 freeze = (ds['tmin'] <= 0).groupby('time.month').mean('time')

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in __getitem__(self, key)
    719 
    720         if hashable(key):
--> 721             return self._construct_dataarray(key)
    722         else:
    723             return self._copy_listed(np.asarray(key))

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    663         except KeyError:
    664             _, name, variable = _get_virtual_variable(
--> 665                 self._variables, name, self._level_coords, self.dims)
    666 
    667         coords = OrderedDict()

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _get_virtual_variable(variables, key, level_vars, dim_sizes)
     72         ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
     73     else:
---> 74         ref_var = variables[ref_name]
     75 
     76     if var_name is None:

KeyError: 'tmin'

In [8]: freeze
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-bcec125f401e> in <module>()
----> 1 freeze

NameError: name 'freeze' is not defined

In [9]: freeze.to_pandas().plot()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-65f2b612c218> in <module>()
----> 1 freeze.to_pandas().plot()

NameError: name 'freeze' is not defined
../_images/examples_freeze_prob.png

Monthly averaging

In [10]: monthly_avg = ds.resample('1MS', dim='time', how='mean')
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    661         try:
--> 662             variable = self._variables[name]
    663         except KeyError:

KeyError: 'time'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-10-fed435c279c5> in <module>()
----> 1 monthly_avg = ds.resample('1MS', dim='time', how='mean')

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/common.py in resample(self, freq, dim, how, skipna, closed, label, base, keep_attrs)
    542         RESAMPLE_DIM = '__resample_dim__'
    543         if isinstance(dim, basestring):
--> 544             dim = self[dim]
    545         group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM)
    546         time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed,

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in __getitem__(self, key)
    719 
    720         if hashable(key):
--> 721             return self._construct_dataarray(key)
    722         else:
    723             return self._copy_listed(np.asarray(key))

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    663         except KeyError:
    664             _, name, variable = _get_virtual_variable(
--> 665                 self._variables, name, self._level_coords, self.dims)
    666 
    667         coords = OrderedDict()

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _get_virtual_variable(variables, key, level_vars, dim_sizes)
     72         ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
     73     else:
---> 74         ref_var = variables[ref_name]
     75 
     76     if var_name is None:

KeyError: 'time'

In [11]: monthly_avg.sel(location='IA').to_dataframe().plot(style='s-')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-1658745544fb> in <module>()
----> 1 monthly_avg.sel(location='IA').to_dataframe().plot(style='s-')

NameError: name 'monthly_avg' is not defined
../_images/examples_tmin_tmax_plot_mean.png

Note that MS here refers to Month-Start; M labels Month-End (the last day of the month).

Calculate monthly anomalies

In climatology, “anomalies” refer to the difference between observations and typical weather for a particular season. Unlike observations, anomalies should not show any seasonal cycle.

In [12]: climatology = ds.groupby('time.month').mean('time')
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    661         try:
--> 662             variable = self._variables[name]
    663         except KeyError:

KeyError: 'time.month'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-c15de2da866e> in <module>()
----> 1 climatology = ds.groupby('time.month').mean('time')

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/common.py in groupby(self, group, squeeze)
    394             iterated over in the form of `(unique_value, grouped_array)` pairs.
    395         """
--> 396         return self._groupby_cls(self, group, squeeze=squeeze)
    397 
    398     def groupby_bins(self, group, bins, right=True, labels=None, precision=3,

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/groupby.py in __init__(self, obj, group, squeeze, grouper, bins, cut_kwargs)
    212                 raise TypeError('`group` must be an xarray.DataArray or the '
    213                                 'name of an xarray variable or dimension')
--> 214             group = obj[group]
    215             if group.name not in obj and group.name in obj.dims:
    216                 # DummyGroups should not appear on groupby results

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in __getitem__(self, key)
    719 
    720         if hashable(key):
--> 721             return self._construct_dataarray(key)
    722         else:
    723             return self._copy_listed(np.asarray(key))

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    663         except KeyError:
    664             _, name, variable = _get_virtual_variable(
--> 665                 self._variables, name, self._level_coords, self.dims)
    666 
    667         coords = OrderedDict()

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _get_virtual_variable(variables, key, level_vars, dim_sizes)
     72         ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
     73     else:
---> 74         ref_var = variables[ref_name]
     75 
     76     if var_name is None:

KeyError: 'time'

In [13]: anomalies = ds.groupby('time.month') - climatology
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    661         try:
--> 662             variable = self._variables[name]
    663         except KeyError:

KeyError: 'time.month'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-13-3814c8e8af73> in <module>()
----> 1 anomalies = ds.groupby('time.month') - climatology

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/common.py in groupby(self, group, squeeze)
    394             iterated over in the form of `(unique_value, grouped_array)` pairs.
    395         """
--> 396         return self._groupby_cls(self, group, squeeze=squeeze)
    397 
    398     def groupby_bins(self, group, bins, right=True, labels=None, precision=3,

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/groupby.py in __init__(self, obj, group, squeeze, grouper, bins, cut_kwargs)
    212                 raise TypeError('`group` must be an xarray.DataArray or the '
    213                                 'name of an xarray variable or dimension')
--> 214             group = obj[group]
    215             if group.name not in obj and group.name in obj.dims:
    216                 # DummyGroups should not appear on groupby results

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in __getitem__(self, key)
    719 
    720         if hashable(key):
--> 721             return self._construct_dataarray(key)
    722         else:
    723             return self._copy_listed(np.asarray(key))

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _construct_dataarray(self, name)
    663         except KeyError:
    664             _, name, variable = _get_virtual_variable(
--> 665                 self._variables, name, self._level_coords, self.dims)
    666 
    667         coords = OrderedDict()

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/dataset.py in _get_virtual_variable(variables, key, level_vars, dim_sizes)
     72         ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
     73     else:
---> 74         ref_var = variables[ref_name]
     75 
     76     if var_name is None:

KeyError: 'time'

In [14]: anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-e06f89d6ac61> in <module>()
----> 1 anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()

NameError: name 'anomalies' is not defined
../_images/examples_anomalies_plot.png

Fill missing values with climatology

The fillna() method on grouped objects lets you easily fill missing values by group:

# throw away the first half of every month
In [15]: some_missing = ds.tmin.sel(time=ds['time.day'] > 15).reindex_like(ds)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-15-49c5a11b9808> in <module>()
----> 1 some_missing = ds.tmin.sel(time=ds['time.day'] > 15).reindex_like(ds)

/build/python-xarray-Gnyvsx/python-xarray-0.9.6/xarray/core/common.py in __getattr__(self, name)
    166                     return source[name]
    167         raise AttributeError("%r object has no attribute %r" %
--> 168                              (type(self).__name__, name))
    169 
    170     def __setattr__(self, name, value):

AttributeError: 'Dataset' object has no attribute 'tmin'

In [16]: filled = some_missing.groupby('time.month').fillna(climatology.tmin)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-15f81b03c1db> in <module>()
----> 1 filled = some_missing.groupby('time.month').fillna(climatology.tmin)

NameError: name 'some_missing' is not defined

In [17]: both = xr.Dataset({'some_missing': some_missing, 'filled': filled})
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-db0ecb13c81d> in <module>()
----> 1 both = xr.Dataset({'some_missing': some_missing, 'filled': filled})

NameError: name 'some_missing' is not defined

In [18]: both
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-452f74295bc1> in <module>()
----> 1 both

NameError: name 'both' is not defined

In [19]: df = both.sel(time='2000').mean('location').reset_coords(drop=True).to_dataframe()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-046f43b5cd48> in <module>()
----> 1 df = both.sel(time='2000').mean('location').reset_coords(drop=True).to_dataframe()

NameError: name 'both' is not defined

In [20]: df[['filled', 'some_missing']].plot()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-20-7711fc30bd07> in <module>()
----> 1 df[['filled', 'some_missing']].plot()

/usr/lib/python3/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   2051         if isinstance(key, (Series, np.ndarray, Index, list)):
   2052             # either boolean or fancy integer index
-> 2053             return self._getitem_array(key)
   2054         elif isinstance(key, DataFrame):
   2055             return self._getitem_frame(key)

/usr/lib/python3/dist-packages/pandas/core/frame.py in _getitem_array(self, key)
   2095             return self.take(indexer, axis=0, convert=False)
   2096         else:
-> 2097             indexer = self.ix._convert_to_indexer(key, axis=1)
   2098             return self.take(indexer, axis=1, convert=True)
   2099 

/usr/lib/python3/dist-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1228                 mask = check == -1
   1229                 if mask.any():
-> 1230                     raise KeyError('%s not in index' % objarr[mask])
   1231 
   1232                 return _values_from_object(indexer)

KeyError: "['filled' 'some_missing'] not in index"
../_images/examples_filled.png