Error analysis of the global pre-processing workflow

Error analysis of the global pre-processing workflow#

Here we reproduce the error analysis shown in Maussion et al. (2019) for the pre-processing part only, and for the glacier directories (version 1.6 and 1.4). The error analysis of user runs needs a separate handling, see the deal_with_errors notebook for more information.

Get the files#

We download the glacier_statistics files from the preprocessed directories folders, at the level 5. That is, we are going to count all errors that happened during the pre-processing chain.

from oggm import utils
import pandas as pd
import seaborn as sns

We will start with a preprocessed directory for OGGM v1.6 (W5E5 with centerlines):

# W5E5 centerlines
url = 'https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2025.6/centerlines/W5E5/per_glacier_spinup/RGI62/b_080/L5/summary/' #todo-wait-until-gdir-ready
# this can take some time
df = []
for rgi_reg in range(1, 19):
    fpath = utils.file_downloader(url + f'glacier_statistics_{rgi_reg:02d}.csv')
    df.append(pd.read_csv(fpath, index_col=0, low_memory=False))
df = pd.concat(df, sort=False).sort_index()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], line 5
      3 for rgi_reg in range(1, 19):
      4     fpath = utils.file_downloader(url + f'glacier_statistics_{rgi_reg:02d}.csv')
----> 5     df.append(pd.read_csv(fpath, index_col=0, low_memory=False))
      6 df = pd.concat(df, sort=False).sort_index()

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/parsers/readers.py:873, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, skip_blank_lines, parse_dates, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    861 kwds_defaults = _refine_defaults_read(
    862     dialect,
    863     delimiter,
   (...)    869     dtype_backend=dtype_backend,
    870 )
    871 kwds.update(kwds_defaults)
--> 873 return _read(filepath_or_buffer, kwds)

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/parsers/readers.py:300, in _read(filepath_or_buffer, kwds)
    297 _validate_names(kwds.get("names", None))
    299 # Create the parser.
--> 300 parser = TextFileReader(filepath_or_buffer, **kwds)
    302 if chunksize or iterator:
    303     return parser

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1645, in TextFileReader.__init__(self, f, engine, **kwds)
   1642     self.options["has_index_names"] = kwds["has_index_names"]
   1644 self.handles: IOHandles | None = None
-> 1645 self._engine = self._make_engine(f, self.engine)

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1904, in TextFileReader._make_engine(self, f, engine)
   1902     if "b" not in mode:
   1903         mode += "b"
-> 1904 self.handles = get_handle(
   1905     f,
   1906     mode,
   1907     encoding=self.options.get("encoding", None),
   1908     compression=self.options.get("compression", None),
   1909     memory_map=self.options.get("memory_map", False),
   1910     is_text=is_text,
   1911     errors=self.options.get("encoding_errors", "strict"),
   1912     storage_options=self.options.get("storage_options", None),
   1913 )
   1914 assert self.handles is not None
   1915 f = self.handles.handle

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/common.py:772, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    769     codecs.lookup_error(errors)
    771 # open URLs
--> 772 ioargs = _get_filepath_or_buffer(
    773     path_or_buf,
    774     encoding=encoding,
    775     compression=compression,
    776     mode=mode,
    777     storage_options=storage_options,
    778 )
    780 handle = ioargs.filepath_or_buffer
    781 handles: list[BaseBuffer]

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/pandas/io/common.py:492, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    488 if not (
    489     hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write")
    490 ):
    491     msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
--> 492     raise ValueError(msg)
    494 return IOArgs(
    495     filepath_or_buffer=filepath_or_buffer,
    496     encoding=encoding,
   (...)    499     mode=mode,
    500 )

ValueError: Invalid file path or buffer object type: <class 'NoneType'>

Analyze the errors#

sns.countplot(y="error_task", data=df);
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[4], line 1
----> 1 sns.countplot(y="error_task", data=df);

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/categorical.py:2631, in countplot(data, x, y, hue, order, hue_order, orient, color, palette, saturation, fill, hue_norm, stat, width, dodge, gap, log_scale, native_scale, formatter, legend, ax, **kwargs)
   2628 elif x is not None and y is not None:
   2629     raise TypeError("Cannot pass values for both `x` and `y`.")
-> 2631 p = _CategoricalAggPlotter(
   2632     data=data,
   2633     variables=dict(x=x, y=y, hue=hue),
   2634     order=order,
   2635     orient=orient,
   2636     color=color,
   2637     legend=legend,
   2638 )
   2640 if ax is None:
   2641     ax = plt.gca()

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/categorical.py:67, in _CategoricalPlotter.__init__(self, data, variables, order, orient, require_numeric, color, legend)
     56 def __init__(
     57     self,
     58     data=None,
   (...)     64     legend="auto",
     65 ):
---> 67     super().__init__(data=data, variables=variables)
     69     # This method takes care of some bookkeeping that is necessary because the
     70     # original categorical plots (prior to the 2021 refactor) had some rules that
     71     # don't fit exactly into VectorPlotter logic. It may be wise to have a second
   (...)     76     # default VectorPlotter rules. If we do decide to make orient part of the
     77     # _base variable assignment, we'll want to figure out how to express that.
     78     if self.input_format == "wide" and orient in ["h", "y"]:

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/_base.py:634, in VectorPlotter.__init__(self, data, variables)
    629 # var_ordered is relevant only for categorical axis variables, and may
    630 # be better handled by an internal axis information object that tracks
    631 # such information and is set up by the scale_* methods. The analogous
    632 # information for numeric axes would be information about log scales.
    633 self._var_ordered = {"x": False, "y": False}  # alt., used DefaultDict
--> 634 self.assign_variables(data, variables)
    636 # TODO Lots of tests assume that these are called to initialize the
    637 # mappings to default values on class initialization. I'd prefer to
    638 # move away from that and only have a mapping when explicitly called.
    639 for var in ["hue", "size", "style"]:

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/_base.py:679, in VectorPlotter.assign_variables(self, data, variables)
    674 else:
    675     # When dealing with long-form input, use the newer PlotData
    676     # object (internal but introduced for the objects interface)
    677     # to centralize / standardize data consumption logic.
    678     self.input_format = "long"
--> 679     plot_data = PlotData(data, variables)
    680     frame = plot_data.frame
    681     names = plot_data.names

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/_core/data.py:57, in PlotData.__init__(self, data, variables)
     51 def __init__(
     52     self,
     53     data: DataSource,
     54     variables: dict[str, VariableSpec],
     55 ):
---> 57     data = handle_data_source(data)
     58     frame, names, ids = self._assign_variables(data, variables)
     60     self.frame = frame

File /usr/local/pyenv/versions/3.13.12/lib/python3.13/site-packages/seaborn/_core/data.py:278, in handle_data_source(data)
    276 elif data is not None and not isinstance(data, Mapping):
    277     err = f"Data source must be a DataFrame or Mapping, not {type(data)!r}."
--> 278     raise TypeError(err)
    280 return data

TypeError: Data source must be a DataFrame or Mapping, not <class 'list'>.
"% area errors all sources: {:.2f}%".format(df.loc[~df['error_task'].isnull()].rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[5], line 1
----> 1 "% area errors all sources: {:.2f}%".format(df.loc[~df['error_task'].isnull()].rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100)

AttributeError: 'list' object has no attribute 'loc'
"% failing glaciers all sources: {:.2f}%".format(df.loc[~df['error_task'].isnull()].rgi_area_km2.count() / df.rgi_area_km2.count() * 100)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[6], line 1
----> 1 "% failing glaciers all sources: {:.2f}%".format(df.loc[~df['error_task'].isnull()].rgi_area_km2.count() / df.rgi_area_km2.count() * 100)

AttributeError: 'list' object has no attribute 'loc'

We now look at the errors that occur already before applying the climate tasks and the historical run (i.e., before mb_calibration_from_scalar_mb and flowline_model_run_historical):

dfe = df.loc[~df['error_task'].isnull()]
dfe = dfe.loc[~dfe['error_task'].isin(['mb_calibration_from_scalar_mb','flowline_model_run_historical'])]
"% area errors before climate: {:.2f}%".format(dfe.rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[7], line 1
----> 1 dfe = df.loc[~df['error_task'].isnull()]
      2 dfe = dfe.loc[~dfe['error_task'].isin(['mb_calibration_from_scalar_mb','flowline_model_run_historical'])]
      3 "% area errors before climate: {:.2f}%".format(dfe.rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100)

AttributeError: 'list' object has no attribute 'loc'
"% failing glaciers all sources: {:.2f}%".format(dfe.loc[~dfe['error_task'].isnull()].rgi_area_km2.count() / df.rgi_area_km2.count() * 100)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[8], line 1
----> 1 "% failing glaciers all sources: {:.2f}%".format(dfe.loc[~dfe['error_task'].isnull()].rgi_area_km2.count() / df.rgi_area_km2.count() * 100)

NameError: name 'dfe' is not defined

Although there are already many glaciers failing before the climate tasks, from a relative missing glacier area perspective, much less of the failing glacier area occurs. The reason is that the largest failing glaciers have mostly flowline_model_run_historical errors that only occur in the preprocessed directories level 4 or higher (after the climate tasks):

# 15 largest glaciers
df.loc[~df['error_task'].isnull()].sort_values(by='rgi_area_km2',
                                               ascending=False)[['rgi_area_km2', 'error_task',
                                                                 'error_msg']].iloc[:15]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[9], line 2
      1 # 15 largest glaciers
----> 2 df.loc[~df['error_task'].isnull()].sort_values(by='rgi_area_km2',
      3                                                ascending=False)[['rgi_area_km2', 'error_task',
      4                                                                  'error_msg']].iloc[:15]

AttributeError: 'list' object has no attribute 'loc'

Example error comparison#

  • centerlines vs elevation bands:

# W5E5 elevation bands
url = 'https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.3/elev_bands/W5E5/RGI62/b_080/L5/summary/'
# this can take some time
df_elev = []
# we don't look at RGI19 (Antarctic glaciers), because no climate available for CRU
for rgi_reg in range(1, 19):
    fpath = utils.file_downloader(url + f'glacier_statistics_{rgi_reg:02d}.csv')
    df_elev.append(pd.read_csv(fpath, index_col=0, low_memory=False))
df_elev = pd.concat(df_elev, sort=False).sort_index()
rel_area_elev = df_elev.loc[~df_elev['error_task'].isnull()].rgi_area_km2.sum() / df_elev.rgi_area_km2.sum() * 100
rel_area_cent = df.loc[~df['error_task'].isnull()].rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100
print(f'% area errors from all sources for elevation band flowlines is {rel_area_elev:.2f}%'+'\n'
      f'compared to {rel_area_cent:.2f}% for centerlines with W5E5')                                                         
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[11], line 2
      1 rel_area_elev = df_elev.loc[~df_elev['error_task'].isnull()].rgi_area_km2.sum() / df_elev.rgi_area_km2.sum() * 100
----> 2 rel_area_cent = df.loc[~df['error_task'].isnull()].rgi_area_km2.sum() / df.rgi_area_km2.sum() * 100
      3 print(f'% area errors from all sources for elevation band flowlines is {rel_area_elev:.2f}%'+'\n'
      4       f'compared to {rel_area_cent:.2f}% for centerlines with W5E5')                                                         

AttributeError: 'list' object has no attribute 'loc'

much fewer errors occur when using elevation band flowlines than when using centerlines!

-> Reason: fewer glacier_mask errors!

# you can check out the different error messages with that,
# but we only output the first 20 here
df_elev.error_msg.dropna().unique()[:20]
array(['ValueError: Trapezoid beds need to have origin widths > 0.',
       'ValueError: operands could not be broadcast together with shapes (4,) (5,) ',
       'AssertionError: ',
       'ValueError: operands could not be broadcast together with shapes (3,) (5,) ',
       'RuntimeError: RGI60-01.24246: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.24248: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.24600: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.24602: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.24603: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.26517: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: RGI60-01.26971: we tried very hard but we could not find a combination of parameters that works for this ref mb.',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 2007.4166666666667',
       'InvalidDEMError: (RGI60-02.13793) DEM altidude range too small.',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1983.75',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1984.0833333333333',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1983.25',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1997.8333333333333',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1985.25',
       'ValueError: operands could not be broadcast together with shapes (2,) (5,) ',
       'RuntimeError: Glacier exceeds domain boundaries, at year: 1973.3333333333333'],
      dtype=object)
  • Compare different climate datasets

    • this works only when using OGGM v1.4 urls (comparing CRU vs ERA5)

    • in OGGM v1.6 (state: March 2023), only GSWP3_W5E5 exists

# attention here OGGM_v1.4 is used and this is just for demonstration purposes how to compare
# different preprocessed directories!
# This is CRU + centerlines. But you can try CRU+elev_bands, or ERA5+elev_bands, etc!
url = 'https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.4/L3-L5_files/CRU/centerlines/qc3/pcp2.5/no_match/RGI62/b_040/L5/summary/'
# this can take some time
df_cru_v14 = []
# we don't look at RGI19 (Antarctic glaciers), because no climate available for CRU
for rgi_reg in range(1, 19):
    fpath = utils.file_downloader(url + f'glacier_statistics_{rgi_reg:02d}.csv')
    df_cru_v14.append(pd.read_csv(fpath, index_col=0, low_memory=False))
df_cru_v14 = pd.concat(df_cru_v14, sort=False).sort_index()
# ERA5 uses a different precipitation factor in OGGM_v1.4
url = 'https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.4/L3-L5_files/ERA5/centerlines/qc3/pcp1.6/no_match/RGI62/b_040/L5/summary/'
# this can take some time
df_era5_v14 = []
# we don't look at RGI19 (Antarctic glaciers), because no climate available for CRU
for rgi_reg in range(1, 19):
    fpath = utils.file_downloader(url + f'glacier_statistics_{rgi_reg:02d}.csv')
    df_era5_v14.append(pd.read_csv(fpath, index_col=0, low_memory=False))
df_era5_v14 = pd.concat(df_era5_v14, sort=False).sort_index()
rel_area_cent_era5_v14 = df_era5_v14.loc[~df_era5_v14['error_task'].isnull()].rgi_area_km2.sum() / df_era5_v14.rgi_area_km2.sum() * 100
rel_area_cent_cru_v14 = df_cru_v14.loc[~df_cru_v14['error_task'].isnull()].rgi_area_km2.sum() / df_cru_v14.rgi_area_km2.sum() * 100
print(f"% area errors all sources for ERA5 is {rel_area_cent_era5_v14:.2f}% compared to {rel_area_cent_cru_v14:.2f}% for CRU")
                                                                        
% area errors all sources for ERA5 is 0.92% compared to 3.24% for CRU

more than three times fewer errors from the climate tasks occur when using ERA5 than when using CRU !

  • Compare between OGGM versions (and climate datasets)

print('% area errors from all sources for centerlines is: \n'+
      f'{rel_area_cent_cru_v14:.2f}% for CRU and OGGM_v14 \n'+
      f'{rel_area_cent_era5_v14:.2f}% for ERA5 and OGGM_v14 \n'+
      f'{rel_area_cent:.2f}% for W5E5 and OGGM_v16')
                                                                              
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[16], line 4
      1 print('% area errors from all sources for centerlines is: \n'+
      2       f'{rel_area_cent_cru_v14:.2f}% for CRU and OGGM_v14 \n'+
      3       f'{rel_area_cent_era5_v14:.2f}% for ERA5 and OGGM_v14 \n'+
----> 4       f'{rel_area_cent:.2f}% for W5E5 and OGGM_v16')

NameError: name 'rel_area_cent' is not defined

Great, the most recent preprocessed directories create the least amount of failing glacier area

This is either a result of the different applied climate, MB calibration or other changes in OGGM_v16. This could be checked more in details by looking into which tasks fail less.

What’s next?#