From 3e47563ee582a42d7e048b0a02e9fd09df7fd629 Mon Sep 17 00:00:00 2001
From: Oliver Lemke <olemke@gmail.com>
Date: Wed, 20 Nov 2024 12:56:24 +0100
Subject: [PATCH 1/4] Avoid problem in true divide

---
 typhon/geographical.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/typhon/geographical.py b/typhon/geographical.py
index ab523d89..9a74153b 100644
--- a/typhon/geographical.py
+++ b/typhon/geographical.py
@@ -324,7 +324,11 @@ def gridded_mean(lat, lon, data, grid):
     grid_sum, _, _ = np.histogram2d(lat, lon, grid, weights=data)
     grid_number, _, _ = np.histogram2d(lat, lon, grid)
 
-    return grid_sum / grid_number, grid_number
+    grid_number_nozeros = np.where(grid_number == 0, -9999, grid_number)
+    gridded_mean = np.where(
+        grid_number_nozeros == -9999, 0, grid_sum / grid_number_nozeros
+    )
+    return gridded_mean, grid_number
 
 
 def sea_mask(lat, lon, mask):
@@ -387,5 +391,3 @@ def _to_array(item):
     lon_cell = lon / mask_lon_step
 
     return mask[lat_cell.astype(int), lon_cell.astype(int)]
-
-

From 649c9cd79de6142dacfeba68073f0e5e79f5b9f1 Mon Sep 17 00:00:00 2001
From: Oliver Lemke <olemke@gmail.com>
Date: Wed, 20 Nov 2024 13:33:16 +0100
Subject: [PATCH 2/4] Lorena's collocation and spareice updates

These are the improvements and fixes made by Lorena during her Masters'
thesis.
---
 typhon/collocations/collocator.py   | 120 +++++++++++++++++-----------
 typhon/collocations/common.py       |  11 ++-
 typhon/files/fileset.py             |  42 +++++++---
 typhon/files/handlers/cloudsat.py   |  31 ++++---
 typhon/files/handlers/common.py     |  11 +++
 typhon/files/handlers/tovs.py       |  12 ++-
 typhon/retrieval/common.py          |   1 +
 typhon/retrieval/spareice/common.py | 115 ++++++++++++++++++++------
 8 files changed, 242 insertions(+), 101 deletions(-)

diff --git a/typhon/collocations/collocator.py b/typhon/collocations/collocator.py
index de9c2c2b..ae443340 100644
--- a/typhon/collocations/collocator.py
+++ b/typhon/collocations/collocator.py
@@ -192,11 +192,14 @@ def collocate_filesets(
 
         self._info(f"Collocate from {start} to {end}")
 
-        # Find the files from both filesets which overlap tempoerally.
+        # Find the files from both filesets which overlap temporally.
         matches = list(filesets[0].match(
             filesets[1], start=start, end=end, max_interval=max_interval,
+            skip_file_errors=skip_file_errors
         ))
-
+        if len(matches) == 0:
+            return
+        
         if processes is None:
             processes = 1
 
@@ -540,59 +543,73 @@ def _collocate_matches(
             skip_errors=skip_file_errors,
         )
 
+        for iprimary in range(len(matches)):
+            nfound_secondaries=len(matches[iprimary][1])
+            got_matches=[next(loaded_matches) for isecondary in range(nfound_secondaries)]
 
-        for loaded_match in loaded_matches:
-            # The FileInfo objects of the matched files:
-            files = loaded_match[0][0], loaded_match[1][0]
+            # Get FileInfos
+            primary_infos=got_matches[0][0][0]
+            secondaries_infos = [nmatch[1][0] for nmatch in got_matches]
+            files = primary_infos, secondaries_infos
+            # Get datasets
+            primary = got_matches[0][0][1].copy()
+            secondaries = [nmatch[1][1].copy() for nmatch in got_matches]
 
-            # We copy the data from the matches since they might be used for
-            # other matches as well:
-            primary, secondary = \
-                loaded_match[0][1].copy(), loaded_match[1][1].copy()
+            # Combine secondary files
+            secondary = xr.concat(secondaries, dim=secondaries[0].time.dims[0])
 
-            self._debug(f"Collocate {files[0].path}\nwith {files[1].path}")
+            # Update dimension numbering
+            secondary[secondary.time.dims[0]] = np.arange(len(secondary.time))
 
+            self._debug(f"Collocate {files[0].path}\nwith {[file.path for file in files[1]]}")
+            
             collocations = self.collocate(
-                (filesets[0].name, primary),
-                (filesets[1].name, secondary), **kwargs,
-            )
+                    (filesets[0].name, primary),
+                    (filesets[1].name, secondary), **kwargs,
+                )
 
             if collocations is None:
                 self._debug("Found no collocations!")
                 # At least, give the process caller a progress update:
-                yield None, None
-                continue
-
-            # Check whether the collocation data is compatible and was build
-            # correctly
-            check_collocation_data(collocations)
+                yield None, None 
 
-            found = [
-                collocations[f"{filesets[0].name}/time"].size,
-                collocations[f"{filesets[1].name}/time"].size
-            ]
+            else: # after yield the code continues at the next line
 
-            self._debug(
-                f"Found {found[0]} ({filesets[0].name}) and "
-                f"{found[1]} ({filesets[1].name}) collocations"
-            )
+                # Check whether the collocation data is compatible and was build
+                # correctly
+                check_collocation_data(collocations)
 
-            # Add the names of the processed files:
-            for f in range(2):
-                if f"{filesets[f].name}/__file" in collocations.variables:
-                    continue
+                found = [
+                    collocations[f"{filesets[0].name}/time"].size,
+                    collocations[f"{filesets[1].name}/time"].size
+                ]
 
-                collocations[f"{filesets[f].name}/__file"] = files[f].path
+                self._debug(
+                    f"Found {found[0]} ({filesets[0].name}) and "
+                    f"{found[1]} ({filesets[1].name}) collocations"
+                )
 
-            # Collect the attributes of the input files. The attributes get a
-            # prefix, primary or secondary, to allow not-unique names.
-            attributes = {
-                f"primary.{p}" if f == 0 else f"secondary.{p}": v
-                for f, file in enumerate(files)
-                for p, v in file.attr.items()
-            }
+                # Add the names of the processed files:
+                for f in range(2):
+                    if f"{filesets[f].name}/__file" in collocations.variables:
+                        continue
+                    if f == 0:
+                        collocations[f"{filesets[f].name}/__file"] = files[f].path
+                    else: 
+                        collocations[f"{filesets[f].name}/__file"] = \
+                            '::'.join([file.path for file in files[1]])
+
+                # Collect the attributes of the input files. The attributes get a
+                # prefix, primary or secondary, to allow not-unique names.
+                # As representative for the combined secondary files, the file attributes
+                # from the first secondary file are taken.
+                attributes = {
+                    f"primary.{p}" if f == 0 else f"secondary.{p}": v
+                    for f, file in enumerate([files[0], files[1][0]])
+                    for p, v in file.attr.items()
+                }
 
-            yield collocations, attributes
+                yield collocations, attributes
 
 
     def collocate(
@@ -735,6 +752,12 @@ def collocate(
 
 
         """
+        # the datasets must be sorted first by time in case an input file is unsorted
+        if isinstance(primary, tuple):
+            primary = (primary[0], primary[1].sortby("time"))
+        if isinstance(primary, tuple):
+            secondary = (secondary[0], secondary[1].sortby("time"))
+        
         if max_distance is None and max_interval is None:
             raise ValueError(
                 "Either max_distance or max_interval must be given!"
@@ -910,7 +933,7 @@ def _prepare_data(self, primary, secondary, max_interval, start, end):
             if not primary_period.size or not secondary_period.size:
                 return None, None
 
-            # We need everything sorted by the time, otherwise xarray's stack
+            # We need everything sorted by the time, otherwise xarray's stack (typo?: sel)
             # method makes problems:
             primary_period = primary_period.sortby(primary_period)
             primary_dim = primary_period.dims[0]
@@ -951,15 +974,18 @@ def _get_common_time_period(
         # very annoying bug in time retrieving
         # (https://github.com/pydata/xarray/issues/1240), this is a
         # little bit cumbersome:
+
+        # Approach which can handle NaT values:
+        # added after concatenate was introduced
         common_start = max(
             start,
-            pd.Timestamp(primary.time.values.min().item(0)).tz_localize(None) - max_interval,
-            pd.Timestamp(secondary.time.values.min().item(0)).tz_localize(None) - max_interval
+            pd.to_datetime(primary.time.values).min().tz_localize(None) - max_interval,
+            pd.to_datetime(secondary.time.values).min().tz_localize(None) - max_interval
         )
         common_end = min(
             end,
-            pd.Timestamp(primary.time.values.max().item(0)).tz_localize(None) + max_interval,
-            pd.Timestamp(secondary.time.values.max().item(0)).tz_localize(None) + max_interval
+            pd.to_datetime(primary.time.values).max().tz_localize(None) + max_interval,
+            pd.to_datetime(secondary.time.values).max().tz_localize(None) + max_interval
         )
 
         primary_period = primary.time.where(
@@ -1038,7 +1064,7 @@ def _flat_to_main_coord(data):
         # to use them as indices later.
         for dim in dims:
             if dim not in data.coords:
-                data[dim] = dim, np.arange(data.dims[dim])
+                data[dim] = dim, np.arange(data.sizes[dim])
 
         # We assume that coordinates must be unique! Otherwise, we would have
         # to use this ugly work-around:
@@ -1151,7 +1177,7 @@ def _create_return(
             }
         )
         metadata["interval"] = xr.DataArray(
-            intervals, dims=("collocation", ),
+            intervals.astype('timedelta64[ns]'), dims=("collocation", ),
             attrs={
                 "max_interval": f"Max. interval in secs: {max_interval}",
                 "max_distance": f"Max. distance in kilometers: {max_distance}",
diff --git a/typhon/collocations/common.py b/typhon/collocations/common.py
index cf3156c0..ff88af75 100644
--- a/typhon/collocations/common.py
+++ b/typhon/collocations/common.py
@@ -118,6 +118,8 @@ def read(self, *args, **kwargs):
             return data
         elif self.read_mode == "collapse" or self.read_mode is None:
             # Collapse the data (default)
+            if self.read_mode is None:
+                print('default read_mode is collapse')
             return collapse(data, self.reference, self.collapser)
         elif self.read_mode == "expand":
             # Expand the data
@@ -220,8 +222,8 @@ def search(self, filesets, collocator=None, **kwargs):
 
 def _rows_for_secondaries(primary):
     """Helper function for collapse"""
-    current_row = np.zeros(primary.size, dtype=int)
-    rows = np.zeros(primary.size, dtype=int)
+    current_row = np.zeros(primary.size, dtype=np.int64)
+    rows = np.zeros(primary.size, dtype=np.int64)
     i = 0
     for p in primary:
         rows[i] = current_row[p]
@@ -336,8 +338,9 @@ def collapse(data, reference=None, collapser=None):
     if collapser is None:
         collapser = {}
     collapser = {
-        "mean": lambda m, a: np.nanmean(m, axis=a),
-        "std": lambda m, a: np.nanstd(m, axis=a),
+        "mean": lambda m, a: np.nanmean(m, axis=a),  # if slice consists solely of
+        "std": lambda m, a: np.nanstd(m, axis=a),    # nans, prints RuntimeWarning,
+                                                     # but returns nan as desired
         "number": lambda m, a: np.count_nonzero(~np.isnan(m), axis=a),
         **collapser,
     }
diff --git a/typhon/files/fileset.py b/typhon/files/fileset.py
index 1edd9489..2013ba3c 100644
--- a/typhon/files/fileset.py
+++ b/typhon/files/fileset.py
@@ -20,10 +20,12 @@
 from sys import platform
 import traceback
 import warnings
+import tempfile
 
 import numpy as np
 import pandas as pd
 from fsspec.implementations.local import LocalFileSystem
+from pathlib import Path
 
 import typhon.files
 from typhon.trees import IntervalTree
@@ -1050,8 +1052,8 @@ def find(
     ):
         """ Find all files of this fileset in a given time period.
 
-        The *start* and *end* parameters build a semi-open interval: only the
-        files that are equal or newer than *start* and older than *end* are
+        The *start* and *end* parameters build a closed interval: only the
+        files that are equal or newer than *start* and equal or older than *end* are
         going to be found.
 
         While searching this method checks whether the file lies in the time
@@ -1114,9 +1116,13 @@ def find(
         # The user can give strings instead of datetime objects:
         start = datetime.min if start is None else to_datetime(start)
         end = datetime.max if end is None else to_datetime(end)
-
+        '''
         # We want to have a semi-open interval as explained in the doc string.
+        # Update 09.03.2023: We want a closed interval just like in collocate() -> _get_common_time_period
+        # to be consistent
         end -= timedelta(microseconds=1)
+        '''
+
 
         if end < start:
             raise ValueError(
@@ -1635,8 +1641,11 @@ def get_info(self, file_info, retrieve_via=None):
 
         # Using the handler for getting more information
         if retrieve_via in ("handler", "both"):
-            with typhon.files.decompress(info.path, tmpdir=self.temp_dir) as \
-                    decompressed_path:
+            with typhon.files.decompress(
+                info.path,
+                tmpdir=self.temp_dir,
+                target=Path(info.path).name + next(tempfile._get_candidate_names()),
+            ) as decompressed_path:
                 decompressed_file = info.copy()
                 decompressed_file.path = decompressed_path
                 handler_info = self.handler.get_info(decompressed_file)
@@ -2019,6 +2028,7 @@ def _return(file_info, return_value):
             try:
                 # file_info could be a bundle of files
                 if isinstance(file_info, FileInfo):
+                    print(file_info)
                     file_content = fileset.read(file_info, **read_args)
                 else:
                     file_content = \
@@ -2071,7 +2081,7 @@ def _return(file_info, return_value):
 
     def match(
             self, other, start=None, end=None, max_interval=None,
-            filters=None, other_filters=None):
+            filters=None, other_filters=None, skip_file_errors=False):
         """Find matching files between two filesets
 
         Matching files are files that overlap each in their time coverage.
@@ -2097,16 +2107,22 @@ def match(
         Examples:
             TODO: Add example
         """
+
+        # files1 are only searched for within start and end
+        # without considering max_interval
+        start=to_datetime(start)
+        end=to_datetime(end)
         if max_interval is not None:
             max_interval = to_timedelta(max_interval, numbers_as="seconds")
-            start = to_datetime(start) - max_interval
-            end = to_datetime(end) + max_interval
+            start_extended=start-max_interval
+            end_extended=end+max_interval
 
         files1 = list(
-            self.find(start, end, filters=filters)
+            self.find(start, end, filters=filters, no_files_error=not(skip_file_errors)) #skip_file_errors added 22.02.2023
         )
+
         files2 = list(
-            other.find(start, end, filters=other_filters)
+            other.find(start_extended, end_extended, filters=other_filters, no_files_error=not(skip_file_errors)) #skip_file_errors added 22.02.2023
         )
 
         # Convert the times (datetime objects) to seconds (integer)
@@ -2119,6 +2135,8 @@ def match(
             for file in files2
         ]).astype("M8[s]").astype(int)
 
+        if len(times1)==0 or len(times2)==0:
+            return
         if max_interval is not None:
             # Expand the intervals of the secondary fileset to close-in-time
             # intervals.
@@ -2700,8 +2718,8 @@ def read(self, file_info, **read_args):
         read_args = {**self.read_args, **read_args}
 
         if self.decompress:
-            with typhon.files.decompress(file_info.path, tmpdir=self.temp_dir)\
-                    as decompressed_path:
+            with typhon.files.decompress(file_info.path, tmpdir=self.temp_dir, target=Path(file_info.path).name+next(tempfile._get_candidate_names()))\
+                    as decompressed_path: # modified 14.02.2023
                 decompressed_file = file_info.copy()
                 decompressed_file.path = decompressed_path
                 data = self.handler.read(decompressed_file, **read_args)
diff --git a/typhon/files/handlers/cloudsat.py b/typhon/files/handlers/cloudsat.py
index bd92c29c..e48ca9d6 100644
--- a/typhon/files/handlers/cloudsat.py
+++ b/typhon/files/handlers/cloudsat.py
@@ -1,10 +1,12 @@
-from datetime import datetime
+from datetime import datetime, timedelta
 import warnings
+import re
 
 import numpy as np
 import xarray as xr
 
 from .common import HDF4, expects_file_info
+from pathlib import Path
 
 pyhdf_is_installed = False
 try:
@@ -45,6 +47,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     @expects_file_info()
+    # new version - works for R04 and R05 data
     def get_info(self, file_info, **kwargs):
         """Return a :class:`FileInfo` object with parameters about the
         file content.
@@ -58,11 +61,18 @@ def get_info(self, file_info, **kwargs):
             A FileInfo object.
         """
 
-        file = SD(file_info.path, SDC.READ)
-        file_info.times[0] = \
-            datetime.strptime(getattr(file, 'start_time'), "%Y%m%d%H%M%S")
-        file_info.times[1] = \
-            datetime.strptime(getattr(file, 'end_time'), "%Y%m%d%H%M%S")
+        fname = Path(file_info.path).name
+        regex = re.compile(r"(?P<year>\d{4})(?P<doy>\d{3})")
+        m = regex.search(fname)
+        year = int(m["year"])
+        doy = int(m["doy"])
+
+        ftimes = super().read(file_info, ["UTC_start", "Profile_time"])
+        utc_start = ftimes["UTC_start"].item(0)
+        duration = ftimes["Profile_time"].item(-1)
+
+        file_info.times[0] = datetime(year, 1, 1) + timedelta(doy - 1, utc_start)
+        file_info.times[1] = file_info.times[0] + timedelta(seconds=duration)
 
         return file_info
 
@@ -71,7 +81,7 @@ def read(self, file_info, **kwargs):
         """Read and parse HDF4 files and load them to a xarray.Dataset
 
         A description about all variables in CloudSat dataset can be found in
-        http://www.cloudsat.cira.colostate.edu/data-products/level-2c/2c-ice?term=53.
+        https://www.cloudsat.cira.colostate.edu/data-products/2c-ice (new link: 16.02.2023)
 
         Args:
             file_info: Path and name of the file as string or FileInfo object.
@@ -97,6 +107,9 @@ def read(self, file_info, **kwargs):
         )
 
         dataset["time"] = self._get_time_field(dataset, file_info)
+        # scnline is only returned as a dimension. Set it to a coordinate (added 16.02.2023)
+        # scnline numbering usually starts with 1
+        dataset=dataset.assign_coords({'scnline':dataset['scnline']+1})
 
         # Remove fields that we do not need any longer (expect the user asked
         # for them explicitly)
@@ -123,10 +136,10 @@ def _get_time_field(self, dataset, file_info):
         profile_times = profile_times.astype("int")
 
         try:
-            date = file_info.times[0].date()
+            date = file_info.times[0].date() # das haette funktionieren muessen..
         except AttributeError:
             # We have to load the info by ourselves:
-            date = self.get_info(file_info).times[0].date()
+            date = self.get_info(file_info).times[0].date() 
 
         # Put all times together so we obtain one full timestamp
         # (date + time) for each data point. We are using the
diff --git a/typhon/files/handlers/common.py b/typhon/files/handlers/common.py
index 7944d049..431ee736 100644
--- a/typhon/files/handlers/common.py
+++ b/typhon/files/handlers/common.py
@@ -673,6 +673,7 @@ def read(self, file_info, fields=None, mapping=None, **kwargs):
         # parameter `group`. To avoid this, we load all groups and their
         # variables by using the netCDF4 directly and load them later into a
         # xarray dataset.
+        # May 2022: xr.open_dataset still not support loading all groups at once
 
         with netCDF4.Dataset(file_info.path, "r") as root:
             # xarray decode_cf scales, don't do it twice!
@@ -684,6 +685,16 @@ def read(self, file_info, fields=None, mapping=None, **kwargs):
 
         return _xarray_rename_fields(dataset, mapping)
 
+
+    def read_from_list(self, file_info, fields=None, mapping=None, common_dimension=None, **kwargs): #added
+        if common_dimension is None:
+            raise ValueError("If file_info is given as a list, you have to provide common_dimension as additional parameter.")
+
+        dataset=[]
+        for entry in file_info:
+            dataset.append(self.read(entry, fields, mapping, **kwargs))
+        return xr.concat(dataset,dim=common_dimension)
+
     @staticmethod
     def _get_dimension_name(ds, group, path, dim):
         # If the dimension is defined in the subgroup, use NOT the one of the
diff --git a/typhon/files/handlers/tovs.py b/typhon/files/handlers/tovs.py
index d84bb9de..9dfb07c5 100644
--- a/typhon/files/handlers/tovs.py
+++ b/typhon/files/handlers/tovs.py
@@ -276,7 +276,10 @@ def read(self, file_info, mask_and_scale=True, interpolate_packed_pixels=True,
 
         # All geolocation fields are packed in the AVHRR GAC files:
         if interpolate_packed_pixels:
-            self._interpolate_packed_pixels(dataset, max_nans_interpolation)
+            try:
+                self._interpolate_packed_pixels(dataset, max_nans_interpolation, file_info)
+            except:
+                return None # try-except added on 24.04.23
             allowed_coords = {'channel', 'calib', 'scnline', 'scnpos'}
         else:
             allowed_coords = {'channel', 'calib', 'scnline', 'scnpos',
@@ -294,7 +297,7 @@ def read(self, file_info, mask_and_scale=True, interpolate_packed_pixels=True,
         return dataset
 
     @staticmethod
-    def _interpolate_packed_pixels(dataset, max_nans_interpolation):
+    def _interpolate_packed_pixels(dataset, max_nans_interpolation, file_info):
         given_pos = np.arange(5, 409, 8)
         new_pos = np.arange(1, 410)
 
@@ -310,8 +313,9 @@ def _interpolate_packed_pixels(dataset, max_nans_interpolation):
 
         if valid_pos.sum() < 52 - max_nans_interpolation:
             raise ValueError(
-                "Too many NaNs in latitude and longitude of this AVHRR file. "
-                "Cannot guarantee a good interpolation!"
+                f"Too many NaNs in latitude and longitude of this AVHRR file. "
+                f"Cannot guarantee a good interpolation! "
+                f"File affected: {file_info}"
             )
 
         # Filter NaNs because CubicSpline cannot handle it:
diff --git a/typhon/retrieval/common.py b/typhon/retrieval/common.py
index 839c02c3..6044cbeb 100644
--- a/typhon/retrieval/common.py
+++ b/typhon/retrieval/common.py
@@ -157,6 +157,7 @@ def _model_to_dict(model):
                 attr: copy.deepcopy(getattr(model, attr))
                 for attr in model.__dir__()
                 if not attr.startswith("__") and attr.endswith("_")
+                if not attr.startswith("_repr_")
             }
         }
 
diff --git a/typhon/retrieval/spareice/common.py b/typhon/retrieval/spareice/common.py
index 64fe58af..40e56431 100644
--- a/typhon/retrieval/spareice/common.py
+++ b/typhon/retrieval/spareice/common.py
@@ -465,7 +465,8 @@ def get_grid_value(grid, lat, lon):
 
             # We do not need the depth of the oceans (this would just
             # confuse the ANN):
-            return_data["elevation"][return_data.elevation < 0] = 0
+
+            return_data.loc[return_data.elevation <0, "elevation"]=0
 
         return return_data
 
@@ -531,15 +532,18 @@ def _retrieve_from_collocations(collocations, _, spareice):
             "description": "True if pixel contains an ice cloud (retrieved"
                            " by SPARE-ICE)."
         }
+        collocations=collocations.to_xarray()
         retrieved["lat"] = collocations["lat"]
         retrieved["lon"] = collocations["lon"]
         retrieved["time"] = collocations["time"]
         retrieved["scnpos"] = collocations["mhs_scnpos"]
-
+        # let index start with 1 (in accordance with scnline)
+        retrieved["index"]= retrieved["index"] + 1
         return retrieved
 
     def retrieve_from_collocations(
-            self, inputs, output, start=None, end=None, processes=None
+            self, inputs, output, start=None, end=None, processes=None,
+            no_files_error=True,
     ):
         """Retrieve SPARE-ICE from collocations between MHS and AVHRR
 
@@ -576,14 +580,15 @@ def retrieve_from_collocations(
         if "elevation" in self.inputs and self.elevation_grid is None:
             raise ValueError("You have to pass a elevation file via init!")
 
-        timer = Timer.start()
+        timer = Timer().start()
         if isinstance(inputs, Collocations):
             # Simply apply a map function to all files from these collocations
             inputs.map(
                 SPAREICE._retrieve_from_collocations, kwargs={
                     "spareice": self,
                 }, on_content=True, pass_info=True, start=start, end=end,
-                max_workers=processes, output=output, worker_type="process"
+                max_workers=processes, output=output, worker_type="process",
+                no_files_error=no_files_error # added 04.05.2023
             )
         elif len(inputs) == 2:
             # Collocate MHS and AVHRR on-the-fly:
@@ -611,7 +616,7 @@ def retrieve_from_collocations(
                 "You need to pass a Collocations object or a list with a MHS "
                 "and AVHRR fileset!"
             )
-        logger.info(f"Took {timer} hours to retrieve SPARE-ICE")
+        logger.info(f"Took {timer} to retrieve SPARE-ICE")
 
     def score(self, data):
         """Calculate the score of SPARE-ICE on testing data
@@ -720,15 +725,13 @@ def report(self, output_dir, experiment, data):
                 subdirectory named `experiment` will be created there. All
                 plots are stored to it.
             experiment: A name for the experiment as a string. Will be included
-                in the title of the plots and used as name for the subdirectory
-                in `output_dir`.
+                in the title of the plots in `output_dir`.
             data: A pandas.DataFrame object with the required input fields.
 
         Returns:
             None
         """
         # Create the output directory:
-        output_dir = join(output_dir, experiment)
         os.makedirs(output_dir, exist_ok=True)
 
         # Run SPARE-ICE!
@@ -750,15 +753,20 @@ def _report_iwp(self, output_dir, experiment, test, retrieved):
             cmap="density", vmin=5,
         )
         scat.cmap.set_under("w")
+
+        # for framing
+        ax.spines['top'].set_visible(True)
+        ax.spines['right'].set_visible(True)
+
         ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]")
         ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]")
         ax.set_title(experiment)
-        fig.colorbar(scat, label="Number of points")
-        fig.savefig(join(output_dir, "2C-ICE-SPAREICE_heatmap.png"))
+        fig.colorbar(scat, label="\nNumber of points")
+        fig.savefig(join(output_dir, "2C-ICE-SPAREICE_heatmap.pdf"))
 
         self._plot_scatter(
             experiment,
-            join(output_dir, "2C-ICE-SPAREICE_scatter_{area}.png"),
+            join(output_dir, "2C-ICE-SPAREICE_scatter_{area}.pdf"),
             test.iwp, retrieved.iwp, test.sea_mask.values
         )
 
@@ -773,7 +781,7 @@ def _report_iwp(self, output_dir, experiment, test, retrieved):
                 - 1
         )
         self._plot_error(
-            experiment, join(output_dir, "2C-ICE-SPAREICE_mfe.png"),
+            experiment, join(output_dir, "2C-ICE-SPAREICE_mfe.pdf"),
             test,
             fe, test.sea_mask.values,
         )
@@ -781,10 +789,10 @@ def _report_iwp(self, output_dir, experiment, test, retrieved):
         # Plot the bias:
         bias = retrieved.iwp.values - test.iwp.values
         self._plot_error(
-            experiment, join(output_dir, "2C-ICE-SPAREICE_bias.png"),
+            experiment, join(output_dir, "2C-ICE-SPAREICE_bias.pdf"),
             test,
             bias, test.sea_mask.values,
-            mfe=False, yrange=[-0.35, 0.45],
+            mfe=False,
         )
 
         # self._plot_weights(
@@ -814,11 +822,16 @@ def _plot_scatter(experiment, file, xdata, ydata, sea_mask):
                 xdata[mask], ydata[mask],
                 s=1, alpha=0.6
             )
+
+            # for framing
+            ax.spines['top'].set_visible(True)
+            ax.spines['right'].set_visible(True)
+
             ax.grid()
             ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]")
             ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]")
             ax.set_title(f"{experiment} - {area}")
-            fig.savefig(file.format(area=area))
+            fig.savefig(file.format(area=area), bbox_inches='tight')
 
     @staticmethod
     def _plot_error(
@@ -831,10 +844,11 @@ def _plot_error(
 
         if mfe:
             ax.set_ylabel("Median fractional error [%]")
-            ax.set_ylim([0, 200])
             statistic = "median"
         else:
-            ax.set_ylabel("$\Delta$ IWP (SPARE-ICE - 2C-ICE) [log 10 g/m^2]")
+            ax.set_ylabel("$\\Delta$ IWP (SPARE-ICE - 2C-ICE) [log 10 g/m^2]")
+            plt.hlines(0, xrange[0], xrange[1], color='red', linewidth=2)
+            ax.set_xlim(xrange)
             statistic = "mean"
 
         for hemisphere in ["global"]:
@@ -862,10 +876,14 @@ def _plot_error(
         ax.grid()
         ax.legend(fancybox=True)
         ax.set_title(f"Experiment: {experiment}")
+        if mfe:
+            ax.set_ylim(ymin=0)
         if yrange is not None:
             ax.set_ylim(yrange)
-        fig.tight_layout()
-        fig.savefig(file)
+
+        ax.set_xlim(xrange)
+
+        fig.savefig(file,  bbox_inches='tight', dpi=120)
 
     def _plot_weights(self, title, file, layer_index=0, vmin=-5, vmax=5):
         import seaborn as sns
@@ -889,19 +907,65 @@ def _plot_weights(self, title, file, layer_index=0, vmin=-5, vmax=5):
         f.tight_layout()
         f.savefig(file)
 
-    def _report_ice_cloud(self, output_dir, experiment, test, retrieved):
+    def _report_ice_cloud_old(self, output_dir, experiment, test, retrieved):
         # Confusion matrix:
         fig, ax = plt.subplots(figsize=(12, 10))
         cm = confusion_matrix(test.ice_cloud, retrieved.ice_cloud)
         img = self._plot_matrix(cm, classes=["Yes", "No"], normalize=True)
-        fig.colorbar(img, label="probability")
+        fig.colorbar(img, label="\nprobability")
         ax.set_title("Ice Cloud Classifier - Performance")
         ax.set_ylabel('real ice cloud')
         ax.set_xlabel('predicted ice cloud')
-        fig.tight_layout()
-        fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.png"))
+        fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.jpg"))
+
+        fig, ax = plt.subplots(figsize=(12, 10))
+        ax.barh(
+            np.arange(len(self.ice_cloud.inputs)),
+            self.ice_cloud.estimator.feature_importances_
+        )
+        ax.set_yticks(np.arange(len(self.ice_cloud.inputs)))
+        ax.set_yticklabels(self.ice_cloud.inputs)
+        ax.set_xlabel("Feature Importance")
+        ax.set_ylabel("Feature")
+        ax.set_title("Ice Cloud Classifier - Importance")
+        fig.savefig(join(output_dir, "ice-cloud-feature-importance.jpg"))
+
+
+    def _report_ice_cloud(self, output_dir, experiment, test, retrieved,normalize=True, **kwargs):
+        from seaborn import heatmap
+
+        # ice-cloud confusion matrix
+        cm = confusion_matrix(test.ice_cloud, retrieved.ice_cloud, labels=[1,0])
+
+        if normalize:
+            cmsum=cm.sum(axis=1)[:, np.newaxis]
+            cmsum=np.where(cmsum==0,1,cmsum) # prevent warning in true_divide
+            cm = cm.astype('float') / cmsum
+
+            fmt_prob=".2f"#"%"
+            kwargs["vmin"]=0
+            kwargs["vmax"]=1
+        else:
+            fmt_prob="d"
+
+        default_kwargs = {
+             "cmap": "Blues",
+             **kwargs }
+
+        fig,ax= plt.subplots(figsize=(12, 10))
+        ax=heatmap(cm, annot=True, cbar=True, cbar_kws={"label":"Probability"},
+                square=True,fmt=fmt_prob,
+                **default_kwargs)
+        ax.set_title("Ice Cloud Classifier - Performance")
+        ax.set_ylabel('Real ice cloud')
+        ax.set_xlabel('Predicted ice cloud')
+        ax.xaxis.set_ticklabels(['Yes','No'])
+        ax.yaxis.set_ticklabels(['Yes','No'])
+        fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.pdf"))
 
+        # ice-cloud feature importannce'
         fig, ax = plt.subplots(figsize=(12, 10))
+        print(self.ice_cloud.estimator.feature_importances_)
         ax.barh(
             np.arange(len(self.ice_cloud.inputs)),
             self.ice_cloud.estimator.feature_importances_
@@ -911,7 +975,8 @@ def _report_ice_cloud(self, output_dir, experiment, test, retrieved):
         ax.set_xlabel("Feature Importance")
         ax.set_ylabel("Feature")
         ax.set_title("Ice Cloud Classifier - Importance")
-        fig.savefig(join(output_dir, "ice-cloud-feature-importance.png"))
+        fig.savefig(join(output_dir, "ice-cloud-feature-importance.pdf"))
+
 
     @staticmethod
     def _plot_matrix(

From a56e9ec7c6b90b811b9542b9bf418a2f4cf16b15 Mon Sep 17 00:00:00 2001
From: Oliver Lemke <olemke@gmail.com>
Date: Wed, 20 Nov 2024 13:33:34 +0100
Subject: [PATCH 3/4] Fix xarray's nanosecond warning

---
 typhon/tests/collocations/test_collocations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/typhon/tests/collocations/test_collocations.py b/typhon/tests/collocations/test_collocations.py
index 151e67c4..425b21ff 100644
--- a/typhon/tests/collocations/test_collocations.py
+++ b/typhon/tests/collocations/test_collocations.py
@@ -87,7 +87,7 @@ def test_collocate_collapse_expand(self):
         collocator = Collocator()
 
         test = xr.Dataset({
-            "time": ("time", np.arange("2000", "2010", dtype="M8[Y]")),
+            "time": ("time", np.arange("2000", "2010", dtype="M8[Y]").astype('datetime64[ns]')),
             "lat": ("time", np.arange(10)),
             "lon": ("time", np.arange(10)),
         })

From b3fa04ff88fe46d37663366a492b16d5ffc09e43 Mon Sep 17 00:00:00 2001
From: Oliver Lemke <olemke@gmail.com>
Date: Wed, 20 Nov 2024 13:34:17 +0100
Subject: [PATCH 4/4] Adjust timeframe in test cases

Collocation end time is now inclusive instead of exclusive.
---
 typhon/tests/files/test_fileset.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/typhon/tests/files/test_fileset.py b/typhon/tests/files/test_fileset.py
index c754eb53..d845b2f0 100644
--- a/typhon/tests/files/test_fileset.py
+++ b/typhon/tests/files/test_fileset.py
@@ -292,7 +292,7 @@ def test_tutorial(self, file_system):
         # Should not find anything:
         empty = list(
             filesets["tutorial"].find(
-                "2017-12-31", "2018-01-01", no_files_error=False
+                "2017-12-31", "2017-12-31 23:59", no_files_error=False
             ))
         assert not empty
 
@@ -525,7 +525,7 @@ def test_single(self, file_system):
         # Should not find anything:
         empty = list(
             filesets["single"].find(
-                "2016-12-31", "2018-01-01", no_files_error=False
+                "2016-12-31", "2017-12-31 23:59", no_files_error=False
             ))
         assert not empty
 
@@ -571,14 +571,14 @@ def test_sequence(self, file_system):
         # Should not find anything:
         empty = list(
             filesets["sequence-placeholder"].find(
-                "2016-12-31", "2018-01-01", no_files_error=False
+                "2016-12-31", "2017-12-31 23:59", no_files_error=False
             ))
         assert not empty
 
         # Should find two files:
         found_files = list(
             filesets["sequence-placeholder"].find(
-                "2018-01-01", "2018-01-02",
+                "2018-01-01", "2018-01-01 23:59",
             ))
 
         check = [
@@ -597,7 +597,7 @@ def test_sequence(self, file_system):
         # Should find two files and should return them in two bins:
         found_files = list(
             filesets["sequence-placeholder"].find(
-                "2018-01-01", "2018-01-02", bundle="6h",
+                "2018-01-01", "2018-01-01 23:59", bundle="6h",
             ))
 
         check = [
@@ -631,7 +631,7 @@ def test_logs(self, file_system, caplog):
                 ))
         assert ("Find files for sequence-placeholder "
                 "between 2018-01-01 00:00:00 and "
-                "2018-01-01 23:59:59") in caplog.text
+                "2018-01-02 00:00:00") in caplog.text
         assert f"via file system" in caplog.text
 
 
@@ -650,14 +650,14 @@ def test_sequence_placeholder(self, file_system):
         # Should not find anything:
         empty = list(
             filesets["sequence-placeholder"].find(
-                "2016-12-31", "2018-01-01", no_files_error=False
+                "2016-12-31", "2017-12-31 23:59", no_files_error=False
             ))
         assert not empty
 
         # Should find two files:
         found_files = list(
             filesets["sequence-placeholder"].find(
-                "2018-01-01", "2018-01-02",
+                "2018-01-01", "2018-01-01 23:59",
             ))
 
         check = [
@@ -676,7 +676,7 @@ def test_sequence_placeholder(self, file_system):
         # Should find two files and should return them in two bins:
         found_files = list(
             filesets["sequence-placeholder"].find(
-                "2018-01-01", "2018-01-02", bundle="6h",
+                "2018-01-01", "2018-01-01 23:59", bundle="6h",
             ))
 
         check = [