From 3e47563ee582a42d7e048b0a02e9fd09df7fd629 Mon Sep 17 00:00:00 2001 From: Oliver Lemke Date: Wed, 20 Nov 2024 12:56:24 +0100 Subject: [PATCH 1/4] Avoid problem in true divide --- typhon/geographical.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/typhon/geographical.py b/typhon/geographical.py index ab523d89..9a74153b 100644 --- a/typhon/geographical.py +++ b/typhon/geographical.py @@ -324,7 +324,11 @@ def gridded_mean(lat, lon, data, grid): grid_sum, _, _ = np.histogram2d(lat, lon, grid, weights=data) grid_number, _, _ = np.histogram2d(lat, lon, grid) - return grid_sum / grid_number, grid_number + grid_number_nozeros = np.where(grid_number == 0, -9999, grid_number) + gridded_mean = np.where( + grid_number_nozeros == -9999, 0, grid_sum / grid_number_nozeros + ) + return gridded_mean, grid_number def sea_mask(lat, lon, mask): @@ -387,5 +391,3 @@ def _to_array(item): lon_cell = lon / mask_lon_step return mask[lat_cell.astype(int), lon_cell.astype(int)] - - From 649c9cd79de6142dacfeba68073f0e5e79f5b9f1 Mon Sep 17 00:00:00 2001 From: Oliver Lemke Date: Wed, 20 Nov 2024 13:33:16 +0100 Subject: [PATCH 2/4] Lorena's collocation and spareice updates These are the improvements and fixes made by Lorena during her Masters' thesis. --- typhon/collocations/collocator.py | 120 +++++++++++++++++----------- typhon/collocations/common.py | 11 ++- typhon/files/fileset.py | 42 +++++++--- typhon/files/handlers/cloudsat.py | 31 ++++--- typhon/files/handlers/common.py | 11 +++ typhon/files/handlers/tovs.py | 12 ++- typhon/retrieval/common.py | 1 + typhon/retrieval/spareice/common.py | 115 ++++++++++++++++++++------ 8 files changed, 242 insertions(+), 101 deletions(-) diff --git a/typhon/collocations/collocator.py b/typhon/collocations/collocator.py index de9c2c2b..ae443340 100644 --- a/typhon/collocations/collocator.py +++ b/typhon/collocations/collocator.py @@ -192,11 +192,14 @@ def collocate_filesets( self._info(f"Collocate from {start} to {end}") - # Find the files from both filesets which overlap tempoerally. + # Find the files from both filesets which overlap temporally. matches = list(filesets[0].match( filesets[1], start=start, end=end, max_interval=max_interval, + skip_file_errors=skip_file_errors )) - + if len(matches) == 0: + return + if processes is None: processes = 1 @@ -540,59 +543,73 @@ def _collocate_matches( skip_errors=skip_file_errors, ) + for iprimary in range(len(matches)): + nfound_secondaries=len(matches[iprimary][1]) + got_matches=[next(loaded_matches) for isecondary in range(nfound_secondaries)] - for loaded_match in loaded_matches: - # The FileInfo objects of the matched files: - files = loaded_match[0][0], loaded_match[1][0] + # Get FileInfos + primary_infos=got_matches[0][0][0] + secondaries_infos = [nmatch[1][0] for nmatch in got_matches] + files = primary_infos, secondaries_infos + # Get datasets + primary = got_matches[0][0][1].copy() + secondaries = [nmatch[1][1].copy() for nmatch in got_matches] - # We copy the data from the matches since they might be used for - # other matches as well: - primary, secondary = \ - loaded_match[0][1].copy(), loaded_match[1][1].copy() + # Combine secondary files + secondary = xr.concat(secondaries, dim=secondaries[0].time.dims[0]) - self._debug(f"Collocate {files[0].path}\nwith {files[1].path}") + # Update dimension numbering + secondary[secondary.time.dims[0]] = np.arange(len(secondary.time)) + self._debug(f"Collocate {files[0].path}\nwith {[file.path for file in files[1]]}") + collocations = self.collocate( - (filesets[0].name, primary), - (filesets[1].name, secondary), **kwargs, - ) + (filesets[0].name, primary), + (filesets[1].name, secondary), **kwargs, + ) if collocations is None: self._debug("Found no collocations!") # At least, give the process caller a progress update: - yield None, None - continue - - # Check whether the collocation data is compatible and was build - # correctly - check_collocation_data(collocations) + yield None, None - found = [ - collocations[f"{filesets[0].name}/time"].size, - collocations[f"{filesets[1].name}/time"].size - ] + else: # after yield the code continues at the next line - self._debug( - f"Found {found[0]} ({filesets[0].name}) and " - f"{found[1]} ({filesets[1].name}) collocations" - ) + # Check whether the collocation data is compatible and was build + # correctly + check_collocation_data(collocations) - # Add the names of the processed files: - for f in range(2): - if f"{filesets[f].name}/__file" in collocations.variables: - continue + found = [ + collocations[f"{filesets[0].name}/time"].size, + collocations[f"{filesets[1].name}/time"].size + ] - collocations[f"{filesets[f].name}/__file"] = files[f].path + self._debug( + f"Found {found[0]} ({filesets[0].name}) and " + f"{found[1]} ({filesets[1].name}) collocations" + ) - # Collect the attributes of the input files. The attributes get a - # prefix, primary or secondary, to allow not-unique names. - attributes = { - f"primary.{p}" if f == 0 else f"secondary.{p}": v - for f, file in enumerate(files) - for p, v in file.attr.items() - } + # Add the names of the processed files: + for f in range(2): + if f"{filesets[f].name}/__file" in collocations.variables: + continue + if f == 0: + collocations[f"{filesets[f].name}/__file"] = files[f].path + else: + collocations[f"{filesets[f].name}/__file"] = \ + '::'.join([file.path for file in files[1]]) + + # Collect the attributes of the input files. The attributes get a + # prefix, primary or secondary, to allow not-unique names. + # As representative for the combined secondary files, the file attributes + # from the first secondary file are taken. + attributes = { + f"primary.{p}" if f == 0 else f"secondary.{p}": v + for f, file in enumerate([files[0], files[1][0]]) + for p, v in file.attr.items() + } - yield collocations, attributes + yield collocations, attributes def collocate( @@ -735,6 +752,12 @@ def collocate( """ + # the datasets must be sorted first by time in case an input file is unsorted + if isinstance(primary, tuple): + primary = (primary[0], primary[1].sortby("time")) + if isinstance(primary, tuple): + secondary = (secondary[0], secondary[1].sortby("time")) + if max_distance is None and max_interval is None: raise ValueError( "Either max_distance or max_interval must be given!" @@ -910,7 +933,7 @@ def _prepare_data(self, primary, secondary, max_interval, start, end): if not primary_period.size or not secondary_period.size: return None, None - # We need everything sorted by the time, otherwise xarray's stack + # We need everything sorted by the time, otherwise xarray's stack (typo?: sel) # method makes problems: primary_period = primary_period.sortby(primary_period) primary_dim = primary_period.dims[0] @@ -951,15 +974,18 @@ def _get_common_time_period( # very annoying bug in time retrieving # (https://github.com/pydata/xarray/issues/1240), this is a # little bit cumbersome: + + # Approach which can handle NaT values: + # added after concatenate was introduced common_start = max( start, - pd.Timestamp(primary.time.values.min().item(0)).tz_localize(None) - max_interval, - pd.Timestamp(secondary.time.values.min().item(0)).tz_localize(None) - max_interval + pd.to_datetime(primary.time.values).min().tz_localize(None) - max_interval, + pd.to_datetime(secondary.time.values).min().tz_localize(None) - max_interval ) common_end = min( end, - pd.Timestamp(primary.time.values.max().item(0)).tz_localize(None) + max_interval, - pd.Timestamp(secondary.time.values.max().item(0)).tz_localize(None) + max_interval + pd.to_datetime(primary.time.values).max().tz_localize(None) + max_interval, + pd.to_datetime(secondary.time.values).max().tz_localize(None) + max_interval ) primary_period = primary.time.where( @@ -1038,7 +1064,7 @@ def _flat_to_main_coord(data): # to use them as indices later. for dim in dims: if dim not in data.coords: - data[dim] = dim, np.arange(data.dims[dim]) + data[dim] = dim, np.arange(data.sizes[dim]) # We assume that coordinates must be unique! Otherwise, we would have # to use this ugly work-around: @@ -1151,7 +1177,7 @@ def _create_return( } ) metadata["interval"] = xr.DataArray( - intervals, dims=("collocation", ), + intervals.astype('timedelta64[ns]'), dims=("collocation", ), attrs={ "max_interval": f"Max. interval in secs: {max_interval}", "max_distance": f"Max. distance in kilometers: {max_distance}", diff --git a/typhon/collocations/common.py b/typhon/collocations/common.py index cf3156c0..ff88af75 100644 --- a/typhon/collocations/common.py +++ b/typhon/collocations/common.py @@ -118,6 +118,8 @@ def read(self, *args, **kwargs): return data elif self.read_mode == "collapse" or self.read_mode is None: # Collapse the data (default) + if self.read_mode is None: + print('default read_mode is collapse') return collapse(data, self.reference, self.collapser) elif self.read_mode == "expand": # Expand the data @@ -220,8 +222,8 @@ def search(self, filesets, collocator=None, **kwargs): def _rows_for_secondaries(primary): """Helper function for collapse""" - current_row = np.zeros(primary.size, dtype=int) - rows = np.zeros(primary.size, dtype=int) + current_row = np.zeros(primary.size, dtype=np.int64) + rows = np.zeros(primary.size, dtype=np.int64) i = 0 for p in primary: rows[i] = current_row[p] @@ -336,8 +338,9 @@ def collapse(data, reference=None, collapser=None): if collapser is None: collapser = {} collapser = { - "mean": lambda m, a: np.nanmean(m, axis=a), - "std": lambda m, a: np.nanstd(m, axis=a), + "mean": lambda m, a: np.nanmean(m, axis=a), # if slice consists solely of + "std": lambda m, a: np.nanstd(m, axis=a), # nans, prints RuntimeWarning, + # but returns nan as desired "number": lambda m, a: np.count_nonzero(~np.isnan(m), axis=a), **collapser, } diff --git a/typhon/files/fileset.py b/typhon/files/fileset.py index 1edd9489..2013ba3c 100644 --- a/typhon/files/fileset.py +++ b/typhon/files/fileset.py @@ -20,10 +20,12 @@ from sys import platform import traceback import warnings +import tempfile import numpy as np import pandas as pd from fsspec.implementations.local import LocalFileSystem +from pathlib import Path import typhon.files from typhon.trees import IntervalTree @@ -1050,8 +1052,8 @@ def find( ): """ Find all files of this fileset in a given time period. - The *start* and *end* parameters build a semi-open interval: only the - files that are equal or newer than *start* and older than *end* are + The *start* and *end* parameters build a closed interval: only the + files that are equal or newer than *start* and equal or older than *end* are going to be found. While searching this method checks whether the file lies in the time @@ -1114,9 +1116,13 @@ def find( # The user can give strings instead of datetime objects: start = datetime.min if start is None else to_datetime(start) end = datetime.max if end is None else to_datetime(end) - + ''' # We want to have a semi-open interval as explained in the doc string. + # Update 09.03.2023: We want a closed interval just like in collocate() -> _get_common_time_period + # to be consistent end -= timedelta(microseconds=1) + ''' + if end < start: raise ValueError( @@ -1635,8 +1641,11 @@ def get_info(self, file_info, retrieve_via=None): # Using the handler for getting more information if retrieve_via in ("handler", "both"): - with typhon.files.decompress(info.path, tmpdir=self.temp_dir) as \ - decompressed_path: + with typhon.files.decompress( + info.path, + tmpdir=self.temp_dir, + target=Path(info.path).name + next(tempfile._get_candidate_names()), + ) as decompressed_path: decompressed_file = info.copy() decompressed_file.path = decompressed_path handler_info = self.handler.get_info(decompressed_file) @@ -2019,6 +2028,7 @@ def _return(file_info, return_value): try: # file_info could be a bundle of files if isinstance(file_info, FileInfo): + print(file_info) file_content = fileset.read(file_info, **read_args) else: file_content = \ @@ -2071,7 +2081,7 @@ def _return(file_info, return_value): def match( self, other, start=None, end=None, max_interval=None, - filters=None, other_filters=None): + filters=None, other_filters=None, skip_file_errors=False): """Find matching files between two filesets Matching files are files that overlap each in their time coverage. @@ -2097,16 +2107,22 @@ def match( Examples: TODO: Add example """ + + # files1 are only searched for within start and end + # without considering max_interval + start=to_datetime(start) + end=to_datetime(end) if max_interval is not None: max_interval = to_timedelta(max_interval, numbers_as="seconds") - start = to_datetime(start) - max_interval - end = to_datetime(end) + max_interval + start_extended=start-max_interval + end_extended=end+max_interval files1 = list( - self.find(start, end, filters=filters) + self.find(start, end, filters=filters, no_files_error=not(skip_file_errors)) #skip_file_errors added 22.02.2023 ) + files2 = list( - other.find(start, end, filters=other_filters) + other.find(start_extended, end_extended, filters=other_filters, no_files_error=not(skip_file_errors)) #skip_file_errors added 22.02.2023 ) # Convert the times (datetime objects) to seconds (integer) @@ -2119,6 +2135,8 @@ def match( for file in files2 ]).astype("M8[s]").astype(int) + if len(times1)==0 or len(times2)==0: + return if max_interval is not None: # Expand the intervals of the secondary fileset to close-in-time # intervals. @@ -2700,8 +2718,8 @@ def read(self, file_info, **read_args): read_args = {**self.read_args, **read_args} if self.decompress: - with typhon.files.decompress(file_info.path, tmpdir=self.temp_dir)\ - as decompressed_path: + with typhon.files.decompress(file_info.path, tmpdir=self.temp_dir, target=Path(file_info.path).name+next(tempfile._get_candidate_names()))\ + as decompressed_path: # modified 14.02.2023 decompressed_file = file_info.copy() decompressed_file.path = decompressed_path data = self.handler.read(decompressed_file, **read_args) diff --git a/typhon/files/handlers/cloudsat.py b/typhon/files/handlers/cloudsat.py index bd92c29c..e48ca9d6 100644 --- a/typhon/files/handlers/cloudsat.py +++ b/typhon/files/handlers/cloudsat.py @@ -1,10 +1,12 @@ -from datetime import datetime +from datetime import datetime, timedelta import warnings +import re import numpy as np import xarray as xr from .common import HDF4, expects_file_info +from pathlib import Path pyhdf_is_installed = False try: @@ -45,6 +47,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) @expects_file_info() + # new version - works for R04 and R05 data def get_info(self, file_info, **kwargs): """Return a :class:`FileInfo` object with parameters about the file content. @@ -58,11 +61,18 @@ def get_info(self, file_info, **kwargs): A FileInfo object. """ - file = SD(file_info.path, SDC.READ) - file_info.times[0] = \ - datetime.strptime(getattr(file, 'start_time'), "%Y%m%d%H%M%S") - file_info.times[1] = \ - datetime.strptime(getattr(file, 'end_time'), "%Y%m%d%H%M%S") + fname = Path(file_info.path).name + regex = re.compile(r"(?P\d{4})(?P\d{3})") + m = regex.search(fname) + year = int(m["year"]) + doy = int(m["doy"]) + + ftimes = super().read(file_info, ["UTC_start", "Profile_time"]) + utc_start = ftimes["UTC_start"].item(0) + duration = ftimes["Profile_time"].item(-1) + + file_info.times[0] = datetime(year, 1, 1) + timedelta(doy - 1, utc_start) + file_info.times[1] = file_info.times[0] + timedelta(seconds=duration) return file_info @@ -71,7 +81,7 @@ def read(self, file_info, **kwargs): """Read and parse HDF4 files and load them to a xarray.Dataset A description about all variables in CloudSat dataset can be found in - http://www.cloudsat.cira.colostate.edu/data-products/level-2c/2c-ice?term=53. + https://www.cloudsat.cira.colostate.edu/data-products/2c-ice (new link: 16.02.2023) Args: file_info: Path and name of the file as string or FileInfo object. @@ -97,6 +107,9 @@ def read(self, file_info, **kwargs): ) dataset["time"] = self._get_time_field(dataset, file_info) + # scnline is only returned as a dimension. Set it to a coordinate (added 16.02.2023) + # scnline numbering usually starts with 1 + dataset=dataset.assign_coords({'scnline':dataset['scnline']+1}) # Remove fields that we do not need any longer (expect the user asked # for them explicitly) @@ -123,10 +136,10 @@ def _get_time_field(self, dataset, file_info): profile_times = profile_times.astype("int") try: - date = file_info.times[0].date() + date = file_info.times[0].date() # das haette funktionieren muessen.. except AttributeError: # We have to load the info by ourselves: - date = self.get_info(file_info).times[0].date() + date = self.get_info(file_info).times[0].date() # Put all times together so we obtain one full timestamp # (date + time) for each data point. We are using the diff --git a/typhon/files/handlers/common.py b/typhon/files/handlers/common.py index 7944d049..431ee736 100644 --- a/typhon/files/handlers/common.py +++ b/typhon/files/handlers/common.py @@ -673,6 +673,7 @@ def read(self, file_info, fields=None, mapping=None, **kwargs): # parameter `group`. To avoid this, we load all groups and their # variables by using the netCDF4 directly and load them later into a # xarray dataset. + # May 2022: xr.open_dataset still not support loading all groups at once with netCDF4.Dataset(file_info.path, "r") as root: # xarray decode_cf scales, don't do it twice! @@ -684,6 +685,16 @@ def read(self, file_info, fields=None, mapping=None, **kwargs): return _xarray_rename_fields(dataset, mapping) + + def read_from_list(self, file_info, fields=None, mapping=None, common_dimension=None, **kwargs): #added + if common_dimension is None: + raise ValueError("If file_info is given as a list, you have to provide common_dimension as additional parameter.") + + dataset=[] + for entry in file_info: + dataset.append(self.read(entry, fields, mapping, **kwargs)) + return xr.concat(dataset,dim=common_dimension) + @staticmethod def _get_dimension_name(ds, group, path, dim): # If the dimension is defined in the subgroup, use NOT the one of the diff --git a/typhon/files/handlers/tovs.py b/typhon/files/handlers/tovs.py index d84bb9de..9dfb07c5 100644 --- a/typhon/files/handlers/tovs.py +++ b/typhon/files/handlers/tovs.py @@ -276,7 +276,10 @@ def read(self, file_info, mask_and_scale=True, interpolate_packed_pixels=True, # All geolocation fields are packed in the AVHRR GAC files: if interpolate_packed_pixels: - self._interpolate_packed_pixels(dataset, max_nans_interpolation) + try: + self._interpolate_packed_pixels(dataset, max_nans_interpolation, file_info) + except: + return None # try-except added on 24.04.23 allowed_coords = {'channel', 'calib', 'scnline', 'scnpos'} else: allowed_coords = {'channel', 'calib', 'scnline', 'scnpos', @@ -294,7 +297,7 @@ def read(self, file_info, mask_and_scale=True, interpolate_packed_pixels=True, return dataset @staticmethod - def _interpolate_packed_pixels(dataset, max_nans_interpolation): + def _interpolate_packed_pixels(dataset, max_nans_interpolation, file_info): given_pos = np.arange(5, 409, 8) new_pos = np.arange(1, 410) @@ -310,8 +313,9 @@ def _interpolate_packed_pixels(dataset, max_nans_interpolation): if valid_pos.sum() < 52 - max_nans_interpolation: raise ValueError( - "Too many NaNs in latitude and longitude of this AVHRR file. " - "Cannot guarantee a good interpolation!" + f"Too many NaNs in latitude and longitude of this AVHRR file. " + f"Cannot guarantee a good interpolation! " + f"File affected: {file_info}" ) # Filter NaNs because CubicSpline cannot handle it: diff --git a/typhon/retrieval/common.py b/typhon/retrieval/common.py index 839c02c3..6044cbeb 100644 --- a/typhon/retrieval/common.py +++ b/typhon/retrieval/common.py @@ -157,6 +157,7 @@ def _model_to_dict(model): attr: copy.deepcopy(getattr(model, attr)) for attr in model.__dir__() if not attr.startswith("__") and attr.endswith("_") + if not attr.startswith("_repr_") } } diff --git a/typhon/retrieval/spareice/common.py b/typhon/retrieval/spareice/common.py index 64fe58af..40e56431 100644 --- a/typhon/retrieval/spareice/common.py +++ b/typhon/retrieval/spareice/common.py @@ -465,7 +465,8 @@ def get_grid_value(grid, lat, lon): # We do not need the depth of the oceans (this would just # confuse the ANN): - return_data["elevation"][return_data.elevation < 0] = 0 + + return_data.loc[return_data.elevation <0, "elevation"]=0 return return_data @@ -531,15 +532,18 @@ def _retrieve_from_collocations(collocations, _, spareice): "description": "True if pixel contains an ice cloud (retrieved" " by SPARE-ICE)." } + collocations=collocations.to_xarray() retrieved["lat"] = collocations["lat"] retrieved["lon"] = collocations["lon"] retrieved["time"] = collocations["time"] retrieved["scnpos"] = collocations["mhs_scnpos"] - + # let index start with 1 (in accordance with scnline) + retrieved["index"]= retrieved["index"] + 1 return retrieved def retrieve_from_collocations( - self, inputs, output, start=None, end=None, processes=None + self, inputs, output, start=None, end=None, processes=None, + no_files_error=True, ): """Retrieve SPARE-ICE from collocations between MHS and AVHRR @@ -576,14 +580,15 @@ def retrieve_from_collocations( if "elevation" in self.inputs and self.elevation_grid is None: raise ValueError("You have to pass a elevation file via init!") - timer = Timer.start() + timer = Timer().start() if isinstance(inputs, Collocations): # Simply apply a map function to all files from these collocations inputs.map( SPAREICE._retrieve_from_collocations, kwargs={ "spareice": self, }, on_content=True, pass_info=True, start=start, end=end, - max_workers=processes, output=output, worker_type="process" + max_workers=processes, output=output, worker_type="process", + no_files_error=no_files_error # added 04.05.2023 ) elif len(inputs) == 2: # Collocate MHS and AVHRR on-the-fly: @@ -611,7 +616,7 @@ def retrieve_from_collocations( "You need to pass a Collocations object or a list with a MHS " "and AVHRR fileset!" ) - logger.info(f"Took {timer} hours to retrieve SPARE-ICE") + logger.info(f"Took {timer} to retrieve SPARE-ICE") def score(self, data): """Calculate the score of SPARE-ICE on testing data @@ -720,15 +725,13 @@ def report(self, output_dir, experiment, data): subdirectory named `experiment` will be created there. All plots are stored to it. experiment: A name for the experiment as a string. Will be included - in the title of the plots and used as name for the subdirectory - in `output_dir`. + in the title of the plots in `output_dir`. data: A pandas.DataFrame object with the required input fields. Returns: None """ # Create the output directory: - output_dir = join(output_dir, experiment) os.makedirs(output_dir, exist_ok=True) # Run SPARE-ICE! @@ -750,15 +753,20 @@ def _report_iwp(self, output_dir, experiment, test, retrieved): cmap="density", vmin=5, ) scat.cmap.set_under("w") + + # for framing + ax.spines['top'].set_visible(True) + ax.spines['right'].set_visible(True) + ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]") ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]") ax.set_title(experiment) - fig.colorbar(scat, label="Number of points") - fig.savefig(join(output_dir, "2C-ICE-SPAREICE_heatmap.png")) + fig.colorbar(scat, label="\nNumber of points") + fig.savefig(join(output_dir, "2C-ICE-SPAREICE_heatmap.pdf")) self._plot_scatter( experiment, - join(output_dir, "2C-ICE-SPAREICE_scatter_{area}.png"), + join(output_dir, "2C-ICE-SPAREICE_scatter_{area}.pdf"), test.iwp, retrieved.iwp, test.sea_mask.values ) @@ -773,7 +781,7 @@ def _report_iwp(self, output_dir, experiment, test, retrieved): - 1 ) self._plot_error( - experiment, join(output_dir, "2C-ICE-SPAREICE_mfe.png"), + experiment, join(output_dir, "2C-ICE-SPAREICE_mfe.pdf"), test, fe, test.sea_mask.values, ) @@ -781,10 +789,10 @@ def _report_iwp(self, output_dir, experiment, test, retrieved): # Plot the bias: bias = retrieved.iwp.values - test.iwp.values self._plot_error( - experiment, join(output_dir, "2C-ICE-SPAREICE_bias.png"), + experiment, join(output_dir, "2C-ICE-SPAREICE_bias.pdf"), test, bias, test.sea_mask.values, - mfe=False, yrange=[-0.35, 0.45], + mfe=False, ) # self._plot_weights( @@ -814,11 +822,16 @@ def _plot_scatter(experiment, file, xdata, ydata, sea_mask): xdata[mask], ydata[mask], s=1, alpha=0.6 ) + + # for framing + ax.spines['top'].set_visible(True) + ax.spines['right'].set_visible(True) + ax.grid() ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]") ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]") ax.set_title(f"{experiment} - {area}") - fig.savefig(file.format(area=area)) + fig.savefig(file.format(area=area), bbox_inches='tight') @staticmethod def _plot_error( @@ -831,10 +844,11 @@ def _plot_error( if mfe: ax.set_ylabel("Median fractional error [%]") - ax.set_ylim([0, 200]) statistic = "median" else: - ax.set_ylabel("$\Delta$ IWP (SPARE-ICE - 2C-ICE) [log 10 g/m^2]") + ax.set_ylabel("$\\Delta$ IWP (SPARE-ICE - 2C-ICE) [log 10 g/m^2]") + plt.hlines(0, xrange[0], xrange[1], color='red', linewidth=2) + ax.set_xlim(xrange) statistic = "mean" for hemisphere in ["global"]: @@ -862,10 +876,14 @@ def _plot_error( ax.grid() ax.legend(fancybox=True) ax.set_title(f"Experiment: {experiment}") + if mfe: + ax.set_ylim(ymin=0) if yrange is not None: ax.set_ylim(yrange) - fig.tight_layout() - fig.savefig(file) + + ax.set_xlim(xrange) + + fig.savefig(file, bbox_inches='tight', dpi=120) def _plot_weights(self, title, file, layer_index=0, vmin=-5, vmax=5): import seaborn as sns @@ -889,19 +907,65 @@ def _plot_weights(self, title, file, layer_index=0, vmin=-5, vmax=5): f.tight_layout() f.savefig(file) - def _report_ice_cloud(self, output_dir, experiment, test, retrieved): + def _report_ice_cloud_old(self, output_dir, experiment, test, retrieved): # Confusion matrix: fig, ax = plt.subplots(figsize=(12, 10)) cm = confusion_matrix(test.ice_cloud, retrieved.ice_cloud) img = self._plot_matrix(cm, classes=["Yes", "No"], normalize=True) - fig.colorbar(img, label="probability") + fig.colorbar(img, label="\nprobability") ax.set_title("Ice Cloud Classifier - Performance") ax.set_ylabel('real ice cloud') ax.set_xlabel('predicted ice cloud') - fig.tight_layout() - fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.png")) + fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.jpg")) + + fig, ax = plt.subplots(figsize=(12, 10)) + ax.barh( + np.arange(len(self.ice_cloud.inputs)), + self.ice_cloud.estimator.feature_importances_ + ) + ax.set_yticks(np.arange(len(self.ice_cloud.inputs))) + ax.set_yticklabels(self.ice_cloud.inputs) + ax.set_xlabel("Feature Importance") + ax.set_ylabel("Feature") + ax.set_title("Ice Cloud Classifier - Importance") + fig.savefig(join(output_dir, "ice-cloud-feature-importance.jpg")) + + + def _report_ice_cloud(self, output_dir, experiment, test, retrieved,normalize=True, **kwargs): + from seaborn import heatmap + + # ice-cloud confusion matrix + cm = confusion_matrix(test.ice_cloud, retrieved.ice_cloud, labels=[1,0]) + + if normalize: + cmsum=cm.sum(axis=1)[:, np.newaxis] + cmsum=np.where(cmsum==0,1,cmsum) # prevent warning in true_divide + cm = cm.astype('float') / cmsum + + fmt_prob=".2f"#"%" + kwargs["vmin"]=0 + kwargs["vmax"]=1 + else: + fmt_prob="d" + + default_kwargs = { + "cmap": "Blues", + **kwargs } + + fig,ax= plt.subplots(figsize=(12, 10)) + ax=heatmap(cm, annot=True, cbar=True, cbar_kws={"label":"Probability"}, + square=True,fmt=fmt_prob, + **default_kwargs) + ax.set_title("Ice Cloud Classifier - Performance") + ax.set_ylabel('Real ice cloud') + ax.set_xlabel('Predicted ice cloud') + ax.xaxis.set_ticklabels(['Yes','No']) + ax.yaxis.set_ticklabels(['Yes','No']) + fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.pdf")) + # ice-cloud feature importannce' fig, ax = plt.subplots(figsize=(12, 10)) + print(self.ice_cloud.estimator.feature_importances_) ax.barh( np.arange(len(self.ice_cloud.inputs)), self.ice_cloud.estimator.feature_importances_ @@ -911,7 +975,8 @@ def _report_ice_cloud(self, output_dir, experiment, test, retrieved): ax.set_xlabel("Feature Importance") ax.set_ylabel("Feature") ax.set_title("Ice Cloud Classifier - Importance") - fig.savefig(join(output_dir, "ice-cloud-feature-importance.png")) + fig.savefig(join(output_dir, "ice-cloud-feature-importance.pdf")) + @staticmethod def _plot_matrix( From a56e9ec7c6b90b811b9542b9bf418a2f4cf16b15 Mon Sep 17 00:00:00 2001 From: Oliver Lemke Date: Wed, 20 Nov 2024 13:33:34 +0100 Subject: [PATCH 3/4] Fix xarray's nanosecond warning --- typhon/tests/collocations/test_collocations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/typhon/tests/collocations/test_collocations.py b/typhon/tests/collocations/test_collocations.py index 151e67c4..425b21ff 100644 --- a/typhon/tests/collocations/test_collocations.py +++ b/typhon/tests/collocations/test_collocations.py @@ -87,7 +87,7 @@ def test_collocate_collapse_expand(self): collocator = Collocator() test = xr.Dataset({ - "time": ("time", np.arange("2000", "2010", dtype="M8[Y]")), + "time": ("time", np.arange("2000", "2010", dtype="M8[Y]").astype('datetime64[ns]')), "lat": ("time", np.arange(10)), "lon": ("time", np.arange(10)), }) From b3fa04ff88fe46d37663366a492b16d5ffc09e43 Mon Sep 17 00:00:00 2001 From: Oliver Lemke Date: Wed, 20 Nov 2024 13:34:17 +0100 Subject: [PATCH 4/4] Adjust timeframe in test cases Collocation end time is now inclusive instead of exclusive. --- typhon/tests/files/test_fileset.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/typhon/tests/files/test_fileset.py b/typhon/tests/files/test_fileset.py index c754eb53..d845b2f0 100644 --- a/typhon/tests/files/test_fileset.py +++ b/typhon/tests/files/test_fileset.py @@ -292,7 +292,7 @@ def test_tutorial(self, file_system): # Should not find anything: empty = list( filesets["tutorial"].find( - "2017-12-31", "2018-01-01", no_files_error=False + "2017-12-31", "2017-12-31 23:59", no_files_error=False )) assert not empty @@ -525,7 +525,7 @@ def test_single(self, file_system): # Should not find anything: empty = list( filesets["single"].find( - "2016-12-31", "2018-01-01", no_files_error=False + "2016-12-31", "2017-12-31 23:59", no_files_error=False )) assert not empty @@ -571,14 +571,14 @@ def test_sequence(self, file_system): # Should not find anything: empty = list( filesets["sequence-placeholder"].find( - "2016-12-31", "2018-01-01", no_files_error=False + "2016-12-31", "2017-12-31 23:59", no_files_error=False )) assert not empty # Should find two files: found_files = list( filesets["sequence-placeholder"].find( - "2018-01-01", "2018-01-02", + "2018-01-01", "2018-01-01 23:59", )) check = [ @@ -597,7 +597,7 @@ def test_sequence(self, file_system): # Should find two files and should return them in two bins: found_files = list( filesets["sequence-placeholder"].find( - "2018-01-01", "2018-01-02", bundle="6h", + "2018-01-01", "2018-01-01 23:59", bundle="6h", )) check = [ @@ -631,7 +631,7 @@ def test_logs(self, file_system, caplog): )) assert ("Find files for sequence-placeholder " "between 2018-01-01 00:00:00 and " - "2018-01-01 23:59:59") in caplog.text + "2018-01-02 00:00:00") in caplog.text assert f"via file system" in caplog.text @@ -650,14 +650,14 @@ def test_sequence_placeholder(self, file_system): # Should not find anything: empty = list( filesets["sequence-placeholder"].find( - "2016-12-31", "2018-01-01", no_files_error=False + "2016-12-31", "2017-12-31 23:59", no_files_error=False )) assert not empty # Should find two files: found_files = list( filesets["sequence-placeholder"].find( - "2018-01-01", "2018-01-02", + "2018-01-01", "2018-01-01 23:59", )) check = [ @@ -676,7 +676,7 @@ def test_sequence_placeholder(self, file_system): # Should find two files and should return them in two bins: found_files = list( filesets["sequence-placeholder"].find( - "2018-01-01", "2018-01-02", bundle="6h", + "2018-01-01", "2018-01-01 23:59", bundle="6h", )) check = [