esm-tools · siligam · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/doc/index.rst b/doc/index.rst
@@ -35,6 +35,7 @@ Contents
    including_subcommand_plugins
    pycmor_fesom
    timeaveraging_frequencies
+   time_bounds
    accessors
    infer_freq
    cookbook

diff --git a/doc/time_bounds.rst b/doc/time_bounds.rst
@@ -0,0 +1,103 @@
+.. _time_bounds:
+
+Time Bounds
+===========
+
+The time bounds feature in PyMorize provides functionality to handle and generate time bounds for time series data. This is particularly useful for climate and weather data where variables are often associated with time intervals rather than single time points.
+
+Overview
+--------
+
+The ``time_bounds`` function creates time bounds for a given xarray Dataset with a time dimension. It automatically detects the time coordinate, checks for existing bounds, and creates appropriate bounds if they don't exist.
+
+Features
+--------
+
+- Automatic detection of time coordinate in the dataset
+- Support for various time frequencies (daily, monthly, etc.)
+- Handling of both standard and offset time points (e.g., mid-month dates)
+- Preservation of existing time bounds
+- Comprehensive logging of operations
+
+Usage
+-----
+
+Basic usage:
+
+.. code-block:: python
+
+    from pymor.std_lib.time_bounds import time_bounds
+    import xarray as xr
+    import numpy as np
+    import pandas as pd
+
+    # Create a sample dataset with time dimension
+    times = pd.date_range("2000-01-01", periods=12, freq="MS")  # Monthly start
+    data = np.random.rand(12)
+    ds = xr.Dataset({"temperature": (["time"], data)}, coords={"time": times})
+
+    # Apply time bounds
+    result = time_bounds(ds, rule)
+
+Supported Time Frequencies
+-------------------------
+
+The function handles various time frequencies, including:
+
+- Daily ("D")
+- Monthly ("MS" for month start, or custom offsets)
+- Custom frequencies using pandas offset aliases
+
+For example, with offset monthly data (e.g., 15th of each month):
+
+.. code-block:: python
+
+    # Create dataset with offset monthly data (15th of each month)
+    base_dates = pd.date_range("2000-01-15", periods=12, freq="MS")
+    times = base_dates + pd.DateOffset(days=14)
+    data = np.random.rand(12)
+    ds = xr.Dataset({"temperature": (["time"], data)}, coords={"time": times})
+
+    # Apply time bounds
+    result = time_bounds(ds, rule)
+
+Logging
+-------
+
+The function provides detailed logging of its operations, including:
+
+- Dataset validation
+- Time coordinate detection
+- Bounds creation/verification
+- Warnings and errors
+
+Example output:
+
+.. code-block:: text
+
+    [Time bounds] dataset_name
+      → is dataset:   ✅
+      → time label  : time
+      → bounds label: bnds
+      → has time bounds: ❌
+      → time values  : 12 points from 2000-01-01 to 2000-12-01
+      → time step    : 30 days 00:00:00
+      → set time bounds: time_bnds(12, 2)
+      → bounds range   : 2000-01-01 to 2001-01-01
+
+Error Handling
+--------------
+
+The function raises appropriate exceptions for invalid inputs:
+
+- ``ValueError`` if the input is not an xarray Dataset
+- ``ValueError`` if no valid time coordinate is found
+- ``ValueError`` if there are fewer than 2 time points
+
+API Reference
+-------------
+
+.. automodule:: pymor.std_lib.time_bounds
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/src/pymor/std_lib/time_bounds.md b/src/pymor/std_lib/time_bounds.md
@@ -0,0 +1,23 @@
+# time bounds
+
+To set time bounds for a variable, it needs to meet the following conditions:
+
+- time method can be mean or instantanous or climatology. time bounds is a valid option when time methods is either mean or instantanous.
+- in case of instantaneous time method, time bounds are same as the time values. That is delta time is 0 for the bounds for a given time point.
+- in case of mean time method, delta time of the bounds is greater than 0. the time point can be at the start or middle or at the end of the time interval (bounds).
+- setting time bounds before doing time average has the advantage of setting the bounds more accurately. `approx_interval` aids in determining time bounds.
+- setting time bounds after doing time average is possible but the time bounds many not be accurate if time points are set to middle or end.
+
+say time bounds function has the following signature:
+
+```python
+def time_bounds(ds: xr.Dataset, rule: Rule) -> xr.Dataset:
+    pass
+```
+
+`approx_interval` which is read from CMIP Table is accessible from rule like `rule.approx_interval`.
+`approx_interval` is floating number in days.
+
+if approx_interval is same as data frequency, it can either mean that the data is already time averaged or the data had similar frequency as the approx_interval.
+
+Let's say data is at monthly frequency and approx_interval is 30 days. As both data frequency andn approx_interval are the same, if the data points are at start of the month, time bounds can be set as the (this_month_start, next_month_start) If the points are at middle or at end of the month, time bounds are still (this_month_start, next_month_start)
diff --git a/src/pymor/std_lib/time_bounds.py b/src/pymor/std_lib/time_bounds.py
@@ -0,0 +1,186 @@
+import numpy as np
+import xarray as xr
+
+from ..core.logging import logger
+from ..core.rule import Rule
+from .dataset_helpers import get_time_label
+
+
+def time_bounds(ds: xr.Dataset, rule: Rule) -> xr.Dataset:
+    """
+    Sets time bounds for a variable based on time method and approx_interval.
+
+    Time bounds are set according to the following logic:
+    - For instantaneous time method: bounds are same as time values (delta time = 0)
+    - For mean time method: bounds span the averaging interval (delta time > 0)
+    - Uses approx_interval from rule to determine appropriate bounds
+
+    Parameters
+    ----------
+    ds : xr.Dataset
+        The input dataset.
+    rule : Rule
+        Rule object containing time bounds file attribute and approx_interval
+
+    Returns
+    -------
+    xr.Dataset
+        The output dataset with the time bounds information.
+    """
+    # Get dataset name for logging
+    dataset_name = ds.attrs.get("name", "unnamed_dataset")
+
+    # Log header with markdown style
+    logger.info(f"[Time bounds] {dataset_name}")
+    logger.info(f"  → is dataset:   {'✅' if isinstance(ds, xr.Dataset) else '❌'}")
+
+    # Check if input is a dataset
+    if not isinstance(ds, xr.Dataset):
+        logger.error("  ❌ The input is not a dataset.")
+        raise ValueError("The input is not a dataset.")
+
+    # Get time label and check if it exists
+    time_label = get_time_label(ds)
+    if time_label is None:
+        logger.error("  ❌ The dataset does not contain a valid time coordinate.")
+        raise ValueError("The dataset does not contain a valid time coordinate.")
+
+    bounds_dim_label = "bnds"
+    time_bounds_label = f"{time_label}_{bounds_dim_label}"
+
+    # Log time and bounds info
+    logger.info(f"  → time label  : {time_label}")
+    logger.info(f"  → bounds label: {bounds_dim_label}")
+
+    # Get approx_interval from rule (in days)
+    approx_interval = getattr(rule, "approx_interval", None)
+    logger.info(f"  → approx_interval: {approx_interval} days")
+
+    # Get time method from rule or dataset attributes
+    time_method = getattr(rule, "time_method", None) or ds.attrs.get("time_method", "mean")
+    logger.info(f"  → time method : {time_method}")
+
+    # Check if time bounds already exist
+    has_time_bounds = time_bounds_label in ds.variables
+    logger.info(f"  → has time bounds: {'✅' if has_time_bounds else '❌'}")
+
+    if has_time_bounds:
+        logger.info(f"  → using existing bounds: {time_bounds_label}")
+        return ds
+
+    # Validate time method
+    if time_method not in ["mean", "instantaneous", "climatology"]:
+        logger.warning(f"  ⚠️  Unknown time method '{time_method}', defaulting to 'mean'")
+        time_method = "mean"
+
+    # Only create bounds for mean and instantaneous methods
+    if time_method == "climatology":
+        logger.info("  → skipping bounds creation for climatology data")
+        return ds
+
+    # Get time coordinate
+    time_var = ds[time_label]
+    time_values = time_var.values
+
+    # Check if we have enough time points
+    if len(time_values) < 1:
+        error_msg = "Cannot create time bounds: no time points found"
+        logger.error(f"  ❌ {error_msg}")
+        raise ValueError(error_msg)
+
+    # Log time values info
+    logger.info(f"  → time values  : {len(time_values)} points from {time_values[0]} to {time_values[-1]}")
+
+    # Handle instantaneous time method
+    if time_method == "instantaneous":
+        logger.info("  → creating instantaneous bounds (delta time = 0)")
+        # For instantaneous data, bounds are the same as time values
+        bounds_data = np.column_stack([time_values, time_values])
-        # For instantaneous data, bounds are the same as time values
-        bounds_data = np.column_stack([time_values, time_values])
+        # For instantaneous data, bounds are the same as time values
+        #
+        # .. note:: np.column_stack reminder
+        #
+        #    >>> a = np.array([1, 2, 3])
+        #    >>> b = np.array([4, 5, 6])
+        #    >>> np.column_stack((a, b))
+        #    [[1, 4], [2, 5], [6, 3]]
+        #
+        bounds_data = np.column_stack([time_values, time_values])
-        # For instantaneous data, bounds are the same as time values
-        bounds_data = np.column_stack([time_values, time_values])
+        # For instantaneous data, bounds are the same as time values
+        #
+        # .. note:: np.column_stack reminder
+        #
+        #    >>> a = np.array([1, 2, 3])
+        #    >>> b = np.array([4, 5, 6])
+        #    >>> np.column_stack((a, b))
+        #    [[1, 4], [2, 5], [6, 3]]
+        #
+        bounds_data = np.column_stack([time_values, time_values])
+
+    # Handle mean time method
+    else:  # time_method == 'mean'
+        logger.info("  → creating mean bounds (delta time > 0)")
+
+        if len(time_values) < 2:
+            error_msg = "Cannot create mean time bounds: need at least 2 time points"
+            logger.error(f"  ❌ {error_msg}")
+            raise ValueError(error_msg)
+
+        # Calculate data frequency in days
+        time_diff_seconds = np.median(np.diff(time_values.astype("datetime64[s]").astype(float)))
+        data_freq_days = time_diff_seconds / (24 * 3600)  # Convert to days
+        logger.info(f"  → data frequency: {data_freq_days:.2f} days")
+
+        # Determine bounds based on approx_interval and data frequency
+        if approx_interval is not None and abs(data_freq_days - approx_interval) < 0.1:
+            logger.info("  → data frequency matches approx_interval, using interval-based bounds")
+
+            # For monthly data (approx_interval ~30 days)
+            if 28 <= approx_interval <= 32:
+                logger.info("  → detected monthly data, using month-start bounds")
+                bounds_data = _create_monthly_bounds(time_values)
+            else:
+                # For other regular intervals, use consecutive time points
+                logger.info("  → using consecutive time point bounds")
+                time_values_extended = np.append(
+                    time_values,
+                    time_values[-1] + np.timedelta64(int(data_freq_days), "D"),
+                )
+                bounds_data = np.column_stack([time_values_extended[:-1], time_values_extended[1:]])
+        else:
+            # Default behavior: use consecutive time points
+            logger.info("  → using default consecutive time point bounds")
+            time_diff = np.median(np.diff(time_values))
+            time_values_extended = np.append(time_values, time_values[-1] + time_diff)
+            bounds_data = np.column_stack([time_values_extended[:-1], time_values_extended[1:]])
+
+    # Create the bounds DataArray
+    bounds = xr.DataArray(
+        data=bounds_data,
+        dims=(time_label, bounds_dim_label),
+        coords={time_label: time_values, bounds_dim_label: [0, 1]},
+        attrs={
+            "long_name": f"time bounds for {time_label}",
+            "comment": f"Generated by pymorize: {time_method} time bounds",
+        },
+    )
+
+    # Add bounds to dataset as a coordinate variable
+    ds = ds.assign_coords({time_bounds_label: bounds})
+
+    # Add bounds attribute to time variable
+    if "bounds" not in time_var.attrs:
+        ds[time_label].attrs["bounds"] = time_bounds_label
+
+    # Log success message with bounds info
+    logger.info(f"  → set time bounds: {time_bounds_label}{bounds.shape}")
+    logger.info(f"  → bounds range   : {bounds.values[0][0]} to {bounds.values[-1][-1]}")
+    logger.info("-" * 50)
+    return ds
+
+
+def _create_monthly_bounds(time_values):
+    """
+    Create monthly bounds where bounds are (month_start, next_month_start)
+    regardless of where the time points are within the month.
+    """
+    import pandas as pd
+
+    bounds_data = []
+
+    for time_val in time_values:
+        # Convert to pandas timestamp for easier month manipulation
+        ts = pd.Timestamp(time_val)
+
+        # Get start of current month
+        month_start = ts.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
+
+        # Get start of next month
+        if ts.month == 12:
+            next_month_start = month_start.replace(year=ts.year + 1, month=1)
+        else:
+            next_month_start = month_start.replace(month=ts.month + 1)
+
+        bounds_data.append([month_start.to_numpy(), next_month_start.to_numpy()])
+
+    return np.array(bounds_data)