diff --git a/documentation/changelog.rst b/documentation/changelog.rst index 88a333571c..24102ff319 100644 --- a/documentation/changelog.rst +++ b/documentation/changelog.rst @@ -12,7 +12,7 @@ v0.31.0 | February XX, 2026 New features ------------- * Improve CSV upload validation by inferring the intended base resolution even when data contains valid gaps, instead of requiring perfectly regular timestamps [see `PR #1918 `_] -* New forecasting API endpoints `[POST] /sensors/(id)/forecasts/trigger `_ and `[GET] /sensors/(id)/forecasts/(uuid) `_ to forecast sensor data [see `PR #1813 `_ and `PR #1823 `_] +* New forecasting API endpoints, and all timing parameters in forecasting CLI got sensible defaults for ease of use `[POST] /sensors/(id)/forecasts/trigger `_ and `[GET] /sensors/(id)/forecasts/(uuid) `_ to forecast sensor data [see `PR #1813 `_, `PR #1823 `_ and `PR #1917 `_] * Support setting a resolution when triggering a schedule via the API or CLI [see `PR #1857 `_] * Support variable peak pricing and changes in commitment baselines [see `PR #1835 `_] * Support storing the aggregate power schedule [see `PR #1736 `_] diff --git a/documentation/cli/change_log.rst b/documentation/cli/change_log.rst index 7a1c1abc89..2818df2317 100644 --- a/documentation/cli/change_log.rst +++ b/documentation/cli/change_log.rst @@ -10,6 +10,7 @@ since v0.31.0 | February XX, 2026 * Fix ``delete-beliefs`` CLI command ignoring the ``--source`` filter during deletion, preventing unintended removal of beliefs from other sources. * Let ``flexmeasures add schedule`` create schedules with only information known prior to some time using the ``prior`` option. * New ``-dry-run`` flag for ``flexmeasures add schedule`` to avoid saving anything (printing out the results instead). +* Streamlines option names for ``flexmeasures add forecasts`` with API usage (preserving backwards compatibility). * Return validation errors instead of database errors for fields that map to database objects. * Mutate job state when running ``flexmeasures jobs run-job ``, including updating metadata and moving between registries * Add ``flexmeasures jobs stats``, which shows queueing statistics to help evaluate the health of the queueing system. diff --git a/documentation/features/forecasting.rst b/documentation/features/forecasting.rst index 140e45309a..aced89f78b 100644 --- a/documentation/features/forecasting.rst +++ b/documentation/features/forecasting.rst @@ -55,12 +55,12 @@ The main CLI parameters that control this process are: - ``to-date``: The global cutoff point. Training and prediction cycles continue until the ``predict-end`` reaches this date. - ``max-forecast-horizon``: The maximum length of a forecast into the future. - ``forecast-frequency``: Determines the number of prediction cycles within the forecast period (e.g. daily, hourly). -- ``start-date``: Define the start of historical data used for training. +- ``train-period``: Define a window of historical data to use for training. Note that: ``forecast-frequency`` together with ``max-forecast-horizon`` determine how the forecasting cycles advance through time. -``start-date`` / ``from-date`` and ``to-date`` allow precise control over the training and prediction windows in each cycle. +``train-period``, ``from-date`` and ``to-date`` allow precise control over the training and prediction windows in each cycle. Forecasting via the API ----------------------- diff --git a/documentation/tut/forecasting_scheduling.rst b/documentation/tut/forecasting_scheduling.rst index 4d6e043ca9..1cc24d4cc6 100644 --- a/documentation/tut/forecasting_scheduling.rst +++ b/documentation/tut/forecasting_scheduling.rst @@ -104,9 +104,8 @@ There are two ways to queue a forecasting job: .. code-block:: json { - "start_date": "2025-01-01T00:00:00+00:00", - "start_predict_date": "2025-01-04T00:00:00+00:00", - "end_date": "2025-01-04T04:00:00+00:00" + "start": "2025-01-04T00:00:00+00:00", + "duration": "PT4H" } Example response: diff --git a/flexmeasures/api/common/responses.py b/flexmeasures/api/common/responses.py index aaeb26f7b1..59e34aa895 100644 --- a/flexmeasures/api/common/responses.py +++ b/flexmeasures/api/common/responses.py @@ -260,6 +260,11 @@ def unknown_schedule(message: str) -> ResponseTuple: return dict(result="Rejected", status="UNKNOWN_SCHEDULE", message=message), 400 +@BaseMessage("No known forecast for this time period.") +def unknown_forecast(message: str) -> ResponseTuple: + return dict(result="Rejected", status="UNKNOWN_FORECAST", message=message), 400 + + def fallback_schedule_redirect(message: str, location: str) -> ResponseTuple: return ( dict(result="Rejected", status="UNKNOWN_SCHEDULE", message=message), diff --git a/flexmeasures/api/common/schemas/utils.py b/flexmeasures/api/common/schemas/utils.py index a217b52005..536e062600 100644 --- a/flexmeasures/api/common/schemas/utils.py +++ b/flexmeasures/api/common/schemas/utils.py @@ -4,6 +4,10 @@ from marshmallow import Schema, fields from flexmeasures.utils.doc_utils import rst_to_openapi +from flexmeasures.data.schemas.forecasting.pipeline import ( + ForecastingTriggerSchema, + TrainPredictPipelineConfigSchema, +) from flexmeasures.data.schemas.sensors import ( SensorReferenceSchema, VariableQuantityField, @@ -11,7 +15,7 @@ ) -def make_openapi_compatible(schema_cls: Type[Schema]) -> Type[Schema]: +def make_openapi_compatible(schema_cls: Type[Schema]) -> Type[Schema]: # noqa: C901 """ Create an OpenAPI-compatible version of a Marshmallow schema. @@ -27,6 +31,25 @@ def make_openapi_compatible(schema_cls: Type[Schema]) -> Type[Schema]: new_fields = {} for name, field in schema_cls._declared_fields.items(): + if schema_cls in (ForecastingTriggerSchema, TrainPredictPipelineConfigSchema): + if "cli" in field.metadata and field.metadata["cli"].get( + "cli-exclusive", False + ): + continue + if isinstance(field, fields.Nested): + nested_schema_cls = type(field.schema) + if nested_schema_cls is TrainPredictPipelineConfigSchema: + field_copy = fields.Nested( + make_openapi_compatible(nested_schema_cls), + metadata=field.metadata, + data_key=field.data_key, + many=field.many, + required=field.required, + allow_none=field.allow_none, + ) + new_fields[name] = field_copy + continue + # Copy metadata, but sanitize description for OpenAPI metadata = dict(getattr(field, "metadata", {})) if "description" in metadata: diff --git a/flexmeasures/api/v3_0/__init__.py b/flexmeasures/api/v3_0/__init__.py index 3f10035312..c85e777ba4 100644 --- a/flexmeasures/api/v3_0/__init__.py +++ b/flexmeasures/api/v3_0/__init__.py @@ -15,7 +15,10 @@ from marshmallow import Schema from flexmeasures import __version__ as fm_version -from flexmeasures.api.v3_0.sensors import SensorAPI +from flexmeasures.api.v3_0.sensors import ( + SensorAPI, + forecasting_trigger_schema_openAPI, +) from flexmeasures.api.v3_0.accounts import AccountAPI from flexmeasures.api.v3_0.users import UserAPI from flexmeasures.api.v3_0.assets import AssetAPI, AssetTypesAPI @@ -137,6 +140,7 @@ def create_openapi_specs(app: Flask): # Explicitly register OpenAPI-compatible schemas schemas = [ ("FlexContextOpenAPISchema", flex_context_schema_openAPI), + ("forecasting_trigger_schema_openAPI", forecasting_trigger_schema_openAPI), ("UserAPIQuerySchema", UserAPIQuerySchema), ("AssetAPIQuerySchema", AssetAPIQuerySchema), ("AssetSchema", AssetSchema), diff --git a/flexmeasures/api/v3_0/sensors.py b/flexmeasures/api/v3_0/sensors.py index a59c11195a..15f446f4e7 100644 --- a/flexmeasures/api/v3_0/sensors.py +++ b/flexmeasures/api/v3_0/sensors.py @@ -22,10 +22,12 @@ from flexmeasures.api.common.responses import ( request_processed, unrecognized_event, + unknown_forecast, unknown_schedule, unprocessable_entity, fallback_schedule_redirect, ) +from flexmeasures.api.common.schemas.utils import make_openapi_compatible from flexmeasures.api.common.utils.validators import ( optional_duration_accepted, ) @@ -71,7 +73,7 @@ from flexmeasures.data.models.forecasting import Forecaster from flexmeasures.data.services.data_sources import get_data_generator from flexmeasures.data.schemas.forecasting.pipeline import ( - ForecasterParametersSchema, + ForecastingTriggerSchema, ) # Instantiate schemes outside of endpoint logic to minimize response time @@ -79,6 +81,19 @@ sensor_schema = SensorSchema() partial_sensor_schema = SensorSchema(partial=True, exclude=["generic_asset_id"]) +# Create ForecasterParametersSchema OpenAPI compatible schema +EXCLUDED_FORECASTING_FIELDS = [ + # todo: hide these in the config schema instead + # "train_period", + # "max_training_period", + "sensor_to_save", +] +forecasting_trigger_schema_openAPI = make_openapi_compatible(ForecastingTriggerSchema)( + # partial=True, + exclude=EXCLUDED_FORECASTING_FIELDS + + ["sensor"], +) + class SensorKwargsSchema(Schema): account = AccountIdField(data_key="account_id", required=False) @@ -1519,7 +1534,10 @@ def get_status(self, id, sensor): @route("//forecasts/trigger", methods=["POST"]) @use_args( - ForecasterParametersSchema(), + ForecastingTriggerSchema( + # partial=True, + exclude=EXCLUDED_FORECASTING_FIELDS, + ), location="combined_sensor_data_description", as_kwargs=True, ) @@ -1533,9 +1551,8 @@ def trigger_forecast(self, id: int, **params): description: | Trigger a forecasting job for a sensor. - This endpoint starts a forecasting job asynchronously and returns a - job UUID. The job will run in the background and generate forecast values - for the specified period. + This endpoint starts a forecasting job asynchronously and returns a job UUID. + The job will run in the background and generate forecasts for the specified period. Once triggered, the job status and results can be retrieved using the ``GET /api/v3_0/sensors//forecasts/`` endpoint. @@ -1554,25 +1571,10 @@ def trigger_forecast(self, id: int, **params): required: true content: application/json: - schema: - type: object - properties: - start_date: - type: string - format: date-time - description: Start date of the historical data used for training. - start_predict_date: - type: string - format: date-time - description: Start date of the forecast period. - end_date: - type: string - format: date-time - description: End date of the forecast period. + schema: forecasting_trigger_schema_openAPI example: - start_date: "2026-01-01T00:00:00+01:00" - start_predict_date: "2026-01-15T00:00:00+01:00" - end_date: "2026-01-17T00:00:00+01:00" + start: "2026-01-15T00:00:00+01:00" + duration: "P2D" responses: 200: description: PROCESSED @@ -1611,9 +1613,6 @@ def trigger_forecast(self, id: int, **params): # Put the sensor to save in the parameters parameters["sensor"] = params["sensor_to_save"].id - # Ensure the forecast is run as a job on a forecasting queue - parameters["as_job"] = True - # Set forecaster model model = parameters.pop("model", "TrainPredictPipeline") @@ -1622,7 +1621,7 @@ def trigger_forecast(self, id: int, **params): forecaster = get_data_generator( source=None, model=model, - config={}, + config=parameters.pop("config", {}), save_config=True, data_generator_type=Forecaster, ) @@ -1631,7 +1630,7 @@ def trigger_forecast(self, id: int, **params): # Queue forecasting job try: - job_id = forecaster.compute(parameters=parameters) + job_id = forecaster.compute(parameters=parameters, as_job=True) except Exception as e: current_app.logger.exception("Forecast job failed to enqueue.") return unprocessable_entity(str(e)) @@ -1731,6 +1730,8 @@ def get_forecast(self, id: int, uuid: str, sensor: Sensor, job_id: str): summary: Started forecasting job value: status: "STARTED" + 400: + description: UNKNOWN_FORECAST 401: description: UNAUTHORIZED 403: @@ -1770,7 +1771,9 @@ def get_forecast(self, id: int, uuid: str, sensor: Sensor, job_id: str): return dict(**response), s # Check job status - if not job.is_finished: + if job.is_finished: + message = "A forecasting job has been processed with your job ID" + else: job_status = job.get_status() job_status_name = ( job_status.upper() if isinstance(job_status, str) else job_status.name @@ -1789,8 +1792,8 @@ def get_forecast(self, id: int, uuid: str, sensor: Sensor, job_id: str): data_source = get_data_source_for_job(job, type="forecasting") forecasts = sensor.search_beliefs( - event_starts_after=job.meta.get("start_predict_date"), - event_ends_before=job.meta.get("end_date"), + event_starts_after=job.meta.get("start"), + event_ends_before=job.meta.get("end"), source=data_source, most_recent_beliefs_only=True, use_latest_version_per_event=True, @@ -1800,6 +1803,10 @@ def get_forecast(self, id: int, uuid: str, sensor: Sensor, job_id: str): current_app.logger.exception("Failed to get forecast job status.") return unprocessable_entity(str(e)) + if forecasts.empty: + return unknown_forecast( + f"{message}, but the forecast was not found in the database." + ) start = forecasts["event_start"].min() last_event_start = forecasts["event_start"].max() diff --git a/flexmeasures/api/v3_0/tests/test_forecasting_api.py b/flexmeasures/api/v3_0/tests/test_forecasting_api.py index a621be8290..cc7a9afb92 100644 --- a/flexmeasures/api/v3_0/tests/test_forecasting_api.py +++ b/flexmeasures/api/v3_0/tests/test_forecasting_api.py @@ -2,9 +2,7 @@ import isodate import pytest from flask import url_for -from flexmeasures.data.services.scheduling import ( - get_data_source_for_job, -) + from rq.job import Job from flexmeasures.utils.job_utils import work_on_rq from flexmeasures.api.tests.utils import get_auth_token @@ -35,18 +33,21 @@ def test_trigger_and_fetch_forecasts( # Trigger job payload = { - "start_date": "2025-01-01T00:00:00+00:00", - "start_predict_date": "2025-01-05T00:00:00+00:00", - "end_date": "2025-01-05T02:00:00+00:00", - "max_forecast_horizon": "PT1H", - "retrain_frequency": "PT1H", + "start": "2025-01-05T00:00:00+00:00", + "end": "2025-01-05T02:00:00+00:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT1H", + "config": { + "train-start": "2025-01-01T00:00:00+00:00", + "retrain-frequency": "PT1H", + }, } trigger_url = url_for("SensorAPI:trigger_forecast", id=sensor_0.id) trigger_res = client.post( trigger_url, json=payload, headers={"Authorization": token} ) - assert trigger_res.status_code == 200 + assert trigger_res.status_code == 200, trigger_res.json trigger_json = trigger_res.get_json() wrap_up_job = app.queues["forecasting"].fetch_job(trigger_json["forecast"]) @@ -77,7 +78,7 @@ def test_trigger_and_fetch_forecasts( payload["sensor"] = sensor_1.id # Run pipeline manually to compute expected forecasts - pipeline = TrainPredictPipeline() + pipeline = TrainPredictPipeline(config=payload.pop("config", {})) pipeline.compute(parameters=payload) # Fetch forecasts for each job @@ -85,7 +86,7 @@ def test_trigger_and_fetch_forecasts( fetch_url = url_for("SensorAPI:get_forecast", id=sensor_0.id, uuid=job_id) res = client.get(fetch_url, headers={"Authorization": token}) - assert res.status_code == 200 + assert res.status_code == 200, res.json data = res.get_json() @@ -103,14 +104,11 @@ def test_trigger_and_fetch_forecasts( assert isinstance(api_forecasts, list) assert len(api_forecasts) > 0 - # Identify which data source wrote these beliefs - data_source = get_data_source_for_job(job, type="forecasting") - # Load only the latest belief per event_start forecasts_df = sensor_1.search_beliefs( - event_starts_after=job.meta.get("start_predict_date"), - event_ends_before=job.meta.get("end_date") + sensor_1.event_resolution, - source=data_source, + event_starts_after=job.meta.get("start"), + event_ends_before=job.meta.get("end"), + source_types=["forecaster"], most_recent_beliefs_only=True, use_latest_version_per_event=True, ).reset_index() diff --git a/flexmeasures/cli/data_add.py b/flexmeasures/cli/data_add.py index f1c1e90a0e..7abdd47b9a 100755 --- a/flexmeasures/cli/data_add.py +++ b/flexmeasures/cli/data_add.py @@ -6,6 +6,10 @@ from datetime import datetime, timedelta from typing import Dict, Any +from flexmeasures.data.schemas.forecasting.pipeline import ( + TrainPredictPipelineConfigSchema, + ForecasterParametersSchema, +) import isodate import json import yaml @@ -32,6 +36,7 @@ MsgStyle, DeprecatedOption, DeprecatedOptionsCommand, + add_cli_options_from_schema, ) from flexmeasures.data import db from flexmeasures.data.scripts.data_gen import ( @@ -70,6 +75,7 @@ GenericAssetSchema, GenericAssetTypeSchema, ) +from flexmeasures.data.schemas.utils import snake_to_kebab from flexmeasures.data.schemas.generic_assets import GenericAssetIdField from flexmeasures.data.models.generic_assets import GenericAsset, GenericAssetType from flexmeasures.data.models.audit_log import AssetAuditLog, AuditLog @@ -81,7 +87,7 @@ from flexmeasures.utils import flexmeasures_inflection from flexmeasures.utils.time_utils import server_now, apply_offset_chain from flexmeasures.utils.unit_utils import convert_units, ur -from flexmeasures.cli.utils import validate_color_cli, validate_url_cli, split_commas +from flexmeasures.cli.utils import validate_color_cli, validate_url_cli from flexmeasures.data.utils import save_to_db from flexmeasures.data.services.utils import get_asset_or_sensor_ref from flexmeasures.data.models.reporting.profit import ProfitOrLossReporter @@ -992,116 +998,6 @@ def add_holidays( @fm_add_data.command("forecasts") -@click.option( - "--sensor", - required=True, - help="Create forecasts for this sensor. Follow up with the sensor's ID. This argument can be given multiple times.", -) -@click.option( - "--regressors", - "--regressor", - multiple=True, - callback=split_commas, - help="Sensor ID to be treated as a regressor. " - "Use this if both realizations and forecasts recorded on this sensor matter as a regressor. " - "This argument can be given multiple times, but can also be set to a comma-separated list.", -) -@click.option( - "--future-regressors", - "--future-regressor", - multiple=True, - callback=split_commas, - help="Sensor ID to be treated only as a future regressor. " - "Use this if only forecasts recorded on this sensor matter as a regressor. " - "This argument can be given multiple times, but can also be set to a comma-separated list.", -) -@click.option( - "--past-regressors", - "--past-regressor", - multiple=True, - callback=split_commas, - help="Sensor ID to be treated only as a past regressor. " - "Use this if only realizations recorded on this sensor matter as a regressor. " - "This argument can be given multiple times, but can also be set to a comma-separated list.", -) -@click.option( - "--train-start", - "--start-date", - "start_date", - required=False, - help=( - "Timestamp marking when training data begins. " - "Format: YYYY-MM-DDTHH:MM:SS±HH:MM. " - "If not provided, it defaults to a period equal to the training duration " - "ending at --from-date." - ), -) -@click.option( - "--to-date", - "--end-date", - "end_date", - required=True, - help="End date for running the pipeline (YYYY-MM-DDTHH:MM:SS+HH:MM).", -) -@click.option( - "--train-period", - required=False, - help="Duration of the initial training period (ISO 8601 duration, e.g. 'P7D', with a minimum of 2 days). " - "Subsequent training periods will grow with each cycle (see --retrain-frequency). " - "If not set, derives a training period from --start-predict-date instead. " - "If that is also not set, defaults to 2 days.", -) -@click.option( - "--retrain-frequency", - "--remodel-frequency", # the term as used in the old forecasting tooling - "--predict-period", # only used during development afaik - required=False, - help="The duration of a cycle of training and predicting, defining how often to retrain the model (ISO 8601 duration, e.g. 'PT24H'). " - "If not set, the model is not retrained.", -) -@click.option( - "--from-date", - "start_predict_date", - default=None, - required=False, - help="Start date for predictions (YYYY-MM-DDTHH:MM:SS+HH:MM). " - "If not set, defaults to now.", -) -@click.option( - "--max-forecast-horizon", - required=False, - help="Maximum forecast horizon (ISO 8601 duration, e.g. 'PT24H'). " - "Defaults to 48 hours.", -) -@click.option( - "--forecast-frequency", - help="Forecast frequency (ISO 8601 duration, e.g. 'PT24H'), i.e. how often to recompute forecasts. " - "Defaults to 1 hour.", -) -@click.option( - "--model-save-dir", - help="Directory to save the trained model.", -) -@click.option( - "--output-path", - help="Directory to save prediction outputs.", -) -@click.option("--probabilistic", is_flag=True, help="Enable probabilistic predictions.") -@click.option( - "--sensor-to-save", - default=None, - help="Sensor ID to save forecasts into a specific sensor. By default, forecasts are saved to the target sensor.", -) -@click.option( - "--as-job", - is_flag=True, - help="Whether to queue a forecasting job instead of computing directly. " - "To process the job, run a worker (on any computer, but configured to the same databases) to process the 'forecasting' queue. Defaults to False.", -) -@click.option( - "--max-training-period", - help="Maximum duration of the training period (ISO 8601 duration, e.g. 'P1Y'). Defaults to 1 year.", -) @click.option( "--resolution", help="[DEPRECATED] Resolution of forecast in minutes. If not set, resolution is determined from the sensor to be forecasted", @@ -1110,11 +1006,6 @@ def add_holidays( "--horizon", help="[DEPRECATED] Forecasting horizon in hours. This argument can be given multiple times. Defaults to all possible horizons.", ) -@click.option( - "--ensure-positive", - is_flag=True, - help="Whether to ensure positive forecasts, by clipping out negative values.", -) @click.option( "--config", "config_file", @@ -1156,22 +1047,23 @@ def add_holidays( is_flag=True, help="Add this flag to edit the parameters passed to the Forecaster in your default text editor (e.g. nano).", ) +@add_cli_options_from_schema(ForecasterParametersSchema()) +@add_cli_options_from_schema(TrainPredictPipelineConfigSchema()) @click.option( - "--missing-threshold", - default=1.0, - help=( - "Maximum fraction of missing data allowed before raising an error. " - "Missing data under this threshold will be filled using forward filling or linear interpolation." - ), + "--as-job", + is_flag=True, + help="Whether to queue a forecasting job instead of computing directly. " + "To process the job, run a worker (on any computer, but configured to the same databases) to process the 'forecasting' queue. Defaults to False.", ) @with_appcontext -def train_predict_pipeline( +def add_forecast( # noqa: C901 forecaster_class: str, source: DataSource | None = None, config_file: TextIOBase | None = None, parameters_file: TextIOBase | None = None, edit_config: bool = False, edit_parameters: bool = False, + as_job: bool = False, **kwargs, ): """ @@ -1180,11 +1072,11 @@ def train_predict_pipeline( \b Example flexmeasures add forecasts --sensor 2092 --regressors 2093 - --start-date 2025-01-01T00:00:00+01:00 --to-date 2025-10-15T00:00:00+01:00 + --to-date 2025-10-15T00:00:00+01:00 \b Workflow - - Training window: defaults from --start-date until the CLI execution time. + - Training window: defaults to a 30-day period in advance of the CLI execution time. - Prediction window: defaults from CLI execution time until --to-date. - max-forecast-horizon: defaults to the length of the prediction window. - Forecasts are computed immediately; use --as-job to enqueue them. @@ -1218,6 +1110,9 @@ def train_predict_pipeline( if config_file: config = yaml.safe_load(config_file) + for field_name, field in TrainPredictPipelineConfigSchema._declared_fields.items(): + if field_value := kwargs.pop(field_name, None): + config[field.data_key] = field_value if edit_config: config = launch_editor("/tmp/config.yml") @@ -1230,10 +1125,11 @@ def train_predict_pipeline( if edit_parameters: parameters = launch_editor("/tmp/parameters.yml") - # Move remaining kwargs to parameters + # Move remaining kwargs to parameters, converting from snake_case to kebab-case to match schema expectation for k, v in kwargs.items(): - if k not in parameters: - parameters[k] = v + kebab_key = snake_to_kebab(k) + if kebab_key not in parameters: + parameters[kebab_key] = v forecaster = get_data_generator( source=source, @@ -1244,7 +1140,7 @@ def train_predict_pipeline( ) try: - pipeline_returns = forecaster.compute(parameters=parameters) + pipeline_returns = forecaster.compute(as_job=as_job, **parameters) # Empty result if not pipeline_returns: diff --git a/flexmeasures/cli/utils.py b/flexmeasures/cli/utils.py index ea0d30bc9e..048224b4e7 100644 --- a/flexmeasures/cli/utils.py +++ b/flexmeasures/cli/utils.py @@ -13,7 +13,9 @@ from tabulate import tabulate import pytz from click_default_group import DefaultGroup +from marshmallow import fields +from flexmeasures.data.schemas.utils import MarshmallowClickMixin from flexmeasures.utils.time_utils import get_most_recent_hour, get_timezone from flexmeasures.utils.validation_utils import validate_color_hex, validate_url from flexmeasures import Sensor @@ -435,3 +437,50 @@ def split_commas(ctx, param, value): for v in value: result.extend(v.split(",")) return list(set([x.strip() for x in result if x.strip()])) + + +def add_cli_options_from_schema(schema): + """Decorator to add CLI options based on a Marshmallow schema's fields.""" + + def decorator(command): + for field_name, field in reversed(schema.fields.items()): + cli = field.metadata.get("cli") + if not cli: + continue + + option_names = cli["option"] + option_aliases = cli.get("aliases", []) + options = [option_names] + option_aliases + + # build help text from field description and example, and optionally extra help provided in the cli metadata + help_text = field.metadata.get("description", "") + + extra_help = cli.get("extra_help") + if extra_help: + help_text += f"\n{extra_help}" + + example = field.metadata.get("example") + if example is not None: + help_text += f"\nExample: {example}" + + kwargs = { + "help": help_text, + "required": field.required, + "default": field.load_default, + } + + if cli.get("is_flag"): + kwargs["is_flag"] = True + + # Transfer the original field type + if isinstance(field, MarshmallowClickMixin): + kwargs["type"] = field + elif isinstance(field, fields.List): + kwargs["multiple"] = True + kwargs["type"] = field.inner + + command = click.option(*options, **kwargs)(command) + + return command + + return decorator diff --git a/flexmeasures/conftest.py b/flexmeasures/conftest.py index f210a14e81..d3043286e4 100644 --- a/flexmeasures/conftest.py +++ b/flexmeasures/conftest.py @@ -1,5 +1,8 @@ from __future__ import annotations +import sys +import builtins +import warnings from contextlib import contextmanager import pytest from random import random, seed @@ -1825,3 +1828,58 @@ def add_test_solar_sensor_and_irradiance_with_forecasts( db.session.commit() return sensors + + +@pytest.fixture +def freeze_server_now(): + """ + Monkeypatch `server_now` in all currently loaded FlexMeasures modules that have it. + + Usage: + def test_x(freeze_server_now): + freeze_server_now(pd.Timestamp("2025-01-15T12:23:58+01")) + """ + patched_modules = set() + + def _freeze(value: datetime | pd.Timestamp): + if isinstance(value, pd.Timestamp): + value = value.to_pydatetime() + # Patch currently loaded FlexMeasures modules + for module in list(sys.modules.values()): # copy to avoid RuntimeError + try: + if not isinstance(module, type(sys)): # skip placeholders + continue + name = getattr(module, "__name__", "") + if not name.startswith("flexmeasures"): + continue + if hasattr(module, "server_now"): + setattr(module, "server_now", lambda: value) + patched_modules.add(module.__name__) + except Exception: + # skip modules that cannot be inspected or modified + pass + + # Optionally, warn if new modules are imported later + original_import = builtins.__import__ + + def import_hook(name, *args, **kwargs): + mod = original_import(name, *args, **kwargs) + if hasattr(mod, "server_now") and mod not in patched_modules: + warnings.warn( + f"Module {name} imported after server_now was frozen; patching it now." + ) + try: + setattr(mod, "server_now", lambda: value) + patched_modules.add(name) + except Exception: + pass + return mod + + builtins.__import__ = import_hook + + return value + + yield _freeze + + # cleanup: restore the original import function + builtins.__import__ = builtins.__import__ diff --git a/flexmeasures/data/models/data_sources.py b/flexmeasures/data/models/data_sources.py index 7bba06e467..c50096d1db 100644 --- a/flexmeasures/data/models/data_sources.py +++ b/flexmeasures/data/models/data_sources.py @@ -1,6 +1,7 @@ from __future__ import annotations from copy import deepcopy +import inspect import json from functools import cached_property from typing import TYPE_CHECKING, Any, ClassVar @@ -109,7 +110,9 @@ def _compute(self, **kwargs) -> list[dict[str, Any]]: """ raise NotImplementedError() - def compute(self, parameters: dict | None = None, **kwargs) -> list[dict[str, Any]]: + def compute( + self, parameters: dict | None = None, as_job: bool = False, **kwargs + ) -> list[dict[str, Any]]: """The configuration `parameters` stores dynamic parameters, parameters that, if changed, DO NOT trigger the creation of a new DataSource. Static parameters, such as the topology of an energy system, can go into `config`. @@ -118,6 +121,7 @@ def compute(self, parameters: dict | None = None, **kwargs) -> list[dict[str, An of the method compute when passing the `parameters` as deserialized attributes. :param parameters: Serialized parameters, defaults to None. + :param as_job: If True, runs as a job. :param kwargs: Deserialized parameters (can be used as an alternative to the `parameters` kwarg). """ @@ -131,9 +135,14 @@ def compute(self, parameters: dict | None = None, **kwargs) -> list[dict[str, An self._parameters = self._parameters_schema.load(self._parameters) - results = self._compute(**self._parameters) + sig = inspect.signature(inspect.unwrap(self._compute)) + accepts_as_job = "as_job" in sig.parameters + if accepts_as_job: + results = self._compute(**self._parameters, as_job=as_job) + else: + results = self._compute(**self._parameters) - if not self._parameters.get("as_job", False): + if not as_job: results = self._assign_sensors_and_source(results) return results diff --git a/flexmeasures/data/models/forecasting/__init__.py b/flexmeasures/data/models/forecasting/__init__.py index 799ae6c096..5aa7683d5f 100644 --- a/flexmeasures/data/models/forecasting/__init__.py +++ b/flexmeasures/data/models/forecasting/__init__.py @@ -75,18 +75,21 @@ class Forecaster(DataGenerator): _config_schema = ForecasterConfigSchema() - def _compute(self, check_output_resolution=True, **kwargs) -> list[dict[str, Any]]: + def _compute( + self, check_output_resolution=True, as_job: bool = False, **kwargs + ) -> list[dict[str, Any]]: """This method triggers the creation of a new forecast. The same object can generate multiple forecasts with different start, end, resolution and belief_time values. :param check_output_resolution: If True, checks each output for whether the event_resolution matches that of the sensor it is supposed to be recorded on. + :param as_job: If True, runs as a job. """ - results = self._compute_forecast(**kwargs) + results = self._compute_forecast(**kwargs, as_job=as_job) - if not kwargs.get("as_job", False): + if not as_job: for result in results: # checking that the event_resolution of the output BeliefDataFrame is equal to the one of the output sensor assert not check_output_resolution or ( @@ -95,10 +98,11 @@ def _compute(self, check_output_resolution=True, **kwargs) -> list[dict[str, Any return results - def _compute_forecast(self, **kwargs) -> list[dict[str, Any]]: + def _compute_forecast(self, as_job: bool = False, **kwargs) -> list[dict[str, Any]]: """Overwrite with the actual computation of your forecast. - :returns list of dictionaries, for example: + :param as_job: If True, runs as a job. + :returns: List of dictionaries, for example: [ { "sensor": 501, @@ -113,28 +117,32 @@ def _clean_parameters(self, parameters: dict) -> dict: These parameters are already contained in the TimedBelief: - - end_date: as the event end - - max_forecast_horizon: as the maximum belief horizon of the beliefs for a given event - - forecast_frequency: as the spacing between unique belief times + - end-date: as the event end + - max-forecast-horizon: as the maximum belief horizon of the beliefs for a given event + - forecast-frequency: as the spacing between unique belief times - probabilistic: as the cumulative_probability of each belief - - sensor_to_save: as the sensor on which the beliefs are recorded + - sensor-to-save: as the sensor on which the beliefs are recorded Other: - - model_save_dir: used internally for the train and predict pipelines to save and load the model - - output_path: for exporting forecasts to file, more of a developer feature - - as_job: only indicates whether the computation was offloaded to a worker + - model-save-dir: used internally for the train and predict pipelines to save and load the model + - output-path: for exporting forecasts to file, more of a developer feature + - as-job: only indicates whether the computation was offloaded to a worker """ _parameters = deepcopy(parameters) + # Note: Parameter keys are in kebab-case due to Marshmallow schema data_key settings + # (see ForecasterParametersSchema in flexmeasures/data/schemas/forecasting/pipeline.py) fields_to_remove = [ - "end_date", - "max_forecast_horizon", - "forecast_frequency", + "end-date", + "max-forecast-horizon", + "forecast-frequency", "probabilistic", - "model_save_dir", - "output_path", - "sensor_to_save", - "as_job", + "model-save-dir", + "output-path", + "sensor-to-save", + "as-job", + "m_viewpoints", # Computed internally, still uses snake_case + "sensor", ] for field in fields_to_remove: diff --git a/flexmeasures/data/models/forecasting/custom_models/base_model.py b/flexmeasures/data/models/forecasting/custom_models/base_model.py index 43a6b8008d..cdb91a5c59 100644 --- a/flexmeasures/data/models/forecasting/custom_models/base_model.py +++ b/flexmeasures/data/models/forecasting/custom_models/base_model.py @@ -1,11 +1,9 @@ -import sys import logging from abc import ABC, abstractmethod from darts import TimeSeries from flexmeasures.data.models.forecasting.utils import negative_to_zero -from flexmeasures.data.models.forecasting.exceptions import CustomException class BaseModel(ABC): @@ -81,19 +79,14 @@ def fit( past_covariates: TimeSeries, future_covariates: TimeSeries, ) -> None: - try: - logging.debug("Training base model") - for i in range(self.max_forecast_horizon): - self.models[i].fit( - series=series, - past_covariates=past_covariates, - future_covariates=future_covariates, - ) - logging.debug("Base model trained successfully") - except Exception as e: - raise CustomException( - f"Error training base model: {e}. Try decreasing the start-date.", sys + logging.debug("Training base model") + for i in range(self.max_forecast_horizon): + self.models[i].fit( + series=series, + past_covariates=past_covariates, + future_covariates=future_covariates, ) + logging.debug("Base model trained successfully") def predict( self, diff --git a/flexmeasures/data/models/forecasting/exceptions.py b/flexmeasures/data/models/forecasting/exceptions.py index 2cde7f5c73..1d963d79b4 100644 --- a/flexmeasures/data/models/forecasting/exceptions.py +++ b/flexmeasures/data/models/forecasting/exceptions.py @@ -1,32 +1,6 @@ -import sys - - class NotEnoughDataException(Exception): pass class InvalidHorizonException(Exception): pass - - -def error_message_detail(error, error_detail: sys): - """ - This function returns the error message and the error detail - """ - _, _, exc_tb = error_detail.exc_info() - file_name = exc_tb.tb_frame.f_code.co_filename - error_message = "Error occurred in python script name [{0}] line number [{1}] error message [{2}]".format( - file_name, exc_tb.tb_lineno, str(error) - ) - return error_message - - -class CustomException(Exception): - def __init__(self, error_message: str, error_detail: sys): - super().__init__(error_message) - self.error_message = error_message_detail( - error_message, error_detail=error_detail - ) - - def __str__(self): - return self.error_message diff --git a/flexmeasures/data/models/forecasting/pipelines/base.py b/flexmeasures/data/models/forecasting/pipelines/base.py index f1c5dfa8d1..bb84cfd7e5 100644 --- a/flexmeasures/data/models/forecasting/pipelines/base.py +++ b/flexmeasures/data/models/forecasting/pipelines/base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys import logging from datetime import datetime from functools import reduce @@ -12,7 +11,7 @@ from flexmeasures.data.models.time_series import Sensor from timely_beliefs import utils as tb_utils -from flexmeasures.data.models.forecasting.exceptions import CustomException +from flexmeasures.data.models.forecasting.exceptions import NotEnoughDataException class BasePipeline: @@ -111,87 +110,83 @@ def load_data_all_beliefs(self) -> pd.DataFrame: Returns: - pd.DataFrame: A DataFrame containing all the data from each sensor. """ - try: - logging.debug( - "Loading all data from %s", - { - "Future regressors": [s.id for s in self.future], - "Past regressors": [s.id for s in self.past], - "Target": self.target_sensor.id, - }, - ) + logging.debug( + "Loading all data from %s", + { + "Future regressors": [s.id for s in self.future], + "Past regressors": [s.id for s in self.past], + "Target": self.target_sensor.id, + }, + ) - sensor_dfs = [] - sensor_names = self.future_regressors + self.past_regressors + [self.target] - sensors = self.future + self.past + [self.target_sensor] - for name, sensor in zip(sensor_names, sensors): - logging.debug(f"Loading data for {name} (sensor ID {sensor.id})") + sensor_dfs = [] + sensor_names = self.future_regressors + self.past_regressors + [self.target] + sensors = self.future + self.past + [self.target_sensor] + for name, sensor in zip(sensor_names, sensors): + logging.debug(f"Loading data for {name} (sensor ID {sensor.id})") - sensor_event_ends_before = self.event_ends_before - sensor_event_starts_after = self.event_starts_after + sensor_event_ends_before = self.event_ends_before + sensor_event_starts_after = self.event_starts_after - most_recent_beliefs_only = True - # Extend time range for future regressors - if sensor in self.future: - sensor_event_ends_before = self.event_ends_before + pd.Timedelta( - hours=self.max_forecast_horizon_in_hours - ) + most_recent_beliefs_only = True + # Extend time range for future regressors + if sensor in self.future: + sensor_event_ends_before = self.event_ends_before + pd.Timedelta( + hours=self.max_forecast_horizon_in_hours + ) - most_recent_beliefs_only = False # load all beliefs available to include forecasts available at each timestamp + most_recent_beliefs_only = False # load all beliefs available to include forecasts available at each timestamp - df = sensor.search_beliefs( - event_starts_after=sensor_event_starts_after, - event_ends_before=sensor_event_ends_before, - most_recent_beliefs_only=most_recent_beliefs_only, - exclude_source_types=( - ["forecaster"] if name == self.target else [] - ), # we exclude forecasters for target dataframe as to not use forecasts in target. + df = sensor.search_beliefs( + event_starts_after=sensor_event_starts_after, + event_ends_before=sensor_event_ends_before, + most_recent_beliefs_only=most_recent_beliefs_only, + exclude_source_types=( + ["forecaster"] if name == self.target else [] + ), # we exclude forecasters for target dataframe as to not use forecasts in target. + ) + try: + # We resample regressors to the target sensor’s resolution so they align in time. + # This ensures the resulting DataFrame can be used directly for predictions. + df = tb_utils.replace_multi_index_level( + df, + "event_start", + df.event_starts.floor(self.target_sensor.event_resolution), ) - try: - # We resample regressors to the target sensor’s resolution so they align in time. - # This ensures the resulting DataFrame can be used directly for predictions. - df = tb_utils.replace_multi_index_level( - df, - "event_start", - df.event_starts.floor(self.target_sensor.event_resolution), - ) - except Exception as e: - logging.warning(f"Error during custom resample for {name}: {e}") + except Exception as e: + logging.warning(f"Error during custom resample for {name}: {e}") - df = df.reset_index() - df_filtered = df[["event_start", "belief_time", "event_value"]].copy() - df_filtered.rename(columns={"event_value": name}, inplace=True) + df = df.reset_index() + df_filtered = df[["event_start", "belief_time", "event_value"]].copy() + df_filtered.rename(columns={"event_value": name}, inplace=True) - sensor_dfs.append(df_filtered) + sensor_dfs.append(df_filtered) - if len(sensor_dfs) == 1: - data_pd = sensor_dfs[0] - else: - # When using future_covariate, the last day in its sensor_df extends beyond - # the target and past regressors by "max_forecast_horizon." - # To ensure we retain these additional future regressor records, - # we use an outer join to merge all sensor_dfs DataFrames on the "event_start" and "belief_time" columns. - - data_pd = reduce( - lambda left, right: pd.merge( - left, right, on=["event_start", "belief_time"], how="outer" - ), - sensor_dfs, - ) - data_pd = data_pd.sort_values( - by=["event_start", "belief_time"] - ).reset_index(drop=True) - data_pd["event_start"] = pd.to_datetime( - data_pd["event_start"], utc=True - ).dt.tz_localize(None) - data_pd["belief_time"] = pd.to_datetime( - data_pd["belief_time"], utc=True - ).dt.tz_localize(None) - - return data_pd + if len(sensor_dfs) == 1: + data_pd = sensor_dfs[0] + else: + # When using future_covariate, the last day in its sensor_df extends beyond + # the target and past regressors by "max_forecast_horizon." + # To ensure we retain these additional future regressor records, + # we use an outer join to merge all sensor_dfs DataFrames on the "event_start" and "belief_time" columns. - except Exception as e: - raise CustomException(f"Error loading dataframe with all beliefs: {e}", sys) + data_pd = reduce( + lambda left, right: pd.merge( + left, right, on=["event_start", "belief_time"], how="outer" + ), + sensor_dfs, + ) + data_pd = data_pd.sort_values( + by=["event_start", "belief_time"] + ).reset_index(drop=True) + data_pd["event_start"] = pd.to_datetime( + data_pd["event_start"], utc=True + ).dt.tz_localize(None) + data_pd["belief_time"] = pd.to_datetime( + data_pd["belief_time"], utc=True + ).dt.tz_localize(None) + + return data_pd def split_data_all_beliefs( # noqa: C901 self, df: pd.DataFrame, is_predict_pipeline: bool = False @@ -235,348 +230,288 @@ def split_data_all_beliefs( # noqa: C901 The detailed semantics of how past/future covariates and targets are constructed for each split are documented in `_generate_splits`. """ - try: - logging.debug("Splitting data target and covariates.") + logging.debug("Splitting data target and covariates.") + + def _generate_splits( + X_past_regressors_df: pd.DataFrame | None, + X_future_regressors_df: pd.DataFrame | None, + y: pd.DataFrame, + ): + """ + Generate past covariates, future covariates, and target series + for multiple simulated prediction times ("belief times"). + + For each simulated belief_time: + - Past covariates contain realized regressor data up to `target_end` + (just before the predictions start). + - Future covariates include realized data up to `target_end` + and forecasts extending up to `forecast_end` (`target_end + max_forecast_horizon`). + - Target series (y) contain realized target values up to `target_end` + (the last event_start available for making forecasts). + - belief_time is the timestamp representing "when the forecast + would have been made." It coincides with the belief_time + of `target_end` — i.e., the last belief_time seen. + + This function loops through `n_steps_to_predict` (if this class is used by the predict pipeline), + creating a sliding window of inputs for each prediction step. + + Parameters + ---------- + X_past_regressors_df : pd.DataFrame | None + Past regressors (realized values before belief_time). None if not used. + X_future_regressors_df : pd.DataFrame | None + Future regressors (realized + forecasted values). None if not used. + y : pd.DataFrame + Target values, indexed by event_start and belief_time. + + Returns + ------- + past_covariates_list : list[TimeSeries] | None + future_covariates_list : list[TimeSeries] | None + target_list : list[TimeSeries] + belief_timestamps_list : list[pd.Timestamp] + """ + + target_sensor_resolution = self.target_sensor.event_resolution + + # target_start is the timestamp of the event_start of the first event in realizations + target_start = pd.to_datetime( + self.event_starts_after, utc=True + ).tz_localize(None) + + # target_end is the timestamp of the last event_start of realized data + # belief_time in this module is the belief_time of the last realization to be used for forecasting at each prediction step. + if self.predict_start: + first_target_end = pd.to_datetime( + self.predict_start - self.target_sensor.event_resolution, + utc=True, + ).tz_localize(None) + first_belief_time = pd.to_datetime( + self.predict_start, utc=True + ).tz_localize(None) + else: + first_target_end = pd.to_datetime( + self.event_ends_before - self.target_sensor.event_resolution, + utc=True, + ).tz_localize(None) + first_belief_time = pd.to_datetime( + self.event_ends_before, utc=True + ).tz_localize(None) - def _generate_splits( - X_past_regressors_df: pd.DataFrame | None, - X_future_regressors_df: pd.DataFrame | None, - y: pd.DataFrame, - ): - """ - Generate past covariates, future covariates, and target series - for multiple simulated prediction times ("belief times"). - - For each simulated belief_time: - - Past covariates contain realized regressor data up to `target_end` - (just before the predictions start). - - Future covariates include realized data up to `target_end` - and forecasts extending up to `forecast_end` (`target_end + max_forecast_horizon`). - - Target series (y) contain realized target values up to `target_end` - (the last event_start available for making forecasts). - - belief_time is the timestamp representing "when the forecast - would have been made." It coincides with the belief_time - of `target_end` — i.e., the last belief_time seen. - - This function loops through `n_steps_to_predict` (if this class is used by the predict pipeline), - creating a sliding window of inputs for each prediction step. - - Parameters - ---------- - X_past_regressors_df : pd.DataFrame | None - Past regressors (realized values before belief_time). None if not used. - X_future_regressors_df : pd.DataFrame | None - Future regressors (realized + forecasted values). None if not used. - y : pd.DataFrame - Target values, indexed by event_start and belief_time. - - Returns - ------- - past_covariates_list : list[TimeSeries] | None - future_covariates_list : list[TimeSeries] | None - target_list : list[TimeSeries] - belief_timestamps_list : list[pd.Timestamp] - """ - - target_sensor_resolution = self.target_sensor.event_resolution - - # target_start is the timestamp of the event_start of the first event in realizations - target_start = pd.to_datetime( - self.event_starts_after, utc=True + # The forecast window ends at target_end + max_forecast_horizon (+ 1 resolution). + first_forecast_end = ( + first_target_end + + pd.Timedelta(hours=self.max_forecast_horizon_in_hours) + + self.target_sensor.event_resolution + ) + # Ensure the forecast_end is in UTC and has no timezone info + first_forecast_end = pd.to_datetime( + first_forecast_end, utc=True + ).tz_localize(None) + + # Initialize save_belief_time for the first iteration if it's specified + if self.save_belief_time: + first_save_belief_time = pd.to_datetime( + self.save_belief_time, utc=True ).tz_localize(None) - # target_end is the timestamp of the last event_start of realized data - # belief_time in this module is the belief_time of the last realization to be used for forecasting at each prediction step. - if self.predict_start: - first_target_end = pd.to_datetime( - self.predict_start - self.target_sensor.event_resolution, - utc=True, - ).tz_localize(None) - first_belief_time = pd.to_datetime( - self.predict_start, utc=True - ).tz_localize(None) - else: - first_target_end = pd.to_datetime( - self.event_ends_before - self.target_sensor.event_resolution, - utc=True, - ).tz_localize(None) - first_belief_time = pd.to_datetime( - self.event_ends_before, utc=True - ).tz_localize(None) - - # The forecast window ends at target_end + max_forecast_horizon (+ 1 resolution). - first_forecast_end = ( - first_target_end - + pd.Timedelta(hours=self.max_forecast_horizon_in_hours) - + self.target_sensor.event_resolution + # Pre-compute per-event_start latest/closest rows + past_latest = None + if X_past_regressors_df is not None: + past_obs = X_past_regressors_df.loc[ + X_past_regressors_df["belief_time"] + > X_past_regressors_df["event_start"] + ].copy() + idx = past_obs.groupby("event_start")["belief_time"].idxmax() + past_latest = ( + past_obs.loc[idx].sort_values("event_start").reset_index(drop=True) + ) + past_keep = [c for c in past_latest.columns if c not in ("belief_time")] + past_latest = past_latest[past_keep] + + future_realized_latest = None + future_all_closest = None + if X_future_regressors_df is not None: + # Realized-only (belief_time > event_start): take closest per event_start + fr = X_future_regressors_df.loc[ + X_future_regressors_df["belief_time"] + > X_future_regressors_df["event_start"] + ].copy() + fr["time_diff"] = (fr["event_start"] - fr["belief_time"]).abs() + idx_fr = fr.groupby("event_start")["time_diff"].idxmin() + fr = ( + fr.loc[idx_fr] + .drop(columns=["time_diff"]) + .sort_values("event_start") + .reset_index(drop=True) ) - # Ensure the forecast_end is in UTC and has no timezone info - first_forecast_end = pd.to_datetime( - first_forecast_end, utc=True - ).tz_localize(None) - # Initialize save_belief_time for the first iteration if it's specified - if self.save_belief_time: - first_save_belief_time = pd.to_datetime( - self.save_belief_time, utc=True - ).tz_localize(None) - - # Pre-compute per-event_start latest/closest rows - past_latest = None - if X_past_regressors_df is not None: - past_obs = X_past_regressors_df.loc[ - X_past_regressors_df["belief_time"] - > X_past_regressors_df["event_start"] - ].copy() - idx = past_obs.groupby("event_start")["belief_time"].idxmax() - past_latest = ( - past_obs.loc[idx] - .sort_values("event_start") - .reset_index(drop=True) - ) - past_keep = [ - c for c in past_latest.columns if c not in ("belief_time") - ] - past_latest = past_latest[past_keep] - - future_realized_latest = None - future_all_closest = None - if X_future_regressors_df is not None: - # Realized-only (belief_time > event_start): take closest per event_start - fr = X_future_regressors_df.loc[ - X_future_regressors_df["belief_time"] - > X_future_regressors_df["event_start"] - ].copy() - fr["time_diff"] = (fr["event_start"] - fr["belief_time"]).abs() - idx_fr = fr.groupby("event_start")["time_diff"].idxmin() - fr = ( - fr.loc[idx_fr] - .drop(columns=["time_diff"]) - .sort_values("event_start") - .reset_index(drop=True) - ) + # All beliefs: closest per event_start (used for forecast slice) + fa = X_future_regressors_df.copy() + fa["time_diff"] = (fa["event_start"] - fa["belief_time"]).abs() + idx_fa = fa.groupby("event_start")["time_diff"].idxmin() + fa = ( + fa.loc[idx_fa] + .drop(columns=["time_diff"]) + .sort_values("event_start") + .reset_index(drop=True) + ) - # All beliefs: closest per event_start (used for forecast slice) - fa = X_future_regressors_df.copy() - fa["time_diff"] = (fa["event_start"] - fa["belief_time"]).abs() - idx_fa = fa.groupby("event_start")["time_diff"].idxmin() - fa = ( - fa.loc[idx_fa] - .drop(columns=["time_diff"]) - .sort_values("event_start") - .reset_index(drop=True) - ) + keep = [c for c in fr.columns if c not in ("belief_time")] + future_realized_latest = fr[keep] + future_all_closest = fa[keep] - keep = [c for c in fr.columns if c not in ("belief_time")] - future_realized_latest = fr[keep] - future_all_closest = fa[keep] + y_clean = ( + y.drop(columns=["belief_time"]) + .sort_values("event_start") + .reset_index(drop=True) + ) - y_clean = ( - y.drop(columns=["belief_time"]) - .sort_values("event_start") - .reset_index(drop=True) + # Helper function: fast closed-interval slice by event_start + def _slice_closed( + df_: pd.DataFrame, start_ts: pd.Timestamp, end_ts: pd.Timestamp + ) -> pd.DataFrame: + if df_ is None or df_.empty: + return df_.iloc[0:0].copy() if df_ is not None else None + + # Ensure datetime dtype; then work in int64 ns for searchsorted + es = pd.to_datetime(df_["event_start"], errors="coerce") + a = es.view("int64").to_numpy() + + lo = np.searchsorted(a, start_ts.value, side="left") + hi = np.searchsorted(a, end_ts.value, side="right") # inclusive end + + # Slice original rows by positional indices + out = df_.iloc[lo:hi].copy() + # (Optional) keep the coerced datetime back on the slice to avoid re-parsing later + if not out.empty: + out.loc[:, "event_start"] = es.iloc[lo:hi].to_numpy() + return out + + target_list = [] + past_covariates_list = [] + future_covariates_list = [] + belief_timestamps_list = [] + + # Number of prediction iterations: all steps if predict pipeline, else just 1 (training) + end_for_loop = self.n_steps_to_predict if is_predict_pipeline else 1 + + # Loop through each simulated forecast step and increase the belief_time and target_end by 1 target sensor resolution + for index_offset in range(0, end_for_loop, self.forecast_frequency): + + # Move belief_time and target_end forward one resolution per step + delta = pd.Timedelta( + seconds=index_offset * target_sensor_resolution.total_seconds() + ) + belief_time = first_belief_time + delta + + # Update the save belief time for the next forecasting cycle: + # - if no self.save_belief_time date exists, set the current belief_time + save_belief_time = ( + first_save_belief_time + delta + if self.save_belief_time + else belief_time + ) + target_end = first_target_end + delta + forecast_end = first_forecast_end + delta + + # Target split + y_slice_df = _slice_closed(y_clean, target_start, target_end) + y_split = self.detect_and_fill_missing_values( + df=y_slice_df, + sensors=[self.target_sensor], + sensor_names=[self.target], + start=target_start, + end=target_end, ) - # Helper function: fast closed-interval slice by event_start - def _slice_closed( - df_: pd.DataFrame, start_ts: pd.Timestamp, end_ts: pd.Timestamp - ) -> pd.DataFrame: - if df_ is None or df_.empty: - return df_.iloc[0:0].copy() if df_ is not None else None - - # Ensure datetime dtype; then work in int64 ns for searchsorted - es = pd.to_datetime(df_["event_start"], errors="coerce") - a = es.view("int64").to_numpy() - - lo = np.searchsorted(a, start_ts.value, side="left") - hi = np.searchsorted(a, end_ts.value, side="right") # inclusive end - - # Slice original rows by positional indices - out = df_.iloc[lo:hi].copy() - # (Optional) keep the coerced datetime back on the slice to avoid re-parsing later - if not out.empty: - out.loc[:, "event_start"] = es.iloc[lo:hi].to_numpy() - return out - - target_list = [] - past_covariates_list = [] - future_covariates_list = [] - belief_timestamps_list = [] - - # Number of prediction iterations: all steps if predict pipeline, else just 1 (training) - end_for_loop = self.n_steps_to_predict if is_predict_pipeline else 1 - - # Loop through each simulated forecast step and increase the belief_time and target_end by 1 target sensor resolution - for index_offset in range(0, end_for_loop, self.forecast_frequency): - - # Move belief_time and target_end forward one resolution per step - delta = pd.Timedelta( - seconds=index_offset * target_sensor_resolution.total_seconds() - ) - belief_time = first_belief_time + delta - - # Update the save belief time for the next forecasting cycle: - # - if no self.save_belief_time date exists, set the current belief_time - save_belief_time = ( - first_save_belief_time + delta - if self.save_belief_time - else belief_time - ) - target_end = first_target_end + delta - forecast_end = first_forecast_end + delta - - # Target split - y_slice_df = _slice_closed(y_clean, target_start, target_end) - y_split = self.detect_and_fill_missing_values( - df=y_slice_df, - sensors=[self.target_sensor], - sensor_names=[self.target], + # Past covariates split + if past_latest is not None: + past_slice = _slice_closed(past_latest, target_start, target_end) + past_covariates = self.detect_and_fill_missing_values( + df=past_slice, + sensors=self.past, + sensor_names=self.past_regressors, start=target_start, end=target_end, ) + else: + past_covariates = None + + # Future covariates (realized up to target_end + forecasts up to forecast_end) split + if ( + future_realized_latest is not None + and future_all_closest is not None + ): + realized_slice = _slice_closed( + future_realized_latest, target_start, target_end + ) - # Past covariates split - if past_latest is not None: - past_slice = _slice_closed( - past_latest, target_start, target_end - ) - past_covariates = self.detect_and_fill_missing_values( - df=past_slice, - sensors=self.past, - sensor_names=self.past_regressors, - start=target_start, - end=target_end, - ) - else: - past_covariates = None - - # Future covariates (realized up to target_end + forecasts up to forecast_end) split - if ( - future_realized_latest is not None - and future_all_closest is not None - ): - realized_slice = _slice_closed( - future_realized_latest, target_start, target_end - ) - - # forecasts strictly after target_end up to forecast_end - # and ONLY those *available at the current belief_time* - # (and truly forecasts: belief_time <= event_start) - fc_window = X_future_regressors_df.loc[ - (X_future_regressors_df["event_start"] > target_end) - & (X_future_regressors_df["event_start"] <= forecast_end) - & (X_future_regressors_df["belief_time"] <= belief_time) - & ( - X_future_regressors_df["belief_time"] - <= X_future_regressors_df["event_start"] - ) - ].copy() - - # for each event_start in that window, pick the latest belief before the event - # (closest from below wrt belief_time) - fc_window["time_diff"] = ( - X_future_regressors_df.loc[fc_window.index, "event_start"] - - X_future_regressors_df.loc[fc_window.index, "belief_time"] - ).abs() - idx_fc = fc_window.groupby("event_start")[ - "belief_time" - ].idxmax() - forecast_slice = ( - fc_window.loc[idx_fc] - .drop(columns=["time_diff"], errors="ignore") - .sort_values("event_start") - .reset_index(drop=True) - ) - - # keep only value columns (drop meta) - keep_fc = [ - c - for c in forecast_slice.columns - if c not in ("belief_time") - ] - forecast_slice = forecast_slice[keep_fc] - - future_df = ( - pd.concat( - [realized_slice, forecast_slice], ignore_index=True - ) - .drop_duplicates(subset=["event_start"]) - .sort_values("event_start") - .reset_index(drop=True) - ) - - future_covariates = self.detect_and_fill_missing_values( - df=future_df, - sensors=self.future, - sensor_names=self.future_regressors, - start=target_start, - end=forecast_end + self.target_sensor.event_resolution, + # forecasts strictly after target_end up to forecast_end + # and ONLY those *available at the current belief_time* + # (and truly forecasts: belief_time <= event_start) + fc_window = X_future_regressors_df.loc[ + (X_future_regressors_df["event_start"] > target_end) + & (X_future_regressors_df["event_start"] <= forecast_end) + & (X_future_regressors_df["belief_time"] <= belief_time) + & ( + X_future_regressors_df["belief_time"] + <= X_future_regressors_df["event_start"] ) + ].copy() - else: - future_covariates = None - - target_list.append(y_split) - past_covariates_list.append(past_covariates) - future_covariates_list.append(future_covariates) - belief_timestamps_list.append(save_belief_time) - - future_covariates_list = ( - future_covariates_list - if future_covariates_list[0] is not None - else None - ) - past_covariates_list = ( - past_covariates_list - if past_covariates_list[0] is not None - else None - ) + # for each event_start in that window, pick the latest belief before the event + # (closest from below wrt belief_time) + fc_window["time_diff"] = ( + X_future_regressors_df.loc[fc_window.index, "event_start"] + - X_future_regressors_df.loc[fc_window.index, "belief_time"] + ).abs() + idx_fc = fc_window.groupby("event_start")["belief_time"].idxmax() + forecast_slice = ( + fc_window.loc[idx_fc] + .drop(columns=["time_diff"], errors="ignore") + .sort_values("event_start") + .reset_index(drop=True) + ) - return ( - past_covariates_list, - future_covariates_list, - target_list, - belief_timestamps_list, - ) + # keep only value columns (drop meta) + keep_fc = [ + c for c in forecast_slice.columns if c not in ("belief_time") + ] + forecast_slice = forecast_slice[keep_fc] - # Autoregressive-only case - if not self.past and not self.future: - logging.info("Using autoregressive forecasting.") + future_df = ( + pd.concat([realized_slice, forecast_slice], ignore_index=True) + .drop_duplicates(subset=["event_start"]) + .sort_values("event_start") + .reset_index(drop=True) + ) - y = df[["event_start", "belief_time", self.target]].copy() + future_covariates = self.detect_and_fill_missing_values( + df=future_df, + sensors=self.future, + sensor_names=self.future_regressors, + start=target_start, + end=forecast_end + self.target_sensor.event_resolution, + ) - _, _, target_list, belief_timestamps_list = _generate_splits( - None, None, y - ) + else: + future_covariates = None - logging.debug("Data split successfully with autoregressive lags.") - return None, None, target_list, belief_timestamps_list + target_list.append(y_split) + past_covariates_list.append(past_covariates) + future_covariates_list.append(future_covariates) + belief_timestamps_list.append(save_belief_time) - # With regressors - X_past_regressors_df = ( - df[["event_start", "belief_time"] + self.past_regressors] - if self.past_regressors + future_covariates_list = ( + future_covariates_list + if future_covariates_list[0] is not None else None ) - X_future_regressors_df = ( - df[["event_start", "belief_time"] + self.future_regressors] - if self.future != [] - else None - ) - y = ( - df[["event_start", "belief_time", self.target]] - .dropna() - .reset_index(drop=True) - .copy() + past_covariates_list = ( + past_covariates_list if past_covariates_list[0] is not None else None ) - ( - past_covariates_list, - future_covariates_list, - target_list, - belief_timestamps_list, - ) = _generate_splits(X_past_regressors_df, X_future_regressors_df, y) - return ( past_covariates_list, future_covariates_list, @@ -584,8 +519,48 @@ def _slice_closed( belief_timestamps_list, ) - except Exception as e: - raise CustomException(f"Error splitting data: {e}", sys) + # Autoregressive-only case + if not self.past and not self.future: + logging.info("Using autoregressive forecasting.") + + y = df[["event_start", "belief_time", self.target]].copy() + + _, _, target_list, belief_timestamps_list = _generate_splits(None, None, y) + + logging.debug("Data split successfully with autoregressive lags.") + return None, None, target_list, belief_timestamps_list + + # With regressors + X_past_regressors_df = ( + df[["event_start", "belief_time"] + self.past_regressors] + if self.past_regressors + else None + ) + X_future_regressors_df = ( + df[["event_start", "belief_time"] + self.future_regressors] + if self.future != [] + else None + ) + y = ( + df[["event_start", "belief_time", self.target]] + .dropna() + .reset_index(drop=True) + .copy() + ) + + ( + past_covariates_list, + future_covariates_list, + target_list, + belief_timestamps_list, + ) = _generate_splits(X_past_regressors_df, X_future_regressors_df, y) + + return ( + past_covariates_list, + future_covariates_list, + target_list, + belief_timestamps_list, + ) def detect_and_fill_missing_values( self, @@ -615,7 +590,7 @@ def detect_and_fill_missing_values( - TimeSeries: The time series with missing values filled. Raises: - - ValueError: If the input dataframe is empty. + - NotEnoughDataException: If the input dataframe is empty or otherwise does not have enough data. - logging.warning: If missing values are detected and filled using `pd.DataFrame.interpolate()`. """ dfs = [] @@ -629,7 +604,7 @@ def detect_and_fill_missing_values( missing_fraction = n_missing / total if total > 0 else 1.0 if missing_fraction > self.missing_threshold: - raise ValueError( + raise NotEnoughDataException( f"Sensor {sensor_name} has {missing_fraction*100:.1f}% missing values " f"which exceeds the allowed threshold of {self.missing_threshold*100:.1f}%" ) @@ -714,7 +689,7 @@ def detect_and_fill_missing_values( missing_rows_fraction = total_missing / total_expected if missing_rows_fraction > self.missing_threshold: - raise ValueError( + raise NotEnoughDataException( f"Sensor {sensor_name} has {missing_rows_fraction*100:.1f}% missing values " f"which exceeds the allowed threshold of {self.missing_threshold*100:.1f}%" ) diff --git a/flexmeasures/data/models/forecasting/pipelines/predict.py b/flexmeasures/data/models/forecasting/pipelines/predict.py index c8dd67c9f1..78fca20420 100644 --- a/flexmeasures/data/models/forecasting/pipelines/predict.py +++ b/flexmeasures/data/models/forecasting/pipelines/predict.py @@ -2,7 +2,6 @@ import os import pickle -import sys import logging from datetime import datetime @@ -15,7 +14,6 @@ from flexmeasures import Sensor, Source from flexmeasures.data import db from flexmeasures.data.models.forecasting.utils import data_to_bdf -from flexmeasures.data.models.forecasting.exceptions import CustomException from flexmeasures.data.models.forecasting.pipelines.base import BasePipeline from flexmeasures.data.utils import save_to_db @@ -100,16 +98,11 @@ def load_model(self): """ Load the model and its metadata from the model_path. """ - try: - logging.debug("Loading model and metadata from %s", self.model_path) - with open(self.model_path, "rb") as file: - model = pickle.load(file) - logging.debug( - "Model and metadata loaded successfully from %s", self.model_path - ) - return model - except Exception as e: - raise CustomException(f"Error loading model and metadata: {e}", sys) from e + logging.debug("Loading model and metadata from %s", self.model_path) + with open(self.model_path, "rb") as file: + model = pickle.load(file) + logging.debug("Model and metadata loaded successfully from %s", self.model_path) + return model def _prepare_df_single_horizon_prediction( self, @@ -123,46 +116,39 @@ def _prepare_df_single_horizon_prediction( Prepare the DataFrame for a single prediction. Make an additional column for quantiles forecast when probabilistic is True """ - try: - logging.debug(f"Preparing DataFrame for viewpoint {viewpoint}.") + logging.debug(f"Preparing DataFrame for viewpoint {viewpoint}.") - if self.probabilistic: - q_kwargs = dict(quantiles=self.quantiles) if self.quantiles else dict() - y_pred_df = y_pred.quantiles_df(**q_kwargs).T - else: - try: - y_pred_df = y_pred.pd_dataframe().T - except AttributeError: - y_pred_df = y_pred.to_dataframe().T + if self.probabilistic: + q_kwargs = dict(quantiles=self.quantiles) if self.quantiles else dict() + y_pred_df = y_pred.quantiles_df(**q_kwargs).T + else: + try: + y_pred_df = y_pred.pd_dataframe().T + except AttributeError: + y_pred_df = y_pred.to_dataframe().T - y_pred_df.columns = [ - f"{h}h" for h in range(1, self.max_forecast_horizon + 1) - ] - y_pred_df.reset_index(inplace=True) - # Insert forecasts event_start timestamps - y_pred_df.insert(0, "event_start", belief_horizon) + y_pred_df.columns = [f"{h}h" for h in range(1, self.max_forecast_horizon + 1)] + y_pred_df.reset_index(inplace=True) + # Insert forecasts event_start timestamps + y_pred_df.insert(0, "event_start", belief_horizon) - # Insert forecasts belief_time timestamps - y_pred_df.insert(1, "belief_time", belief_timestamp) + # Insert forecasts belief_time timestamps + y_pred_df.insert(1, "belief_time", belief_timestamp) - # Insert the target sensor name and value at belief time forecasts are made - y_pred_df.insert(2, self.target, value_at_belief_horizon) - if self.quantiles: - y_pred_df.set_index( - ["event_start", "belief_time", self.target, "component"], - inplace=True, - ) - else: - y_pred_df.set_index( - ["event_start", "belief_time", self.target], inplace=True - ) + # Insert the target sensor name and value at belief time forecasts are made + y_pred_df.insert(2, self.target, value_at_belief_horizon) + if self.quantiles: + y_pred_df.set_index( + ["event_start", "belief_time", self.target, "component"], + inplace=True, + ) + else: + y_pred_df.set_index( + ["event_start", "belief_time", self.target], inplace=True + ) - logging.debug(f"DataFrame prepared for viewpoint {viewpoint}.") - return y_pred_df - except Exception as e: - raise CustomException( - f"Error preparing prediction DataFrame: {e}", sys - ) from e + logging.debug(f"DataFrame prepared for viewpoint {viewpoint}.") + return y_pred_df def make_single_fixed_viewpoint_prediction( self, @@ -182,37 +168,31 @@ def make_single_fixed_viewpoint_prediction( - `BasePipeline` class docstring (“Covariate semantics”) - `BasePipeline.split_data_all_beliefs` → `_generate_splits` """ - try: - logging.debug( - f"Predicting for viewpoint {viewpoint}, forecasting up to {self.total_forecast_hours} hours ahead." - ) - # Inputs (y, past_covariates, future_covariates) are pre-sliced for this - # belief time by BasePipeline._generate_splits. See BasePipeline docs and - # CHECK THIS DIAGRAM : https://cloud.seita.nl/index.php/s/FYRgJwE3ER8kTLk aka 20250210_123637.png + logging.debug( + f"Predicting for viewpoint {viewpoint}, forecasting up to {self.total_forecast_hours} hours ahead." + ) + # Inputs (y, past_covariates, future_covariates) are pre-sliced for this + # belief time by BasePipeline._generate_splits. See BasePipeline docs and + # CHECK THIS DIAGRAM : https://cloud.seita.nl/index.php/s/FYRgJwE3ER8kTLk aka 20250210_123637.png - # Get time series of forecasts at a single viewpoint - y_pred = model.predict( - current_y, - past_covariates=past_covariates, - future_covariates=future_covariates, - ) + # Get time series of forecasts at a single viewpoint + y_pred = model.predict( + current_y, + past_covariates=past_covariates, + future_covariates=future_covariates, + ) - belief_horizon = current_y.end_time() - value_at_belief_horizon = current_y.last_value() - y_pred_df = self._prepare_df_single_horizon_prediction( - y_pred=y_pred, - belief_horizon=belief_horizon, - value_at_belief_horizon=value_at_belief_horizon, - viewpoint=viewpoint, - belief_timestamp=belief_timestamp, - ) - logging.debug(f"Prediction for viewpoint {viewpoint} completed.") - return y_pred_df - except Exception as e: - raise CustomException( - f"Error predicting for viewpoint {viewpoint}: {e}", - sys, - ) from e + belief_horizon = current_y.end_time() + value_at_belief_horizon = current_y.last_value() + y_pred_df = self._prepare_df_single_horizon_prediction( + y_pred=y_pred, + belief_horizon=belief_horizon, + value_at_belief_horizon=value_at_belief_horizon, + viewpoint=viewpoint, + belief_timestamp=belief_timestamp, + ) + logging.debug(f"Prediction for viewpoint {viewpoint} completed.") + return y_pred_df def make_multi_fixed_viewpoint_predictions( self, @@ -225,101 +205,89 @@ def make_multi_fixed_viewpoint_predictions( """ Make predictions for multiple fixed viewpoints, for the given model, X, and y. """ - try: + logging.debug( + f"Starting to generate predictions for up to {self.max_forecast_horizon} ({self.readable_resolution}) intervals (i.e. {self.total_forecast_hours} hours)." + ) + + # We make predictions up to the last hour in the predict_period + y_pred_dfs = list() + for v, belief_timestamp in enumerate(belief_timestamps_list): + future_covariates = ( + future_covariates_list[v] if future_covariates_list else None + ) + past_covariates = past_covariates_list[v] if past_covariates_list else None + y = y_list[v] logging.debug( - f"Starting to generate predictions for up to {self.max_forecast_horizon} ({self.readable_resolution}) intervals (i.e. {self.total_forecast_hours} hours)." + f"Making prediction for {belief_timestamp} (viewpoint {v + 1}/{self.number_of_viewpoints})" ) - - # We make predictions up to the last hour in the predict_period - y_pred_dfs = list() - for v, belief_timestamp in enumerate(belief_timestamps_list): - future_covariates = ( - future_covariates_list[v] if future_covariates_list else None - ) - past_covariates = ( - past_covariates_list[v] if past_covariates_list else None - ) - y = y_list[v] - logging.debug( - f"Making prediction for {belief_timestamp} (viewpoint {v + 1}/{self.number_of_viewpoints})" - ) - y_pred_df = self.make_single_fixed_viewpoint_prediction( - model=model, - future_covariates=future_covariates, - past_covariates=past_covariates, - current_y=y, - viewpoint=v + 1, # humanized iterator starting from 1 - belief_timestamp=belief_timestamp, - ) - y_pred_dfs.append(y_pred_df) - df_res = pd.concat(y_pred_dfs) - logging.debug("Finished generating predictions.") - return df_res - except Exception as e: - raise CustomException(f"Error generating predictions: {e}", sys) from e + y_pred_df = self.make_single_fixed_viewpoint_prediction( + model=model, + future_covariates=future_covariates, + past_covariates=past_covariates, + current_y=y, + viewpoint=v + 1, # humanized iterator starting from 1 + belief_timestamp=belief_timestamp, + ) + y_pred_dfs.append(y_pred_df) + df_res = pd.concat(y_pred_dfs) + logging.debug("Finished generating predictions.") + return df_res def save_results_to_CSV(self, df_pred: pd.DataFrame): """ Save the predictions to a CSV file. """ - try: - logging.debug("Saving predictions to a CSV file.") - os.makedirs(os.path.dirname(self.output_path), exist_ok=True) - df_pred.to_csv(self.output_path) - logging.debug("Successfully saved predictions to %s", self.output_path) - - except Exception as e: - raise CustomException(f"Error saving predictions: {e}", sys) from e + logging.debug("Saving predictions to a CSV file.") + os.makedirs(os.path.dirname(self.output_path), exist_ok=True) + df_pred.to_csv(self.output_path) + logging.debug("Successfully saved predictions to %s", self.output_path) def run(self, delete_model: bool = False) -> BeliefsDataFrame: """ Execute the prediction pipeline. """ - try: - df = self.load_data_all_beliefs() - ( - past_covariates_list, - future_covariates_list, - y_list, - belief_timestamps_list, - ) = self.split_data_all_beliefs(df, is_predict_pipeline=True) - logging.debug("Done splitting data") + df = self.load_data_all_beliefs() + ( + past_covariates_list, + future_covariates_list, + y_list, + belief_timestamps_list, + ) = self.split_data_all_beliefs(df, is_predict_pipeline=True) + logging.debug("Done splitting data") - model = self.load_model() - logging.debug("Model loaded") - df_pred = self.make_multi_fixed_viewpoint_predictions( - model, - future_covariates_list=future_covariates_list, - past_covariates_list=past_covariates_list, - y_list=y_list, - belief_timestamps_list=belief_timestamps_list, - ) - logging.debug("Predictions ready to be saved") + model = self.load_model() + logging.debug("Model loaded") + df_pred = self.make_multi_fixed_viewpoint_predictions( + model, + future_covariates_list=future_covariates_list, + past_covariates_list=past_covariates_list, + y_list=y_list, + belief_timestamps_list=belief_timestamps_list, + ) + logging.debug("Predictions ready to be saved") - # todo: it looks like data_to_bdf should become a class method - bdf = data_to_bdf( - data=df_pred, - horizon=self.max_forecast_horizon, - probabilistic=self.probabilistic, - target_sensor=self.target_sensor, - sensor_to_save=self.sensor_to_save, - data_source=self.data_source, - ) - if self.output_path is not None: - self.save_results_to_CSV(bdf) + # todo: it looks like data_to_bdf should become a class method + bdf = data_to_bdf( + data=df_pred, + horizon=self.max_forecast_horizon, + probabilistic=self.probabilistic, + target_sensor=self.target_sensor, + sensor_to_save=self.sensor_to_save, + data_source=self.data_source, + ) + if self.output_path is not None: + self.save_results_to_CSV(bdf) - save_to_db( - bdf, save_changed_beliefs_only=False - ) # save all beliefs of forecasted values even if they are the same values as the previous beliefs. - db.session.commit() - logging.info( - f"Saved predictions to DB with source: {bdf.sources[0]}, sensor: {self.sensor_to_save}, sensor_id: {self.sensor_to_save.id}." - ) - if delete_model: - os.remove(self.model_path) + save_to_db( + bdf, save_changed_beliefs_only=False + ) # save all beliefs of forecasted values even if they are the same values as the previous beliefs. + db.session.commit() + logging.info( + f"Saved predictions to DB with source: {bdf.sources[0]}, sensor: {self.sensor_to_save}, sensor_id: {self.sensor_to_save.id}." + ) + if delete_model: + os.remove(self.model_path) - logging.info("Prediction pipeline completed successfully.") + logging.info("Prediction pipeline completed successfully.") - return bdf - except Exception as e: - raise CustomException(f"Error running pipeline: {e}", sys) from e + return bdf diff --git a/flexmeasures/data/models/forecasting/pipelines/train.py b/flexmeasures/data/models/forecasting/pipelines/train.py index 2b7fc31751..eacde33137 100644 --- a/flexmeasures/data/models/forecasting/pipelines/train.py +++ b/flexmeasures/data/models/forecasting/pipelines/train.py @@ -2,7 +2,6 @@ import os import pickle -import sys import warnings import logging from datetime import datetime @@ -11,7 +10,6 @@ from flexmeasures import Sensor from flexmeasures.data.models.forecasting.custom_models.lgbm_model import CustomLGBM -from flexmeasures.data.models.forecasting.exceptions import CustomException from flexmeasures.data.models.forecasting.pipelines.base import BasePipeline warnings.filterwarnings("ignore") @@ -78,32 +76,26 @@ def train_model( """ Trains the specified model using the provided training data. """ - try: - logging.debug(f"Training model {model.__class__.__name__}") + logging.debug(f"Training model {model.__class__.__name__}") - model.fit( - series=y_train, - past_covariates=past_covariates, - future_covariates=future_covariates, - ) - logging.debug("Model trained successfully") - return model - except Exception as e: - raise CustomException(f"Error training model: {e}", sys) from e + model.fit( + series=y_train, + past_covariates=past_covariates, + future_covariates=future_covariates, + ) + logging.debug("Model trained successfully") + return model def save_model(self, model, model_name: str): """ Save the trained model to the model_save_path. """ - try: - model_save_path = os.path.join(self.model_save_dir, model_name) - # Ensure the directory exists - os.makedirs(self.model_save_dir, exist_ok=True) - with open(model_save_path, "wb") as file: - pickle.dump(model, file) - logging.debug(f"Model and metadata saved successfully to {model_save_path}") - except Exception as e: - raise CustomException(f"Error saving model and metadata: {e}", sys) from e + model_save_path = os.path.join(self.model_save_dir, model_name) + # Ensure the directory exists + os.makedirs(self.model_save_dir, exist_ok=True) + with open(model_save_path, "wb") as file: + pickle.dump(model, file) + logging.debug(f"Model and metadata saved successfully to {model_save_path}") def run(self, counter: int): """ @@ -112,36 +104,32 @@ def run(self, counter: int): This function loads the data, splits it into training and testing sets, trains multiple models on the training set, and saves the trained models. """ - try: - df = self.load_data_all_beliefs() - past_covariates_list, future_covariates_list, y_train_list, _ = ( - self.split_data_all_beliefs(df) - ) - past_covariates = past_covariates_list[0] if past_covariates_list else None - future_covariates = ( - future_covariates_list[0] if future_covariates_list else None - ) - y_train = y_train_list[0] - - models = { - f"sensor_{self.target_sensor.id}-cycle_{counter}-lgbm.pkl": CustomLGBM( - max_forecast_horizon=self.max_forecast_horizon, - probabilistic=self.probabilistic, - auto_regressive=self.auto_regressive, - use_past_covariates=past_covariates_list is not None, - use_future_covariates=future_covariates_list is not None, - ensure_positive=self.ensure_positive, - ) - } + df = self.load_data_all_beliefs() + past_covariates_list, future_covariates_list, y_train_list, _ = ( + self.split_data_all_beliefs(df) + ) + past_covariates = past_covariates_list[0] if past_covariates_list else None + future_covariates = ( + future_covariates_list[0] if future_covariates_list else None + ) + y_train = y_train_list[0] - for model_name, model in models.items(): - trained_model = self.train_model( - model=model, - future_covariates=future_covariates, - past_covariates=past_covariates, - y_train=y_train, - ) - self.save_model(trained_model, model_name) + models = { + f"sensor_{self.target_sensor.id}-cycle_{counter}-lgbm.pkl": CustomLGBM( + max_forecast_horizon=self.max_forecast_horizon, + probabilistic=self.probabilistic, + auto_regressive=self.auto_regressive, + use_past_covariates=past_covariates_list is not None, + use_future_covariates=future_covariates_list is not None, + ensure_positive=self.ensure_positive, + ) + } - except Exception as e: - raise CustomException(f"Error running training pipeline: {e}", sys) from e + for model_name, model in models.items(): + trained_model = self.train_model( + model=model, + future_covariates=future_covariates, + past_covariates=past_covariates, + y_train=y_train, + ) + self.save_model(trained_model, model_name) diff --git a/flexmeasures/data/models/forecasting/pipelines/train_predict.py b/flexmeasures/data/models/forecasting/pipelines/train_predict.py index e636419c57..7da3a98ffd 100644 --- a/flexmeasures/data/models/forecasting/pipelines/train_predict.py +++ b/flexmeasures/data/models/forecasting/pipelines/train_predict.py @@ -3,7 +3,6 @@ from typing import Any import os -import sys import time import logging from datetime import datetime, timedelta @@ -13,7 +12,6 @@ from flask import current_app from flexmeasures.data.models.forecasting import Forecaster -from flexmeasures.data.models.forecasting.exceptions import CustomException from flexmeasures.data.models.forecasting.pipelines.predict import PredictPipeline from flexmeasures.data.models.forecasting.pipelines.train import TrainPipeline from flexmeasures.data.schemas.forecasting.pipeline import ( @@ -36,7 +34,7 @@ def __init__( config: dict | None = None, delete_model: bool = False, save_config: bool = True, - save_parameters: bool = True, + save_parameters: bool = False, ): super().__init__( config=config, save_config=save_config, save_parameters=save_parameters @@ -72,18 +70,20 @@ def run_cycle( # Train model train_pipeline = TrainPipeline( - future_regressors=self._parameters["future_regressors"], - past_regressors=self._parameters["past_regressors"], - target_sensor=self._parameters["target"], + future_regressors=self._config["future_regressors"], + past_regressors=self._config["past_regressors"], + target_sensor=self._parameters["sensor"], model_save_dir=self._parameters["model_save_dir"], - n_steps_to_predict=self._parameters["train_period_in_hours"] * multiplier, + n_steps_to_predict=(predict_start - train_start) + // timedelta(hours=1) + * multiplier, max_forecast_horizon=self._parameters["max_forecast_horizon"] - // self._parameters["target"].event_resolution, + // self._parameters["sensor"].event_resolution, event_starts_after=train_start, event_ends_before=train_end, probabilistic=self._parameters["probabilistic"], - ensure_positive=self._parameters["ensure_positive"], - missing_threshold=self._parameters.get("missing_threshold"), + ensure_positive=self._config["ensure_positive"], + missing_threshold=self._config.get("missing_threshold"), ) logging.info(f"Training cycle from {train_start} to {train_end} started ...") @@ -95,26 +95,26 @@ def run_cycle( ) # Make predictions predict_pipeline = PredictPipeline( - future_regressors=self._parameters["future_regressors"], - past_regressors=self._parameters["past_regressors"], - target_sensor=self._parameters["target"], + future_regressors=self._config["future_regressors"], + past_regressors=self._config["past_regressors"], + target_sensor=self._parameters["sensor"], model_path=os.path.join( self._parameters["model_save_dir"], - f"sensor_{self._parameters['target'].id}-cycle_{counter}-lgbm.pkl", + f"sensor_{self._parameters['sensor'].id}-cycle_{counter}-lgbm.pkl", ), output_path=( os.path.join( self._parameters["output_path"], - f"sensor_{self._parameters['target'].id}-cycle_{counter}.csv", + f"sensor_{self._parameters['sensor'].id}-cycle_{counter}.csv", ) if self._parameters["output_path"] else None ), n_steps_to_predict=self._parameters["predict_period_in_hours"] * multiplier, max_forecast_horizon=self._parameters["max_forecast_horizon"] - // self._parameters["target"].event_resolution, + // self._parameters["sensor"].event_resolution, forecast_frequency=self._parameters["forecast_frequency"] - // self._parameters["target"].event_resolution, + // self._parameters["sensor"].event_resolution, probabilistic=self._parameters["probabilistic"], event_starts_after=train_start, # use beliefs about events before the start of the predict period event_ends_before=predict_end, # ignore any beliefs about events beyond the end of the predict period @@ -123,7 +123,7 @@ def run_cycle( predict_end=predict_end, sensor_to_save=self._parameters["sensor_to_save"], data_source=self.data_source, - missing_threshold=self._parameters.get("missing_threshold"), + missing_threshold=self._config.get("missing_threshold"), ) logging.info( f"Prediction cycle from {predict_start} to {predict_end} started ..." @@ -142,13 +142,53 @@ def run_cycle( f"{p.ordinal(counter)} Train-Predict cycle from {train_start} to {predict_end} completed in {total_runtime:.2f} seconds." ) self.return_values.append( - {"data": forecasts, "sensor": self._parameters["target"]} + {"data": forecasts, "sensor": self._parameters["sensor"]} ) return total_runtime - def _compute_forecast(self, **kwargs) -> list[dict[str, Any]]: + def _compute_forecast(self, as_job: bool = False, **kwargs) -> list[dict[str, Any]]: # Run the train-and-predict pipeline - return self.run(**kwargs) + return self.run(as_job=as_job, **kwargs) + + def _derive_training_period(self) -> tuple[datetime, datetime]: + """Derive the effective training period for model fitting. + + The training period ends at ``predict_start`` and starts at the + most restrictive (latest) of the following: + + - The configured ``start_date`` (if any) + - ``predict_start - train_period_in_hours`` (if configured) + - ``predict_start - max_training_period`` (always enforced) + + Additionally, the resulting training window is guaranteed to span + at least two days. + + :return: A tuple ``(train_start, train_end)`` defining the training window. + """ + train_end = self._parameters["predict_start"] + + configured_start: datetime | None = self._config.get("train_start") + period_hours: int | None = self._config.get("train_period_in_hours") + + candidates: list[datetime] = [] + + if configured_start is not None: + candidates.append(configured_start) + + if period_hours is not None: + candidates.append(train_end - timedelta(hours=period_hours)) + + # Always enforce maximum training period + candidates.append(train_end - self._config["max_training_period"]) + + train_start = max(candidates) + + # Enforce minimum training period of 2 days + min_training_period = timedelta(days=2) + if train_end - train_start < min_training_period: + train_start = train_end - min_training_period + + return train_start, train_end def run( self, @@ -156,131 +196,126 @@ def run( queue: str = "forecasting", **job_kwargs, ): - try: + logging.info( + f"Starting Train-Predict Pipeline to predict for {self._parameters['predict_period_in_hours']} hours." + ) + # How much to move forward to the next cycle one prediction period later + cycle_frequency = max( + self._config["retrain_frequency"], + self._parameters["forecast_frequency"], + ) + + predict_start = self._parameters["predict_start"] + predict_end = predict_start + cycle_frequency + + # Determine training window (start, end) + train_start, train_end = self._derive_training_period() + + sensor_resolution = self._parameters["sensor"].event_resolution + multiplier = int( + timedelta(hours=1) / sensor_resolution + ) # multiplier used to adapt n_steps_to_predict to hours from sensor resolution, e.g. 15 min sensor resolution will have 7*24*4 = 168 predictions to predict a week + + # Compute number of training cycles (at least 1) + n_cycles = max( + timedelta(hours=self._parameters["predict_period_in_hours"]) + // max( + self._config["retrain_frequency"], + self._parameters["forecast_frequency"], + ), + 1, + ) + + cumulative_cycles_runtime = 0 # To track the cumulative runtime of TrainPredictPipeline cycles when not running as a job. + cycles_job_params = [] + for counter in range(n_cycles): + predict_end = min(predict_end, self._parameters["end_date"]) + + train_predict_params = { + "train_start": train_start, + "train_end": train_end, + "predict_start": predict_start, + "predict_end": predict_end, + "counter": counter + 1, + "multiplier": multiplier, + } + + if not as_job: + cycle_runtime = self.run_cycle(**train_predict_params) + cumulative_cycles_runtime += cycle_runtime + else: + train_predict_params["target_sensor_id"] = self._parameters["sensor"].id + cycles_job_params.append(train_predict_params) + + train_end += cycle_frequency + predict_start += cycle_frequency + predict_end += cycle_frequency + if not as_job: logging.info( - f"Starting Train-Predict Pipeline to predict for {self._parameters['predict_period_in_hours']} hours." + f"Train-Predict Pipeline completed successfully in {cumulative_cycles_runtime:.2f} seconds." ) - predict_start = self._parameters["predict_start"] - predict_end = predict_start + timedelta( - hours=self._parameters["predict_period_in_hours"] - ) - train_start = predict_start - timedelta( - hours=self._parameters["train_period_in_hours"] - ) - train_end = predict_start - counter = 0 - - sensor_resolution = self._parameters["target"].event_resolution - multiplier = int( - timedelta(hours=1) / sensor_resolution - ) # multiplier used to adapt n_steps_to_predict to hours from sensor resolution, e.g. 15 min sensor resolution will have 7*24*4 = 168 predicitons to predict a week - - cumulative_cycles_runtime = 0 # To track the cumulative runtime of TrainPredictPipeline cycles when not running as a job. - cycles_job_params = [] - while predict_end <= self._parameters["end_date"]: - counter += 1 - - train_predict_params = { - "train_start": train_start, - "train_end": train_end, - "predict_start": predict_start, - "predict_end": predict_end, - "counter": counter, - "multiplier": multiplier, - } - - if not as_job: - cycle_runtime = self.run_cycle(**train_predict_params) - cumulative_cycles_runtime += cycle_runtime - else: - train_predict_params["target_sensor_id"] = self._parameters[ - "target" - ].id - cycles_job_params.append(train_predict_params) - - # Move forward to the next cycle one prediction period later - cycle_frequency = timedelta( - hours=self._parameters["predict_period_in_hours"] - ) - train_end += cycle_frequency - predict_start += cycle_frequency - predict_end += cycle_frequency - if counter == 0: - logging.info( - f"Train-Predict Pipeline Not Run: start-predict-date + predict-period is {predict_end}, which exceeds end-date {self._parameters['end_date']}. " - f"Try decreasing the predict-period." - ) - elif not as_job: - logging.info( - f"Train-Predict Pipeline completed successfully in {cumulative_cycles_runtime:.2f} seconds." - ) + if as_job: + cycle_job_ids = [] - if as_job: - cycle_job_ids = [] - for cycle_params in cycles_job_params: - # job metadata for tracking - job_metadata = { - "data_source_info": {"id": self.data_source.id}, - "start_predict_date": self._parameters["predict_start"], - "end_date": self._parameters["end_date"], - "sensor_id": self._parameters["sensor_to_save"].id, - } - job = Job.create( - self.run_cycle, - # Some cycle job params override job kwargs - kwargs={**job_kwargs, **cycle_params}, - connection=current_app.queues[queue].connection, - ttl=int( - current_app.config.get( - "FLEXMEASURES_JOB_TTL", timedelta(-1) - ).total_seconds() - ), - result_ttl=int( - current_app.config.get( - "FLEXMEASURES_PLANNING_TTL", timedelta(-1) - ).total_seconds() - ), # NB job.cleanup docs says a negative number of seconds means persisting forever - meta=job_metadata, - timeout=60 * 60, # 1 hour - ) - - # Store the job ID for this cycle - cycle_job_ids.append(job.id) - - current_app.queues[queue].enqueue_job(job) - current_app.job_cache.add( - self._parameters["target"].id, - job_id=job.id, - queue=queue, - asset_or_sensor_type="sensor", - ) - - wrap_up_job = Job.create( - self.run_wrap_up, - kwargs={ - "cycle_job_ids": cycle_job_ids - }, # cycles jobs IDs to wait for + # job metadata for tracking + job_metadata = { + "data_source_info": {"id": self.data_source.id}, + "start": self._parameters["predict_start"], + "end": self._parameters["end_date"], + "sensor_id": self._parameters["sensor_to_save"].id, + } + for cycle_params in cycles_job_params: + + job = Job.create( + self.run_cycle, + # Some cycle job params override job kwargs + kwargs={**job_kwargs, **cycle_params}, connection=current_app.queues[queue].connection, - depends_on=cycle_job_ids, # wrap-up job depends on all cycle jobs ttl=int( current_app.config.get( "FLEXMEASURES_JOB_TTL", timedelta(-1) ).total_seconds() ), + result_ttl=int( + current_app.config.get( + "FLEXMEASURES_PLANNING_TTL", timedelta(-1) + ).total_seconds() + ), # NB job.cleanup docs says a negative number of seconds means persisting forever meta=job_metadata, + timeout=60 * 60, # 1 hour + ) + + # Store the job ID for this cycle + cycle_job_ids.append(job.id) + + current_app.queues[queue].enqueue_job(job) + current_app.job_cache.add( + self._parameters["sensor"].id, + job_id=job.id, + queue=queue, + asset_or_sensor_type="sensor", ) - current_app.queues[queue].enqueue_job(wrap_up_job) - - if len(cycle_job_ids) > 1: - # Return the wrap-up job ID if multiple cycle jobs are queued - return wrap_up_job.id - else: - # Return the single cycle job ID if only one job is queued - return cycle_job_ids[0] - - return self.return_values - except Exception as e: - raise CustomException( - f"Error running Train-Predict Pipeline: {e}", sys - ) from e + + wrap_up_job = Job.create( + self.run_wrap_up, + kwargs={"cycle_job_ids": cycle_job_ids}, # cycles jobs IDs to wait for + connection=current_app.queues[queue].connection, + depends_on=cycle_job_ids, # wrap-up job depends on all cycle jobs + ttl=int( + current_app.config.get( + "FLEXMEASURES_JOB_TTL", timedelta(-1) + ).total_seconds() + ), + meta=job_metadata, + ) + current_app.queues[queue].enqueue_job(wrap_up_job) + + if len(cycle_job_ids) > 1: + # Return the wrap-up job ID if multiple cycle jobs are queued + return wrap_up_job.id + else: + # Return the single cycle job ID if only one job is queued + return cycle_job_ids[0] if len(cycle_job_ids) == 1 else wrap_up_job.id + + return self.return_values diff --git a/flexmeasures/data/schemas/forecasting/pipeline.py b/flexmeasures/data/schemas/forecasting/pipeline.py index 10201cc3af..c60178c933 100644 --- a/flexmeasures/data/schemas/forecasting/pipeline.py +++ b/flexmeasures/data/schemas/forecasting/pipeline.py @@ -4,11 +4,23 @@ import os from datetime import timedelta +from isodate.duration import Duration -from marshmallow import fields, Schema, validates_schema, post_load, ValidationError +from marshmallow import ( + fields, + Schema, + validates_schema, + pre_load, + post_load, + ValidationError, +) from flexmeasures.data.schemas import SensorIdField -from flexmeasures.data.schemas.times import AwareDateTimeOrDateField, DurationField +from flexmeasures.data.schemas.times import ( + AwareDateTimeOrDateField, + DurationField, + PlanningDurationField, +) from flexmeasures.data.models.forecasting.utils import floor_to_resolution from flexmeasures.utils.time_utils import server_now @@ -16,204 +28,354 @@ class TrainPredictPipelineConfigSchema(Schema): model = fields.String(load_default="CustomLGBM") - - -class ForecasterParametersSchema(Schema): - - sensor = SensorIdField( - required=True, - metadata={ - "description": "ID of the sensor to forecast.", - "example": 2092, - }, - ) future_regressors = fields.List( SensorIdField(), - required=False, + data_key="future-regressors", + load_default=[], metadata={ - "description": "Sensor IDs to be treated only as future regressors.", + "description": ( + "Sensor IDs to be treated only as future regressors." + " Use this if only forecasts recorded on this sensor matter as a regressor." + ), "example": [2093, 2094], + "cli": { + "option": "--future-regressors", + }, }, ) past_regressors = fields.List( SensorIdField(), - required=False, + data_key="past-regressors", + load_default=[], metadata={ - "description": "Sensor IDs to be treated only as past regressors.", + "description": ( + "Sensor IDs to be treated only as past regressors." + " Use this if only realizations recorded on this sensor matter as a regressor." + ), "example": [2095], + "cli": { + "option": "--past-regressors", + }, }, ) regressors = fields.List( SensorIdField(), - required=False, + data_key="regressors", + load_default=[], metadata={ - "description": "Sensor IDs used as both past and future regressors.", + "description": ( + "Sensor IDs used as both past and future regressors." + " Use this if both realizations and forecasts recorded on this sensor matter as a regressor." + ), "example": [2093, 2094, 2095], + "cli": { + "option": "--regressors", + }, }, ) - model_save_dir = fields.Str( - required=False, - allow_none=True, - load_default="flexmeasures/data/models/forecasting/artifacts/models", + missing_threshold = fields.Float( + data_key="missing-threshold", + load_default=1.0, metadata={ - "description": "Directory to save the trained model.", - "example": "flexmeasures/data/models/forecasting/artifacts/models", + "description": "Maximum fraction of missing data allowed before raising an error. Defaults to 1.0.", + "example": 0.1, + "cli": { + "option": "--missing-threshold", + "extra_help": "Missing data under this threshold will be filled using forward filling or linear interpolation.", + }, }, ) - output_path = fields.Str( - required=False, + ensure_positive = fields.Bool( + data_key="ensure-positive", + load_default=False, allow_none=True, metadata={ - "description": "Directory to save prediction outputs. Defaults to None (no outputs saved).", - "example": "flexmeasures/data/models/forecasting/artifacts/forecasts", + "description": "Whether to clip negative values in forecasts. Defaults to None (disabled).", + "example": True, + "cli": { + "option": "--ensure-positive", + }, }, ) - start_date = AwareDateTimeOrDateField( + train_start = AwareDateTimeOrDateField( + data_key="train-start", required=False, allow_none=True, metadata={ - "description": "Timestamp marking the start of training data. Defaults to train_period before start_predict_date if not set.", + "description": "Timestamp marking the start of training data. Defaults to train_period before start if not set.", "example": "2025-01-01T00:00:00+01:00", - }, - ) - end_date = AwareDateTimeOrDateField( - required=True, - inclusive=True, - metadata={ - "description": "End date for running the pipeline.", - "example": "2025-10-15T00:00:00+01:00", + "cli": { + "cli-exclusive": True, + "option": "--train-start", + "aliases": ["--start-date", "--train-start"], + }, }, ) train_period = DurationField( - required=False, + data_key="train-period", + load_default=timedelta(days=30), allow_none=True, metadata={ - "description": "Duration of the initial training period (ISO 8601 format, min 2 days). If not set, derived from start_date and start_predict_date or defaults to P30D (30 days).", + "description": "Duration of the initial training period (ISO 8601 format, min 2 days). If not set, derived from train_start and start if not set or defaults to P30D (30 days).", "example": "P7D", + "cli": { + "cli-exclusive": True, + "option": "--train-period", + }, }, ) - start_predict_date = AwareDateTimeOrDateField( - required=False, + max_training_period = DurationField( + data_key="max-training-period", + load_default=timedelta(days=365), allow_none=True, metadata={ - "description": "Start date for predictions. Defaults to now, floored to the sensor resolution, so that the first forecast is about the ongoing event.", - "example": "2025-01-08T00:00:00+01:00", + "description": "Maximum duration of the training period. Defaults to 1 year (P1Y).", + "example": "P1Y", + "cli": { + "cli-exclusive": True, + "option": "--max-training-period", + }, }, ) retrain_frequency = DurationField( - required=False, + data_key="retrain-frequency", + load_default=PlanningDurationField.load_default, allow_none=True, metadata={ "description": "Frequency of retraining/prediction cycle (ISO 8601 duration). Defaults to prediction window length if not set.", "example": "PT24H", + "cli": { + "cli-exclusive": True, + "option": "--retrain-frequency", + }, }, ) - max_forecast_horizon = DurationField( - required=False, + + @validates_schema + def validate_parameters(self, data: dict, **kwargs): # noqa: C901 + if data["retrain_frequency"] < timedelta(hours=1): + raise ValidationError( + "retrain-frequency must be at least 1 hour", + field_name="retrain_frequency", + ) + + train_period = data.get("train_period") + max_training_period = data.get("max_training_period") + + if train_period is not None and train_period < timedelta(days=2): + raise ValidationError( + "train-period must be at least 2 days (48 hours)", + field_name="train_period", + ) + + if isinstance(max_training_period, Duration): + # DurationField only returns Duration when years/months are present + raise ValidationError( + "max-training-period must be specified using days or smaller units " + "(e.g. P365D, PT48H). Years and months are not supported.", + field_name="max_training_period", + ) + + @post_load + def resolve_config(self, data: dict, **kwargs) -> dict: # noqa: C901 + + future_regressors = data.get("future_regressors", []) + past_regressors = data.get("past_regressors", []) + past_and_future_regressors = data.pop("regressors", []) + + if past_and_future_regressors: + future_regressors = list( + set(future_regressors + past_and_future_regressors) + ) + past_regressors = list(set(past_regressors + past_and_future_regressors)) + + data["future_regressors"] = future_regressors + data["past_regressors"] = past_regressors + + train_period_in_hours = data["train_period"] // timedelta(hours=1) + max_training_period = data["max_training_period"] + if train_period_in_hours > max_training_period // timedelta(hours=1): + train_period_in_hours = max_training_period // timedelta(hours=1) + logging.warning( + f"train-period is greater than max-training-period ({max_training_period}), setting train-period to max-training-period", + ) + + data["train_period_in_hours"] = train_period_in_hours + return data + + +class ForecasterParametersSchema(Schema): + """ + NB cli-exclusive fields are not exposed via the API (removed by make_openapi_compatible). + """ + + sensor = SensorIdField( + data_key="sensor", + required=True, + metadata={ + "description": "ID of the sensor to forecast.", + "example": 2092, + "cli": { + "option": "--sensor", + }, + }, + ) + model_save_dir = fields.Str( + data_key="model-save-dir", allow_none=True, - load_default=timedelta(hours=48), + load_default="flexmeasures/data/models/forecasting/artifacts/models", metadata={ - "description": "Maximum forecast horizon. Defaults to 48 hours if not set.", - "example": "PT48H", + "description": "Directory to save the trained model.", + "example": "flexmeasures/data/models/forecasting/artifacts/models", + "cli": { + "cli-exclusive": True, + "option": "--model-save-dir", + }, }, ) - forecast_frequency = DurationField( + output_path = fields.Str( + data_key="output-path", required=False, allow_none=True, - load_default=timedelta(hours=1), metadata={ - "description": "How often to recompute forecasts. Defaults to 1 hour.", - "example": "PT1H", + "description": "Directory to save prediction outputs. Defaults to None (no outputs saved).", + "example": "flexmeasures/data/models/forecasting/artifacts/forecasts", + "cli": { + "cli-exclusive": True, + "option": "--output-path", + }, }, ) - probabilistic = fields.Bool( + duration = PlanningDurationField( + load_default=PlanningDurationField.load_default, + metadata=dict( + description="The duration for which to create the forecast, in ISO 8601 duration format. Defaults to the planning horizon.", + example="PT24H", + cli={ + "option": "--duration", + "aliases": ["--predict-period"], + }, + ), + ) + end = AwareDateTimeOrDateField( + data_key="end", required=False, - load_default=False, + allow_none=True, + inclusive=True, metadata={ - "description": "Enable probabilistic predictions if True. Defaults to false.", - "example": False, + "description": "End of the last event forecasted. Use either this field or the duration field.", + "example": "2025-10-15T00:00:00+01:00", + "cli": { + "cli-exclusive": True, + "option": "--end", + "aliases": ["--end-date", "--to-date"], + }, }, ) - sensor_to_save = SensorIdField( + start = AwareDateTimeOrDateField( + data_key="start", required=False, allow_none=True, metadata={ - "description": "Sensor ID where forecasts will be saved; defaults to target sensor.", - "example": 2092, + "description": "Start date for predictions. Defaults to now, floored to the sensor resolution, so that the first forecast is about the ongoing event.", + "example": "2025-01-08T00:00:00+01:00", + "cli": { + "option": "--start", + "aliases": ["--start-predict-date", "--from-date"], + }, }, ) - ensure_positive = fields.Bool( + max_forecast_horizon = DurationField( + data_key="max-forecast-horizon", required=False, allow_none=True, metadata={ - "description": "Whether to clip negative values in forecasts. Defaults to None (disabled).", - "example": True, + "description": "Maximum forecast horizon. Defaults to covering the whole prediction period (which itself defaults to 48 hours).", + "example": "PT48H", + "cli": { + "cli-exclusive": True, + "option": "--max-forecast-horizon", + }, }, ) - missing_threshold = fields.Float( + forecast_frequency = DurationField( + data_key="forecast-frequency", required=False, - load_default=1.0, + allow_none=True, metadata={ - "description": "Maximum fraction of missing data allowed before raising an error. Defaults to 1.0.", - "example": 0.1, + "description": "How often to recompute forecasts. This setting can be used to get forecasts from multiple viewpoints, which is especially useful for running simulations. Defaults to the max-forecast-horizon.", + "example": "PT1H", + "cli": { + "option": "--forecast-frequency", + }, }, ) - as_job = fields.Bool( + probabilistic = fields.Bool( + data_key="probabilistic", load_default=False, metadata={ - "description": "If True, compute forecasts asynchronously using RQ jobs. Defaults to False.", - "example": True, + "description": "Enable probabilistic predictions if True. Defaults to false.", + "example": False, + "cli": { + "cli-exclusive": True, + "option": "--probabilistic", + }, }, ) - max_training_period = DurationField( + sensor_to_save = SensorIdField( + data_key="sensor-to-save", required=False, allow_none=True, metadata={ - "description": "Maximum duration of the training period. Defaults to 1 year (P1Y).", - "example": "P1Y", + "description": "Sensor ID where forecasts will be saved; defaults to target sensor.", + "example": 2092, + "cli": { + "option": "--sensor-to-save", + }, }, ) + @pre_load + def sanitize_input(self, data, **kwargs): + + # Check predict period + if len({"start", "end", "duration"} & data.keys()) > 2: + raise ValidationError( + "Provide 'duration' with either 'start' or 'end', but not with both.", + field_name="duration", + ) + + # Drop None values + data = {k: v for k, v in data.items() if v is not None} + + return data + @validates_schema - def validate_parameters(self, data: dict, **kwargs): - start_date = data["start_date"] - end_date = data["end_date"] - predict_start = data.get("start_predict_date", None) - train_period = data.get("train_period") - retrain_frequency = data.get("retrain_frequency") + def validate_parameters(self, data: dict, **kwargs): # noqa: C901 + end_date = data.get("end") + predict_start = data.get("start", None) max_forecast_horizon = data.get("max_forecast_horizon") forecast_frequency = data.get("forecast_frequency") sensor = data.get("sensor") - if start_date >= end_date: - raise ValidationError( - "start-date must be before end-date", field_name="start_date" - ) + # todo: consider moving this to the run method in train_predict.py + # if train_start is not None and end is not None and train_start >= end_date: + # raise ValidationError( + # "train_start must be before end", field_name="train-start" + # ) if predict_start: - if predict_start < start_date: - raise ValidationError( - "start-predict-date cannot be before start-date", - field_name="start_predict_date", - ) - if predict_start >= end_date: + # if train_start is not None and predict_start < train_start: + # raise ValidationError( + # "start cannot be before start", + # field_name="start", + # ) + if end_date is not None and predict_start >= end_date: raise ValidationError( - "start-predict-date must be before end-date", - field_name="start_predict_date", + "start must be before end", + field_name="start", ) - if train_period is not None and train_period < timedelta(days=2): - raise ValidationError( - "train-period must be at least 2 days (48 hours)", - field_name="train_period", - ) - - if retrain_frequency is not None and retrain_frequency <= timedelta(0): - raise ValidationError( - "retrain-frequency must be greater than 0", - field_name="retrain_frequency", - ) - if max_forecast_horizon is not None: if max_forecast_horizon % sensor.event_resolution != timedelta(0): raise ValidationError( @@ -226,82 +388,79 @@ def validate_parameters(self, data: dict, **kwargs): f"forecast-frequency must be a multiple of the sensor resolution ({sensor.event_resolution})" ) - @post_load - def resolve_config(self, data: dict, **kwargs) -> dict: # noqa: C901 + @post_load(pass_original=True) + def resolve_config( # noqa: C901 + self, data: dict, original_data: dict | None = None, **kwargs + ) -> dict: + """Resolve timing parameters, using sensible defaults and choices. - target_sensor = data["sensor"] + Defaults: + 1. predict-period defaults to minimum of (FM planning horizon and max-forecast-horizon) only if there is a single default viewpoint. + 2. max-forecast-horizon defaults to the predict-period + 3. forecast-frequency defaults to minimum of (FM planning horizon, predict-period, max-forecast-horizon) - future_regressors = data.get("future_regressors", []) - past_regressors = data.get("past_regressors", []) - past_and_future_regressors = data.get("regressors", []) + Choices: + 1. If max-forecast-horizon < predict-period, we raise a ValidationError due to incomplete coverage + 2. retraining-frequency becomes the maximum of (FM planning horizon and forecast-frequency, this is capped by the predict-period. + """ - if past_and_future_regressors: - future_regressors = list( - set(future_regressors + past_and_future_regressors) - ) - past_regressors = list(set(past_regressors + past_and_future_regressors)) + target_sensor = data["sensor"] resolution = target_sensor.event_resolution now = server_now() floored_now = floor_to_resolution(now, resolution) - predict_start = data.get("start_predict_date") or floored_now - save_belief_time = ( - now if data.get("start_predict_date") is None else predict_start - ) + if data.get("start") is None: + if original_data.get("duration") and data.get("end") is not None: + predict_start = data["end"] - data["duration"] + else: + predict_start = floored_now + else: + predict_start = data["start"] + + save_belief_time = now if data.get("start") is None else predict_start - if data.get("start_predict_date") is None and data.get("train_period"): + if data.get("end") is None: + data["end"] = predict_start + data["duration"] - predict_start = data["start_date"] + data["train_period"] - save_belief_time = None + predict_period = ( + data["end"] - predict_start if data.get("end") else data["duration"] + ) + forecast_frequency = data.get("forecast_frequency") - if data.get("train_period") is None and data["start_date"] is None: - train_period_in_hours = 30 * 24 # Set default train_period value to 30 days + max_forecast_horizon = data.get("max_forecast_horizon") - elif data.get("train_period") is None and data["start_date"]: - train_period_in_hours = int( - (predict_start - data["start_date"]).total_seconds() / 3600 + # Check for inconsistent parameters explicitly set + if ( + "max-forecast-horizon" in original_data + and "duration" in original_data + and max_forecast_horizon < predict_period + ): + raise ValidationError( + "This combination of parameters will not yield forecasts for the entire prediction window.", + field_name="max_forecast_horizon", ) - else: - train_period_in_hours = data["train_period"] // timedelta(hours=1) - if train_period_in_hours < 48: + if max_forecast_horizon is None: + max_forecast_horizon = predict_period + elif max_forecast_horizon > predict_period: raise ValidationError( - "train-period must be at least 2 days (48 hours)", - field_name="train_period", + "max-forecast-horizon must be less than or equal to predict-period", + field_name="max_forecast_horizon", ) - max_training_period = data.get("max_training_period") or timedelta(days=365) - if train_period_in_hours > max_training_period // timedelta(hours=1): - train_period_in_hours = max_training_period // timedelta(hours=1) - logging.warning( - f"train-period is greater than max-training-period ({max_training_period}), setting train-period to max-training-period", + elif max_forecast_horizon < predict_period and forecast_frequency is None: + # Update the default predict-period if the user explicitly set a smaller max-forecast-horizon, + # unless they also set a forecast-frequency explicitly + predict_period = max_forecast_horizon + + if forecast_frequency is None: + forecast_frequency = min( + max_forecast_horizon, + predict_period, ) - if data.get("retrain_frequency") is None: - retrain_frequency_in_hours = int( - (data["end_date"] - predict_start).total_seconds() / 3600 - ) - else: - retrain_frequency_in_hours = data["retrain_frequency"] // timedelta(hours=1) - if retrain_frequency_in_hours < 1: - raise ValidationError("retrain-frequency must be at least 1 hour") - - if data["start_date"] is None: - start_date = predict_start - timedelta(hours=train_period_in_hours) - else: - start_date = data["start_date"] - - max_forecast_horizon = data.get("max_forecast_horizon") - forecast_frequency = data.get("forecast_frequency") - - if max_forecast_horizon is None and forecast_frequency is None: - max_forecast_horizon = timedelta(hours=retrain_frequency_in_hours) - forecast_frequency = timedelta(hours=retrain_frequency_in_hours) - elif max_forecast_horizon is None: - max_forecast_horizon = forecast_frequency - elif forecast_frequency is None: - forecast_frequency = max_forecast_horizon + predict_period_in_hours = int(predict_period.total_seconds() / 3600) if data.get("sensor_to_save") is None: sensor_to_save = target_sensor @@ -317,26 +476,30 @@ def resolve_config(self, data: dict, **kwargs) -> dict: # noqa: C901 # Read default from schema model_save_dir = self.fields["model_save_dir"].load_default - ensure_positive = data.get("ensure_positive") + m_viewpoints = max(predict_period // forecast_frequency, 1) return dict( - future_regressors=future_regressors, - past_regressors=past_regressors, - target=target_sensor, + sensor=target_sensor, model_save_dir=model_save_dir, output_path=output_path, - start_date=start_date, - end_date=data["end_date"], - train_period_in_hours=train_period_in_hours, - max_training_period=max_training_period, + end_date=data["end"], predict_start=predict_start, - predict_period_in_hours=retrain_frequency_in_hours, + predict_period_in_hours=predict_period_in_hours, max_forecast_horizon=max_forecast_horizon, forecast_frequency=forecast_frequency, - probabilistic=data["probabilistic"], + probabilistic=data.get("probabilistic"), sensor_to_save=sensor_to_save, - ensure_positive=ensure_positive, - missing_threshold=data.get("missing_threshold"), - as_job=data.get("as_job"), save_belief_time=save_belief_time, + m_viewpoints=m_viewpoints, ) + + +class ForecastingTriggerSchema(ForecasterParametersSchema): + + config = fields.Nested( + TrainPredictPipelineConfigSchema(), + required=False, + metadata={ + "description": "Changing any of these will result in a new data source ID." + }, + ) diff --git a/flexmeasures/data/schemas/tests/test_forecasting.py b/flexmeasures/data/schemas/tests/test_forecasting.py new file mode 100644 index 0000000000..ed14afa2f8 --- /dev/null +++ b/flexmeasures/data/schemas/tests/test_forecasting.py @@ -0,0 +1,609 @@ +import pytest + +from marshmallow import ValidationError +import pandas as pd + +from flexmeasures.data.schemas.forecasting.pipeline import ForecasterParametersSchema +from flexmeasures.data.schemas.utils import kebab_to_snake + + +@pytest.mark.parametrize( + ["timing_input", "expected_timing_output"], + [ + # Case 0: no timing parameters are given + # + # User expects to get forecasts for the default FM planning horizon from a single viewpoint (server now, floored to the hour). + # Specifically, we expect: + # - predict-period = FM planning horizon + # - max-forecast-horizon = FM planning horizon + # - forecast-frequency = FM planning horizon + # - (config) retraining-frequency = FM planning horizon + # - 1 cycle, 1 belief time + # - training-period = 30 days + ( + {}, + { + "predict-start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ).floor("1h"), + # default training period 30 days before predict start + # "start-date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h") + # - pd.Timedelta(days=30), + # default prediction period 48 hours after predict start + "end-date": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ).floor("1h") + + pd.Timedelta(hours=48), + # these are set by the schema defaults + "predict-period-in-hours": 48, + "max-forecast-horizon": pd.Timedelta(days=2), + # "train-period-in-hours": 24 * 30, + # "retrain_frequency": 2 * 24, + # "max-training-period": pd.Timedelta(days=365), + "forecast-frequency": pd.Timedelta(days=2), + # the one belief time corresponds to server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 1, + }, + ), + # Case 1: predict-period = 12 hours + # + # User expects to get forecasts for the next 12 hours from a single viewpoint. + # Specifically, we expect: + # - max-forecast-horizon = predict-period = 12 hours + # - forecast-frequency = predict-period = 12 hours + # - (config) retraining-frequency = FM planning horizon, but capped by predict-period, so 12 hours + # - 1 cycle, 1 belief time + # - training-period = 30 days + ( + {"duration": "PT12H"}, + { + "predict_start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ).floor("1h"), + # "start_date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h") + # - pd.Timedelta(days=30), + # "train_period_in_hours": 24 * 30, + "predict_period_in_hours": 12, + "max_forecast_horizon": pd.Timedelta(hours=12), + "forecast_frequency": pd.Timedelta(hours=12), + "end_date": pd.Timestamp( + "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + ) + + pd.Timedelta(hours=12), + # "retrain_frequency": 2 * 24, + # "max_training_period": pd.Timedelta(days=365), + "save_belief_time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ), + "m_viewpoints": 1, + }, + ), + # Case 2: max-forecast-horizon = 12 hours # here we have issue that predict period is defaulted to 48 hours, but max-forecast-horizon is set to 12 hours, which should be less than or equal to predict-period + # + # User expects to get forecasts for the next 12 hours from a single viewpoint (same as case 1). + # Specifically, we expect: + # - predict-period = 12 hours + # - forecast-frequency = max-forecast-horizon = 12 hours + # - retraining-period = FM planning horizon + # - 1 cycle, 1 belief time + # These expectations are encoded in default 1 of ForecasterParametersSchema.resolve_config + ( + {"max-forecast-horizon": "PT12H"}, + { + "predict_start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ).floor("1h"), + # "start_date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h") + # - pd.Timedelta(days=30), + "end_date": pd.Timestamp( + "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + ) + + pd.Timedelta(hours=48), + # "train_period_in_hours": 30 * 24, + "predict_period_in_hours": 12, + "max_forecast_horizon": pd.Timedelta(hours=12), + "forecast_frequency": pd.Timedelta(hours=12), + # "retrain_frequency": 2 * 24, + # "max_training_period": pd.Timedelta(days=365), + "save_belief_time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + ), + "m_viewpoints": 1, + }, + ), + # Case 3: forecast-frequency = 12 hours + # todo: add to description that this should really be used in combination with the predict-start field + # + # User expects to get forecasts for the default FM planning horizon from a new viewpoint every 12 hours. + # Specifically, we expect: + # - predict-period = FM planning horizon + # - max-forecast-horizon = predict-period (actual horizons are 48, 36, 24 and 12) + # - retraining-period = FM planning horizon + # - 1 cycle, 4 belief times + ( + { + "start": "2025-01-15T12:00:00+01:00", + "forecast-frequency": "PT12H", + }, + { + "predict_start": pd.Timestamp( + "2025-01-15T12:00:00.000+01", tz="Europe/Amsterdam" + ), + # "start_date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h") + # - pd.Timedelta(days=30), + # "train_period_in_hours": 30 * 24, + "predict_period_in_hours": 48, + "max_forecast_horizon": pd.Timedelta(hours=48), + "forecast_frequency": pd.Timedelta(hours=12), + "end_date": pd.Timestamp( + "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + ) + + pd.Timedelta(hours=48), + # "max_training_period": pd.Timedelta(days=365), + # "retrain-frequency": 2 * 24, + # this is the first belief time of the four belief times + "save_belief_time": pd.Timestamp( + "2025-01-15T12:00:00.00+01", tz="Europe/Amsterdam" + ), + "m_viewpoints": 4, + }, + ), + # Case 4: (config) retraining-period = 12 hours + # + # User expects to get forecasts for the default FM planning horizon from a new viewpoint every 12 hours (retraining at every viewpoint). + # Specifically, we expect: + # - predict-period = FM planning horizon + # - max-forecast-horizon = predict-period (actual horizons are 48, 36, 24 and 12) + # - forecast-frequency = predict-period (NOT capped by retraining-period, no param changes based on config) + # - 1 cycle, 1 belief time + # ( + # { + # "retrain-frequency": "PT12H", + # "end-date": "2025-01-17T12:00:00+01:00", + # }, + # { + # "predict_start": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h"), + # "start_date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ).floor("1h") + # - pd.Timedelta(days=30), + # "train_period_in_hours": 30 * 24, + # "predict_period_in_hours": 48, + # "max_forecast_horizon": pd.Timedelta(hours=48), + # "forecast_frequency": pd.Timedelta(hours=48), + # "end_date": pd.Timestamp( + # "2025-01-17T12:00:00+01", tz="Europe/Amsterdam" + # ), + # "retrain-frequency": 12, + # "max_training_period": pd.Timedelta(days=365), + # "save_belief_time": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam" + # ), + # "m_viewpoints": 1, + # }, + # ), + # Case 5: predict-period = 10 days and max-forecast-horizon = 12 hours + # + # User expects to get a ValidationError for having set parameters that won't give complete coverage of the predict-period. + ( + { + "duration": "P10D", + "max-forecast-horizon": "PT12H", + }, + ValidationError( + { + "max_forecast_horizon": [ + "This combination of parameters will not yield forecasts for the entire prediction window." + ] + } + ), + ), + # Case 6: predict-period = 12 hours and max-forecast-horizon = 10 days + # + # User expects that FM complains: the max-forecast-horizon should be lower than the predict-period + # - forecast-frequency = predict-period + # - retraining-frequency = FM planning horizon + # - 1 cycle, 1 belief time + ( + { + "duration": "PT12H", + "max-forecast-horizon": "P10D", + }, + ValidationError( + { + "max_forecast_horizon": [ + "max-forecast-horizon must be less than or equal to predict-period" + ] + } + ), + ), + # Case 7: end-date = almost 5 days after now + # + # User expects to get forecasts for the next 5 days (from server now floored to 1 hour) with a default 30-day training period + # - predict-period = 5 days + # - forecast-frequency = predict-period + # - retraining-frequency = FM planning horizon + # - 1 cycle, 1 belief time + # - training-period = 30 days + ( + {"end": "2025-01-20T12:00:00+01:00"}, + { + "predict-start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ).floor("1h"), + # "start-date": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", + # tz="Europe/Amsterdam", + # ).floor("1h") + # - pd.Timedelta( + # days=30 + # ), # default training period 30 days before predict start + "end-date": pd.Timestamp( + "2025-01-20T12:00:00+01", + tz="Europe/Amsterdam", + ), + # "train-period-in-hours": 30 * 24, # from start date to predict start + "predict-period-in-hours": 120, # from predict start to end date + "forecast-frequency": pd.Timedelta( + days=5 + ), # default forecast frequency + "max-forecast-horizon": pd.Timedelta( + days=5 + ), # duration between predict start and end date + # default values + # "retrain_frequency": 2 * 24, + # "max-training-period": pd.Timedelta(days=365), + # server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 1, + }, + ), + # Case 8: end-date = almost 4.5 days after now, train-start is 26.5 days before now + # + # User expects to get forecasts for the next 4.5 days (from server now floored to 1 hour) with a custom 636-hour training period + # - predict-period = 108 hours + # - forecast-frequency = predict-period + # - retraining-frequency = FM planning horizon + # - 1 cycle, 1 belief time + # - training-period = 636 hours + ( + { + # "train-start": "2024-12-20T00:00:00+01:00", + "end": "2025-01-20T00:00:00+01:00", + }, + { + # "start-date": pd.Timestamp( + # "2024-12-20T00:00:00+01", tz="Europe/Amsterdam" + # ), + "end-date": pd.Timestamp( + "2025-01-20T00:00:00+01", tz="Europe/Amsterdam" + ), + "predict-start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ).floor("1h"), + "predict-period-in-hours": 108, # hours from predict start to end date + # "train-period-in-hours": 636, # hours between start date and predict start + "max-forecast-horizon": pd.Timedelta(hours=108), + "forecast-frequency": pd.Timedelta(hours=108), + # "retrain_frequency": 2 * 24, + # "max-training-period": pd.Timedelta(days=365), + # server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 1, + }, + ), + # Case 9: end-date is given with train-period = 3 days + # + # User expects the start date to be computed from the inferred predict-start and train-period. + # Specifically, we expect: + # - predict-start = server now floored to sensor resolution + # - train-period = 3 days (72 hours) + # - predict-period = 5 days (from predict-start to end-date) + # - max-forecast-horizon = predict-period = 5 days + # - forecast-frequency = predict-period = 5 days + # - retrain-frequency = FM planning horizon + # - 1 cycle, 1 belief time + ( + { + "end": "2025-01-20T12:00:00+01:00", + # "train-period": "P3D", + }, + { + "end-date": pd.Timestamp( + "2025-01-20T12:00:00+01", tz="Europe/Amsterdam" + ), + "predict-start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ).floor("1h"), + # "start-date": pd.Timestamp( + # "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + # ) + # - pd.Timedelta(days=3), + # "train-period-in-hours": 72, # from start date to predict start + "predict-period-in-hours": 120, # from predict start to end date + "max-forecast-horizon": pd.Timedelta( + days=5 + ), # duration between predict start and end date + "forecast-frequency": pd.Timedelta(days=5), + # default values + # "retrain_frequency": 2 * 24, + # "max-training-period": pd.Timedelta(days=365), + # server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 1, + }, + ), + # Case 10: train-start is given with train-period = 3 days + # + # User expects predict-start to be derived from train-start + train-period. + # Specifically, we expect: + # - predict-start = train-start + 3 days + # - predict-period = FM planning horizon (48 hours) + # - end-date = predict-start + 48 hours + # - max-forecast-horizon = predict-period = 48 hours + # - forecast-frequency = predict-period = 48 hours + # - retrain-frequency = FM planning horizon + # - 1 cycle, 1 belief time + # ( + # { + # # "train-start": "2024-12-25T00:00:00+01:00", + # # "train-period": "P3D", + # }, + # { + # # "train-start": pd.Timestamp( + # # "2024-12-25T00:00:00+01", tz="Europe/Amsterdam" + # # ), + # "predict-start": pd.Timestamp( + # "2024-12-25T00:00:00+01", tz="Europe/Amsterdam" + # ) + # + pd.Timedelta(days=3), + # "end-date": pd.Timestamp( + # "2024-12-28T00:00:00+01", tz="Europe/Amsterdam" + # ) + # + pd.Timedelta(days=2), + # # "train-period-in-hours": 72, + # "max-forecast-horizon": pd.Timedelta( + # days=2 + # ), # duration between predict start and end date + # "forecast-frequency": pd.Timedelta( + # days=2 + # ), # duration between predict start and end date + # # default values + # "predict-period-in-hours": 48, + # # "retrain_frequency": 2 * 24, + # # "max-training-period": pd.Timedelta(days=365), + # # the belief time of the forecasts will be calculated from start and max-forecast-horizon and forecast-frequency + # "save-belief-time": None, + # "m_viewpoints": 1, + # }, + # ), + # Case 11: train-start is given with predict-period duration = 3 days + # + # User expects predict-start to remain based on server now (no train-period given). + # Specifically, we expect: + # - predict-start = server now floored to sensor resolution + # - predict-period = 3 days + # - end-date = predict-start + 3 days + # - train-period derived from train-start to predict-start + # - max-forecast-horizon = predict-period = 3 days + # - forecast-frequency = predict-period = 3 days + # - retrain-frequency = FM planning horizon + # - 1 cycle, 1 belief time + ( + { + # "train-start": "2024-12-25T00:00:00+01:00", + "duration": "P3D", + }, + { + # "start-date": pd.Timestamp( + # "2024-12-25T00:00:00+01", tz="Europe/Amsterdam" + # ), + "predict-start": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ).floor("1h"), + "end-date": pd.Timestamp( + "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + ) + + pd.Timedelta(days=3), + "predict-period-in-hours": 72, + # "train-period-in-hours": 516, # from train-start to predict-start + "max-forecast-horizon": pd.Timedelta( + days=3 + ), # duration between predict-start and end-date + "forecast-frequency": pd.Timedelta(days=3), + # default values + # "retrain_frequency": 2 * 24, + # "max-training-period": pd.Timedelta(days=365), + # server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 1, + }, + ), + # Case 12: train-start is given with train-period = 20 days and duration = 3 days + # + # User expects both predict-start and end-date to be derived from train-start. + # Specifically, we expect: + # - predict-start = train-start + 20 days + # - predict-period = 3 days + # - end-date = train-start + 23 days + # - max-forecast-horizon = predict-period = 3 days + # - forecast-frequency = predict-period = 3 days + # - retrain-frequency = FM planning horizon + # - 1 cycle, 1 belief time + # ( + # { + # # "train-start": "2024-12-01T00:00:00+01:00", + # # "train-period": "P20D", + # "duration": "P3D", + # }, + # { + # # "start-date": pd.Timestamp( + # # "2024-12-01T00:00:00+01", tz="Europe/Amsterdam" + # # ), + # "predict-start": pd.Timestamp( + # "2024-12-01T00:00:00+01", tz="Europe/Amsterdam" + # ) + # + pd.Timedelta(days=20), + # "end-date": pd.Timestamp( + # "2024-12-01T00:00:00+01", tz="Europe/Amsterdam" + # ) + # + pd.Timedelta(days=23), + # # "train-period-in-hours": 480, + # "predict-period-in-hours": 72, + # # defaults to prediction period (duration) + # "max-forecast-horizon": pd.Timedelta(days=3), + # "forecast-frequency": pd.Timedelta(days=3), + # # default values + # # "retrain_frequency": 2 * 24, + # # "max-training-period": pd.Timedelta(days=365), + # # the belief time of the forecasts will be calculated from start and max-forecast-horizon and forecast-frequency + # "save-belief-time": None, + # }, + # ), + # Case 13: only end is given with retrain-frequency = 3 days + # + # User expects train start and predict start to be derived from end-date and defaults. + # Specifically, we expect: + # - predict-start = end-date - default duration (FM planning horizon) + # - train-period = default 30 days + # - train-start = predict-start - 30 days + # - predict-period = 6 days + # - max-forecast-horizon = predict-period = 6 days + # - forecast-frequency = predict-period = 6 days + # - retrain-frequency = 3 days (explicit) + # - 1 cycle, 1 belief time + # ( + # { + # "end-date": "2025-01-21T12:00:00+01:00", + # "retrain-frequency": "P3D", # only comes into play if forecast-frequency is lower than retrain-frequency, which here it is not + # }, + # { + # "end-date": pd.Timestamp( + # "2025-01-21T12:00:00+01", tz="Europe/Amsterdam" + # ), + # "predict-start": pd.Timestamp( + # "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + # ), + # "start-date": pd.Timestamp( + # "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + # ) + # - pd.Timedelta(days=30), + # "predict-period-in-hours": 144, # from predict start to end date + # "train-period-in-hours": 30 * 24, + # "max-forecast-horizon": pd.Timedelta( + # days=6 + # ), # duration between predict start and end date + # "forecast-frequency": pd.Timedelta(hours=144), + # # default values + # "max-training-period": pd.Timedelta(days=365), + # "retrain-frequency": 3 * 24, + # # server now + # "save-belief-time": pd.Timestamp( + # "2025-01-15T12:23:58.387422+01", + # tz="Europe/Amsterdam", + # ), + # "m_viewpoints": 1, # we expect 1 cycle from the forecast-frequency defaulting to the predict-period + # }, + # ), + # Case 14: forecast-frequency = 5 days, predict-period = 10 days + # + # User expects to get forecasts for 10 days from two unique viewpoints 5 days apart. + # Specifically, we expect: + # - predict-period = 10 days + # - max-forecast-horizon = predict-period (actual horizons are 10 days and 5 days) + # - forecast-frequency = 5 days + # - retrain-frequency = FM planning horizon + # - 2 cycles, 2 belief times + ( + { + "duration": "P10D", + "forecast-frequency": "P5D", + }, + { + "end-date": pd.Timestamp( + "2025-01-25T12:00:00+01", tz="Europe/Amsterdam" + ), + "predict-start": pd.Timestamp( + "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + ), + # "start-date": pd.Timestamp( + # "2025-01-15T12:00:00+01", tz="Europe/Amsterdam" + # ) + # - pd.Timedelta(days=30), + "predict-period-in-hours": 240, # from predict start to end date + # "train-period-in-hours": 30 * 24, + "max-forecast-horizon": pd.Timedelta( + days=10 + ), # duration between predict start and end date + "forecast-frequency": pd.Timedelta(hours=120), + # default values + # "max-training-period": pd.Timedelta(days=365), + # "retrain-frequency": 2 * 24, + # server now + "save-belief-time": pd.Timestamp( + "2025-01-15T12:23:58.387422+01", + tz="Europe/Amsterdam", + ), + "m_viewpoints": 2, # we expect 2 cycles from the retrain frequency and predict period given the end date + }, + ), + ], +) +def test_timing_parameters_of_forecaster_parameters_schema( + setup_dummy_sensors, freeze_server_now, timing_input, expected_timing_output +): + freeze_server_now( + pd.Timestamp("2025-01-15T12:23:58.387422+01", tz="Europe/Amsterdam") + ) + + if isinstance(expected_timing_output, ValidationError): + with pytest.raises(ValidationError) as exc: + ForecasterParametersSchema().load( + { + "sensor": 1, + **timing_input, + } + ) + assert exc.value.messages == expected_timing_output.messages + return + data = ForecasterParametersSchema().load( + { + "sensor": 1, + **timing_input, + } + ) + # breakpoint() + for k, v in expected_timing_output.items(): + # Convert kebab-case key to snake_case to match data dictionary keys returned by schema + snake_key = kebab_to_snake(k) + assert data[snake_key] == v, f"{k} did not match expectations." diff --git a/flexmeasures/data/schemas/utils.py b/flexmeasures/data/schemas/utils.py index b8ff581b75..149d891f39 100644 --- a/flexmeasures/data/schemas/utils.py +++ b/flexmeasures/data/schemas/utils.py @@ -83,3 +83,13 @@ def convert_to_quantity(value: str, to_unit: str) -> ur.Quantity: raise FMValidationError( f"Cannot convert value '{value}' to a valid quantity. {e}" ) + + +def snake_to_kebab(key: str) -> str: + """Convert snake_case to kebab-case.""" + return key.replace("_", "-") + + +def kebab_to_snake(key: str) -> str: + """Convert kebab-case to snake_case.""" + return key.replace("-", "_") diff --git a/flexmeasures/data/services/data_sources.py b/flexmeasures/data/services/data_sources.py index bfe0e1d993..c2f260806b 100644 --- a/flexmeasures/data/services/data_sources.py +++ b/flexmeasures/data/services/data_sources.py @@ -4,7 +4,7 @@ from flask import current_app from sqlalchemy import select -from typing import Type +from typing import Type, TypeVar from flexmeasures import User, Source from flexmeasures.data import db @@ -13,6 +13,9 @@ from flask import current_app as app +DG = TypeVar("DG", bound=DataGenerator) + + def get_or_create_source( source: User | str, source_type: str | None = None, @@ -79,8 +82,8 @@ def get_data_generator( model: str, config: dict, save_config: bool, - data_generator_type: Type, -) -> DataGenerator | None: + data_generator_type: Type[DG], +) -> DG | None: dg_type_name = data_generator_type.__name__ if source is None: logging.info( diff --git a/flexmeasures/data/tests/test_train_predict_pipeline.py b/flexmeasures/data/tests/test_forecasting_pipeline.py similarity index 51% rename from flexmeasures/data/tests/test_train_predict_pipeline.py rename to flexmeasures/data/tests/test_forecasting_pipeline.py index 5af5722a47..348eff7f9a 100644 --- a/flexmeasures/data/tests/test_train_predict_pipeline.py +++ b/flexmeasures/data/tests/test_forecasting_pipeline.py @@ -8,129 +8,120 @@ from marshmallow import ValidationError +from flexmeasures.data.models.forecasting.exceptions import NotEnoughDataException from flexmeasures.data.models.forecasting.pipelines import TrainPredictPipeline -from flexmeasures.data.models.forecasting.exceptions import CustomException from flexmeasures.utils.job_utils import work_on_rq from flexmeasures.data.services.forecasting import handle_forecasting_exception @pytest.mark.parametrize( - ["config", "params", "expected_error"], + ["config", "params", "as_job", "expected_error"], [ ( { # "model": "CustomLGBM", + "train-start": "2025-01-01T00:00+02:00", + "train-period": "P2D", + "retrain-frequency": "P0D", # 0 days is expected to fail }, { "sensor": "solar-sensor", - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "end_date": "2025-01-03T00:00+02:00", - "train_period": "P2D", - "sensor_to_save": None, - "start_predict_date": "2025-01-02T00:00+02:00", - "retrain_frequency": "P0D", # 0 days is expected to fail - "max_forecast_horizon": "PT1H", - "forecast_frequency": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "end": "2025-01-03T00:00+02:00", + "sensor-to-save": None, + "start": "2025-01-02T00:00+02:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT1H", "probabilistic": False, }, - (ValidationError, "retrain-frequency must be greater than 0"), + False, + (ValidationError, "retrain-frequency must be at least 1 hour"), ), ( { # "model": "CustomLGBM", + "future-regressors": ["irradiance-sensor"], + "train-start": "2025-01-01T00:00+02:00", }, { "sensor": "solar-sensor", - "future_regressors": ["irradiance-sensor"], - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "start_predict_date": "2025-01-08T00:00+02:00", # start_predict_date coincides with end of available data in sensor - "end_date": "2025-01-09T00:00+02:00", - "sensor_to_save": None, - "max_forecast_horizon": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "start": "2025-01-08T00:00+02:00", # start coincides with end of available data in sensor + "end": "2025-01-09T00:00+02:00", + "sensor-to-save": None, + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT24H", # 1 cycle and 1 viewpoint "probabilistic": False, - "as_job": True, }, + True, None, ), ( { # "model": "CustomLGBM", + "future-regressors": ["irradiance-sensor"], + # "train-start": "2025-01-01T00:00+02:00", # without a start date, max-training-period takes over + "max-training-period": "P7D", }, { "sensor": "solar-sensor", - "future_regressors": ["irradiance-sensor"], - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "start_predict_date": "2025-01-08T00:00+02:00", # start_predict_date coincides with end of available data in sensor - "end_date": "2025-01-09T00:00+02:00", - "sensor_to_save": None, - "max_forecast_horizon": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "start": "2025-01-08T00:00+02:00", # start coincides with end of available data in sensor + "end": "2025-01-09T00:00+02:00", + "sensor-to-save": None, + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT24H", # 1 cycle and 1 viewpoint "probabilistic": False, }, + False, None, ), ( { # "model": "CustomLGBM", + "past-regressors": ["irradiance-sensor"], + "future-regressors": ["irradiance-sensor"], + "train-start": "2025-01-01T00:00+02:00", }, { # Test: duplicate sensor names in past and future regressors "sensor": "solar-sensor", - "past_regressors": ["irradiance-sensor"], - "future_regressors": ["irradiance-sensor"], - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "start_predict_date": "2025-01-04T00:00+02:00", - "end_date": "2025-01-09T00:00+02:00", - "sensor_to_save": None, - "max_forecast_horizon": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "start": "2025-01-08T00:00+02:00", + "end": "2025-01-09T00:00+02:00", + "sensor-to-save": None, + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT24H", "probabilistic": False, }, + False, None, ), ( { # "model": "CustomLGBM", + "future-regressors": ["irradiance-sensor"], + "retrain-frequency": "P1D", + "train-start": "2025-01-01T00:00+02:00", + "train-period": "P2D", }, { "sensor": "solar-sensor", - "future_regressors": ["irradiance-sensor"], - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "end_date": "2025-01-03T00:00+02:00", - "train_period": "P2D", - "sensor_to_save": None, - "start_predict_date": "2025-01-02T00:00+02:00", - "retrain_frequency": "P1D", - "max_forecast_horizon": "PT1H", - "forecast_frequency": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "end": "2025-01-03T00:00+02:00", + "sensor-to-save": None, + "start": "2025-01-02T00:00+02:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT24H", "probabilistic": False, }, + False, None, ), - # ( - # {}, - # { - # "sensor": "solar-sensor", - # "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - # "output_path": None, - # "start_date": "2025-07-01T00:00+02:00", - # "end_date": "2025-07-12T00:00+02:00", - # "sensor_to_save": 1, - # "start_predict_date": "2025-07-11T17:26+02:00", - # "retrain_frequency": "PT24H", - # "max_forecast_horizon": 24, - # "forecast_frequency": 1, - # "probabilistic": False, - # }, - # (ValidationError, "Try increasing the --end-date."), - # ) ], ) def test_train_predict_pipeline( # noqa: C901 @@ -138,6 +129,7 @@ def test_train_predict_pipeline( # noqa: C901 setup_fresh_test_forecast_data, config, # config passed to the Forecaster params, # parameters passed to the compute method of the Forecaster + as_job: bool, expected_error: bool | tuple[type[BaseException], str], ): sensor = setup_fresh_test_forecast_data[params["sensor"]] @@ -145,22 +137,22 @@ def test_train_predict_pipeline( # noqa: C901 past_regressors = [ setup_fresh_test_forecast_data[regressor_name] - for regressor_name in params.get("past_regressors", []) + for regressor_name in config.get("past-regressors", []) ] future_regressors = [ setup_fresh_test_forecast_data[regressor_name] - for regressor_name in params.get("future_regressors", []) + for regressor_name in config.get("future-regressors", []) ] regressors = [ setup_fresh_test_forecast_data[regressor_name] for regressor_name in params.get("regressors", []) ] - if params.get("past_regressors"): - params["past_regressors"] = [regressor.id for regressor in past_regressors] + if config.get("past-regressors"): + config["past-regressors"] = [regressor.id for regressor in past_regressors] - if params.get("future_regressors"): - params["future_regressors"] = [regressor.id for regressor in future_regressors] + if config.get("future-regressors"): + config["future-regressors"] = [regressor.id for regressor in future_regressors] if params.get("regressors"): params["regressors"] = [regressor.id for regressor in regressors] @@ -172,38 +164,38 @@ def test_train_predict_pipeline( # noqa: C901 assert expected_error[1] in str(e_info) else: pipeline = TrainPredictPipeline(config=config) - pipeline_returns = pipeline.compute(parameters=params) + pipeline_returns = pipeline.compute(parameters=params, as_job=as_job) # Check pipeline properties for attr in ("model",): if config.get(attr): assert hasattr(pipeline, attr) - if params.get("as_job"): + if as_job: work_on_rq( app.queues["forecasting"], exc_handler=handle_forecasting_exception ) forecasts = sensor.search_beliefs(source_types=["forecaster"]) dg_params = pipeline._parameters # parameters stored in the data generator - n_cycles = (dg_params["end_date"] - dg_params["predict_start"]) / ( + m_viewpoints = (dg_params["end_date"] - dg_params["predict_start"]) / ( dg_params["forecast_frequency"] ) # 1 hour of forecasts is saved over 4 15-minute resolution events - n_events_per_horizon = timedelta(hours=1) / dg_params["target"].event_resolution + n_events_per_horizon = timedelta(hours=1) / dg_params["sensor"].event_resolution n_hourly_horizons = dg_params["max_forecast_horizon"] // timedelta(hours=1) assert ( - len(forecasts) == n_cycles * n_hourly_horizons * n_events_per_horizon - ), f"we expect 4 forecasts per horizon for each cycle within the prediction window, and {n_cycles} cycles with each {n_hourly_horizons} hourly horizons" + len(forecasts) == m_viewpoints * n_hourly_horizons * n_events_per_horizon + ), f"we expect 4 forecasts per horizon for each viewpoint within the prediction window, and {m_viewpoints} viewpoints with each {n_hourly_horizons} hourly horizons" assert ( - forecasts.lineage.number_of_belief_times == n_cycles - ), f"we expect 1 belief time per cycle, and {n_cycles} cycles" + forecasts.lineage.number_of_belief_times == m_viewpoints + ), f"we expect {m_viewpoints} viewpoints" source = forecasts.lineage.sources[0] assert "TrainPredictPipeline" in str( source ), "string representation of the Forecaster (DataSource) should mention the used model" - if dg_params["as_job"]: + if as_job: # Fetch returned job job = app.queues["forecasting"].fetch_job(pipeline_returns) @@ -249,53 +241,53 @@ def test_train_predict_pipeline( # noqa: C901 # Check DataGenerator configuration stored under DataSource attributes data_generator_config = source.attributes["data_generator"]["config"] assert data_generator_config["model"] == "CustomLGBM" - - # Check DataGenerator parameters stored under DataSource attributes - data_generator_params = source.attributes["data_generator"]["parameters"] assert ( - "missing_threshold" in data_generator_params - ), "data generator parameters should mention missing_threshold" + "missing-threshold" in data_generator_config + ), "data generator config should mention missing_threshold" for regressor in past_regressors: assert ( - regressor.id in data_generator_params["past_regressors"] - ), f"data generator parameters should mention past regressor {regressor.name}" + regressor.id in data_generator_config["past-regressors"] + ), f"data generator config should mention past regressor {regressor.name}" for regressor in future_regressors: assert ( - regressor.id in data_generator_params["future_regressors"] - ), f"data generator parameters should mention future regressor {regressor.name}" + regressor.id in data_generator_config["future-regressors"] + ), f"data generator config should mention future regressor {regressor.name}" for regressor in regressors: assert ( - regressor.id in data_generator_params["past_regressors"] - ), f"data generator parameters should mention regressor {regressor.name} as a past regressor" + regressor.id in data_generator_config["past-regressors"] + ), f"data generator config should mention regressor {regressor.name} as a past regressor" assert ( - regressor.id in data_generator_params["future_regressors"] - ), f"data generator parameters should mention regressor {regressor.name} as a future regressor" + regressor.id in data_generator_config["future-regressors"] + ), f"data generator config should mention regressor {regressor.name} as a future regressor" assert ( - "regressors" not in data_generator_params + "regressors" not in data_generator_config ), "(past and future) regressors should be stored under 'past_regressors' and 'future_regressors' instead" + assert "max-training-period" in data_generator_config + + # Check DataGenerator parameters stored under DataSource attributes is empty + assert "parameters" not in source.attributes["data_generator"] -# Test that missing data logging works and raises CustomException when threshold exceeded +# Test that missing data logging works and raises NotEnoughDataException when threshold exceeded @pytest.mark.parametrize( ["config", "params"], [ # Target sensor has missing data ( { # "model": "CustomLGBM", + "missing-threshold": "0.0", + "train-start": "2025-01-01T00:00+02:00", }, { "sensor": "solar-sensor", - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "end_date": "2025-01-30T00:00+02:00", - "sensor_to_save": None, - "start_predict_date": "2025-01-25T00:00+02:00", - "retrain_frequency": "P1D", - "max_forecast_horizon": "PT1H", - "forecast_frequency": "PT1H", - "missing_threshold": "0.0", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "end": "2025-01-30T00:00+02:00", + "sensor-to-save": None, + "start": "2025-01-25T00:00+02:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT1H", "probabilistic": False, }, ), @@ -303,20 +295,19 @@ def test_train_predict_pipeline( # noqa: C901 ( { # "model": "CustomLGBM", + "future-regressors": ["irradiance-sensor"], + "missing-threshold": "0.0", + "train-start": "2025-01-01T00:00+02:00", }, { "sensor": "solar-sensor", - "future_regressors": ["irradiance-sensor"], - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "end_date": "2025-01-30T00:00+02:00", - "missing_threshold": "0.0", - "sensor_to_save": None, - "start_predict_date": "2025-01-25T00:00+02:00", - "retrain_frequency": "P1D", - "max_forecast_horizon": "PT1H", - "forecast_frequency": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "end": "2025-01-30T00:00+02:00", + "sensor-to-save": None, + "start": "2025-01-25T00:00+02:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT1H", "probabilistic": False, }, ), @@ -329,7 +320,7 @@ def test_missing_data_logs_warning( caplog, ): """ - Verify that a CustomException is raised (wrapping a ValueError) + Verify that a NotEnoughDataException is raised (wrapping a ValueError) """ sensor = setup_fresh_test_forecast_data_with_missing_data[params["sensor"]] params["sensor"] = sensor.id @@ -346,44 +337,43 @@ def test_missing_data_logs_warning( setup_fresh_test_forecast_data_with_missing_data[reg] for reg in params.get("regressors", []) ] - params["missing_threshold"] = float(params.get("missing_threshold")) - if params.get("past_regressors"): - params["past_regressors"] = [r.id for r in past_regressors] - if params.get("future_regressors"): - params["future_regressors"] = [r.id for r in future_regressors] + config["missing-threshold"] = float(config.get("missing-threshold")) + if config.get("past-regressors"): + config["past-regressors"] = [r.id for r in past_regressors] + if config.get("future-regressors"): + config["future-regressors"] = [r.id for r in future_regressors] if params.get("regressors"): params["regressors"] = [r.id for r in regressors] - with caplog.at_level(logging.WARNING): - pipeline = TrainPredictPipeline(config=config) - # Expect CustomException when missing data exceeds threshold - with pytest.raises(CustomException) as excinfo: - pipeline.compute(parameters=params) - assert "missing values" in str( - excinfo.value - ), "Expected CustomException for missing data threshold" + pipeline = TrainPredictPipeline(config=config) + # Expect ValueError when missing data exceeds threshold + with pytest.raises(NotEnoughDataException) as excinfo: + pipeline.compute(parameters=params) + assert "missing values" in str( + excinfo.value + ), "Expected NotEnoughDataException for missing data threshold" -# Test that max_training_period caps train_period and logs a warning +# Test that max_training-period caps train-period and logs a warning @pytest.mark.parametrize( ["config", "params"], [ ( { # "model": "CustomLGBM", + "retrain-frequency": "P1D", + "train-start": "2025-01-01T00:00+02:00", + "max-training-period": "P10D", # cap at 10 days }, { "sensor": "solar-sensor", - "model_save_dir": "flexmeasures/data/models/forecasting/artifacts/models", - "output_path": None, - "start_date": "2025-01-01T00:00+02:00", - "end_date": "2025-01-30T00:00+02:00", - "max_training_period": "P10D", # cap at 10 days - "sensor_to_save": None, - "start_predict_date": "2025-01-25T00:00+02:00", - "retrain_frequency": "P1D", - "max_forecast_horizon": "PT1H", - "forecast_frequency": "PT1H", + "model-save-dir": "flexmeasures/data/models/forecasting/artifacts/models", + "output-path": None, + "end": "2025-01-30T00:00+02:00", + "sensor-to-save": None, + "start": "2025-01-25T00:00+02:00", + "max-forecast-horizon": "PT1H", + "forecast-frequency": "PT1H", "probabilistic": False, }, ), @@ -396,8 +386,8 @@ def test_train_period_capped_logs_warning( caplog, ): """ - Verify that a warning is logged when train_period exceeds max_training_period, - and that train_period is capped accordingly. + Verify that a warning is logged when train-period exceeds max-training-period, + and that train-period is capped accordingly. """ sensor = setup_fresh_test_forecast_data[params["sensor"]] params["sensor"] = sensor.id @@ -411,8 +401,8 @@ def test_train_period_capped_logs_warning( for message in caplog.messages ), "Expected warning about capping train_period" - params_used = pipeline._parameters - assert params_used["missing_threshold"] == 1 - assert params_used["train_period_in_hours"] == timedelta(days=10) / timedelta( + config_used = pipeline._config + assert config_used["missing_threshold"] == 1 + assert config_used["train_period_in_hours"] == timedelta(days=10) / timedelta( hours=1 ), "train_period_in_hours should be capped to max_training_period" diff --git a/flexmeasures/ui/static/openapi-specs.json b/flexmeasures/ui/static/openapi-specs.json index ea91c572f7..d8d1841f2a 100644 --- a/flexmeasures/ui/static/openapi-specs.json +++ b/flexmeasures/ui/static/openapi-specs.json @@ -609,6 +609,9 @@ } } }, + "400": { + "description": "UNKNOWN_FORECAST" + }, "401": { "description": "UNAUTHORIZED" }, @@ -1163,7 +1166,7 @@ "/api/v3_0/sensors/{id}/forecasts/trigger": { "post": { "summary": "Trigger forecasting job for one sensor", - "description": "Trigger a forecasting job for a sensor.\n\nThis endpoint starts a forecasting job asynchronously and returns a\njob UUID. The job will run in the background and generate forecast values\nfor the specified period.\n\nOnce triggered, the job status and results can be retrieved using the\n``GET /api/v3_0/sensors//forecasts/`` endpoint.\n", + "description": "Trigger a forecasting job for a sensor.\n\nThis endpoint starts a forecasting job asynchronously and returns a job UUID.\nThe job will run in the background and generate forecasts for the specified period.\n\nOnce triggered, the job status and results can be retrieved using the\n``GET /api/v3_0/sensors//forecasts/`` endpoint.\n", "security": [ { "ApiKeyAuth": [] @@ -1186,29 +1189,11 @@ "content": { "application/json": { "schema": { - "type": "object", - "properties": { - "start_date": { - "type": "string", - "format": "date-time", - "description": "Start date of the historical data used for training." - }, - "start_predict_date": { - "type": "string", - "format": "date-time", - "description": "Start date of the forecast period." - }, - "end_date": { - "type": "string", - "format": "date-time", - "description": "End date of the forecast period." - } - } + "$ref": "#/components/schemas/forecasting_trigger_schema_openAPI" }, "example": { - "start_date": "2026-01-01T00:00:00+01:00", - "start_predict_date": "2026-01-15T00:00:00+01:00", - "end_date": "2026-01-17T00:00:00+01:00" + "start": "2026-01-15T00:00:00+01:00", + "duration": "P2D" } } } @@ -4110,6 +4095,99 @@ }, "additionalProperties": false }, + "TrainPredictPipelineConfigSchemaOpenAPI": { + "type": "object", + "properties": { + "model": { + "type": "string", + "default": "CustomLGBM" + }, + "future-regressors": { + "type": "array", + "default": [], + "description": "Sensor IDs to be treated only as future regressors. Use this if only forecasts recorded on this sensor matter as a regressor.", + "example": [ + 2093, + 2094 + ], + "items": { + "type": "integer" + } + }, + "past-regressors": { + "type": "array", + "default": [], + "description": "Sensor IDs to be treated only as past regressors. Use this if only realizations recorded on this sensor matter as a regressor.", + "example": [ + 2095 + ], + "items": { + "type": "integer" + } + }, + "regressors": { + "type": "array", + "default": [], + "description": "Sensor IDs used as both past and future regressors. Use this if both realizations and forecasts recorded on this sensor matter as a regressor.", + "example": [ + 2093, + 2094, + 2095 + ], + "items": { + "type": "integer" + } + }, + "missing-threshold": { + "type": "number", + "default": 1.0, + "description": "Maximum fraction of missing data allowed before raising an error. Defaults to 1.0.", + "example": 0.1 + }, + "ensure-positive": { + "type": [ + "boolean", + "null" + ], + "default": false, + "description": "Whether to clip negative values in forecasts. Defaults to None (disabled).", + "example": true + } + }, + "additionalProperties": false + }, + "forecasting_trigger_schema_openAPI": { + "type": "object", + "properties": { + "duration": { + "type": "string", + "description": "The duration for which to create the forecast, in ISO 8601 duration format. Defaults to the planning horizon.", + "example": "PT24H" + }, + "start": { + "type": [ + "string", + "null" + ], + "format": "date-time", + "description": "Start date for predictions. Defaults to now, floored to the sensor resolution, so that the first forecast is about the ongoing event.", + "example": "2025-01-08T00:00:00+01:00" + }, + "forecast-frequency": { + "type": [ + "string", + "null" + ], + "description": "How often to recompute forecasts. This setting can be used to get forecasts from multiple viewpoints, which is especially useful for running simulations. Defaults to the max-forecast-horizon.", + "example": "PT1H" + }, + "config": { + "description": "Changing any of these will result in a new data source ID.", + "$ref": "#/components/schemas/TrainPredictPipelineConfigSchemaOpenAPI" + } + }, + "additionalProperties": false + }, "UserAPIQuerySchema": { "type": "object", "properties": {