From 3dca4fa196edbabd9c9531e87c58c35af6383ed3 Mon Sep 17 00:00:00 2001 From: Hana Chaari Date: Fri, 21 Mar 2025 06:33:38 +0100 Subject: [PATCH 1/2] cherry-pick commit 1bbc7513 ee2f1a0a from fix/prevent-duplicates-after-simplify_index --- flexmeasures/data/queries/utils.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/flexmeasures/data/queries/utils.py b/flexmeasures/data/queries/utils.py index 3044240d11..ca490fb64b 100644 --- a/flexmeasures/data/queries/utils.py +++ b/flexmeasures/data/queries/utils.py @@ -2,7 +2,7 @@ from typing import Type from datetime import datetime, timedelta - +import logging from flask_security import current_user from werkzeug.exceptions import Forbidden import pandas as pd @@ -13,7 +13,8 @@ from sqlalchemy import select, Select from flexmeasures.data.config import db -from flexmeasures.data.models.generic_assets import GenericAsset + +from flexmeasures.data.models.generic_assets import GenericAsset, GenericAssetType from flexmeasures.data.models.data_sources import DataSource from flexmeasures.utils import flexmeasures_inflection from flexmeasures.auth.policy import user_has_admin_access @@ -217,7 +218,10 @@ def get_belief_timing_criteria( def simplify_index( - bdf: tb.BeliefsDataFrame, index_levels_to_columns: list[str] | None = None + bdf: tb.BeliefsDataFrame, + index_levels_to_columns: list[str] | None = None, + keep_duplicate_value: str | None = None, + keep_duplicate_column: GenericAsset | GenericAssetType | None = None, ) -> pd.DataFrame: """Drops indices other than event_start. Optionally, salvage index levels as new columns. @@ -241,6 +245,13 @@ def simplify_index( else: raise KeyError(f"Level {col} not found") bdf.index = bdf.index.get_level_values("event_start") + if bdf.index.duplicated().any(): + if keep_duplicate_column is not None and keep_duplicate_column in bdf.columns: + bdf = bdf[bdf[keep_duplicate_column] == keep_duplicate_value] + logging.debug("bdf without duplicates %s \n ", bdf) + else: + raise ValueError("Duplicates found in index after processing.") + return bdf From 1883473340892a08a1e4dd480fdd1b7698b1ccc8 Mon Sep 17 00:00:00 2001 From: Hana Chaari Date: Wed, 14 May 2025 14:33:21 +0100 Subject: [PATCH 2/2] imprve debugging: raise info on duplicate indices with possible reasons. To handle duplicate where simplify_index method is used --- flexmeasures/data/queries/utils.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/flexmeasures/data/queries/utils.py b/flexmeasures/data/queries/utils.py index e2607773f3..18b331bade 100644 --- a/flexmeasures/data/queries/utils.py +++ b/flexmeasures/data/queries/utils.py @@ -217,12 +217,11 @@ def get_belief_timing_criteria( return criteria + def simplify_index( bdf: tb.BeliefsDataFrame, index_levels_to_columns: list[str] | None = None, - keep_duplicate_value: str | None = None, - keep_duplicate_column: GenericAsset | GenericAssetType | None = None, -) -> pd.DataFrame: + ) -> pd.DataFrame: """Drops indices other than event_start. Optionally, salvage index levels as new columns. @@ -233,6 +232,13 @@ def simplify_index( * The index levels are dropped (by overwriting the multi-level index with just the “event_start” index level). Only for the columns named in index_levels_to_columns, the relevant information is kept around. """ + if bdf.lineage.number_of_beliefs < len(bdf): + logging.debug("bdf with duplicates due to probabilistic beliefs:\n %s", bdf) + if bdf.lineage.number_of_events < bdf.lineage.number_of_beliefs and bdf.lineage.number_of_sources == 1: + logging.debug("bdf with duplicates due to multiple belief times/horizons per event:\n %s", bdf) + if bdf.lineage.number_of_events < bdf.lineage.number_of_beliefs and bdf.lineage.number_of_sources > 1: + logging.debug("bdf with duplicates maybe due to multiple sources per event:\n %s", bdf) + if index_levels_to_columns is not None: for col in index_levels_to_columns: try: @@ -246,11 +252,8 @@ def simplify_index( raise KeyError(f"Level {col} not found") bdf.index = bdf.index.get_level_values("event_start") if bdf.index.duplicated().any(): - if keep_duplicate_column is not None and keep_duplicate_column in bdf.columns: - bdf = bdf[bdf[keep_duplicate_column] == keep_duplicate_value] - logging.debug("bdf without duplicates %s \n ", bdf) - else: - raise ValueError("Duplicates found in index after processing.") + logging.debug(f"bdf with duplicates: {bdf}") + # raise ValueError("Duplicates found in index after processing.") return bdf