Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
216 changes: 145 additions & 71 deletions tests/ert/unit_tests/analysis/test_misfit_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,34 +143,33 @@ def test_that_correlated_and_independent_observations_are_grouped_separately(
np.testing.assert_equal(obs_errors, obs_error_copy)


@pytest.mark.parametrize("nr_observations", [0, 1, 2])
def test_edge_cases_with_few_observations_return_default_values(nr_observations):
"""Test that edge cases with 0-2 observations return default scaling values.
We do not know why this is the case.
def test_edge_cases_with_few_observations_return_default_values():
"""
Test that edge cases with 2 observations return default scaling values.
We create an example of a response matrix where all rows are perfectly correlated,
which should lead to a single cluster with 1 component and a scaling factor of
sqrt(num_observations / num_components) = sqrt(2).
However, since the number of observations is <= 2, the function should skip the
clustering and PCA and return default values of 1.0 for all scaling factors
"""
nr_realizations = 1000
nr_observations = 2
Y = np.ones((nr_observations, nr_realizations))

rng = np.random.default_rng(1234)
parameters_a = rng.normal(10, 1, nr_realizations)

# create response matrix
for i in range(nr_observations):
Y[i] = (i + 1) * parameters_a

scale_factors, clusters, nr_components = main(Y, Y.mean(axis=1))
# Add observation errors
obs_errors = np.ones(nr_observations) * 0.1

np.testing.assert_equal(
scale_factors,
np.array(nr_observations * [1.0]),
)

np.testing.assert_equal(
clusters,
np.array(nr_observations * [1.0]),
)
scale_factors, *_ = main(Y, obs_errors)

np.testing.assert_equal(
nr_components,
scale_factors,
np.array(nr_observations * [1.0]),
)

Expand Down Expand Up @@ -349,7 +348,7 @@ def test_autoscale_clusters_observations_by_correlation_pattern_ignoring_sign(
(0.55, True), # Above threshold ~0.42: D-E merges first
(0.50, True),
(0.45, True),
(0.40, False), # Below threshold: A-B merges first
(0.40, False), # Below threshold: A-B or B-C merges first
(0.30, False),
],
)
Expand All @@ -360,52 +359,59 @@ def test_that_clustering_prioritizes_global_similarity_over_local_correlation(
This test demonstrates that the current clustering implementation (using
Euclidean distance on correlation rows) can behave unintuitively by prioritizing
pairs with LOWER direct correlation for merging over pairs with HIGHER direct
correlation, if the higher-correlation pair disagrees strongly on other variables.
correlation. This happens when the higher-correlation pair disagrees strongly on
other variables.

"Prioritizing" here means that the hierarchical clustering algorithm considers
the lower-correlation pair to be "closer" (more similar) and thus merges them
earlier in the bottom-up process.
earlier in the bottom-up clustering process.

Scenario:
- Group 1: A, B, C.
A and C are independent.
B is a mix of A and C (correlated ~0.707 with both).
A and B are correlated ~0.7.
However, A and B agree poorly on C (A=0, B=0.7).
The Euclidean distance between A's and B's correlation rows is large
because of this disagreement on C.
B = A + C (correlated ~0.707 with both).
In other words, A and B are correlated ~0.7, the same is true for B and C.
However, the Euclidean distance between A's and B's correlation rows is large
because of the disagreement on C

- Group 2: D, E.
D and E are isolated and correlated with strength rho (varied by parametrization).
They agree perfectly on A, B, C (all zero).
They agree perfectly on A, B, C (zero correlation with all).
The Euclidean distance between D's and E's correlation rows is relatively
small because they have consistent (zero) correlations with everything else.

Each variable's correlation row includes its correlation with all variables,
including itself (1.0 on diagonal) and its pair partner. For D and E:
Each variable's correlation row (the corresponding row in the correlation matrix)
includes its correlation with all variables, including itself (1.0 on diagonal).
For D and E we have:

D's row: [corr(D,A)=0, corr(D,B)=0, corr(D,C)=0, corr(D,D)=1.0, corr(D,E)=rho]
E's row: [corr(E,A)=0, corr(E,B)=0, corr(E,C)=0, corr(E,D)=rho, corr(E,E)=1.0]

For A, B and C we have:
A's row: [corr(A,A)=1.0, corr(A,B)=0.7, corr(A,C)=0, corr(A,D)=0, corr(A,E)=0]
B's row: [corr(B,A)=0.7, corr(B,B)=1.0, corr(B,C)=0.7, corr(B,D)=0, corr(B,E)=0]
C's row: [corr(C,A)=0, corr(C,B)=0.7, corr(C,C)=1.0, corr(C,D)=0, corr(C,E)=0]

Threshold calculation:
Threshold calculations (based on Euclidean distance between correlation rows):
dist(D,E) = sqrt((0-0)^2 + (0-0)^2 + (0-0)^2 + (1-rho)^2 + (rho-1)^2))
dist(A,B) = sqrt((1-0.7)^2 + (0.7-1)^2 + (0-0.7)^2 + (0-0)^2 + (0-0)^2)) = 0.82
= sqrt(2 * (1 - rho)^2)
dist(A,B) = dist(B,C) = sqrt((1-0.7)^2 + (0.7-1)^2 + (0-0.7)^2 + (0-0)^2+(0-0)^2))
= 0.82

Solve dist(D,E) < dist(A,B) for rho:
sqrt(2 * (1 - rho)^2) < 0.82
2 * (1 - rho)^2 < 0.82^2
(1 - rho)^2 < 0.82^2 / 2
1 - rho < sqrt(0.82^2 / 2)
rho > 1 - sqrt(0.82^2 / 2) ≈ 0.42
Hence, when rho > 0.42, D-E merges first; otherwise it does not.
Hence, when rho > 0.42, D-E merges first; otherwise either A-B or B-C merges first
(depending on random variation in the sampling).

This test is parametrized to verify both regimes:
- rho > 0.42: D-E merges before A-B despite corr(A,B) > rho
- rho < 0.42: D-E no longer the closest pair
- rho > 0.42: D-E merges first despite corr(A,B) > rho and corr(B,C) > rho
- rho < 0.42: D-E no longer the closest pair (either A-B or B-C merges first)
"""

rng = np.random.default_rng(42)
N_realizations = 10000

Expand Down Expand Up @@ -433,11 +439,15 @@ def test_that_clustering_prioritizes_global_similarity_over_local_correlation(
# so we assert it is an array to silence static analysis warnings about indexing.
assert isinstance(corr, np.ndarray)
corr_AB = corr[0, 1]
corr_BC = corr[1, 2]
corr_DE = corr[3, 4]

assert np.isclose(corr_AB, 0.707, atol=0.05), (
f"Setup error: A-B corr {corr_AB} != 0.707"
)
assert np.isclose(corr_BC, 0.707, atol=0.05), (
f"Setup error: B-C corr {corr_BC} != 0.707"
)
assert np.isclose(corr_DE, corr_de_target, atol=0.05), (
f"Setup error: D-E corr {corr_DE} != {corr_de_target}"
)
Expand All @@ -450,36 +460,57 @@ def test_that_clustering_prioritizes_global_similarity_over_local_correlation(

is_DE_merged = clusters[3] == clusters[4]
is_AB_merged = clusters[0] == clusters[1]
is_BC_merged = clusters[1] == clusters[2]

failure_msg = (
f"DE_merged={is_DE_merged}, AB_merged={is_AB_merged}. "
f"Correlations: AB={corr_AB:.3f}, DE={corr_DE:.3f}"
f"DE_merged={is_DE_merged}, AB_merged={is_AB_merged}, BC_merged={is_BC_merged}."
f"Correlations: AB={corr_AB:.3f}, BC={corr_BC:.3f}, DE={corr_DE:.3f}"
)
assert is_DE_merged == expect_de_merged, failure_msg
# When D-E merges first, A-B should not (they stay separate)
# When D-E merges first, A-B and B-Cshould not (they stay separate)
if expect_de_merged:
assert not is_AB_merged, failure_msg
assert not is_BC_merged, failure_msg


def test_that_error_scaling_discards_noisy_observations_in_pca():
def test_clustering_and_scaling_realistic_scenario():
"""
Documents current behavior: when scaling responses by observation error
(Y / obs_errors), precise observations dominate and noisy observations
are effectively discarded by PCA truncation.

Scenario: 500 noisy seismic amplitudes + 20 precise well pressures,
where seismic responds to a shallow parameter and pressure responds
to an independent deep parameter.

With error-scaling (divide by obs_errors):
- Precise pressure measurements get amplified
- Noisy seismic measurements get suppressed
- Result: 1 PC needed (pressure dominates, seismic info lost)

With StandardScaler (z-score, unit variance per observation):
- Both observation types are treated equally
- Result: 2 PCs needed (both independent directions preserved)
This is an integration test for two key components of the autoscaler:

1. Determining the number of clusters.
2. Determining the scaling factor for each cluster.

Scenario:
We are given 500 noisy seismic observations and 20 precise well pressure
observations. All the seismic observations are correlated and all the
pressure observations are correlated, but the seismic and the pressure observations
are independent of each other.

Intuitive behavior:
The autoscaler should identify two clusters, one for the seismic observations and
one for the pressure observations, and assign a scaling factor to each cluster based
on the number of observations in that cluster (sqrt(500) for seismic and sqrt(20)
for pressure).

Actual behavior with current implementation:
One cluster with scaling factor sqrt(520).

Explanation of clustering:
The autoscaler uses PCA to determine the number of clusters. When using PCA it is
common to normalize the data before calculating the principal components
(e.g., using StandardScaler to give each observation unit variance). However,
the current implementation scales the responses by the observation errors instead.
This means that the precise pressure observations are amplified and the noisy
seismic observations are suppressed. As a result, the PCA identifies only one
principal component, and hence, everything is grouped into one cluster.

Explanation of scaling factor:
The scaling factor for a cluster is calculated as
sqrt(num_observations_in_cluster / num_components), where num_components is
calculated by doing PCA on the elements of the cluster (stopping when 95% of the
total variance is explained).
"""

rng = np.random.default_rng(42)
n_realizations = 100

Expand All @@ -490,14 +521,28 @@ def test_that_error_scaling_discards_noisy_observations_in_pca():
param_shallow = rng.normal(0, 1, size=(n_realizations, 1))
param_deep = rng.normal(0, 1, size=(n_realizations, 1))

# Initialize data by using broadcasting to create the correlated structures:

# - Seismic responses = param_shallow * sensitivity,
# (all 500 seismic obs are linear combinations of the same parameter to create
# within-group correlation)

# - Pressure responses = param_deep * sensitivity,
# (all 20 pressure obs are linear combinations of a different parameter)

# - Since param_shallow and param_deep are independent, this results in
# the seismic responses being independent from the pressure responses.

# Seismic: sensitive to shallow param
seismic_sensitivity = rng.uniform(0.5, 1.5, size=(1, n_seismic))
seismic_responses = param_shallow @ seismic_sensitivity
# Add noise
seismic_responses += rng.normal(0, 0.1, size=(n_realizations, n_seismic))

# Pressure: sensitive to deep param (independent of seismic)
pressure_sensitivity = rng.uniform(0.5, 1.5, size=(1, n_pressure))
pressure_responses = param_deep @ pressure_sensitivity
# Add noise
pressure_responses += rng.normal(0, 0.1, size=(n_realizations, n_pressure))

responses = np.hstack([seismic_responses, pressure_responses])
Expand All @@ -507,26 +552,55 @@ def test_that_error_scaling_discards_noisy_observations_in_pca():
pressure_errors = np.full(n_pressure, 0.05)
obs_errors = np.hstack([seismic_errors, pressure_errors])

# Method A: Error scaling (current approach)
scaled_by_error = responses / obs_errors
n_components_error_scaling = get_nr_primary_components(
scaled_by_error, threshold=0.95
)
# Run the main function to get clusters and scaling factors
scale_factors, clusters, _ = main(responses.T, obs_errors)

# Method B: Standard scaling (z-score)
scaled_standard = (responses - responses.mean(axis=0)) / responses.std(axis=0)
n_components_standard_scaling = get_nr_primary_components(
scaled_standard, threshold=0.95
)
# Assert that all observations are in the same cluster
assert len(np.unique(clusters)) == 1

# With error scaling, precise observations dominate → only 1 PC needed
assert n_components_error_scaling == 1, (
f"Error scaling should yield 1 PC (pressure dominates), "
f"got {n_components_error_scaling}"
)
# Assert that the scaling factor is sqrt(520) for all observations
# note that this results in the pressure observations receiving a scaling
# factor of sqrt(520) instead of sqrt(20), which means that they are significantly
# deflated compared to what one would expect, and essentially ignored/suppressed by
# the updating algorithm, even though the observations are precise and should be
# influential.
expected_sf = np.sqrt(n_seismic + n_pressure)
assert np.allclose(scale_factors, expected_sf)

# With standard scaling, both independent directions are visible → 2 PCs needed
assert n_components_standard_scaling == 2, (
f"Standard scaling should yield 2 PCs (both groups visible), "
f"got {n_components_standard_scaling}"
)

def test_clustering_and_scaling_edge_case():
"""
This test demonstrates that when observations have irregular errors, the
current clustering approach can lead to unintuitive results where independent
observations are clustered together, and treated as correlated.

Scenario:
Suppose we have 100 independent observations r_1,r_2,...,r_100 with corresponding
observation errors, where r_1 has a small error, and r_2,...,r_100 have large
errors. The error-scaling step amplifies r_1's response and suppresses
r_2,...,r_100, leading to one cluster instead of 100 clusters. As a result, the
history matching update is deflated as if all 100 observations were
perfectly correlated, even though they are all independent.
"""

# Create 100 independent responses
rng = np.random.default_rng(42)
n_observations = 100
n_realizations = 1000
responses = rng.standard_normal((n_observations, n_realizations))

# Create irregular observation errors: one small, the rest large
obs_errors = np.array([0.1] + [10.0] * (n_observations - 1))

# Run clustering algorithm
scale_factors, clusters, _ = main(responses, obs_errors)

# Assert that all observations are clustered together (only 1 cluster)
assert len(np.unique(clusters)) == 1

# Assert deflation rate
# For independent observations, the scaling factor should be 1
# but since all overvations are clustered together and treated as perfectly
# correlated, the scaling factor becomes sqrt(100) = 10
assert len(np.unique(scale_factors)) == 1
assert np.allclose(scale_factors, np.sqrt(100))
Loading