DataIntegrationGroup · jacob-a-brown · May 21, 2025 · May 19, 2025 · May 19, 2025 · May 19, 2025
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -53,4 +53,4 @@ jobs:
 
     - name: Test with pytest
       run: |
-        pytest -s -rx tests
+        pytest -s tests
diff --git a/backend/connectors/wqp/source.py b/backend/connectors/wqp/source.py
@@ -28,6 +28,12 @@
     DT_MEASURED,
     EARLIEST,
     LATEST,
+    TDS,
+    WATERLEVELS,
+    USGS_PCODE_30210,
+    USGS_PCODE_70300,
+    USGS_PCODE_70301,
+    USGS_PCODE_70303,
 )
 from backend.connectors.wqp.transformer import (
     WQPSiteTransformer,
@@ -97,7 +103,7 @@ def get_records(self):
             else:
                 # every record with pCode 30210 (depth in m) has a corresponding
                 # record with pCode 72019 (depth in ft) but not vice versa
-                params["pCode"] = "30210"
+                params["pCode"] = USGS_PCODE_30210
 
         params.update(get_date_range(config))
 
@@ -130,7 +136,44 @@ def _extract_source_parameter_results(self, records):
         return [ri["ResultMeasureValue"] for ri in records]
 
     def _clean_records(self, records):
-        return [ri for ri in records if ri["ResultMeasureValue"]]
+        clean_records = [r for r in records if r["ResultMeasureValue"]]
+
+        if self.config.parameter == TDS and len(clean_records) > 1:
+            site_id = clean_records[0]["MonitoringLocationIdentifier"]
+            return_records = []
+            activity_identifiers = [record["ActivityIdentifier"] for record in records]
+            activity_identifiers = list(set(activity_identifiers))
+            for activity_identifier in activity_identifiers:
+                # get all records for this activity identifier
+                ai_records = {
+                    record["USGSPCode"]: record
+                    for record in records
+                    if record["ActivityIdentifier"] == activity_identifier
+                }
+                if len(ai_records.items()) > 1:
+                    if USGS_PCODE_70300 in ai_records.keys():
+                        kept_record = ai_records[USGS_PCODE_70300]
+                        pcode = USGS_PCODE_70300
+                    elif USGS_PCODE_70301 in ai_records.keys():
+                        kept_record = ai_records[USGS_PCODE_70301]
+                        pcode = USGS_PCODE_70301
+                    elif USGS_PCODE_70303 in ai_records.keys():
+                        kept_record = ai_records[USGS_PCODE_70303]
+                        pcode = USGS_PCODE_70303
+                    else:
+                        raise ValueError(
+                            f"Multiple TDS records found for {site_id} with ActivityIdentifier {activity_identifier} but no 70300, 70301, or 70303 pcodes found."
+                        )
+                    record_date = kept_record["ActivityStartDate"]
+                    self.log(
+                        f"Removing duplicates for {site_id} on {record_date} with ActivityIdentifier {activity_identifier}. Keeping record with pcode {pcode}."
+                    )
+                else:
+                    kept_record = list(ai_records.values())[0]
+                return_records.append(kept_record)
+            return return_records
+        else:
+            return clean_records
 
     def _extract_source_parameter_units(self, records):
         return [ri["ResultMeasure/MeasureUnitCode"] for ri in records]
@@ -160,7 +203,7 @@ def get_records(self, site_record):
         }
         params.update(get_date_range(self.config))
 
-        if config.parameter.lower() != "waterlevels":
+        if config.parameter.lower() != WATERLEVELS:
             params["characteristicName"] = get_analyte_search_param(
                 config.parameter, WQP_ANALYTE_MAPPING
             )

diff --git a/backend/constants.py b/backend/constants.py
@@ -57,6 +57,11 @@
 SOURCE_PARAMETER_UNITS = "source_parameter_units"
 CONVERSION_FACTOR = "conversion_factor"
 
+USGS_PCODE_30210 = "30210"
+USGS_PCODE_70300 = "70300"
+USGS_PCODE_70301 = "70301"
+USGS_PCODE_70303 = "70303"
+
 ANALYTE_OPTIONS = sorted(
     [
         ARSENIC,

diff --git a/frontend/cli.py b/frontend/cli.py
@@ -187,11 +187,11 @@ def cli():
     )
 ]
 
-OUTPUT_FORMATS = sorted([value for value in OutputFormat])
+OUTPUT_FORMATS = sorted([of for of in OutputFormat])
 OUTPUT_FORMAT_OPTIONS = [
     click.option(
         "--output-format",
-        type=click.Choice(OUTPUT_FORMATS),
+        type=click.Choice(OUTPUT_FORMATS, case_sensitive=False),
         default="csv",
         help=f"Output file format for sites: {OUTPUT_FORMATS}. Default is csv",
     )

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 setup(
     name="nmuwd",
-    version="0.9.5",
+    version="0.9.6",
     author="Jake Ross",
     description="New Mexico Water Data Integration Engine",
     long_description=long_description,

diff --git a/tests/test_cli/__init__.py b/tests/test_cli/__init__.py
@@ -104,74 +104,82 @@ def _test_weave(
         result = self.runner.invoke(weave, arguments, standalone_mode=False)
 
         # Assert
-        assert result.exit_code == 0
-
-        """
-        For the config, check that
-
-        0. (set output dir to clean up tests results even in event of failure)
-        1. The parameter is set correctly
-        2. The agencies are set correctly
-        3. The output types are set correctly
-        4. The site limit is set correctly
-        5. The dry is set correctly
-        6. The start date is set correctly
-        7. The end date is set correctly
-        8. The geographic filter is set correctly
-        9. The site output type is set correctly
-        """
-        config = result.return_value
-
-        # 0
-        self.output_dir = Path(config.output_path)
-
-        # 1
-        assert getattr(config, "parameter") == parameter
-
-        # 2
-        agency_with_underscore = self.agency.replace("-", "_")
-        if self.agency_reports_parameter[parameter]:
-            assert getattr(config, f"use_source_{agency_with_underscore}") is True
-        else:
-            assert getattr(config, f"use_source_{agency_with_underscore}") is False
-
-        for no_agency in no_agencies:
-            no_agency_with_underscore = no_agency.replace("--no-", "").replace("-", "_")
-            assert getattr(config, f"use_source_{no_agency_with_underscore}") is False
-
-        # 3
-        output_types = ["summary", "timeseries_unified", "timeseries_separated"]
-        for ot in output_types:
-            if ot == output_type:
-                assert getattr(config, f"output_{ot}") is True
+        try:
+            assert result.exit_code == 0
+
+            """
+            For the config, check that
+
+            0. (set output dir to clean up tests results even in event of failure)
+            1. The parameter is set correctly
+            2. The agencies are set correctly
+            3. The output types are set correctly
+            4. The site limit is set correctly
+            5. The dry is set correctly
+            6. The start date is set correctly
+            7. The end date is set correctly
+            8. The geographic filter is set correctly
+            9. The site output type is set correctly
+            """
+            config = result.return_value
+
+            # 0
+            self.output_dir = Path(config.output_path)
+
+            # 1
+            assert getattr(config, "parameter") == parameter
+
+            # 2
+            agency_with_underscore = self.agency.replace("-", "_")
+            if self.agency_reports_parameter[parameter]:
+                assert getattr(config, f"use_source_{agency_with_underscore}") is True
             else:
-                assert getattr(config, f"output_{ot}") is False
-
-        # 4
-        assert getattr(config, "site_limit") == 4
-
-        # 5
-        assert getattr(config, "dry") is True
-
-        # 6
-        assert getattr(config, "start_date") == start_date
-
-        # 7
-        assert getattr(config, "end_date") == end_date
-
-        # 8
-        if geographic_filter_name and geographic_filter_value:
-            for _geographic_filter_name in ["bbox", "county", "wkt"]:
-                if _geographic_filter_name == geographic_filter_name:
-                    assert (
-                        getattr(config, _geographic_filter_name)
-                        == geographic_filter_value
-                    )
+                assert getattr(config, f"use_source_{agency_with_underscore}") is False
+
+            for no_agency in no_agencies:
+                no_agency_with_underscore = no_agency.replace("--no-", "").replace(
+                    "-", "_"
+                )
+                assert (
+                    getattr(config, f"use_source_{no_agency_with_underscore}") is False
+                )
+
+            # 3
+            output_types = ["summary", "timeseries_unified", "timeseries_separated"]
+            for ot in output_types:
+                if ot == output_type:
+                    assert getattr(config, f"output_{ot}") is True
                 else:
-                    assert getattr(config, _geographic_filter_name) == ""
-
-        # 9
-        assert getattr(config, "output_format") == output_format
+                    assert getattr(config, f"output_{ot}") is False
+
+            # 4
+            assert getattr(config, "site_limit") == 4
+
+            # 5
+            assert getattr(config, "dry") is True
+
+            # 6
+            assert getattr(config, "start_date") == start_date
+
+            # 7
+            assert getattr(config, "end_date") == end_date
+
+            # 8
+            if geographic_filter_name and geographic_filter_value:
+                for _geographic_filter_name in ["bbox", "county", "wkt"]:
+                    if _geographic_filter_name == geographic_filter_name:
+                        assert (
+                            getattr(config, _geographic_filter_name)
+                            == geographic_filter_value
+                        )
+                    else:
+                        assert getattr(config, _geographic_filter_name) == ""
+
+            # 9
+            assert getattr(config, "output_format") == output_format
+        except Exception as e:
+            print(result)
+            assert False
 
     def test_weave_summary(self):
         self._test_weave(parameter=WATERLEVELS, output_type="summary")