From c3f036d60482fec7a6ce5ab0e30d8e93a98403f1 Mon Sep 17 00:00:00 2001
From: Andrew Robbins <andrew@robbinsa.me>
Date: Fri, 20 Dec 2024 00:57:15 -0500
Subject: [PATCH 1/7] use direct zarr sink

---
 ALLCools/count_matrix/dataset.py | 155 +++++++++++++++++++------------
 1 file changed, 95 insertions(+), 60 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index 8106811..6df8a55 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -4,13 +4,17 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from functools import lru_cache
 from shutil import rmtree
+import tempfile
+
 
 import numpy as np
 import pandas as pd
 import pybedtools
 import pysam
 import xarray as xr
+from numcodecs import blosc
 from scipy import stats
+import zarr, zarr.creation, zarr.convenience, zarr.hierarchy, zarr.storage
 
 from ALLCools.utilities import parse_chrom_size, parse_mc_pattern
 
@@ -68,11 +72,9 @@ def summary(self):
         return mc_type_data
 
 
-def _determine_datasets(regions, quantifiers, chrom_size_path, tmp_dir):
+def _determine_datasets(regions, quantifiers, chrom_size_path):
     """Determine datasets for each region."""
-    tmp_dir = pathlib.Path(tmp_dir).absolute()
-    tmp_dir.mkdir(exist_ok=True, parents=True)
-
+    tmpdir = tempfile.mkdtemp()
     chrom_sizes = parse_chrom_size(chrom_size_path)
     datasets = {}
     for pair in regions:
@@ -122,7 +124,7 @@ def _id(i, c=chrom):
 
             except ValueError:
                 raise ValueError(f"Can not understand region specification {region_path}")
-        region_path = f"{tmp_dir}/{name}.regions.csv"
+        region_path = f"{tmpdir}/{name}.regions.csv"
         region_bed_df.to_csv(region_path)
         datasets[name] = {"regions": region_path, "quant": []}
 
@@ -152,7 +154,7 @@ def _id(i, c=chrom):
         if quant_type not in ALLOW_QUANT_TYPES:
             raise ValueError(f"QUANT_TYPE need to be in {ALLOW_QUANT_TYPES}, got {quant_type} in {quantifier}.")
         datasets[name]["quant"].append(_Quant(mc_types=mc_types, quant_type=quant_type, kwargs=kwargs))
-    return datasets
+    return datasets, tmpdir
 
 
 def _count_single_region_set(allc_table, region_config, obs_dim, region_dim):
@@ -183,7 +185,7 @@ def _count_single_region_set(allc_table, region_config, obs_dim, region_dim):
             data = xr.DataArray(
                 np.array([sample_data]),
                 coords=[[sample], region_ids, total_mc_types, ["mc", "cov"]],
-                dims=[obs_dim, region_dim, "mc_type", "count_type"],
+                dims=[obs_dim, region_dim, "mc_type", "count_type"]
             )
             total_data.append(data)
     total_data = xr.Dataset({f"{region_dim}_da": xr.concat(total_data, dim=obs_dim)})
@@ -208,7 +210,7 @@ def _calculate_pv(data, reverse_value, obs_dim, var_dim, cutoff=0.9):
 
 
 def _count_single_zarr(
-    allc_table, region_config, obs_dim, region_dim, output_path, obs_dim_dtype, count_dtype="uint32"
+    allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"
 ):
     """Process single region set and its quantifiers."""
     # count all ALLC and mC types that's needed for quantifiers if this region_dim
@@ -216,7 +218,6 @@ def _count_single_zarr(
         allc_table=allc_table, region_config=region_config, obs_dim=obs_dim, region_dim=region_dim
     )
 
-    total_ds = {}
     # deal with count quantifiers
     count_mc_types = []
     for quant in region_config["quant"]:
@@ -227,8 +228,8 @@ def _count_single_zarr(
         count_da = count_ds.sel(mc_type=count_mc_types)[f"{region_dim}_da"]
         max_int = np.iinfo(count_dtype).max
         count_da = xr.where(count_da > max_int, max_int, count_da)
-        total_ds[f"{region_dim}_da"] = count_da.astype(count_dtype)
-
+        regiongroup[f"{region_dim}_da"][
+            chunk_start : chunk_start + allc_table.index.size, :, :, :] = count_da.astype(count_dtype).data
     # deal with hypo-score, hyper-score quantifiers
     for quant in region_config["quant"]:
         if quant.quant_type == "hypo-score":
@@ -240,7 +241,9 @@ def _count_single_zarr(
                     var_dim=region_dim,
                     **quant.kwargs,
                 )
-                total_ds[f"{region_dim}_da_{mc_type}-hypo-score"] = data
+                regiongroup[f"{region_dim}_da_{mc_type}-hypo-score"][
+                    chunk_start : chunk_start + allc_table.index.size, :
+                ] = data.data
         elif quant.quant_type == "hyper-score":
             for mc_type in quant.mc_types:
                 data = _calculate_pv(
@@ -250,11 +253,9 @@ def _count_single_zarr(
                     var_dim=region_dim,
                     **quant.kwargs,
                 )
-                total_ds[f"{region_dim}_da_{mc_type}-hyper-score"] = data
-    total_ds = xr.Dataset(total_ds)
-    total_ds.coords[obs_dim] = total_ds.coords[obs_dim].astype(obs_dim_dtype)
-    total_ds.to_zarr(output_path, mode="w")
-    return output_path
+                regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][chunk_start : chunk_start + allc_table.index.size, :] = data.data
+
+    return True
 
 
 @doc_params(
@@ -302,7 +303,6 @@ def generate_dataset(
 
     # determine index length and str dtype
     max_length = allc_table.index.map(lambda idx: len(idx)).max()
-    obs_dim_dtype = f"<U{max_length}"
 
     # determine parallel chunk size
     n_sample = allc_table.size
@@ -311,68 +311,102 @@ def generate_dataset(
 
     # prepare regions and determine quantifiers
     pathlib.Path(output_path).mkdir(exist_ok=True)
-    tmp_dir = f"{output_path}_tmp"
-    datasets = _determine_datasets(regions, quantifiers, chrom_size_path, tmp_dir)
-
+    z = zarr.storage.DirectoryStore(path=output_path)
+    root = zarr.hierarchy.group(store = z, overwrite = True)
+    datasets, tmpdir = _determine_datasets(regions, quantifiers, chrom_size_path)
     # copy chrom_size_path to output_path
     subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
-
-    chunk_records = defaultdict(dict)
+    for region_dim, region_config in datasets.items():
+        regiongroup = root.create_group(region_dim)
+        # save region coords to the ds
+        bed = pd.read_csv(f"{tmpdir}/{region_dim}.regions.csv", index_col=0)
+        bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
+        bed.index.name = region_dim
+        region_size = bed.index.size
+        dsobs = regiongroup.array(
+            name=obs_dim,
+            data=allc_table.index.values,
+            chunks=(chunk_size),
+            dtype=f"<U{max_length}"
+        )
+        dsobs.attrs['_ARRAY_DIMENSIONS'] = [obs_dim]
+        # append region bed to the saved ds
+        ds = xr.Dataset()
+        for col, data in bed.items():
+            ds.coords[col] = data
+        ds.coords[region_dim] = bed.index.values
+        # change object dtype to string
+        for k in ds.coords.keys():
+            if ds.coords[k].dtype == "O":
+                ds.coords[k] = ds.coords[k].astype(str)
+        ds.to_zarr(f"{output_path}/{region_dim}", mode="w")
+        count_mc_types = []
+        for quant in region_config["quant"]:
+            if quant.quant_type == "count":
+                count_mc_types += quant.mc_types
+        count_mc_types = list(set(count_mc_types))
+        if len(count_mc_types) > 0:
+            DA = regiongroup.empty(
+                name=f"{region_dim}_da",
+                shape=(n_sample, region_size, len(count_mc_types), 2),
+                chunks=(chunk_size, region_size, len(count_mc_types), 2),
+                dtype="uint32"
+            )
+            DA.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim, "mc_type", "count_type"]
+            count = regiongroup.array(
+                name="count_type",
+                data=(["mc", "cov"]),
+                dtype="<U3"
+            )
+            count.attrs['_ARRAY_DIMENSIONS']=["count_type"]
+            mc = regiongroup.array(
+                name="mc_type",
+                data=count_mc_types,
+                dtype="<U3"
+            )
+            mc.attrs['_ARRAY_DIMENSIONS']=["mc_type"]
+        # deal with hypo-score, hyper-score quantifiers
+        for quant in region_config["quant"]:
+            if quant.quant_type == "hypo-score":
+                for mc_type in quant.mc_types:
+                    hypo = regiongroup.empty (
+                        name = f"{region_dim}_da_{mc_type}-hypo-score",
+                        shape=(allc_table.size, region_size),
+                        chunks = (chunk_size, region_size),
+                        dtype = "float16"
+                    )
+                    hypo.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
+            elif quant.quant_type == "hyper-score":
+                for mc_type in quant.mc_types:
+                    hyper = regiongroup.empty (
+                        name = f"{region_dim}_da_{mc_type}-hyper-score",
+                        shape=(allc_table.size, region_size),
+                        chunks = (chunk_size, region_size),
+                        dtype = "float16"
+                    )
+                    hyper.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
+    blosc.use_threads = False
     with ProcessPoolExecutor(cpu) as exe:
         futures = {}
         # parallel on allc chunks and region_sets levels
         for i, chunk_start in enumerate(range(0, n_sample, chunk_size)):
             allc_chunk = allc_table[chunk_start : chunk_start + chunk_size]
             for region_dim, region_config in datasets.items():
-                chunk_path = f"{tmp_dir}/chunk_{region_dim}_{chunk_start}.zarr"
                 f = exe.submit(
                     _count_single_zarr,
                     allc_table=allc_chunk,
                     region_config=region_config,
                     obs_dim=obs_dim,
                     region_dim=region_dim,
-                    output_path=chunk_path,
-                    obs_dim_dtype=obs_dim_dtype,
+                    chunk_start=chunk_start,
+                    regiongroup=regiongroup,
                 )
                 futures[f] = (region_dim, i)
 
         for f in as_completed(futures):
             region_dim, i = futures[f]
-            chunk_path = f.result()
             print(f"Chunk {i} of {region_dim} returned")
-            chunk_records[region_dim][i] = chunk_path
-
-    for region_dim, chunks in chunk_records.items():
-        # write chunk in order
-        chunk_paths = pd.Series(chunks).sort_index().tolist()
-        for i, chunk_path in enumerate(chunk_paths):
-            ds = xr.open_zarr(chunk_path).load()
-            # dump chunk to final place
-            if i == 0:
-                # first chunk
-                ds.to_zarr(f"{output_path}/{region_dim}", mode="w")
-            else:
-                # append
-                ds.to_zarr(f"{output_path}/{region_dim}", append_dim=obs_dim)
-            rmtree(chunk_path)
-
-        # save region coords to the ds
-        bed = pd.read_csv(f"{tmp_dir}/{region_dim}.regions.csv", index_col=0)
-        bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
-        bed.index.name = region_dim
-        # append region bed to the saved ds
-        ds = xr.Dataset()
-        for col, data in bed.items():
-            ds.coords[col] = data
-        # change object dtype to string
-        for k in ds.coords.keys():
-            if ds.coords[k].dtype == "O":
-                ds.coords[k] = ds.coords[k].astype(str)
-        ds.to_zarr(f"{output_path}/{region_dim}", mode="a")
-
-    # delete tmp
-    rmtree(tmp_dir)
-
+    blosc.use_threads = None
     from ..mcds.utilities import update_dataset_config
 
     update_dataset_config(
@@ -383,4 +417,5 @@ def generate_dataset(
             "ds_sample_dim": {region_dim: obs_dim for region_dim in datasets.keys()},
         },
     )
+    zarr.convenience.consolidate_metadata(z)
     return output_path

From 1d0ac9ea1f52e006ad027616d7ee3820928c9eca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 21 Dec 2024 00:06:38 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ALLCools/count_matrix/dataset.py              | 74 +++++++++----------
 docs/CONDUCT.md                               | 20 ++---
 docs/CONTRIBUTING.md                          | 14 ++--
 docs/README.md                                | 12 +--
 .../basic/intro_basic_clustering.md           | 12 +--
 docs/allcools/intro.md                        | 16 ++--
 docs/allcools/start/analysis_steps.md         | 22 +++---
 docs/allcools/start/installation.md           |  4 +-
 8 files changed, 83 insertions(+), 91 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index 6df8a55..5f89056 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -1,20 +1,22 @@
 import pathlib
 import subprocess
+import tempfile
 from collections import defaultdict
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from functools import lru_cache
-from shutil import rmtree
-import tempfile
-
 
 import numpy as np
 import pandas as pd
 import pybedtools
 import pysam
 import xarray as xr
+import zarr
+import zarr.convenience
+import zarr.creation
+import zarr.hierarchy
+import zarr.storage
 from numcodecs import blosc
 from scipy import stats
-import zarr, zarr.creation, zarr.convenience, zarr.hierarchy, zarr.storage
 
 from ALLCools.utilities import parse_chrom_size, parse_mc_pattern
 
@@ -185,7 +187,7 @@ def _count_single_region_set(allc_table, region_config, obs_dim, region_dim):
             data = xr.DataArray(
                 np.array([sample_data]),
                 coords=[[sample], region_ids, total_mc_types, ["mc", "cov"]],
-                dims=[obs_dim, region_dim, "mc_type", "count_type"]
+                dims=[obs_dim, region_dim, "mc_type", "count_type"],
             )
             total_data.append(data)
     total_data = xr.Dataset({f"{region_dim}_da": xr.concat(total_data, dim=obs_dim)})
@@ -209,9 +211,7 @@ def _calculate_pv(data, reverse_value, obs_dim, var_dim, cutoff=0.9):
     return pv
 
 
-def _count_single_zarr(
-    allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"
-):
+def _count_single_zarr(allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"):
     """Process single region set and its quantifiers."""
     # count all ALLC and mC types that's needed for quantifiers if this region_dim
     count_ds = _count_single_region_set(
@@ -228,8 +228,9 @@ def _count_single_zarr(
         count_da = count_ds.sel(mc_type=count_mc_types)[f"{region_dim}_da"]
         max_int = np.iinfo(count_dtype).max
         count_da = xr.where(count_da > max_int, max_int, count_da)
-        regiongroup[f"{region_dim}_da"][
-            chunk_start : chunk_start + allc_table.index.size, :, :, :] = count_da.astype(count_dtype).data
+        regiongroup[f"{region_dim}_da"][chunk_start : chunk_start + allc_table.index.size, :, :, :] = count_da.astype(
+            count_dtype
+        ).data
     # deal with hypo-score, hyper-score quantifiers
     for quant in region_config["quant"]:
         if quant.quant_type == "hypo-score":
@@ -253,7 +254,9 @@ def _count_single_zarr(
                     var_dim=region_dim,
                     **quant.kwargs,
                 )
-                regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][chunk_start : chunk_start + allc_table.index.size, :] = data.data
+                regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][
+                    chunk_start : chunk_start + allc_table.index.size, :
+                ] = data.data
 
     return True
 
@@ -312,7 +315,7 @@ def generate_dataset(
     # prepare regions and determine quantifiers
     pathlib.Path(output_path).mkdir(exist_ok=True)
     z = zarr.storage.DirectoryStore(path=output_path)
-    root = zarr.hierarchy.group(store = z, overwrite = True)
+    root = zarr.hierarchy.group(store=z, overwrite=True)
     datasets, tmpdir = _determine_datasets(regions, quantifiers, chrom_size_path)
     # copy chrom_size_path to output_path
     subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
@@ -324,12 +327,9 @@ def generate_dataset(
         bed.index.name = region_dim
         region_size = bed.index.size
         dsobs = regiongroup.array(
-            name=obs_dim,
-            data=allc_table.index.values,
-            chunks=(chunk_size),
-            dtype=f"<U{max_length}"
+            name=obs_dim, data=allc_table.index.values, chunks=(chunk_size), dtype=f"<U{max_length}"
         )
-        dsobs.attrs['_ARRAY_DIMENSIONS'] = [obs_dim]
+        dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
         # append region bed to the saved ds
         ds = xr.Dataset()
         for col, data in bed.items():
@@ -350,41 +350,33 @@ def generate_dataset(
                 name=f"{region_dim}_da",
                 shape=(n_sample, region_size, len(count_mc_types), 2),
                 chunks=(chunk_size, region_size, len(count_mc_types), 2),
-                dtype="uint32"
-            )
-            DA.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim, "mc_type", "count_type"]
-            count = regiongroup.array(
-                name="count_type",
-                data=(["mc", "cov"]),
-                dtype="<U3"
-            )
-            count.attrs['_ARRAY_DIMENSIONS']=["count_type"]
-            mc = regiongroup.array(
-                name="mc_type",
-                data=count_mc_types,
-                dtype="<U3"
+                dtype="uint32",
             )
-            mc.attrs['_ARRAY_DIMENSIONS']=["mc_type"]
+            DA.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim, "mc_type", "count_type"]
+            count = regiongroup.array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
+            count.attrs["_ARRAY_DIMENSIONS"] = ["count_type"]
+            mc = regiongroup.array(name="mc_type", data=count_mc_types, dtype="<U3")
+            mc.attrs["_ARRAY_DIMENSIONS"] = ["mc_type"]
         # deal with hypo-score, hyper-score quantifiers
         for quant in region_config["quant"]:
             if quant.quant_type == "hypo-score":
                 for mc_type in quant.mc_types:
-                    hypo = regiongroup.empty (
-                        name = f"{region_dim}_da_{mc_type}-hypo-score",
+                    hypo = regiongroup.empty(
+                        name=f"{region_dim}_da_{mc_type}-hypo-score",
                         shape=(allc_table.size, region_size),
-                        chunks = (chunk_size, region_size),
-                        dtype = "float16"
+                        chunks=(chunk_size, region_size),
+                        dtype="float16",
                     )
-                    hypo.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
+                    hypo.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
             elif quant.quant_type == "hyper-score":
                 for mc_type in quant.mc_types:
-                    hyper = regiongroup.empty (
-                        name = f"{region_dim}_da_{mc_type}-hyper-score",
+                    hyper = regiongroup.empty(
+                        name=f"{region_dim}_da_{mc_type}-hyper-score",
                         shape=(allc_table.size, region_size),
-                        chunks = (chunk_size, region_size),
-                        dtype = "float16"
+                        chunks=(chunk_size, region_size),
+                        dtype="float16",
                     )
-                    hyper.attrs['_ARRAY_DIMENSIONS']=[obs_dim, region_dim]
+                    hyper.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
     blosc.use_threads = False
     with ProcessPoolExecutor(cpu) as exe:
         futures = {}
diff --git a/docs/CONDUCT.md b/docs/CONDUCT.md
index 8484972..228e69d 100644
--- a/docs/CONDUCT.md
+++ b/docs/CONDUCT.md
@@ -8,19 +8,19 @@ In the interest of fostering an open and welcoming environment, we as contributo
 
 Examples of behavior that contributes to creating a positive environment include:
 
--   Using welcoming and inclusive language
--   Being respectful of differing viewpoints and experiences
--   Gracefully accepting constructive criticism
--   Focusing on what is best for the community
--   Showing empathy towards other community members
+- Using welcoming and inclusive language
+- Being respectful of differing viewpoints and experiences
+- Gracefully accepting constructive criticism
+- Focusing on what is best for the community
+- Showing empathy towards other community members
 
 Examples of unacceptable behavior by participants include:
 
--   The use of sexualized language or imagery and unwelcome sexual attention or advances
--   Trolling, insulting/derogatory comments, and personal or political attacks
--   Public or private harassment
--   Publishing others' private information, such as a physical or electronic address, without explicit permission
--   Other conduct which could reasonably be considered inappropriate in a professional setting
+- The use of sexualized language or imagery and unwelcome sexual attention or advances
+- Trolling, insulting/derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or electronic address, without explicit permission
+- Other conduct which could reasonably be considered inappropriate in a professional setting
 
 ## Our Responsibilities
 
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 08a910d..61b3ccf 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -9,9 +9,9 @@ Report bugs using GitHub issues.
 
 If you are reporting a bug, please include:
 
--   Your operating system name and version.
--   Any details about your local setup that might be helpful in troubleshooting.
--   Detailed steps to reproduce the bug.
+- Your operating system name and version.
+- Any details about your local setup that might be helpful in troubleshooting.
+- Detailed steps to reproduce the bug.
 
 ## Fix Bugs
 
@@ -35,10 +35,10 @@ The best way to send feedback is to file an issue on GitHub.
 
 If you are proposing a feature:
 
--   Explain in detail how it would work.
--   Keep the scope as narrow as possible, to make it easier to implement.
--   Remember that this is a volunteer-driven project, and that contributions
-    are welcome :)
+- Explain in detail how it would work.
+- Keep the scope as narrow as possible, to make it easier to implement.
+- Remember that this is a volunteer-driven project, and that contributions
+  are welcome :)
 
 ## Get Started
 
diff --git a/docs/README.md b/docs/README.md
index ab756c6..05f60e7 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -8,10 +8,10 @@ ALLCools documentation
 
 If you'd like to develop on and build the ALLCools book, you should:
 
--   Clone this repository and run
--   Run `pip install -r requirements.txt` (it is recommended you do this within a virtual environment)
--   (Recommended) Remove the existing `ALLCools/_build/` directory
--   Run `jupyter-book build ALLCools/`
+- Clone this repository and run
+- Run `pip install -r requirements.txt` (it is recommended you do this within a virtual environment)
+- (Recommended) Remove the existing `ALLCools/_build/` directory
+- Run `jupyter-book build ALLCools/`
 
 A fully-rendered HTML version of the book will be built in `ALLCools/_build/html/`.
 
@@ -21,8 +21,8 @@ The html version of the book is hosted on the `gh-pages` branch of this repo. A
 
 If you wish to disable this automation, you may remove the GitHub actions workflow and build the book manually by:
 
--   Navigating to your local build; and running,
--   `ghp-import -n -p -f ALLCools/_build/html`
+- Navigating to your local build; and running,
+- `ghp-import -n -p -f ALLCools/_build/html`
 
 This will automatically push your build to the `gh-pages` branch. More information on this hosting process can be found [here](https://jupyterbook.org/publish/gh-pages.html#manually-host-your-book-with-github-pages).
 
diff --git a/docs/allcools/cell_level/basic/intro_basic_clustering.md b/docs/allcools/cell_level/basic/intro_basic_clustering.md
index 87eb2c2..7a93ac0 100644
--- a/docs/allcools/cell_level/basic/intro_basic_clustering.md
+++ b/docs/allcools/cell_level/basic/intro_basic_clustering.md
@@ -14,9 +14,9 @@ The dataset we used for 100Kb clustering documentation comes from the hippocampu
 
 #### Download Input Files
 
--   Cell metadata: ADD DOWNLOAD URL
--   single-cell ALLC files: ADD DOWNLOAD URL
--   MCDS files: ADD DOWNLOAD URL
+- Cell metadata: ADD DOWNLOAD URL
+- single-cell ALLC files: ADD DOWNLOAD URL
+- MCDS files: ADD DOWNLOAD URL
 
 ### For 5Kb bins clustering
 
@@ -24,9 +24,9 @@ The dataset we used for 5Kb clustering documentation comes from human PBMC (ADD
 
 #### Download Input Files
 
--   Cell metadata: ADD DOWNLOAD URL
--   single-cell ALLC files: ADD DOWNLOAD URL
--   MCDS files: ADD DOWNLOAD URL
+- Cell metadata: ADD DOWNLOAD URL
+- single-cell ALLC files: ADD DOWNLOAD URL
+- MCDS files: ADD DOWNLOAD URL
 
 ## Prepare your own datasets
 
diff --git a/docs/allcools/intro.md b/docs/allcools/intro.md
index 7030fd0..4478fa5 100644
--- a/docs/allcools/intro.md
+++ b/docs/allcools/intro.md
@@ -21,10 +21,10 @@ ALLCools documentation organization.
 
 ## Authors
 
--   Hanqing Liu, developer, initial conception
--   Jingtian Zhou, developer, 5kb clustering algorithms
--   Wei Tian
--   Jiaying Xu
+- Hanqing Liu, developer, initial conception
+- Jingtian Zhou, developer, 5kb clustering algorithms
+- Wei Tian
+- Jiaying Xu
 
 ## Support
 
@@ -37,10 +37,10 @@ figclass: margin
 Click on this to create a page specific issue.
 ```
 
--   The source code is on [github](https://github.com/lhqing/ALLCools);
--   For releases and changelog, please check out the [github releases page](https://github.com/lhqing/ALLCools/releases);
--   For bugs and feature requests, please use the [issue tracker](https://github.com/lhqing/ALLCools/issues).
--   For page-specific issues, please use the "open issue" button on the top-right toggle.
+- The source code is on [github](https://github.com/lhqing/ALLCools);
+- For releases and changelog, please check out the [github releases page](https://github.com/lhqing/ALLCools/releases);
+- For bugs and feature requests, please use the [issue tracker](https://github.com/lhqing/ALLCools/issues).
+- For page-specific issues, please use the "open issue" button on the top-right toggle.
 
 ## Citing ALLCools
 
diff --git a/docs/allcools/start/analysis_steps.md b/docs/allcools/start/analysis_steps.md
index 5ffe367..cadf6b0 100644
--- a/docs/allcools/start/analysis_steps.md
+++ b/docs/allcools/start/analysis_steps.md
@@ -20,11 +20,11 @@ In general, the **cellular analysis** is focused on individual cells' overall di
 
 ### Sections
 
--   [Basic walk-through of the clustering analysis](../cell_level/basic/intro_basic_clustering.md).
--   [Step-by-step description of the clustering analysis](../cell_level/step_by_step/intro_step_by_step_clustering.md).
--   [Identification of Differentially Methylated Genes clusters](../cell_level/dmg/intro_dmg.md).
--   [Cell-level data integration](../cell_level/integration/intro_integration.md).
--   [Potential cell doublets identification](../cell_level/doublets/intro_doublets.md).
+- [Basic walk-through of the clustering analysis](../cell_level/basic/intro_basic_clustering.md).
+- [Step-by-step description of the clustering analysis](../cell_level/step_by_step/intro_step_by_step_clustering.md).
+- [Identification of Differentially Methylated Genes clusters](../cell_level/dmg/intro_dmg.md).
+- [Cell-level data integration](../cell_level/integration/intro_integration.md).
+- [Potential cell doublets identification](../cell_level/doublets/intro_doublets.md).
 
 ### Input
 
@@ -49,12 +49,12 @@ Specifically, this strategy starts from a cell-by-5kb-bin hypo-methylation score
 
 ### Sections
 
--   [Prepare pseudo-bulk ALLC files](../cluster_level/intro.md)
--   [Call Differentially Methylated Region (DMR)](../cluster_level/RegionDS/01a.call_dmr)
--   [DMR annotation](../cluster_level/RegionDS/02.annotation.ipynb)
--   [DMR motif analysis (finding upstream regulators of DMRs)](../cluster_level/RegionDS/intro_motif.md)
--   [DMR - Gene correlation analysis (finding downstream targets of DMRs)](../cluster_level/Correlation/intro_corr)
--   [Enhancer prediction with REPTILE algorithm](../cluster_level/REPTILE/intro_reptile.md)
+- [Prepare pseudo-bulk ALLC files](../cluster_level/intro.md)
+- [Call Differentially Methylated Region (DMR)](../cluster_level/RegionDS/01a.call_dmr)
+- [DMR annotation](../cluster_level/RegionDS/02.annotation.ipynb)
+- [DMR motif analysis (finding upstream regulators of DMRs)](../cluster_level/RegionDS/intro_motif.md)
+- [DMR - Gene correlation analysis (finding downstream targets of DMRs)](../cluster_level/Correlation/intro_corr)
+- [Enhancer prediction with REPTILE algorithm](../cluster_level/REPTILE/intro_reptile.md)
 
 ### Basic process of genomic analysis
 
diff --git a/docs/allcools/start/installation.md b/docs/allcools/start/installation.md
index 30a6c68..cf0a987 100644
--- a/docs/allcools/start/installation.md
+++ b/docs/allcools/start/installation.md
@@ -123,8 +123,8 @@ conda deactivate
 
 Here are some optional packages which might be hard to install on some old systems.
 
--   `rpy2` (R and the R package pvclust) is used for the cluster dendrogram.
--   `tpot` is used in REPTILE model.
+- `rpy2` (R and the R package pvclust) is used for the cluster dendrogram.
+- `tpot` is used in REPTILE model.
 
 ```shell
 mamba install -n allcools rpy2

From 09894b76a967fff6279017bdc734fa67c7f7df36 Mon Sep 17 00:00:00 2001
From: Andrew Robbins <andrew@robbinsa.me>
Date: Wed, 8 Jan 2025 14:29:57 -0500
Subject: [PATCH 3/7] add dependency on zarr under version 3

---
 environment.yml | 2 +-
 pyproject.toml  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index a625dd1..0be6010 100644
--- a/environment.yml
+++ b/environment.yml
@@ -30,7 +30,7 @@ dependencies:
     - statsmodels
     - xarray
     - yaml
-    - zarr
+    - zarr < 3
     - pip:
           - papermill
           - imblearn
diff --git a/pyproject.toml b/pyproject.toml
index ee85625..f7e4da1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     'seaborn',
     "xarray",
     "pyyaml",
+    "zarr < 3"
 ]
 
 [project.optional-dependencies]

From e9fcd17789923ce6247ce2a0e37227cadd4e2ffc Mon Sep 17 00:00:00 2001
From: Andrew Robbins <andrew@robbinsa.me>
Date: Sun, 16 Feb 2025 00:10:55 -0500
Subject: [PATCH 4/7] maybe fix lack of obs_dim

---
 ALLCools/count_matrix/dataset.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index 5f89056..f920191 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -211,7 +211,7 @@ def _calculate_pv(data, reverse_value, obs_dim, var_dim, cutoff=0.9):
     return pv
 
 
-def _count_single_zarr(allc_table, region_config, obs_dim, region_dim, chunk_start, regiongroup, count_dtype="uint32"):
+def _count_single_zarr(allc_table, region_config, obs_dim, obs_dim_dtype, region_dim, chunk_start, regiongroup, count_dtype="uint32"):
     """Process single region set and its quantifiers."""
     # count all ALLC and mC types that's needed for quantifiers if this region_dim
     count_ds = _count_single_region_set(
@@ -257,7 +257,7 @@ def _count_single_zarr(allc_table, region_config, obs_dim, region_dim, chunk_sta
                 regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][
                     chunk_start : chunk_start + allc_table.index.size, :
                 ] = data.data
-
+    regiongroup[obs_dim] = count_ds.coords[obs_dim].astype(obs_dim_dtype)
     return True
 
 
@@ -306,6 +306,7 @@ def generate_dataset(
 
     # determine index length and str dtype
     max_length = allc_table.index.map(lambda idx: len(idx)).max()
+    obs_dim_dtype = f"<U{max_length}"
 
     # determine parallel chunk size
     n_sample = allc_table.size
@@ -389,6 +390,7 @@ def generate_dataset(
                     allc_table=allc_chunk,
                     region_config=region_config,
                     obs_dim=obs_dim,
+                    obs_dim_dtype = obs_dim_dtype,
                     region_dim=region_dim,
                     chunk_start=chunk_start,
                     regiongroup=regiongroup,

From 0cc2b363906e4ff05e9a23131187ccffd46648d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 16 Feb 2025 13:54:22 +0000
Subject: [PATCH 5/7] fix lack of obs_dim

---
 ALLCools/count_matrix/dataset.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index f920191..697df3c 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -211,7 +211,9 @@ def _calculate_pv(data, reverse_value, obs_dim, var_dim, cutoff=0.9):
     return pv
 
 
-def _count_single_zarr(allc_table, region_config, obs_dim, obs_dim_dtype, region_dim, chunk_start, regiongroup, count_dtype="uint32"):
+def _count_single_zarr(
+    allc_table, region_config, obs_dim, obs_dim_dtype, region_dim, chunk_start, regiongroup, count_dtype="uint32"
+):
     """Process single region set and its quantifiers."""
     # count all ALLC and mC types that's needed for quantifiers if this region_dim
     count_ds = _count_single_region_set(
@@ -257,7 +259,9 @@ def _count_single_zarr(allc_table, region_config, obs_dim, obs_dim_dtype, region
                 regiongroup[f"{region_dim}_da_{mc_type}-hyper-score"][
                     chunk_start : chunk_start + allc_table.index.size, :
                 ] = data.data
-    regiongroup[obs_dim] = count_ds.coords[obs_dim].astype(obs_dim_dtype)
+    regiongroup[obs_dim][chunk_start : chunk_start + allc_table.index.size] = (
+        count_ds.coords[obs_dim].astype(obs_dim_dtype).data
+    )
     return True
 
 
@@ -327,10 +331,6 @@ def generate_dataset(
         bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
         bed.index.name = region_dim
         region_size = bed.index.size
-        dsobs = regiongroup.array(
-            name=obs_dim, data=allc_table.index.values, chunks=(chunk_size), dtype=f"<U{max_length}"
-        )
-        dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
         # append region bed to the saved ds
         ds = xr.Dataset()
         for col, data in bed.items():
@@ -340,7 +340,11 @@ def generate_dataset(
         for k in ds.coords.keys():
             if ds.coords[k].dtype == "O":
                 ds.coords[k] = ds.coords[k].astype(str)
-        ds.to_zarr(f"{output_path}/{region_dim}", mode="w")
+        ds.to_zarr(f"{output_path}/{region_dim}", mode="w", consolidated=False)
+        dsobs = regiongroup.empty(
+            name=obs_dim, shape=allc_table.index.size, chunks=(chunk_size), dtype=f"<U{max_length}"
+        )
+        dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
         count_mc_types = []
         for quant in region_config["quant"]:
             if quant.quant_type == "count":
@@ -390,7 +394,7 @@ def generate_dataset(
                     allc_table=allc_chunk,
                     region_config=region_config,
                     obs_dim=obs_dim,
-                    obs_dim_dtype = obs_dim_dtype,
+                    obs_dim_dtype=obs_dim_dtype,
                     region_dim=region_dim,
                     chunk_start=chunk_start,
                     regiongroup=regiongroup,

From e178fc339b963a9432c0e4e75e5ebd373c1aded6 Mon Sep 17 00:00:00 2001
From: Andrew Robbins <andrew@robbinsa.me>
Date: Sun, 16 Feb 2025 21:23:27 -0500
Subject: [PATCH 6/7] properly iterate over datasets

---
 ALLCools/count_matrix/dataset.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index 697df3c..b3c6c9c 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -324,8 +324,10 @@ def generate_dataset(
     datasets, tmpdir = _determine_datasets(regions, quantifiers, chrom_size_path)
     # copy chrom_size_path to output_path
     subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
+    rgs = {}
     for region_dim, region_config in datasets.items():
         regiongroup = root.create_group(region_dim)
+        rgs[region_dim] = regiongroup
         # save region coords to the ds
         bed = pd.read_csv(f"{tmpdir}/{region_dim}.regions.csv", index_col=0)
         bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
@@ -397,10 +399,9 @@ def generate_dataset(
                     obs_dim_dtype=obs_dim_dtype,
                     region_dim=region_dim,
                     chunk_start=chunk_start,
-                    regiongroup=regiongroup,
+                    regiongroup=rgs[region_dim],
                 )
                 futures[f] = (region_dim, i)
-
         for f in as_completed(futures):
             region_dim, i = futures[f]
             print(f"Chunk {i} of {region_dim} returned")
@@ -415,5 +416,6 @@ def generate_dataset(
             "ds_sample_dim": {region_dim: obs_dim for region_dim in datasets.keys()},
         },
     )
-    zarr.convenience.consolidate_metadata(z)
+    for region_dim in datasets.keys():
+        zarr.convenience.consolidate_metadata(f"{output_path}/{region_dim}")
     return output_path

From a983c2cb2a87f39a671e40da75df8ca4ea1f2656 Mon Sep 17 00:00:00 2001
From: Andrew Robbins <andrew@robbinsa.me>
Date: Fri, 7 Mar 2025 08:13:50 -0500
Subject: [PATCH 7/7] iterate over ALL counts for every region

---
 ALLCools/count_matrix/dataset.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ALLCools/count_matrix/dataset.py b/ALLCools/count_matrix/dataset.py
index e79a2d3..6f2f701 100644
--- a/ALLCools/count_matrix/dataset.py
+++ b/ALLCools/count_matrix/dataset.py
@@ -98,7 +98,7 @@ def _determine_datasets(regions, quantifiers, chrom_size_path):
                     "do not have index in its fourth column, adding it automatically. "
                     "If this is not desired, add a fourth column containing UNIQUE IDs to the BED file.",
                 )
-                region_bed_df[name] = [f"{name}_{i}" for i in range(region_bed_df.shape[0])]
+                region_bed_df[name] = (f"{name}_{i}" for i in range(region_bed_df.shape[0]))
             # check if name is unique()
             if region_bed_df.iloc[:, 3].duplicated().sum() > 0:
                 raise ValueError(f"Region IDs in {region_path} (fourth column) are not unique.")
@@ -219,7 +219,6 @@ def _count_single_zarr(
     count_ds = _count_single_region_set(
         allc_table=allc_table, region_config=region_config, obs_dim=obs_dim, region_dim=region_dim
     )
-
     # deal with count quantifiers
     count_mc_types = []
     for quant in region_config["quant"]:
@@ -326,8 +325,7 @@ def generate_dataset(
     subprocess.run(["cp", "-f", chrom_size_path, f"{output_path}/chrom_sizes.txt"], check=True)
     rgs = {}
     for region_dim, region_config in datasets.items():
-        regiongroup = root.create_group(region_dim)
-        rgs[region_dim] = regiongroup
+        rgs[region_dim] = root.create_group(region_dim)
         # save region coords to the ds
         bed = pd.read_csv(f"{tmpdir}/{region_dim}.regions.csv", index_col=0)
         bed.columns = [f"{region_dim}_chrom", f"{region_dim}_start", f"{region_dim}_end"]
@@ -343,7 +341,7 @@ def generate_dataset(
             if ds.coords[k].dtype == "O":
                 ds.coords[k] = ds.coords[k].astype(str)
         ds.to_zarr(f"{output_path}/{region_dim}", mode="w", consolidated=False)
-        dsobs = regiongroup.empty(
+        dsobs = rgs[region_dim].empty(
             name=obs_dim, shape=allc_table.index.size, chunks=(chunk_size), dtype=f"<U{max_length}"
         )
         dsobs.attrs["_ARRAY_DIMENSIONS"] = [obs_dim]
@@ -353,22 +351,22 @@ def generate_dataset(
                 count_mc_types += quant.mc_types
         count_mc_types = list(set(count_mc_types))
         if len(count_mc_types) > 0:
-            DA = regiongroup.empty(
+            DA = rgs[region_dim].empty(
                 name=f"{region_dim}_da",
                 shape=(n_sample, region_size, len(count_mc_types), 2),
                 chunks=(chunk_size, region_size, len(count_mc_types), 2),
                 dtype="uint32",
             )
             DA.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim, "mc_type", "count_type"]
-            count = regiongroup.array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
+            count = rgs[region_dim].array(name="count_type", data=(["mc", "cov"]), dtype="<U3")
             count.attrs["_ARRAY_DIMENSIONS"] = ["count_type"]
-            mc = regiongroup.array(name="mc_type", data=count_mc_types, dtype="<U3")
+            mc = rgs[region_dim].array(name="mc_type", data=count_mc_types, dtype="<U3")
             mc.attrs["_ARRAY_DIMENSIONS"] = ["mc_type"]
         # deal with hypo-score, hyper-score quantifiers
         for quant in region_config["quant"]:
             if quant.quant_type == "hypo-score":
                 for mc_type in quant.mc_types:
-                    hypo = regiongroup.empty(
+                    hypo = rgs[region_dim].empty(
                         name=f"{region_dim}_da_{mc_type}-hypo-score",
                         shape=(allc_table.size, region_size),
                         chunks=(chunk_size, region_size),
@@ -377,7 +375,7 @@ def generate_dataset(
                     hypo.attrs["_ARRAY_DIMENSIONS"] = [obs_dim, region_dim]
             elif quant.quant_type == "hyper-score":
                 for mc_type in quant.mc_types:
-                    hyper = regiongroup.empty(
+                    hyper = rgs[region_dim].empty(
                         name=f"{region_dim}_da_{mc_type}-hyper-score",
                         shape=(allc_table.size, region_size),
                         chunks=(chunk_size, region_size),