From d1893ec8ab9b6aef1f1f73acfeafb630f9121e72 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Mon, 19 Jan 2026 03:27:14 -0600 Subject: [PATCH 01/34] add offline sqlite --- utilix/__init__.py | 1 + utilix/mongo_storage.py | 100 +++- utilix/mongo_to_sqlite.py | 937 ++++++++++++++++++++++++++++++++++++++ utilix/rundb.py | 23 +- utilix/sqlite_backend.py | 496 ++++++++++++++++++++ 5 files changed, 1542 insertions(+), 15 deletions(-) create mode 100644 utilix/mongo_to_sqlite.py create mode 100644 utilix/sqlite_backend.py diff --git a/utilix/__init__.py b/utilix/__init__.py index 8f4587b..ac82e81 100644 --- a/utilix/__init__.py +++ b/utilix/__init__.py @@ -15,3 +15,4 @@ from .shell import Shell from .rundb import DB, xent_collection, xe1t_collection from . import mongo_storage +from . import sqlite_backend diff --git a/utilix/mongo_storage.py b/utilix/mongo_storage.py index 8d0de71..04b24e1 100644 --- a/utilix/mongo_storage.py +++ b/utilix/mongo_storage.py @@ -12,7 +12,7 @@ from utilix.rundb import DB, xent_collection from utilix.utils import to_str_tuple from utilix import uconfig, logger - +from utilix.sqlite_backend import OfflineGridFS, _load_sqlite_config class GridFsBase: """Base class for GridFS operations.""" @@ -305,22 +305,66 @@ def __init__(self, *args, **kwargs): return def initialize(self, store_files_at=None, *args, **kwargs): - super().__init__(*args, **kwargs) - - # We are going to set a place where to store the files. It's - # either specified by the user or we use these defaults: + # parse cache dirs (same as you already do) if store_files_at is None: - store_files_at = ( - "./resource_cache", - "/tmp/straxen_resource_cache", - ) - elif not isinstance(store_files_at, (tuple, str, list)): - raise ValueError(f"{store_files_at} should be tuple of paths!") + store_files_at = ("./resource_cache", "/tmp/straxen_resource_cache") elif isinstance(store_files_at, str): store_files_at = to_str_tuple(store_files_at) + elif isinstance(store_files_at, list): + store_files_at = tuple(store_files_at) + elif not isinstance(store_files_at, (tuple, list)): + raise ValueError(f"{store_files_at} should be tuple/list/str of paths!") self.storage_options = store_files_at + # offline? + try: + sqlite_cfg = _load_sqlite_config() + sqlite_active = sqlite_cfg.sqlite_active() + except Exception: + sqlite_cfg = None + sqlite_active = False + + if sqlite_active: + self._offline = OfflineGridFS( + sqlite_path=sqlite_cfg.sqlite_path, + offline_root=sqlite_cfg.offline_root, + cache_dirs=tuple(self.storage_options), + gridfs_db_name="files", + ) + # IMPORTANT: do NOT call super().__init__() + return + + # online fallback + super().__init__(*args, **kwargs) + + # ------------------------- + # OFFLINE-safe overrides + # ------------------------- + + def list_files(self) -> List[str]: + if hasattr(self, "_offline"): + return self._offline.list_files() + return super().list_files() + + def config_exists(self, config: str) -> bool: + if hasattr(self, "_offline"): + return self._offline.latest_by_config_name(config) is not None + return super().config_exists(config) + + def md5_stored(self, abs_path: str) -> bool: + # offline mode doesn't have a DB md5 index for arbitrary paths; just behave conservatively + if hasattr(self, "_offline"): + return False + return super().md5_stored(abs_path) + + def test_find(self) -> None: + if hasattr(self, "_offline"): + # simple sanity: must be able to list at least 1 file + _ = self._offline.list_files() + return + return super().test_find() + def download_single(self, config_name: str, human_readable_file_name=False): """Download the config_name if it exists. @@ -331,6 +375,14 @@ def download_single(self, config_name: str, human_readable_file_name=False): :return: str, the absolute path of the file requested """ + + # Offline path (sqlite-backed GridFS index) + if hasattr(self, "_offline"): + return self._offline.download_single( + config_name, + human_readable_file_name=human_readable_file_name, + ) + if self.config_exists(config_name): # Query by name query = self.get_query_config(config_name) @@ -513,6 +565,23 @@ def initialize( self.storage_options: Tuple[str, ...] = store_files_at + # Offline sqlite backend support (reuse utilix.sqlite_backend.OfflineGridFS) + try: + sqlite_cfg = _load_sqlite_config() + sqlite_active = sqlite_cfg.sqlite_active() + except Exception: + sqlite_cfg = None + sqlite_active = False + + if sqlite_active: + self._offline = OfflineGridFS( + sqlite_path=sqlite_cfg.sqlite_path, + offline_root=sqlite_cfg.offline_root, + cache_dirs=tuple(self.storage_options), + gridfs_db_name="files", + ) + + def download_single( self, config_name: str, @@ -520,6 +589,15 @@ def download_single( human_readable_file_name: bool = False, ) -> str: """Download the config_name if it exists.""" + + # Offline path (sqlite-backed GridFS index) + if hasattr(self, "_offline"): + return self._offline.download_single( + config_name, + human_readable_file_name=human_readable_file_name, + write_to=write_to, + ) + target_file_name = ( config_name if human_readable_file_name else self.db.get_file_md5(config_name) ) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py new file mode 100644 index 0000000..b0d6fe7 --- /dev/null +++ b/utilix/mongo_to_sqlite.py @@ -0,0 +1,937 @@ +#!/usr/bin/env python3 +""" +Dump selected MongoDB collections + GridFS into local SQLite(s). + +NEW: +- xedocs:* is dumped into a separate SQLite file (xedocs.sqlite) with + one table per xedocs collection and useful indexes. +- everything else stays as before (rundb.sqlite with kv_collections + runs_index + gridfs_files). + +Spec file examples: + xenonnt:runs + files:GRIDFS + xedocs:ALL + corrections:ALL +""" + +import argparse +import json +import logging +import os +import sqlite3 +import sys +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple, Any + +import pymongo +from bson import BSON +from bson.objectid import ObjectId + + +# ------------------------- +# Compression helpers +# ------------------------- + +def _compressor(): + try: + import zstandard as zstd # type: ignore + + cctx = zstd.ZstdCompressor(level=10) + dctx = zstd.ZstdDecompressor() + + def compress(b: bytes) -> bytes: + return cctx.compress(b) + + def decompress(b: bytes) -> bytes: + return dctx.decompress(b) + + return "zstd", compress, decompress + except Exception: + import zlib + + def compress(b: bytes) -> bytes: + return zlib.compress(b, level=6) + + def decompress(b: bytes) -> bytes: + return zlib.decompress(b) + + return "zlib", compress, decompress + + +COMP_ALGO, compress_bytes, _ = _compressor() + + +# ------------------------- +# Spec parsing +# ------------------------- + +@dataclass(frozen=True) +class SpecItem: + db: str + what: str # collection name, "ALL", or "GRIDFS" + + +def parse_spec_lines(lines: Iterable[str]) -> List[SpecItem]: + out: List[SpecItem] = [] + for raw in lines: + s = raw.strip() + if not s or s.startswith("#"): + continue + if ":" not in s: + raise ValueError(f"Bad spec line (expected db:thing): {s}") + db, what = s.split(":", 1) + db, what = db.strip(), what.strip() + if not db or not what: + raise ValueError(f"Bad spec line (empty db/thing): {s}") + out.append(SpecItem(db=db, what=what)) + return out + + +# ------------------------- +# Mongo connection (utilix-friendly) +# ------------------------- + +def get_utilix_mongo_uri(experiment: str) -> str: + """ + Mirrors utilix._collection style: + mongodb://{user}:{password}@{url} + """ + from utilix import uconfig # type: ignore + + if experiment not in ("xent", "xe1t"): + raise ValueError("experiment must be 'xent' or 'xe1t'") + + url = uconfig.get("RunDB", f"{experiment}_url") + user = uconfig.get("RunDB", f"{experiment}_user") + password = uconfig.get("RunDB", f"{experiment}_password") + + force_single_server = uconfig.get("RunDB", "force_single_server", fallback=True) + if force_single_server: + url = url.split(",")[-1] + + return f"mongodb://{user}:{password}@{url}" + + +def get_mongo_client(experiment: str, uri_override: Optional[str] = None) -> pymongo.MongoClient: + uri = uri_override or get_utilix_mongo_uri(experiment) + + kwargs: Dict[str, object] = { + "serverSelectionTimeoutMS": 30_000, + "connectTimeoutMS": 30_000, + "socketTimeoutMS": 60_000, + "retryWrites": False, + "readPreference": "secondaryPreferred", + } + if int(pymongo.__version__.split(".")[0]) >= 4: + kwargs["directConnection"] = True + + return pymongo.MongoClient(uri, **kwargs) + + +# ------------------------- +# SQLite schema (rundb.sqlite) +# ------------------------- + +SCHEMA_SQL_RUNDB = """ +PRAGMA journal_mode = WAL; +PRAGMA synchronous = NORMAL; +PRAGMA temp_store = MEMORY; + +CREATE TABLE IF NOT EXISTS kv_collections ( + db_name TEXT NOT NULL, + coll_name TEXT NOT NULL, + doc_id TEXT NOT NULL, + doc_bson_z BLOB NOT NULL, + PRIMARY KEY (db_name, coll_name, doc_id) +); + +CREATE TABLE IF NOT EXISTS runs_index ( + db_name TEXT NOT NULL, + doc_id TEXT NOT NULL, + number INTEGER, + name TEXT, + start INTEGER, + end INTEGER, + tags_json TEXT, + PRIMARY KEY (db_name, doc_id) +); + +CREATE INDEX IF NOT EXISTS idx_runs_number ON runs_index(db_name, number); +CREATE INDEX IF NOT EXISTS idx_runs_name ON runs_index(db_name, name); +CREATE INDEX IF NOT EXISTS idx_runs_start ON runs_index(db_name, start); + +CREATE TABLE IF NOT EXISTS gridfs_files ( + db_name TEXT NOT NULL, + file_id TEXT NOT NULL, + filename TEXT, + config_name TEXT, + length INTEGER, + chunkSize INTEGER, + uploadDate INTEGER, + md5 TEXT, + metadata_json TEXT, + logical_name TEXT, + blob_path TEXT NOT NULL, + PRIMARY KEY (db_name, file_id) +); + +CREATE INDEX IF NOT EXISTS idx_gridfs_filename ON gridfs_files(db_name, filename); +CREATE INDEX IF NOT EXISTS idx_gridfs_configname ON gridfs_files(db_name, config_name); +""" + + +# ------------------------- +# SQLite schema (xedocs.sqlite) +# ------------------------- + + +def _schema_sql_xedocs_table(table: str, extra_label_cols: List[str]) -> str: + """ + Create one table per xedocs collection. + + We keep a stable set of "core" columns (id/version/time/value/full doc), and + *also* create additional TEXT columns for any label fields we discover from + sampling documents in that collection. + + Note: extra label columns are quoted to tolerate odd names. + """ + + def q(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + # Core columns + cols = [ + f"{q('_id')} TEXT PRIMARY KEY", + f"{q('version')} TEXT", + f"{q('time_ns')} INTEGER", + f"{q('time_left_ns')} INTEGER", + f"{q('time_right_ns')} INTEGER", + f"{q('created_date_ns')} INTEGER", + f"{q('value_num')} REAL", + f"{q('value_json')} TEXT", + ] + + # Discovered label columns (TEXT) + for c in extra_label_cols: + if c in {'_id', 'version', 'time_ns', 'time_left_ns', 'time_right_ns', 'created_date_ns', 'value_num', 'value_json', 'doc_bson_z'}: + continue + cols.append(f"{q(c)} TEXT") + + # Full original BSON (compressed) + cols.append(f"{q('doc_bson_z')} BLOB NOT NULL") + + # Always-create indexes: + # - time sampled lookup: version + time + # - time interval lookup: version + interval + # - common labels (if present) + index_sql = [ + f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_time')} ON {q(table)}({q('version')}, {q('time_ns')});", + f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_interval')} ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", + ] + + # Optional label indexes (keep this small to avoid DB bloat) + preferred = [ + 'algorithm', + 'config_name', + 'detector', + 'source', + 'pmt', + 'gain_model', + ] + + present = set(extra_label_cols) + n_extra = 0 + for lab in preferred: + if lab in present: + index_sql.append( + f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_' + lab)} ON {q(table)}({q('version')}, {q(lab)});" + ) + n_extra += 1 + if n_extra >= 6: + break + + cols_sql = ",\n ".join(cols) + idx_sql = "\n\n".join(index_sql) + + return f""" +CREATE TABLE IF NOT EXISTS {q(table)} ( + {cols_sql} +); + +{idx_sql} +""" + + +# ------------------------- +# Utilities +# ------------------------- + +def ensure_dir(p: Path) -> None: + p.mkdir(parents=True, exist_ok=True) + + +def now_s() -> float: + return time.time() + + +def oid_to_str(x) -> str: + if isinstance(x, ObjectId): + return str(x) + return str(x) + + +def to_unix_seconds(dtobj) -> Optional[int]: + try: + if dtobj is None: + return None + return int(dtobj.timestamp()) + except Exception: + return None + + +def to_utc_ns(dtobj) -> Optional[int]: + try: + if dtobj is None: + return None + # bson datetime is usually naive but UTC + # treat naive as UTC + if getattr(dtobj, "tzinfo", None) is None: + import datetime as dt + dtobj = dtobj.replace(tzinfo=dt.timezone.utc) + return int(dtobj.timestamp() * 1_000_000_000) + except Exception: + return None + + +def bson_pack(doc: dict) -> bytes: + return BSON.encode(doc) + + +def pack_and_compress(doc: dict) -> bytes: + return compress_bytes(bson_pack(doc)) + + +def list_collection_names_safe(db: pymongo.database.Database) -> List[str]: + try: + return db.list_collection_names() + except pymongo.errors.OperationFailure as e: + raise RuntimeError( + f"Not authorized to list collections in DB '{db.name}'. " + f"Use explicit spec lines (db:collection) instead of db:ALL. " + f"Mongo error: {e}" + ) from e + + +# ------------------------- +# Dump logic (generic -> rundb.sqlite kv_collections) +# ------------------------- + +def dump_generic_collection( + mongo_db: pymongo.database.Database, + coll_name: str, + sql: sqlite3.Connection, + out_db_name: str, + batch_size: int, + logger: logging.Logger, + query: Optional[dict] = None, + projection: Optional[dict] = None, +) -> int: + query = query or {} + coll = mongo_db[coll_name] + + logger.info(f"[mongo] dumping {mongo_db.name}.{coll_name} -> rundb.sqlite kv_collections") + t0 = now_s() + + cur = coll.find(query, projection=projection, no_cursor_timeout=True, batch_size=batch_size) + n = 0 + buf: List[Tuple[str, str, str, bytes]] = [] + + insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) VALUES (?,?,?,?)" + + for doc in cur: + _id = doc.get("_id") + doc_id = oid_to_str(_id) if _id is not None else f"noid:{n}" + blob = pack_and_compress(doc) + buf.append((out_db_name, coll_name, doc_id, blob)) + n += 1 + + if len(buf) >= batch_size: + sql.executemany(insert_sql, buf) + sql.commit() + buf.clear() + + if buf: + sql.executemany(insert_sql, buf) + sql.commit() + + dt = now_s() - t0 + logger.info(f"[mongo] done {mongo_db.name}.{coll_name}: {n} docs in {dt:.1f}s") + return n + + +def dump_xenonnt_runs_index( + mongo_db: pymongo.database.Database, + runs_coll_name: str, + sql: sqlite3.Connection, + out_db_name: str, + batch_size: int, + logger: logging.Logger, + drop_fields: Optional[List[str]] = None, +) -> int: + drop_fields = drop_fields or [] + coll = mongo_db[runs_coll_name] + + logger.info(f"[mongo] dumping runs {mongo_db.name}.{runs_coll_name} with index + compression") + t0 = now_s() + + cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) + n = 0 + buf_kv: List[Tuple[str, str, str, bytes]] = [] + buf_idx: List[Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]]] = [] + + ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) VALUES (?,?,?,?)" + ins_idx = """ + INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) + VALUES (?,?,?,?,?,?,?) + """ + + for doc in cur: + _id = doc.get("_id") + doc_id = oid_to_str(_id) if _id is not None else f"noid:{n}" + + number = doc.get("number") or doc.get("run_number") or doc.get("runNumber") + try: + number_i = int(number) if number is not None else None + except Exception: + number_i = None + + name = doc.get("name") or doc.get("run_name") or doc.get("runName") + + start = doc.get("start") or doc.get("start_time") or doc.get("startTime") or doc.get("starttime") + end = doc.get("end") or doc.get("end_time") or doc.get("endTime") or doc.get("endtime") + + start_u = to_unix_seconds(start) + end_u = to_unix_seconds(end) + + tags = doc.get("tags") + tags_json = None + try: + if tags is not None: + tags_json = json.dumps(tags, default=str) + except Exception: + tags_json = None + + if drop_fields: + doc = dict(doc) + for k in drop_fields: + doc.pop(k, None) + + blob = pack_and_compress(doc) + + buf_kv.append((out_db_name, runs_coll_name, doc_id, blob)) + buf_idx.append((out_db_name, doc_id, number_i, str(name) if name is not None else None, start_u, end_u, tags_json)) + n += 1 + + if len(buf_kv) >= batch_size: + sql.executemany(ins_kv, buf_kv) + sql.executemany(ins_idx, buf_idx) + sql.commit() + buf_kv.clear() + buf_idx.clear() + + if buf_kv: + sql.executemany(ins_kv, buf_kv) + sql.executemany(ins_idx, buf_idx) + sql.commit() + + dt = now_s() - t0 + logger.info(f"[mongo] done runs {mongo_db.name}.{runs_coll_name}: {n} docs in {dt:.1f}s") + return n + + +def dump_gridfs_db( + mongo_db: pymongo.database.Database, + sql: sqlite3.Connection, + out_root: Path, + logger: logging.Logger, + batch_size: int, + only_configs: Optional[List[str]] = None, +) -> int: + import json as _json + + files_coll = mongo_db["fs.files"] + chunks_coll = mongo_db["fs.chunks"] + + out_dir = out_root / "gridfs" / mongo_db.name / "blobs" + ensure_dir(out_dir) + + query = {} + if only_configs: + query = {"config_name": {"$in": only_configs}} + + logger.info(f"[gridfs] dumping GridFS from DB '{mongo_db.name}' to {out_dir}") + t0 = now_s() + + cursor = files_coll.find(query, no_cursor_timeout=True).sort("uploadDate", 1) + + n = 0 + buf: List[Tuple] = [] + + ins = """ + INSERT OR REPLACE INTO gridfs_files( + db_name, file_id, filename, config_name, length, chunkSize, uploadDate, md5, + metadata_json, logical_name, blob_path + ) + VALUES (?,?,?,?,?,?,?,?,?,?,?) + """ + + for fdoc in cursor: + file_id = fdoc["_id"] + file_id_s = oid_to_str(file_id) + + filename = fdoc.get("filename") + config_name = fdoc.get("config_name") or fdoc.get("name") or fdoc.get("config") + + length = int(fdoc.get("length", 0)) + chunk_size = int(fdoc.get("chunkSize", 255 * 1024)) + upload_u = to_unix_seconds(fdoc.get("uploadDate")) + md5 = fdoc.get("md5") + + meta = fdoc.get("metadata") + metadata_json = _json.dumps(meta, default=str) if meta is not None else None + + logical_name = ( + config_name + or filename + or (meta.get("filename") if isinstance(meta, dict) else None) + or (meta.get("name") if isinstance(meta, dict) else None) + or None + ) + + display = (logical_name or "NO_NAME").replace("/", "_") + blob_name = f"{file_id_s}__{display}" + blob_rel = str(Path("gridfs") / mongo_db.name / "blobs" / blob_name) + blob_abs = out_root / blob_rel + + if not blob_abs.exists() or blob_abs.stat().st_size != length: + tmp_path = blob_abs.with_suffix(blob_abs.suffix + ".tmp") + ensure_dir(tmp_path.parent) + + with tmp_path.open("wb") as out_f: + expected_n = 0 + ch_cur = chunks_coll.find({"files_id": file_id}, no_cursor_timeout=True).sort("n", 1) + wrote = 0 + for ch in ch_cur: + n_chunk = int(ch["n"]) + if n_chunk != expected_n: + raise RuntimeError( + f"[gridfs] Missing chunk for file_id={file_id_s}: " + f"expected n={expected_n}, got n={n_chunk}" + ) + out_f.write(bytes(ch["data"])) + wrote += len(ch["data"]) + expected_n += 1 + + if wrote > length: + out_f.flush() + out_f.seek(length) + out_f.truncate() + + tmp_path.replace(blob_abs) + + buf.append( + ( + mongo_db.name, + file_id_s, + filename, + config_name, + length, + chunk_size, + upload_u, + md5, + metadata_json, + logical_name, + blob_rel, + ) + ) + n += 1 + + if len(buf) >= batch_size: + sql.executemany(ins, buf) + sql.commit() + buf.clear() + + if buf: + sql.executemany(ins, buf) + sql.commit() + + dt = now_s() - t0 + logger.info(f"[gridfs] done '{mongo_db.name}': {n} files in {dt:.1f}s") + return n + + +# ------------------------- +# Dump logic (xedocs -> xedocs.sqlite tables) +# ------------------------- + + +def _xedocs_extract(doc: dict, label_cols: List[str]) -> Dict[str, Any]: + """Extract core xedocs fields + discovered label columns.""" + + out: Dict[str, Any] = {} + + out["_id"] = oid_to_str(doc.get("_id")) + out["version"] = doc.get("version") + + created_date = doc.get("created_date") or doc.get("createdDate") + out["created_date_ns"] = to_utc_ns(created_date) + + # time handling + out["time_ns"] = None + out["time_left_ns"] = None + out["time_right_ns"] = None + + t = doc.get("time") + if t is not None: + if isinstance(t, dict) and ("left" in t or "right" in t): + out["time_left_ns"] = to_utc_ns(t.get("left")) + out["time_right_ns"] = to_utc_ns(t.get("right")) + else: + out["time_ns"] = to_utc_ns(t) + + # value columns + v = doc.get("value", None) + out["value_num"] = None + try: + if isinstance(v, (int, float)) and not isinstance(v, bool): + out["value_num"] = float(v) + except Exception: + pass + + try: + out["value_json"] = json.dumps(v, default=str) + except Exception: + out["value_json"] = None + + # discovered labels (TEXT) + for k in label_cols: + if k in ("_id", "version", "time", "created_date", "createdDate", "value", "comments", "reviews"): + continue + val = doc.get(k, None) + if val is None: + out[k] = None + continue + # Keep labels reasonably queryable: store simple types as strings, + # otherwise JSON-encode. + if isinstance(val, (str, int, float, bool)): + out[k] = str(val) if not isinstance(val, str) else val + else: + try: + out[k] = json.dumps(val, default=str) + except Exception: + out[k] = str(val) + + out["doc_bson_z"] = pack_and_compress(doc) + return out + + + +def dump_xedocs_collection_to_tables( + mongo_db: pymongo.database.Database, + coll_name: str, + sql_x: sqlite3.Connection, + batch_size: int, + logger: logging.Logger, + sample_n: int = 1000, +) -> int: + """Dump xedocs. into xedocs.sqlite table with auto-discovered label columns.""" + + coll = mongo_db[coll_name] + table = coll_name + + logger.info(f"[mongo] dumping xedocs.{coll_name} -> xedocs.sqlite table '{table}' (auto-discover labels)") + + # --------- + # 1) Discover label columns from a sample of docs + # --------- + skip_keys = { + "_id", + "time", + "value", + "created_date", + "createdDate", + "comments", + "reviews", + } + + label_cols_set = set() + try: + sample_cursor = coll.find({}, no_cursor_timeout=True, batch_size=min(batch_size, 500)).limit(sample_n) + for d in sample_cursor: + for k in d.keys(): + if k in skip_keys: + continue + # We keep 'version' as a core column, but allow it in schema generation + # (it will be ignored if duplicated) + label_cols_set.add(k) + except Exception as e: + logger.warning(f"[mongo] xedocs label discovery failed for {coll_name}: {type(e).__name__}: {e}") + + # Deterministic order + label_cols = sorted(label_cols_set) + + # --------- + # 2) Create table schema (core + discovered labels) + # --------- + sql_x.executescript(_schema_sql_xedocs_table(table, extra_label_cols=label_cols)) + sql_x.commit() + + # --------- + # 3) Dump all docs + # --------- + t0 = now_s() + + # Build INSERT dynamically + # Core columns (must match schema) + core_cols = [ + "_id", + "version", + "time_ns", + "time_left_ns", + "time_right_ns", + "created_date_ns", + "value_num", + "value_json", + ] + + # Only keep label columns that are not core columns and are valid SQL identifiers when quoted + # (we always quote, so any name is okay) + extra_cols = [c for c in label_cols if c not in {"_id", "version", "time_ns", "time_left_ns", "time_right_ns", "created_date_ns", "value_num", "value_json", "doc_bson_z"}] + + all_cols = core_cols + extra_cols + ["doc_bson_z"] + + def q(name: str) -> str: + return '"' + name.replace('"', '""') + '"' + + placeholders = ",".join(["?"] * len(all_cols)) + ins = f"INSERT OR REPLACE INTO {q(table)}({','.join(q(c) for c in all_cols)}) VALUES ({placeholders})" + + cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) + + n = 0 + buf: List[Tuple[Any, ...]] = [] + + for doc in cur: + e = _xedocs_extract(doc, label_cols=extra_cols) + row = tuple(e.get(c) for c in all_cols) + buf.append(row) + n += 1 + + if len(buf) >= batch_size: + sql_x.executemany(ins, buf) + sql_x.commit() + buf.clear() + + if buf: + sql_x.executemany(ins, buf) + sql_x.commit() + + dt = now_s() - t0 + logger.info(f"[mongo] done xedocs.{coll_name}: {n} docs in {dt:.1f}s") + return n + + +# ------------------------- +# Main +# ------------------------- + +def setup_logger(verbosity: int) -> logging.Logger: + lvl = logging.INFO if verbosity == 0 else (logging.DEBUG if verbosity >= 1 else logging.INFO) + logger = logging.getLogger("dump_mongo_offline") + logger.setLevel(lvl) + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(lvl) + fmt = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s") + handler.setFormatter(fmt) + logger.handlers.clear() + logger.addHandler(handler) + logger.propagate = False + return logger + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--out", required=True, help="Output directory for offline cache") + ap.add_argument("--experiment", default="xent", choices=["xent", "xe1t"], help="utilix experiment") + ap.add_argument("--mongo-uri", default=None, help="Override Mongo URI (otherwise uses utilix uconfig)") + ap.add_argument("--spec", required=True, help="Spec file with lines like 'xenonnt:runs', 'xedocs:ALL', 'files:GRIDFS'") + ap.add_argument("--sqlite-name", default="rundb.sqlite", help="SQLite filename under --out for runs/gridfs/kv") + ap.add_argument("--xedocs-sqlite-name", default="xedocs.sqlite", help="SQLite filename under --out for xedocs tables") + ap.add_argument("--batch-size", type=int, default=2000, help="Batch size for Mongo cursor and SQLite inserts") + ap.add_argument("-v", "--verbose", action="count", default=0, help="Increase logging verbosity (-v/-vv)") + + ap.add_argument( + "--runs-drop-field", + action="append", + default=[], + help="Drop a field from xenonnt:runs docs before storing (repeatable).", + ) + + ap.add_argument("--gridfs-only-filenames", default=None, help="Text file with one filename per line to dump from GridFS") + args = ap.parse_args() + + logger = setup_logger(args.verbose) + + out_root = Path(args.out).resolve() + ensure_dir(out_root) + + spec_path = Path(args.spec).resolve() + spec_items = parse_spec_lines(spec_path.read_text().splitlines()) + + logger.info(f"Connecting to Mongo (experiment={args.experiment}, uri_override={bool(args.mongo_uri)})") + client = get_mongo_client(args.experiment, uri_override=args.mongo_uri) + + # rundb.sqlite + sqlite_path = out_root / args.sqlite_name + logger.info(f"Opening rundb SQLite at {sqlite_path}") + sql = sqlite3.connect(str(sqlite_path)) + sql.executescript(SCHEMA_SQL_RUNDB) + sql.commit() + + # xedocs.sqlite (only opened if needed) + xedocs_sqlite_path = out_root / args.xedocs_sqlite_name + sql_x: Optional[sqlite3.Connection] = None + + gridfs_only = None + if args.gridfs_only_filenames: + gridfs_only = [ln.strip() for ln in Path(args.gridfs_only_filenames).read_text().splitlines() if ln.strip()] + + manifest = { + "format": "offline-mongo-sqlite-v2", + "created_at_unix": int(time.time()), + "compression": COMP_ALGO, + "experiment": args.experiment, + "spec_file": str(spec_path), + "spec": [{"db": x.db, "what": x.what} for x in spec_items], + "sqlite_rundb": str(sqlite_path.name), + "sqlite_xedocs": str(xedocs_sqlite_path.name), + } + (out_root / "manifest.json").write_text(json.dumps(manifest, indent=2)) + logger.info(f"Wrote manifest.json (compression={COMP_ALGO})") + + def _get_sql_x() -> sqlite3.Connection: + nonlocal sql_x + if sql_x is None: + logger.info(f"Opening xedocs SQLite at {xedocs_sqlite_path}") + sql_x = sqlite3.connect(str(xedocs_sqlite_path)) + # some pragmas for speed + sql_x.execute("PRAGMA journal_mode = WAL;") + sql_x.execute("PRAGMA synchronous = NORMAL;") + sql_x.execute("PRAGMA temp_store = MEMORY;") + sql_x.commit() + return sql_x + + for item in spec_items: + dbname = item.db + what = item.what + mongo_db = client[dbname] + + if what.upper() == "GRIDFS": + dump_gridfs_db( + mongo_db=mongo_db, + sql=sql, + out_root=out_root, + logger=logger, + batch_size=max(200, args.batch_size // 5), + only_configs=gridfs_only, + ) + continue + + if what.upper() == "ALL": + names = list_collection_names_safe(mongo_db) + logger.info(f"[mongo] {dbname}:ALL expanded to {len(names)} collections") + + for cname in names: + if cname in ("fs.files", "fs.chunks"): + logger.info(f"[mongo] skipping {dbname}.{cname} (use {dbname}:GRIDFS instead)") + continue + + if dbname == "xedocs": + dump_xedocs_collection_to_tables( + mongo_db=mongo_db, + coll_name=cname, + sql_x=_get_sql_x(), + batch_size=args.batch_size, + logger=logger, + sample_n=1000, + ) + else: + dump_generic_collection( + mongo_db=mongo_db, + coll_name=cname, + sql=sql, + out_db_name=dbname, + batch_size=args.batch_size, + logger=logger, + ) + continue + + # Single collection + cname = what + + if dbname == "xedocs": + dump_xedocs_collection_to_tables( + mongo_db=mongo_db, + coll_name=cname, + sql_x=_get_sql_x(), + batch_size=args.batch_size, + logger=logger, + sample_n=1000, + ) + continue + + if dbname == "xenonnt" and cname == "runs": + dump_xenonnt_runs_index( + mongo_db=mongo_db, + runs_coll_name=cname, + sql=sql, + out_db_name=dbname, + batch_size=args.batch_size, + logger=logger, + drop_fields=args.runs_drop_field, + ) + else: + dump_generic_collection( + mongo_db=mongo_db, + coll_name=cname, + sql=sql, + out_db_name=dbname, + batch_size=args.batch_size, + logger=logger, + ) + + logger.info("ANALYZE (optional)...") + try: + sql.execute("ANALYZE;") + sql.commit() + except Exception: + logger.warning("ANALYZE failed for rundb.sqlite (continuing)") + + if sql_x is not None: + try: + sql_x.execute("ANALYZE;") + sql_x.commit() + except Exception: + logger.warning("ANALYZE failed for xedocs.sqlite (continuing)") + + logger.info("All done.") + logger.info(f"Offline cache written to: {out_root}") + logger.info(f"rundb.sqlite : {sqlite_path}") + if sql_x is not None: + logger.info(f"xedocs.sqlite: {xedocs_sqlite_path}") + + +if __name__ == "__main__": + main() diff --git a/utilix/rundb.py b/utilix/rundb.py index 37513d2..9d9e8b2 100644 --- a/utilix/rundb.py +++ b/utilix/rundb.py @@ -9,15 +9,14 @@ from warnings import warn import time -from . import uconfig, logger, io - +from . import uconfig, logger, io, sqlite_backend +from .sqlite_backend import OfflineSQLiteCollection, SQLiteConfig, _load_sqlite_config # Config the logger: if uconfig is not None: # type: ignore PREFIX = uconfig.get("RunDB", "rundb_api_url", fallback=None) # type: ignore BASE_HEADERS = {"Content-Type": "application/json", "Cache-Control": "no-cache"} - class NewTokenError(Exception): pass @@ -592,8 +591,24 @@ def _collection(experiment, collection, url=None, user=None, password=None, data return db[collection] +def _sqlite_collection(experiment: str, sqlite_config: SQLiteConfig, collection: str = "runs", **kwargs): + database = kwargs.pop("database", None) + if database is None: + database = uconfig.get("RunDB", f"{experiment}_database") + + return OfflineSQLiteCollection( + sqlite_path=sqlite_config.sqlite_path, + db_name=database, + coll_name=collection, + compression=sqlite_config.compression, + ) + def xent_collection(collection="runs", **kwargs): - return _collection("xent", collection, **kwargs) + sqlite_config = _load_sqlite_config() + if sqlite_config.sqlite_active(): + return _sqlite_collection("xent", sqlite_config, collection, **kwargs) + else: + return _collection("xent", collection, **kwargs) def xent_collection_admin(collection="runs", **kwargs): diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py new file mode 100644 index 0000000..a25b201 --- /dev/null +++ b/utilix/sqlite_backend.py @@ -0,0 +1,496 @@ +from __future__ import annotations + +import os +import sqlite3 +import shutil +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple +import os +import traceback +import logging +import pymongo + +OFFLINE_DEBUG = os.environ.get("OFFLINE_DEBUG", "0") not in ("0", "", "false", "False") + +def _env_bool(name: str, default: str = "0") -> bool: + v = os.environ.get(name, default) + return v not in ("0", "", "false", "False", "no", "No", "NO") + +def _dbg(msg): + if OFFLINE_DEBUG: + logging.debug(f"[offline-debug] {msg}") + +def _dbg_stack(tag, n=6): + if OFFLINE_DEBUG: + logging.debug(f"[offline-debug] --- stack ({tag}) ---") + logging.debug("".join(traceback.format_stack(limit=n))) + logging.debug(f"[offline-debug] --- end stack ({tag}) ---") + +def block(msg: str, cfg: SQLiteConfig) -> None: + if cfg.hard: + raise RuntimeError(f"[offline-hard] blocked: {msg}") + _dbg(f"WARNING: {msg}") + _dbg_stack("blocked") + +@dataclass(frozen=True) +class SQLiteConfig: + rundb_sqlite_path: Optional[Path] + xedocs_sqlite_path: Optional[Path] + offline_root: Optional[Path] + compression: str + debug: bool + hard: bool + stack: bool + spy: bool + + def rundb_active(self) -> bool: + return self.rundb_sqlite_path is not None and self.rundb_sqlite_path.exists() + + def xedocs_active(self) -> bool: + return self.xedocs_sqlite_path is not None and self.xedocs_sqlite_path.exists() + + def sqlite_active(self) -> bool: + return self.rundb_active() and self.xedocs_active() + +def _load_sqlite_config() -> SQLiteConfig: + sqp = os.environ.get("RUNDB_SQLITE_PATH", "").strip() + rundb_sqlite_path = Path(sqp).expanduser().resolve() if sqp else None + + xsp = os.environ.get("XEDOCS_SQLITE_PATH", "").strip() + xedocs_sqlite_path = Path(xsp).expanduser().resolve() if xsp else None + + offline_root = rundb_sqlite_path.parent if (rundb_sqlite_path and rundb_sqlite_path.exists()) else None + + debug = _env_bool("OFFLINE_DEBUG") + hard = _env_bool("OFFLINE_HARD") + stack = _env_bool("OFFLINE_STACK") + spy = _env_bool("PYMONGO_SPY") + + return SQLiteConfig( + rundb_sqlite_path=rundb_sqlite_path, + xedocs_sqlite_path=xedocs_sqlite_path, + offline_root=offline_root, + compression="zstd", + debug=debug, + hard=hard, + stack=stack, + spy=spy, + ) + + +@dataclass(frozen=True) +class GridFSRow: + db_name: str + file_id: str + config_name: str + md5: str + length: int + uploadDate: int + blob_path: str + + +class OfflineGridFS: + """ + Minimal offline replacement for utilix.mongo_storage.MongoDownloader / APIDownloader behavior: + + - query SQLite table gridfs_files by config_name + - pick the latest by uploadDate + - stage/copy the blob into a local cache folder named by md5 + - return the staged path + """ + + def __init__( + self, + sqlite_path: str | Path, + offline_root: str | Path, + cache_dirs: Tuple[str | Path, ...] = ("./resource_cache", "/tmp/straxen_resource_cache"), + gridfs_db_name: str = "files", + ): + self.sqlite_path = Path(sqlite_path).resolve() + self.offline_root = Path(offline_root).resolve() + self.cache_dirs = tuple(Path(p) for p in cache_dirs) + self.gridfs_db_name = gridfs_db_name + + self.conn = sqlite3.connect(str(self.sqlite_path)) + self.conn.row_factory = sqlite3.Row + + # ----------------- + # cache dir helpers + # ----------------- + def _pick_cache_dir(self) -> Path: + for d in self.cache_dirs: + try: + d.mkdir(parents=True, exist_ok=True) + except Exception: + continue + if os.access(d, os.W_OK): + return d + raise PermissionError(f"Cannot write to any cache dir: {self.cache_dirs}") + + # ----------------- + # sqlite queries + # ----------------- + def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: + row = self.conn.execute( + """ + SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path + FROM gridfs_files + WHERE db_name = ? AND config_name = ? + ORDER BY uploadDate DESC + LIMIT 1 + """, + (self.gridfs_db_name, config_name), + ).fetchone() + + if row is None: + return None + + # Some older entries might have NULL md5; that's not usable for caching-by-md5. + md5 = row["md5"] + if md5 is None: + raise RuntimeError(f"Found GridFS entry for {config_name} but md5 is NULL in sqlite index") + + return GridFSRow( + db_name=row["db_name"], + file_id=row["file_id"], + config_name=row["config_name"], + md5=str(md5), + length=int(row["length"] or 0), + uploadDate=int(row["uploadDate"] or 0), + blob_path=str(row["blob_path"]), + ) + + # ----------------- + # public API + # ----------------- + def download_single( + self, + config_name: str, + human_readable_file_name: bool = False, + write_to: Optional[str | Path] = None, + ) -> str: + """ + Return absolute path to a staged file. + Default behavior matches utilix: store under md5 in a cache dir. + """ + + _dbg(f"OfflineGridFS.download_single('{config_name}') [SQLITE]") + + entry = self.latest_by_config_name(config_name) + if entry is None: + raise KeyError(f"Config '{config_name}' not found in offline gridfs_files index") + + blob_abs = (self.offline_root / entry.blob_path).resolve() + if not blob_abs.exists(): + raise FileNotFoundError(f"Blob missing on disk: {blob_abs} (from sqlite blob_path)") + + target_dir = Path(write_to).resolve() if write_to else self._pick_cache_dir() + target_dir.mkdir(parents=True, exist_ok=True) + + target_name = config_name if human_readable_file_name else entry.md5 + target_abs = (target_dir / target_name).resolve() + + # If already staged, trust it (fast path) + if target_abs.exists(): + return str(target_abs) + + # Copy in a safe-ish way (atomic replace) + tmp = target_abs.with_suffix(target_abs.suffix + ".tmp") + shutil.copyfile(blob_abs, tmp) + tmp.replace(target_abs) + + return str(target_abs) + + def list_files(self) -> list[str]: + rows = self.conn.execute( + "SELECT DISTINCT config_name FROM gridfs_files WHERE db_name=? ORDER BY config_name", + (self.gridfs_db_name,), + ).fetchall() + return [r["config_name"] for r in rows if r["config_name"] is not None] + + def close(self) -> None: + self.conn.close() + + +def smoke_test( + sqlite_path: str | Path, + offline_root: str | Path, + config_name: str, +) -> None: + g = OfflineGridFS(sqlite_path=sqlite_path, offline_root=offline_root) + p = g.download_single(config_name) + print("[OK] staged:", p) + g.close() + + + + +# ---- OFFLINE RUNDB COLLECTION (SQLite-backed) ---- + +from bson import BSON + +def _decompressor(algo: str): + if algo == "zstd": + import zstandard as zstd # type: ignore + dctx = zstd.ZstdDecompressor() + return dctx.decompress + elif algo == "zlib": + import zlib + return zlib.decompress + else: + raise ValueError(f"Unknown compression algo: {algo}") + + +class OfflineMongoClient: + """Dummy client to satisfy: collection.database.client""" + def close(self): + return + + +@dataclass +class OfflineMongoDatabase: + name: str + client: OfflineMongoClient + + +class OfflineSQLiteCollection: + """ + Minimal pymongo.collection.Collection-like wrapper backed by our sqlite cache. + Provides the attribute chain expected by straxen.storage.rundb.RunDB: + collection.database.client + And a few commonly-used methods: find_one, find, count_documents. + """ + + def __init__( + self, + sqlite_path: str | Path, + db_name: str, + coll_name: str, + compression: str = "zstd", + ): + self.sqlite_path = Path(sqlite_path).resolve() + self.db_name = str(db_name) + self.name = str(coll_name) # pymongo Collection has .name + self._coll_name = str(coll_name) + + self._conn = sqlite3.connect(str(self.sqlite_path)) + self._conn.row_factory = sqlite3.Row + self._decompress = _decompressor(compression) + + # mimic pymongo: collection.database.client + self.database = OfflineMongoDatabase(name=self.db_name, client=OfflineMongoClient()) + + def close(self): + try: + self._conn.close() + except Exception: + pass + + # --- internal helpers --- + + def _decode_row(self, row) -> dict: + raw = self._decompress(row["doc_bson_z"]) + return BSON(raw).decode() + + def _get_by_id(self, doc_id: str) -> dict: + row = self._conn.execute( + "SELECT doc_bson_z FROM kv_collections WHERE db_name=? AND coll_name=? AND doc_id=?", + (self.db_name, self._coll_name, str(doc_id)), + ).fetchone() + if row is None: + raise KeyError(f"Not found: {self.db_name}.{self._coll_name} _id={doc_id}") + return self._decode_row(row) + + # --- pymongo-ish public API --- + + def find_one(self, filter: dict | None = None, *args, **kwargs): + """ + Minimal behavior: + - if filter contains _id, return that doc + - else return first doc (used as connectivity test) + """ + filter = filter or {} + + # _id special case + if "_id" in filter: + ... + + if self._coll_name == "runs" and "number" in filter: + number = int(filter["number"]) + row = self._conn.execute( + "SELECT doc_id FROM runs_index WHERE db_name=? AND number=? LIMIT 1", + (self.db_name, number), + ).fetchone() + if row is None: + return None + return self._get_by_id(row["doc_id"]) + + if row is None: + return None + return self._decode_row(row) + + def find(self, filter: dict | None = None, *args, **kwargs): + filter = filter or {} + + # Special-case _id + if "_id" in filter: + try: + doc = self._get_by_id(str(filter["_id"])) + return _OfflineCursor([doc]) # small list OK + except KeyError: + return _OfflineCursor([]) + + # Special-case xenonnt.runs by number + if self._coll_name == "runs" and "number" in filter: + number = int(filter["number"]) + row = self._conn.execute( + "SELECT doc_id FROM runs_index WHERE db_name=? AND number=? LIMIT 1", + (self.db_name, number), + ).fetchone() + if row is None: + return _OfflineCursor([]) + doc = self._get_by_id(row["doc_id"]) + return _OfflineCursor([doc]) + + # Default: streaming cursor over all docs + return _OfflineStreamingCursor(self.iter_all()) + + def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: + filter = filter or {} + + if "_id" in filter: + row = self._conn.execute( + "SELECT COUNT(*) AS n FROM kv_collections WHERE db_name=? AND coll_name=? AND doc_id=?", + (self.db_name, self._coll_name, str(filter["_id"])), + ).fetchone() + return int(row["n"]) if row else 0 + + if self._coll_name == "runs" and "number" in filter: + number = int(filter["number"]) + row = self._conn.execute( + "SELECT COUNT(*) AS n FROM runs_index WHERE db_name=? AND number=?", + (self.db_name, number), + ).fetchone() + return int(row["n"]) if row else 0 + + row = self._conn.execute( + "SELECT COUNT(*) AS n FROM kv_collections WHERE db_name=? AND coll_name=?", + (self.db_name, self._coll_name), + ).fetchone() + return int(row["n"]) if row else 0 + + def iter_all(self): + cur = self._conn.execute( + "SELECT doc_bson_z FROM kv_collections WHERE db_name=? AND coll_name=?", + (self.db_name, self._coll_name), + ) + for row in cur: + yield self._decode_row(row) + + def as_list(self, limit: int | None = None): + out = [] + for i, d in enumerate(self.iter_all()): + out.append(d) + if limit is not None and i + 1 >= limit: + break + return out + +class _OfflineCursor: + """Small in-memory cursor (safe only for tiny result sets).""" + def __init__(self, docs): + self._docs = list(docs) + + def sort(self, key, direction=1): + rev = direction == -1 + self._docs.sort(key=lambda d: d.get(key), reverse=rev) + return self + + def skip(self, n): + self._docs = self._docs[int(n):] + return self + + def limit(self, n): + self._docs = self._docs[:int(n)] + return self + + def __iter__(self): + return iter(self._docs) + + +class _OfflineStreamingCursor: + """Streaming cursor: does NOT materialize docs.""" + def __init__(self, iterator): + self._it = iterator + self._skip = 0 + self._limit = None + self._sort_key = None + self._sort_dir = 1 + + def sort(self, key, direction=1): + # WARNING: true sort requires materialization. + # Keep it conservative: only allow sort if limit is set (small-ish), + # otherwise do nothing or raise. + self._sort_key = key + self._sort_dir = direction + return self + + def skip(self, n): + self._skip = int(n) + return self + + def limit(self, n): + self._limit = int(n) + return self + + def __iter__(self): + it = self._it + + # apply skip + for _ in range(self._skip): + try: + next(it) + except StopIteration: + return iter(()) + + # If no sort requested, stream directly + if self._sort_key is None: + if self._limit is None: + return it + else: + # stream with limit + def gen(): + for i, d in enumerate(it): + if i >= self._limit: + break + yield d + return gen() + + # If sort requested, we must materialize. + # We materialize only up to limit if provided, else this is dangerous. + if self._limit is None: + raise RuntimeError("Offline streaming cursor cannot sort without limit (would load everything).") + + docs = [] + for i, d in enumerate(it): + if i >= self._limit: + break + docs.append(d) + + rev = self._sort_dir == -1 + docs.sort(key=lambda d: d.get(self._sort_key), reverse=rev) + return iter(docs) + + + +# Add pymongo spy +_orig_mc = pymongo.MongoClient + +class MongoClientSpy(_orig_mc): + def __init__(self, *args, **kwargs): + cfg = _load_sqlite_config() + if cfg.spy: + block(f"pymongo.MongoClient CREATED args={args} kwargs_keys={list(kwargs.keys())}", cfg) + super().__init__(*args, **kwargs) + +pymongo.MongoClient = MongoClientSpy \ No newline at end of file From c1da901b025d371ca6fff3655d8eaac835360c07 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 10:13:02 +0000 Subject: [PATCH 02/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilix/mongo_storage.py | 4 +- utilix/mongo_to_sqlite.py | 169 ++++++++++++++++++++++++++++++-------- utilix/rundb.py | 6 +- utilix/sqlite_backend.py | 66 ++++++++++----- 4 files changed, 183 insertions(+), 62 deletions(-) diff --git a/utilix/mongo_storage.py b/utilix/mongo_storage.py index 04b24e1..45e2f6c 100644 --- a/utilix/mongo_storage.py +++ b/utilix/mongo_storage.py @@ -14,6 +14,7 @@ from utilix import uconfig, logger from utilix.sqlite_backend import OfflineGridFS, _load_sqlite_config + class GridFsBase: """Base class for GridFS operations.""" @@ -375,7 +376,6 @@ def download_single(self, config_name: str, human_readable_file_name=False): :return: str, the absolute path of the file requested """ - # Offline path (sqlite-backed GridFS index) if hasattr(self, "_offline"): return self._offline.download_single( @@ -581,7 +581,6 @@ def initialize( gridfs_db_name="files", ) - def download_single( self, config_name: str, @@ -589,7 +588,6 @@ def download_single( human_readable_file_name: bool = False, ) -> str: """Download the config_name if it exists.""" - # Offline path (sqlite-backed GridFS index) if hasattr(self, "_offline"): return self._offline.download_single( diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index b0d6fe7..55c351e 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -""" -Dump selected MongoDB collections + GridFS into local SQLite(s). +"""Dump selected MongoDB collections + GridFS into local SQLite(s). NEW: - xedocs:* is dumped into a separate SQLite file (xedocs.sqlite) with @@ -12,6 +11,7 @@ files:GRIDFS xedocs:ALL corrections:ALL + """ import argparse @@ -34,6 +34,7 @@ # Compression helpers # ------------------------- + def _compressor(): try: import zstandard as zstd # type: ignore @@ -67,6 +68,7 @@ def decompress(b: bytes) -> bytes: # Spec parsing # ------------------------- + @dataclass(frozen=True) class SpecItem: db: str @@ -93,10 +95,12 @@ def parse_spec_lines(lines: Iterable[str]) -> List[SpecItem]: # Mongo connection (utilix-friendly) # ------------------------- + def get_utilix_mongo_uri(experiment: str) -> str: - """ - Mirrors utilix._collection style: - mongodb://{user}:{password}@{url} + """Mirrors utilix._collection style: + + mongodb://{user}:{password}@{url} + """ from utilix import uconfig # type: ignore @@ -188,14 +192,14 @@ def get_mongo_client(experiment: str, uri_override: Optional[str] = None) -> pym def _schema_sql_xedocs_table(table: str, extra_label_cols: List[str]) -> str: - """ - Create one table per xedocs collection. + """Create one table per xedocs collection. We keep a stable set of "core" columns (id/version/time/value/full doc), and *also* create additional TEXT columns for any label fields we discover from sampling documents in that collection. Note: extra label columns are quoted to tolerate odd names. + """ def q(name: str) -> str: @@ -215,7 +219,17 @@ def q(name: str) -> str: # Discovered label columns (TEXT) for c in extra_label_cols: - if c in {'_id', 'version', 'time_ns', 'time_left_ns', 'time_right_ns', 'created_date_ns', 'value_num', 'value_json', 'doc_bson_z'}: + if c in { + "_id", + "version", + "time_ns", + "time_left_ns", + "time_right_ns", + "created_date_ns", + "value_num", + "value_json", + "doc_bson_z", + }: continue cols.append(f"{q(c)} TEXT") @@ -233,12 +247,12 @@ def q(name: str) -> str: # Optional label indexes (keep this small to avoid DB bloat) preferred = [ - 'algorithm', - 'config_name', - 'detector', - 'source', - 'pmt', - 'gain_model', + "algorithm", + "config_name", + "detector", + "source", + "pmt", + "gain_model", ] present = set(extra_label_cols) @@ -268,6 +282,7 @@ def q(name: str) -> str: # Utilities # ------------------------- + def ensure_dir(p: Path) -> None: p.mkdir(parents=True, exist_ok=True) @@ -299,6 +314,7 @@ def to_utc_ns(dtobj) -> Optional[int]: # treat naive as UTC if getattr(dtobj, "tzinfo", None) is None: import datetime as dt + dtobj = dtobj.replace(tzinfo=dt.timezone.utc) return int(dtobj.timestamp() * 1_000_000_000) except Exception: @@ -328,6 +344,7 @@ def list_collection_names_safe(db: pymongo.database.Database) -> List[str]: # Dump logic (generic -> rundb.sqlite kv_collections) # ------------------------- + def dump_generic_collection( mongo_db: pymongo.database.Database, coll_name: str, @@ -389,7 +406,9 @@ def dump_xenonnt_runs_index( cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) n = 0 buf_kv: List[Tuple[str, str, str, bytes]] = [] - buf_idx: List[Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]]] = [] + buf_idx: List[ + Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]] + ] = [] ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) VALUES (?,?,?,?)" ins_idx = """ @@ -409,7 +428,12 @@ def dump_xenonnt_runs_index( name = doc.get("name") or doc.get("run_name") or doc.get("runName") - start = doc.get("start") or doc.get("start_time") or doc.get("startTime") or doc.get("starttime") + start = ( + doc.get("start") + or doc.get("start_time") + or doc.get("startTime") + or doc.get("starttime") + ) end = doc.get("end") or doc.get("end_time") or doc.get("endTime") or doc.get("endtime") start_u = to_unix_seconds(start) @@ -431,7 +455,17 @@ def dump_xenonnt_runs_index( blob = pack_and_compress(doc) buf_kv.append((out_db_name, runs_coll_name, doc_id, blob)) - buf_idx.append((out_db_name, doc_id, number_i, str(name) if name is not None else None, start_u, end_u, tags_json)) + buf_idx.append( + ( + out_db_name, + doc_id, + number_i, + str(name) if name is not None else None, + start_u, + end_u, + tags_json, + ) + ) n += 1 if len(buf_kv) >= batch_size: @@ -521,7 +555,9 @@ def dump_gridfs_db( with tmp_path.open("wb") as out_f: expected_n = 0 - ch_cur = chunks_coll.find({"files_id": file_id}, no_cursor_timeout=True).sort("n", 1) + ch_cur = chunks_coll.find({"files_id": file_id}, no_cursor_timeout=True).sort( + "n", 1 + ) wrote = 0 for ch in ch_cur: n_chunk = int(ch["n"]) @@ -579,7 +615,6 @@ def dump_gridfs_db( def _xedocs_extract(doc: dict, label_cols: List[str]) -> Dict[str, Any]: """Extract core xedocs fields + discovered label columns.""" - out: Dict[str, Any] = {} out["_id"] = oid_to_str(doc.get("_id")) @@ -617,7 +652,16 @@ def _xedocs_extract(doc: dict, label_cols: List[str]) -> Dict[str, Any]: # discovered labels (TEXT) for k in label_cols: - if k in ("_id", "version", "time", "created_date", "createdDate", "value", "comments", "reviews"): + if k in ( + "_id", + "version", + "time", + "created_date", + "createdDate", + "value", + "comments", + "reviews", + ): continue val = doc.get(k, None) if val is None: @@ -637,7 +681,6 @@ def _xedocs_extract(doc: dict, label_cols: List[str]) -> Dict[str, Any]: return out - def dump_xedocs_collection_to_tables( mongo_db: pymongo.database.Database, coll_name: str, @@ -647,11 +690,12 @@ def dump_xedocs_collection_to_tables( sample_n: int = 1000, ) -> int: """Dump xedocs. into xedocs.sqlite table with auto-discovered label columns.""" - coll = mongo_db[coll_name] table = coll_name - logger.info(f"[mongo] dumping xedocs.{coll_name} -> xedocs.sqlite table '{table}' (auto-discover labels)") + logger.info( + f"[mongo] dumping xedocs.{coll_name} -> xedocs.sqlite table '{table}' (auto-discover labels)" + ) # --------- # 1) Discover label columns from a sample of docs @@ -668,7 +712,9 @@ def dump_xedocs_collection_to_tables( label_cols_set = set() try: - sample_cursor = coll.find({}, no_cursor_timeout=True, batch_size=min(batch_size, 500)).limit(sample_n) + sample_cursor = coll.find( + {}, no_cursor_timeout=True, batch_size=min(batch_size, 500) + ).limit(sample_n) for d in sample_cursor: for k in d.keys(): if k in skip_keys: @@ -677,7 +723,9 @@ def dump_xedocs_collection_to_tables( # (it will be ignored if duplicated) label_cols_set.add(k) except Exception as e: - logger.warning(f"[mongo] xedocs label discovery failed for {coll_name}: {type(e).__name__}: {e}") + logger.warning( + f"[mongo] xedocs label discovery failed for {coll_name}: {type(e).__name__}: {e}" + ) # Deterministic order label_cols = sorted(label_cols_set) @@ -708,7 +756,22 @@ def dump_xedocs_collection_to_tables( # Only keep label columns that are not core columns and are valid SQL identifiers when quoted # (we always quote, so any name is okay) - extra_cols = [c for c in label_cols if c not in {"_id", "version", "time_ns", "time_left_ns", "time_right_ns", "created_date_ns", "value_num", "value_json", "doc_bson_z"}] + extra_cols = [ + c + for c in label_cols + if c + not in { + "_id", + "version", + "time_ns", + "time_left_ns", + "time_right_ns", + "created_date_ns", + "value_num", + "value_json", + "doc_bson_z", + } + ] all_cols = core_cols + extra_cols + ["doc_bson_z"] @@ -747,6 +810,7 @@ def q(name: str) -> str: # Main # ------------------------- + def setup_logger(verbosity: int) -> logging.Logger: lvl = logging.INFO if verbosity == 0 else (logging.DEBUG if verbosity >= 1 else logging.INFO) logger = logging.getLogger("dump_mongo_offline") @@ -764,13 +828,36 @@ def setup_logger(verbosity: int) -> logging.Logger: def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--out", required=True, help="Output directory for offline cache") - ap.add_argument("--experiment", default="xent", choices=["xent", "xe1t"], help="utilix experiment") - ap.add_argument("--mongo-uri", default=None, help="Override Mongo URI (otherwise uses utilix uconfig)") - ap.add_argument("--spec", required=True, help="Spec file with lines like 'xenonnt:runs', 'xedocs:ALL', 'files:GRIDFS'") - ap.add_argument("--sqlite-name", default="rundb.sqlite", help="SQLite filename under --out for runs/gridfs/kv") - ap.add_argument("--xedocs-sqlite-name", default="xedocs.sqlite", help="SQLite filename under --out for xedocs tables") - ap.add_argument("--batch-size", type=int, default=2000, help="Batch size for Mongo cursor and SQLite inserts") - ap.add_argument("-v", "--verbose", action="count", default=0, help="Increase logging verbosity (-v/-vv)") + ap.add_argument( + "--experiment", default="xent", choices=["xent", "xe1t"], help="utilix experiment" + ) + ap.add_argument( + "--mongo-uri", default=None, help="Override Mongo URI (otherwise uses utilix uconfig)" + ) + ap.add_argument( + "--spec", + required=True, + help="Spec file with lines like 'xenonnt:runs', 'xedocs:ALL', 'files:GRIDFS'", + ) + ap.add_argument( + "--sqlite-name", + default="rundb.sqlite", + help="SQLite filename under --out for runs/gridfs/kv", + ) + ap.add_argument( + "--xedocs-sqlite-name", + default="xedocs.sqlite", + help="SQLite filename under --out for xedocs tables", + ) + ap.add_argument( + "--batch-size", + type=int, + default=2000, + help="Batch size for Mongo cursor and SQLite inserts", + ) + ap.add_argument( + "-v", "--verbose", action="count", default=0, help="Increase logging verbosity (-v/-vv)" + ) ap.add_argument( "--runs-drop-field", @@ -779,7 +866,11 @@ def main() -> None: help="Drop a field from xenonnt:runs docs before storing (repeatable).", ) - ap.add_argument("--gridfs-only-filenames", default=None, help="Text file with one filename per line to dump from GridFS") + ap.add_argument( + "--gridfs-only-filenames", + default=None, + help="Text file with one filename per line to dump from GridFS", + ) args = ap.parse_args() logger = setup_logger(args.verbose) @@ -790,7 +881,9 @@ def main() -> None: spec_path = Path(args.spec).resolve() spec_items = parse_spec_lines(spec_path.read_text().splitlines()) - logger.info(f"Connecting to Mongo (experiment={args.experiment}, uri_override={bool(args.mongo_uri)})") + logger.info( + f"Connecting to Mongo (experiment={args.experiment}, uri_override={bool(args.mongo_uri)})" + ) client = get_mongo_client(args.experiment, uri_override=args.mongo_uri) # rundb.sqlite @@ -806,7 +899,11 @@ def main() -> None: gridfs_only = None if args.gridfs_only_filenames: - gridfs_only = [ln.strip() for ln in Path(args.gridfs_only_filenames).read_text().splitlines() if ln.strip()] + gridfs_only = [ + ln.strip() + for ln in Path(args.gridfs_only_filenames).read_text().splitlines() + if ln.strip() + ] manifest = { "format": "offline-mongo-sqlite-v2", diff --git a/utilix/rundb.py b/utilix/rundb.py index 9d9e8b2..0fe2b2d 100644 --- a/utilix/rundb.py +++ b/utilix/rundb.py @@ -17,6 +17,7 @@ PREFIX = uconfig.get("RunDB", "rundb_api_url", fallback=None) # type: ignore BASE_HEADERS = {"Content-Type": "application/json", "Cache-Control": "no-cache"} + class NewTokenError(Exception): pass @@ -591,7 +592,9 @@ def _collection(experiment, collection, url=None, user=None, password=None, data return db[collection] -def _sqlite_collection(experiment: str, sqlite_config: SQLiteConfig, collection: str = "runs", **kwargs): +def _sqlite_collection( + experiment: str, sqlite_config: SQLiteConfig, collection: str = "runs", **kwargs +): database = kwargs.pop("database", None) if database is None: database = uconfig.get("RunDB", f"{experiment}_database") @@ -603,6 +606,7 @@ def _sqlite_collection(experiment: str, sqlite_config: SQLiteConfig, collection: compression=sqlite_config.compression, ) + def xent_collection(collection="runs", **kwargs): sqlite_config = _load_sqlite_config() if sqlite_config.sqlite_active(): diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index a25b201..5b0cde5 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -13,26 +13,31 @@ OFFLINE_DEBUG = os.environ.get("OFFLINE_DEBUG", "0") not in ("0", "", "false", "False") + def _env_bool(name: str, default: str = "0") -> bool: v = os.environ.get(name, default) return v not in ("0", "", "false", "False", "no", "No", "NO") + def _dbg(msg): if OFFLINE_DEBUG: logging.debug(f"[offline-debug] {msg}") + def _dbg_stack(tag, n=6): if OFFLINE_DEBUG: logging.debug(f"[offline-debug] --- stack ({tag}) ---") logging.debug("".join(traceback.format_stack(limit=n))) logging.debug(f"[offline-debug] --- end stack ({tag}) ---") + def block(msg: str, cfg: SQLiteConfig) -> None: if cfg.hard: raise RuntimeError(f"[offline-hard] blocked: {msg}") _dbg(f"WARNING: {msg}") _dbg_stack("blocked") + @dataclass(frozen=True) class SQLiteConfig: rundb_sqlite_path: Optional[Path] @@ -53,6 +58,7 @@ def xedocs_active(self) -> bool: def sqlite_active(self) -> bool: return self.rundb_active() and self.xedocs_active() + def _load_sqlite_config() -> SQLiteConfig: sqp = os.environ.get("RUNDB_SQLITE_PATH", "").strip() rundb_sqlite_path = Path(sqp).expanduser().resolve() if sqp else None @@ -60,12 +66,14 @@ def _load_sqlite_config() -> SQLiteConfig: xsp = os.environ.get("XEDOCS_SQLITE_PATH", "").strip() xedocs_sqlite_path = Path(xsp).expanduser().resolve() if xsp else None - offline_root = rundb_sqlite_path.parent if (rundb_sqlite_path and rundb_sqlite_path.exists()) else None + offline_root = ( + rundb_sqlite_path.parent if (rundb_sqlite_path and rundb_sqlite_path.exists()) else None + ) debug = _env_bool("OFFLINE_DEBUG") - hard = _env_bool("OFFLINE_HARD") + hard = _env_bool("OFFLINE_HARD") stack = _env_bool("OFFLINE_STACK") - spy = _env_bool("PYMONGO_SPY") + spy = _env_bool("PYMONGO_SPY") return SQLiteConfig( rundb_sqlite_path=rundb_sqlite_path, @@ -91,13 +99,14 @@ class GridFSRow: class OfflineGridFS: - """ - Minimal offline replacement for utilix.mongo_storage.MongoDownloader / APIDownloader behavior: + """Minimal offline replacement for utilix.mongo_storage.MongoDownloader / APIDownloader + behavior: - query SQLite table gridfs_files by config_name - pick the latest by uploadDate - stage/copy the blob into a local cache folder named by md5 - return the staged path + """ def __init__( @@ -139,7 +148,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1 - """, + """\ + , (self.gridfs_db_name, config_name), ).fetchone() @@ -149,7 +159,9 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: # Some older entries might have NULL md5; that's not usable for caching-by-md5. md5 = row["md5"] if md5 is None: - raise RuntimeError(f"Found GridFS entry for {config_name} but md5 is NULL in sqlite index") + raise RuntimeError( + f"Found GridFS entry for {config_name} but md5 is NULL in sqlite index" + ) return GridFSRow( db_name=row["db_name"], @@ -170,11 +182,11 @@ def download_single( human_readable_file_name: bool = False, write_to: Optional[str | Path] = None, ) -> str: - """ - Return absolute path to a staged file. + """Return absolute path to a staged file. + Default behavior matches utilix: store under md5 in a cache dir. - """ + """ _dbg(f"OfflineGridFS.download_single('{config_name}') [SQLITE]") entry = self.latest_by_config_name(config_name) @@ -224,26 +236,28 @@ def smoke_test( g.close() - - # ---- OFFLINE RUNDB COLLECTION (SQLite-backed) ---- from bson import BSON + def _decompressor(algo: str): if algo == "zstd": import zstandard as zstd # type: ignore + dctx = zstd.ZstdDecompressor() return dctx.decompress elif algo == "zlib": import zlib + return zlib.decompress else: raise ValueError(f"Unknown compression algo: {algo}") class OfflineMongoClient: - """Dummy client to satisfy: collection.database.client""" + """Dummy client to satisfy: collection.database.client.""" + def close(self): return @@ -255,11 +269,12 @@ class OfflineMongoDatabase: class OfflineSQLiteCollection: - """ - Minimal pymongo.collection.Collection-like wrapper backed by our sqlite cache. + """Minimal pymongo.collection.Collection-like wrapper backed by our sqlite cache. + Provides the attribute chain expected by straxen.storage.rundb.RunDB: collection.database.client And a few commonly-used methods: find_one, find, count_documents. + """ def __init__( @@ -271,7 +286,7 @@ def __init__( ): self.sqlite_path = Path(sqlite_path).resolve() self.db_name = str(db_name) - self.name = str(coll_name) # pymongo Collection has .name + self.name = str(coll_name) # pymongo Collection has .name self._coll_name = str(coll_name) self._conn = sqlite3.connect(str(self.sqlite_path)) @@ -337,7 +352,7 @@ def find(self, filter: dict | None = None, *args, **kwargs): if "_id" in filter: try: doc = self._get_by_id(str(filter["_id"])) - return _OfflineCursor([doc]) # small list OK + return _OfflineCursor([doc]) # small list OK except KeyError: return _OfflineCursor([]) @@ -396,8 +411,10 @@ def as_list(self, limit: int | None = None): break return out + class _OfflineCursor: """Small in-memory cursor (safe only for tiny result sets).""" + def __init__(self, docs): self._docs = list(docs) @@ -407,11 +424,11 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n):] + self._docs = self._docs[int(n) :] return self def limit(self, n): - self._docs = self._docs[:int(n)] + self._docs = self._docs[: int(n)] return self def __iter__(self): @@ -420,6 +437,7 @@ def __iter__(self): class _OfflineStreamingCursor: """Streaming cursor: does NOT materialize docs.""" + def __init__(self, iterator): self._it = iterator self._skip = 0 @@ -464,12 +482,15 @@ def gen(): if i >= self._limit: break yield d + return gen() # If sort requested, we must materialize. # We materialize only up to limit if provided, else this is dangerous. if self._limit is None: - raise RuntimeError("Offline streaming cursor cannot sort without limit (would load everything).") + raise RuntimeError( + "Offline streaming cursor cannot sort without limit (would load everything)." + ) docs = [] for i, d in enumerate(it): @@ -482,10 +503,10 @@ def gen(): return iter(docs) - # Add pymongo spy _orig_mc = pymongo.MongoClient + class MongoClientSpy(_orig_mc): def __init__(self, *args, **kwargs): cfg = _load_sqlite_config() @@ -493,4 +514,5 @@ def __init__(self, *args, **kwargs): block(f"pymongo.MongoClient CREATED args={args} kwargs_keys={list(kwargs.keys())}", cfg) super().__init__(*args, **kwargs) -pymongo.MongoClient = MongoClientSpy \ No newline at end of file + +pymongo.MongoClient = MongoClientSpy From 5ad5ad7b2e1da8157a8553977eb81a3ed7b35973 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Mon, 19 Jan 2026 04:46:05 -0600 Subject: [PATCH 03/34] precommit --- utilix/mongo_to_sqlite.py | 25 +++++++++++++++++-------- utilix/rundb.py | 2 +- utilix/sqlite_backend.py | 14 ++++++-------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 55c351e..929ab73 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -17,7 +17,6 @@ import argparse import json import logging -import os import sqlite3 import sys import time @@ -241,8 +240,12 @@ def q(name: str) -> str: # - time interval lookup: version + interval # - common labels (if present) index_sql = [ - f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_time')} ON {q(table)}({q('version')}, {q('time_ns')});", - f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_interval')} ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", + f"CREATE INDEX IF NOT EXISTS \ + {q('idx_' + table + '_version_time')} \ + ON {q(table)}({q('version')}, {q('time_ns')});", + f"CREATE INDEX IF NOT EXISTS \ + {q('idx_' + table + '_version_interval')} \ + ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", ] # Optional label indexes (keep this small to avoid DB bloat) @@ -260,7 +263,9 @@ def q(name: str) -> str: for lab in preferred: if lab in present: index_sql.append( - f"CREATE INDEX IF NOT EXISTS {q('idx_' + table + '_version_' + lab)} ON {q(table)}({q('version')}, {q(lab)});" + f"CREATE INDEX IF NOT EXISTS \ + {q('idx_' + table + '_version_' + lab)} \ + ON {q(table)}({q('version')}, {q(lab)});" ) n_extra += 1 if n_extra >= 6: @@ -365,7 +370,8 @@ def dump_generic_collection( n = 0 buf: List[Tuple[str, str, str, bytes]] = [] - insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) VALUES (?,?,?,?)" + insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) \ + VALUES (?,?,?,?)" for doc in cur: _id = doc.get("_id") @@ -410,7 +416,8 @@ def dump_xenonnt_runs_index( Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]] ] = [] - ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) VALUES (?,?,?,?)" + ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)\ + VALUES (?,?,?,?)" ins_idx = """ INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) VALUES (?,?,?,?,?,?,?) @@ -694,7 +701,7 @@ def dump_xedocs_collection_to_tables( table = coll_name logger.info( - f"[mongo] dumping xedocs.{coll_name} -> xedocs.sqlite table '{table}' (auto-discover labels)" + f"[mongo] dumping xedocs.{coll_name} -> xedocs.sqlite table '{table}' (auto-discover)" ) # --------- @@ -779,7 +786,9 @@ def q(name: str) -> str: return '"' + name.replace('"', '""') + '"' placeholders = ",".join(["?"] * len(all_cols)) - ins = f"INSERT OR REPLACE INTO {q(table)}({','.join(q(c) for c in all_cols)}) VALUES ({placeholders})" + ins = f"INSERT OR REPLACE INTO \ + {q(table)}({','.join(q(c) for c in all_cols)}) \ + VALUES ({placeholders})" cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) diff --git a/utilix/rundb.py b/utilix/rundb.py index 0fe2b2d..209bbf9 100644 --- a/utilix/rundb.py +++ b/utilix/rundb.py @@ -9,7 +9,7 @@ from warnings import warn import time -from . import uconfig, logger, io, sqlite_backend +from . import uconfig, logger, io from .sqlite_backend import OfflineSQLiteCollection, SQLiteConfig, _load_sqlite_config # Config the logger: diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 5b0cde5..4c58ee9 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -6,10 +6,10 @@ from dataclasses import dataclass from pathlib import Path from typing import Optional, Tuple -import os import traceback import logging import pymongo +from bson import BSON OFFLINE_DEBUG = os.environ.get("OFFLINE_DEBUG", "0") not in ("0", "", "false", "False") @@ -148,8 +148,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1 - """\ - , + """, (self.gridfs_db_name, config_name), ).fetchone() @@ -238,8 +237,6 @@ def smoke_test( # ---- OFFLINE RUNDB COLLECTION (SQLite-backed) ---- -from bson import BSON - def _decompressor(algo: str): if algo == "zstd": @@ -376,7 +373,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( - "SELECT COUNT(*) AS n FROM kv_collections WHERE db_name=? AND coll_name=? AND doc_id=?", + "SELECT COUNT(*) AS n FROM kv_collections \ + WHERE db_name=? AND coll_name=? AND doc_id=?", (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 @@ -424,11 +422,11 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n) :] + self._docs = self._docs[int(n):] return self def limit(self, n): - self._docs = self._docs[: int(n)] + self._docs = self._docs[:int(n)] return self def __iter__(self): From 973e768d4446579bef7cad62a4e90749916728af Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 10:46:20 +0000 Subject: [PATCH 04/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilix/mongo_to_sqlite.py | 6 ++++-- utilix/sqlite_backend.py | 10 ++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 929ab73..5aed0a5 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -370,7 +370,8 @@ def dump_generic_collection( n = 0 buf: List[Tuple[str, str, str, bytes]] = [] - insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) \ + insert_sql =\ + "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) \ VALUES (?,?,?,?)" for doc in cur: @@ -416,7 +417,8 @@ def dump_xenonnt_runs_index( Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]] ] = [] - ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)\ + ins_kv =\ + "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)\ VALUES (?,?,?,?)" ins_idx = """ INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 4c58ee9..946c917 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -148,7 +148,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1 - """, + """\ + , (self.gridfs_db_name, config_name), ).fetchone() @@ -374,7 +375,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?", + WHERE db_name=? AND coll_name=? AND doc_id=?"\ + , (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 @@ -422,11 +424,11 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n):] + self._docs = self._docs[int(n) :] return self def limit(self, n): - self._docs = self._docs[:int(n)] + self._docs = self._docs[: int(n)] return self def __iter__(self): From 9c89c45061c04fa0dd12a21389790299bf3cfffe Mon Sep 17 00:00:00 2001 From: cfuselli Date: Mon, 19 Jan 2026 04:49:09 -0600 Subject: [PATCH 05/34] precommit --- utilix/mongo_to_sqlite.py | 4 ++-- utilix/sqlite_backend.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 929ab73..ab17fad 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -371,7 +371,7 @@ def dump_generic_collection( buf: List[Tuple[str, str, str, bytes]] = [] insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) \ - VALUES (?,?,?,?)" + VALUES (?,?,?,?)" for doc in cur: _id = doc.get("_id") @@ -417,7 +417,7 @@ def dump_xenonnt_runs_index( ] = [] ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)\ - VALUES (?,?,?,?)" + VALUES (?,?,?,?)" ins_idx = """ INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) VALUES (?,?,?,?,?,?,?) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 4c58ee9..cf52cd5 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -148,7 +148,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1 - """, + """, (self.gridfs_db_name, config_name), ).fetchone() From a1d8b05ba4dbcf48f1cc6109b3db841ea0d9ccbe Mon Sep 17 00:00:00 2001 From: cfuselli Date: Mon, 19 Jan 2026 04:54:28 -0600 Subject: [PATCH 06/34] precommit --- utilix/mongo_to_sqlite.py | 9 +++++---- utilix/sqlite_backend.py | 11 ++++------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index ab17fad..573e6ff 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -370,8 +370,8 @@ def dump_generic_collection( n = 0 buf: List[Tuple[str, str, str, bytes]] = [] - insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z) \ - VALUES (?,?,?,?)" + insert_sql = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)" + insert_sql += " VALUES (?,?,?,?)" for doc in cur: _id = doc.get("_id") @@ -416,8 +416,9 @@ def dump_xenonnt_runs_index( Tuple[str, str, Optional[int], Optional[str], Optional[int], Optional[int], Optional[str]] ] = [] - ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)\ - VALUES (?,?,?,?)" + ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)" + ins_kv += " VALUES (?,?,?,?)" + ins_idx = """ INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) VALUES (?,?,?,?,?,?,?) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index e14dad2..8e3368a 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -147,9 +147,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1 - """, - (self.gridfs_db_name, config_name), + LIMIT 1""", (self.gridfs_db_name, config_name), ).fetchone() if row is None: @@ -374,8 +372,7 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?"\ - , + WHERE db_name=? AND coll_name=? AND doc_id=?", (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 @@ -423,11 +420,11 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n) :] + self._docs = self._docs[int(n):] return self def limit(self, n): - self._docs = self._docs[: int(n)] + self._docs = self._docs[:int(n)] return self def __iter__(self): From 29fc4d8cebea26e2fc0f01a5bcf13d2c4f56bd97 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Jan 2026 10:58:24 +0000 Subject: [PATCH 07/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilix/mongo_to_sqlite.py | 2 +- utilix/sqlite_backend.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 573e6ff..3d70687 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -418,7 +418,7 @@ def dump_xenonnt_runs_index( ins_kv = "INSERT OR REPLACE INTO kv_collections(db_name, coll_name, doc_id, doc_bson_z)" ins_kv += " VALUES (?,?,?,?)" - + ins_idx = """ INSERT OR REPLACE INTO runs_index(db_name, doc_id, number, name, start, end, tags_json) VALUES (?,?,?,?,?,?,?) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 8e3368a..cfbd119 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -147,7 +147,9 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", (self.gridfs_db_name, config_name), + LIMIT 1"""\ + , + (self.gridfs_db_name, config_name), ).fetchone() if row is None: @@ -372,7 +374,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?", + WHERE db_name=? AND coll_name=? AND doc_id=?"\ + , (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 @@ -420,11 +423,11 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n):] + self._docs = self._docs[int(n) :] return self def limit(self, n): - self._docs = self._docs[:int(n)] + self._docs = self._docs[: int(n)] return self def __iter__(self): From e9a588d462469f1022681894fdbf4804716503f7 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 12:58:15 +0100 Subject: [PATCH 08/34] Fix critical bugs: correct sqlite_path references and complete find_one() logic - Fix AttributeError: replace sqlite_config.sqlite_path with rundb_sqlite_path in rundb.py and mongo_storage.py (3 locations) - Fix NameError in OfflineSQLiteCollection.find_one(): add proper _id handling and default query case - These bugs would cause immediate crashes when offline mode is activated Fixes ensure offline SQLite backend actually works when both rundb and xedocs SQLite files are present. --- utilix/mongo_storage.py | 4 ++-- utilix/rundb.py | 2 +- utilix/sqlite_backend.py | 13 ++++++++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/utilix/mongo_storage.py b/utilix/mongo_storage.py index 45e2f6c..304a86e 100644 --- a/utilix/mongo_storage.py +++ b/utilix/mongo_storage.py @@ -328,7 +328,7 @@ def initialize(self, store_files_at=None, *args, **kwargs): if sqlite_active: self._offline = OfflineGridFS( - sqlite_path=sqlite_cfg.sqlite_path, + sqlite_path=sqlite_cfg.rundb_sqlite_path, offline_root=sqlite_cfg.offline_root, cache_dirs=tuple(self.storage_options), gridfs_db_name="files", @@ -575,7 +575,7 @@ def initialize( if sqlite_active: self._offline = OfflineGridFS( - sqlite_path=sqlite_cfg.sqlite_path, + sqlite_path=sqlite_cfg.rundb_sqlite_path, offline_root=sqlite_cfg.offline_root, cache_dirs=tuple(self.storage_options), gridfs_db_name="files", diff --git a/utilix/rundb.py b/utilix/rundb.py index 209bbf9..2769648 100644 --- a/utilix/rundb.py +++ b/utilix/rundb.py @@ -600,7 +600,7 @@ def _sqlite_collection( database = uconfig.get("RunDB", f"{experiment}_database") return OfflineSQLiteCollection( - sqlite_path=sqlite_config.sqlite_path, + sqlite_path=sqlite_config.rundb_sqlite_path, db_name=database, coll_name=collection, compression=sqlite_config.compression, diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index cfbd119..728d59e 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -320,14 +320,19 @@ def find_one(self, filter: dict | None = None, *args, **kwargs): """ Minimal behavior: - if filter contains _id, return that doc + - if filter contains 'number' (for runs collection), look it up - else return first doc (used as connectivity test) """ filter = filter or {} # _id special case if "_id" in filter: - ... + try: + return self._get_by_id(str(filter["_id"])) + except KeyError: + return None + # Special case for runs collection with number filter if self._coll_name == "runs" and "number" in filter: number = int(filter["number"]) row = self._conn.execute( @@ -338,6 +343,12 @@ def find_one(self, filter: dict | None = None, *args, **kwargs): return None return self._get_by_id(row["doc_id"]) + # Default: return first doc (connectivity test) + row = self._conn.execute( + "SELECT doc_bson_z FROM kv_collections WHERE db_name=? AND coll_name=? LIMIT 1", + (self.db_name, self._coll_name), + ).fetchone() + if row is None: return None return self._decode_row(row) From c354f2acb281792344b5392bc435dffd9532cb52 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 12:58:55 +0100 Subject: [PATCH 09/34] Apply black formatting to sqlite_backend.py --- utilix/sqlite_backend.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 728d59e..1a77cb6 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -147,8 +147,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , + LIMIT 1""", (self.gridfs_db_name, config_name), ).fetchone() @@ -348,7 +347,7 @@ def find_one(self, filter: dict | None = None, *args, **kwargs): "SELECT doc_bson_z FROM kv_collections WHERE db_name=? AND coll_name=? LIMIT 1", (self.db_name, self._coll_name), ).fetchone() - + if row is None: return None return self._decode_row(row) @@ -385,8 +384,7 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?"\ - , + WHERE db_name=? AND coll_name=? AND doc_id=?", (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 From f82dc1d4a6c2276438040c6553c590c0ef0a61d6 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:01:33 +0100 Subject: [PATCH 10/34] Add comprehensive tests for offline SQLite backend Tests cover: - SQLiteConfig loading and activation logic - OfflineGridFS file operations (list, download) - OfflineSQLiteCollection queries (find_one, find, count_documents) - xent_collection() fallback behavior (SQLite vs MongoDB) - Edge cases and error handling All 13 tests pass successfully. --- tests/test_offline_sqlite.py | 386 +++++++++++++++++++++++++++++++++++ 1 file changed, 386 insertions(+) create mode 100644 tests/test_offline_sqlite.py diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py new file mode 100644 index 0000000..e69d87e --- /dev/null +++ b/tests/test_offline_sqlite.py @@ -0,0 +1,386 @@ +"""Tests for SQLite offline backend functionality.""" +import os +import sqlite3 +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch, MagicMock + +from bson import BSON + + +class TestSQLiteConfig(unittest.TestCase): + """Test SQLiteConfig dataclass and configuration loading.""" + + def test_load_config_from_env(self): + """Test loading SQLite config from environment variables.""" + from utilix.sqlite_backend import _load_sqlite_config + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + + # Create empty files + rundb_path.touch() + xedocs_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + cfg = _load_sqlite_config() + + self.assertIsNotNone(cfg.rundb_sqlite_path) + self.assertIsNotNone(cfg.xedocs_sqlite_path) + # Use resolve() on both sides to handle symlinks (e.g., /var -> /private/var on macOS) + self.assertEqual(cfg.rundb_sqlite_path.resolve(), rundb_path.resolve()) + self.assertEqual(cfg.xedocs_sqlite_path.resolve(), xedocs_path.resolve()) + self.assertTrue(cfg.rundb_active()) + self.assertTrue(cfg.xedocs_active()) + self.assertTrue(cfg.sqlite_active()) + + def test_sqlite_active_requires_both_files(self): + """Test that sqlite_active() requires both files to exist.""" + from utilix.sqlite_backend import _load_sqlite_config + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + + # Only create rundb file + rundb_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + cfg = _load_sqlite_config() + + self.assertTrue(cfg.rundb_active()) + self.assertFalse(cfg.xedocs_active()) + self.assertFalse(cfg.sqlite_active()) # Requires BOTH + + def test_sqlite_active_false_when_no_env_vars(self): + """Test that sqlite_active() is False without environment variables.""" + from utilix.sqlite_backend import _load_sqlite_config + + with patch.dict(os.environ, {}, clear=True): + # Remove RUNDB_SQLITE_PATH and XEDOCS_SQLITE_PATH if present + os.environ.pop("RUNDB_SQLITE_PATH", None) + os.environ.pop("XEDOCS_SQLITE_PATH", None) + + cfg = _load_sqlite_config() + + self.assertFalse(cfg.rundb_active()) + self.assertFalse(cfg.xedocs_active()) + self.assertFalse(cfg.sqlite_active()) + + +class TestOfflineGridFS(unittest.TestCase): + """Test OfflineGridFS for file operations.""" + + def setUp(self): + """Create temporary directory and mock SQLite database.""" + self.tmpdir = tempfile.TemporaryDirectory() + self.tmppath = Path(self.tmpdir.name) + + # Create mock SQLite database with gridfs_files table + self.db_path = self.tmppath / "rundb.sqlite" + self.blob_path = self.tmppath / "test_blob.txt" + + # Write test blob + self.blob_path.write_text("test content") + + # Create database with gridfs_files table + conn = sqlite3.connect(str(self.db_path)) + conn.execute( + """ + CREATE TABLE gridfs_files ( + db_name TEXT, + file_id TEXT, + config_name TEXT, + md5 TEXT, + length INTEGER, + uploadDate INTEGER, + blob_path TEXT + ) + """ + ) + conn.execute( + """ + INSERT INTO gridfs_files + (db_name, file_id, config_name, md5, length, uploadDate, blob_path) + VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + "files", + "test_id", + "test_config", + "abc123", + 12, + 1234567890, + "test_blob.txt", + ), + ) + conn.commit() + conn.close() + + def tearDown(self): + """Clean up temporary directory.""" + self.tmpdir.cleanup() + + def test_offline_gridfs_list_files(self): + """Test listing files from offline GridFS.""" + from utilix.sqlite_backend import OfflineGridFS + + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(self.tmppath / "cache",), + ) + + files = gfs.list_files() + self.assertIn("test_config", files) + gfs.close() + + def test_offline_gridfs_download_single(self): + """Test downloading a single file from offline GridFS.""" + from utilix.sqlite_backend import OfflineGridFS + + cache_dir = self.tmppath / "cache" + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(cache_dir,), + ) + + # Download file + result_path = gfs.download_single("test_config") + + # Should be cached by md5 + self.assertTrue(Path(result_path).exists()) + self.assertIn("abc123", result_path) # md5 in filename + + gfs.close() + + def test_offline_gridfs_missing_config_raises(self): + """Test that missing config raises KeyError.""" + from utilix.sqlite_backend import OfflineGridFS + + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(self.tmppath / "cache",), + ) + + with self.assertRaises(KeyError): + gfs.download_single("nonexistent_config") + + gfs.close() + + +class TestOfflineSQLiteCollection(unittest.TestCase): + """Test OfflineSQLiteCollection for database queries.""" + + def setUp(self): + """Create temporary SQLite database with test data.""" + self.tmpdir = tempfile.TemporaryDirectory() + self.db_path = Path(self.tmpdir.name) / "rundb.sqlite" + + # Create database with kv_collections and runs_index tables + conn = sqlite3.connect(str(self.db_path)) + + conn.execute( + """ + CREATE TABLE kv_collections ( + db_name TEXT, + coll_name TEXT, + doc_id TEXT, + doc_bson_z BLOB + ) + """ + ) + + conn.execute( + """ + CREATE TABLE runs_index ( + db_name TEXT, + number INTEGER, + doc_id TEXT + ) + """ + ) + + # Insert test document + import zlib + test_doc = {"_id": "test_id_123", "number": 12345, "name": "test_run"} + bson_data = BSON.encode(test_doc) + compressed = zlib.compress(bson_data, level=6) + + conn.execute( + "INSERT INTO kv_collections (db_name, coll_name, doc_id, doc_bson_z) VALUES (?, ?, ?, ?)", + ("xenonnt", "runs", "test_id_123", compressed), + ) + + conn.execute( + "INSERT INTO runs_index (db_name, number, doc_id) VALUES (?, ?, ?)", + ("xenonnt", 12345, "test_id_123"), + ) + + conn.commit() + conn.close() + + def tearDown(self): + """Clean up temporary directory.""" + self.tmpdir.cleanup() + + def test_find_one_by_id(self): + """Test find_one with _id filter.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one({"_id": "test_id_123"}) + self.assertIsNotNone(doc) + self.assertEqual(doc["_id"], "test_id_123") + self.assertEqual(doc["number"], 12345) + + coll.close() + + def test_find_one_by_number(self): + """Test find_one with number filter for runs collection.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one({"number": 12345}) + self.assertIsNotNone(doc) + self.assertEqual(doc["number"], 12345) + self.assertEqual(doc["_id"], "test_id_123") + + coll.close() + + def test_find_one_default_returns_first_doc(self): + """Test find_one without filter returns first document.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one() + self.assertIsNotNone(doc) + self.assertEqual(doc["_id"], "test_id_123") + + coll.close() + + def test_count_documents(self): + """Test count_documents method.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + count = coll.count_documents({}) + self.assertEqual(count, 1) + + count = coll.count_documents({"number": 12345}) + self.assertEqual(count, 1) + + count = coll.count_documents({"number": 99999}) + self.assertEqual(count, 0) + + coll.close() + + def test_find_returns_cursor(self): + """Test find method returns iterable cursor.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + cursor = coll.find({"number": 12345}) + docs = list(cursor) + + self.assertEqual(len(docs), 1) + self.assertEqual(docs[0]["number"], 12345) + + coll.close() + + +class TestXentCollectionOffline(unittest.TestCase): + """Test xent_collection() function with offline mode.""" + + def test_xent_collection_uses_sqlite_when_active(self): + """Test that xent_collection uses SQLite when offline is active.""" + from utilix.sqlite_backend import SQLiteConfig, OfflineSQLiteCollection + from utilix.rundb import xent_collection + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + rundb_path.touch() + xedocs_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + with patch("utilix.rundb.uconfig") as mock_config: + mock_config.get.return_value = "xenonnt" + + coll = xent_collection("runs") + + # Should return OfflineSQLiteCollection when offline is active + self.assertIsInstance(coll, OfflineSQLiteCollection) + coll.close() + + def test_xent_collection_uses_mongodb_when_offline_inactive(self): + """Test that xent_collection uses MongoDB when offline is not active.""" + from utilix.rundb import xent_collection + + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("RUNDB_SQLITE_PATH", None) + os.environ.pop("XEDOCS_SQLITE_PATH", None) + + with patch("utilix.rundb._collection") as mock_collection: + mock_collection.return_value = MagicMock() + + coll = xent_collection("runs") + + # Should call _collection (MongoDB) when offline is not active + mock_collection.assert_called_once() + + +if __name__ == "__main__": + unittest.main() From 22559116e965310d1cd1b99703962f8d2fee9fb9 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:07:26 +0100 Subject: [PATCH 11/34] Add comprehensive documentation for offline SQLite backend - Added detailed section in README.md covering setup, usage, and troubleshooting - Documented environment variables and their purposes - Added examples for generating SQLite dumps and using offline mode - Included limitations and performance considerations - Enhanced module docstring in sqlite_backend.py with usage examples --- README.md | 137 +++++++++++++++++++++++++++++++++++++++ utilix/sqlite_backend.py | 36 ++++++++++ 2 files changed, 173 insertions(+) diff --git a/README.md b/README.md index c2c0274..1f0660d 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,143 @@ If you need to use different databases or do not want to use the information lis >>> xe1t_coll, xe1t_db, xe1t_user, xe1t_pw, xe1t_url = [ask someone] >>> xe1t_collection = pymongo_collection(xe1t_coll, database=xe1t_coll, user=xe1t_user, password=xe1t_pw, url=xe1t_url) +## Offline SQLite Backend + +For scenarios where network connectivity to MongoDB is unavailable (e.g., working on compute nodes without internet access, traveling, or during database outages), `utilix` provides an **offline SQLite backend** that allows you to continue working with local copies of the database and GridFS files. + +### Overview + +The offline backend consists of two main components: +1. **OfflineSQLiteCollection**: A pymongo-compatible wrapper around SQLite databases containing BSON-compressed documents +2. **OfflineGridFS**: A GridFS-compatible interface for accessing cached files + +When properly configured, utilix will automatically use the SQLite backend when both database files are available, and seamlessly fall back to MongoDB when they're not. + +### Setup + +#### 1. Generate SQLite Database Files + +First, you need to create SQLite dumps of the MongoDB collections you need. The `mongo_to_sqlite.py` script (included in utilix) handles this: + +```bash +# Create a spec file listing what to dump +cat > dump_spec.txt << EOF +xenonnt:runs +files:GRIDFS +xedocs:ALL +corrections:ALL +EOF + +# Run the dump (requires MongoDB access) +python -m utilix.mongo_to_sqlite \ + --spec dump_spec.txt \ + --rundb-out /path/to/rundb.sqlite \ + --xedocs-out /path/to/xedocs.sqlite +``` + +This will create two SQLite files: +- `rundb.sqlite`: Contains runs collection, GridFS file index, and file blobs +- `xedocs.sqlite`: Contains corrections and other xedocs collections + +**Note**: The dump process can take significant time depending on data size. Plan accordingly. + +#### 2. Configure Environment Variables + +Set the following environment variables to enable offline mode: + +```bash +export RUNDB_SQLITE_PATH="/path/to/rundb.sqlite" +export XEDOCS_SQLITE_PATH="/path/to/xedocs.sqlite" + +# Optional: Set compression algorithm (default: zstd) +export OFFLINE_COMP="zstd" # or "zlib" + +# Optional: Enable debug logging +export OFFLINE_DEBUG="1" +``` + +**Important**: Both SQLite files must exist and be accessible for offline mode to activate. If either is missing, utilix will fall back to MongoDB automatically. + +#### 3. Use Normally + +Once configured, your existing code works without modification: + +```python +from utilix import xent_collection + +# Automatically uses SQLite if files are present, MongoDB otherwise +runs = xent_collection("runs") +doc = runs.find_one({"number": 12345}) + +# GridFS downloads also work offline +from utilix.mongo_storage import MongoDownloader +downloader = MongoDownloader() +path = downloader.download_single("my_config") +``` + +### Features and Limitations + +#### Supported Operations +- ✅ `find_one()` with `_id`, `number`, or no filter +- ✅ `find()` with basic filters +- ✅ `count_documents()` +- ✅ GridFS file listing and downloads +- ✅ Automatic MD5-based file caching +- ✅ BSON compression (zstd or zlib) + +#### Limitations +- ⚠️ Complex queries (aggregations, regex, etc.) may not work +- ⚠️ Cursor operations like `sort()` without `limit()` will raise errors to prevent loading entire collections +- ⚠️ Write operations are not supported (read-only) +- ⚠️ The offline database is a snapshot; it won't reflect new data added to MongoDB + +### Performance Considerations + +- SQLite databases are compressed with zstd (or zlib as fallback), typically achieving 5-10x compression +- First-time file access requires decompression; subsequent accesses benefit from OS caching +- For large result sets, queries may be slower than MongoDB due to BSON decompression overhead +- GridFS files are cached by MD5 hash to avoid re-downloading + +### Updating Your Offline Database + +The SQLite files are static snapshots. To refresh them with new data: + +```bash +# Re-run the dump script +python -m utilix.mongo_to_sqlite \ + --spec dump_spec.txt \ + --rundb-out /path/to/rundb.sqlite \ + --xedocs-out /path/to/xedocs.sqlite \ + --overwrite # Add this flag to replace existing files +``` + +### Troubleshooting + +**Problem**: `AttributeError: 'SQLiteConfig' object has no attribute 'sqlite_path'` +- **Solution**: Update to the latest version of utilix. This was a bug in early versions. + +**Problem**: Offline mode not activating +- **Check**: Both environment variables are set: `echo $RUNDB_SQLITE_PATH $XEDOCS_SQLITE_PATH` +- **Check**: Both files exist: `ls -lh $RUNDB_SQLITE_PATH $XEDOCS_SQLITE_PATH` +- **Check**: Set `OFFLINE_DEBUG=1` to see debug messages + +**Problem**: `KeyError: Config 'xyz' not found in offline gridfs_files index` +- **Solution**: The file wasn't included in the dump. Re-dump with the file added to your spec. + +**Problem**: Queries return different results than MongoDB +- **Note**: This shouldn't happen for supported queries. Please report as a bug with example code. + +### Environment Variable Reference + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `RUNDB_SQLITE_PATH` | Yes | - | Path to RunDB SQLite file | +| `XEDOCS_SQLITE_PATH` | Yes | - | Path to xedocs SQLite file | +| `OFFLINE_COMP` | No | `zstd` | Compression algorithm (`zstd` or `zlib`) | +| `OFFLINE_DEBUG` | No | `0` | Enable debug logging (`1` or `0`) | +| `OFFLINE_HARD` | No | `0` | Raise errors instead of warnings on unsupported ops | +| `PYMONGO_SPY` | No | `0` | Log when pymongo.MongoClient is created (for debugging) | + ## Data processing requests You may find yourself missing some data which requires a large amount of resources to process. In these cases, you can submit a processing request to the computing team. diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 1a77cb6..5eefc84 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -1,5 +1,41 @@ from __future__ import annotations +"""SQLite offline backend for utilix. + +This module provides offline access to XENON RunDB and GridFS data using local +SQLite databases. It allows analysis to continue when MongoDB is unreachable. + +Usage: + 1. Generate SQLite files using mongo_to_sqlite.py (requires MongoDB access) + 2. Set environment variables: + export RUNDB_SQLITE_PATH="/path/to/rundb.sqlite" + export XEDOCS_SQLITE_PATH="/path/to/xedocs.sqlite" + 3. Use utilix normally - offline mode activates automatically + +Example: + >>> import os + >>> os.environ["RUNDB_SQLITE_PATH"] = "/data/rundb.sqlite" + >>> os.environ["XEDOCS_SQLITE_PATH"] = "/data/xedocs.sqlite" + >>> + >>> from utilix import xent_collection + >>> runs = xent_collection("runs") # Uses SQLite if files exist + >>> doc = runs.find_one({"number": 12345}) + +Environment Variables: + RUNDB_SQLITE_PATH: Path to RunDB SQLite file (required) + XEDOCS_SQLITE_PATH: Path to xedocs SQLite file (required) + OFFLINE_COMP: Compression algorithm, 'zstd' or 'zlib' (default: 'zstd') + OFFLINE_DEBUG: Enable debug logging, '1' or '0' (default: '0') + +Classes: + SQLiteConfig: Configuration dataclass for offline mode + OfflineGridFS: GridFS-compatible offline file access + OfflineSQLiteCollection: pymongo-compatible offline collection access + +Functions: + _load_sqlite_config: Load configuration from environment variables +""" + import os import sqlite3 import shutil From 9a9cd93307d50ecb67f04206e2a07a779e324db4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:13:43 +0000 Subject: [PATCH 12/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 126 ++++++++++++++++------------------- utilix/sqlite_backend.py | 8 ++- 2 files changed, 62 insertions(+), 72 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index e69d87e..d33b1a2 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -1,4 +1,5 @@ """Tests for SQLite offline backend functionality.""" + import os import sqlite3 import tempfile @@ -19,7 +20,7 @@ def test_load_config_from_env(self): with tempfile.TemporaryDirectory() as tmpdir: rundb_path = Path(tmpdir) / "rundb.sqlite" xedocs_path = Path(tmpdir) / "xedocs.sqlite" - + # Create empty files rundb_path.touch() xedocs_path.touch() @@ -49,7 +50,7 @@ def test_sqlite_active_requires_both_files(self): with tempfile.TemporaryDirectory() as tmpdir: rundb_path = Path(tmpdir) / "rundb.sqlite" xedocs_path = Path(tmpdir) / "xedocs.sqlite" - + # Only create rundb file rundb_path.touch() @@ -74,7 +75,7 @@ def test_sqlite_active_false_when_no_env_vars(self): # Remove RUNDB_SQLITE_PATH and XEDOCS_SQLITE_PATH if present os.environ.pop("RUNDB_SQLITE_PATH", None) os.environ.pop("XEDOCS_SQLITE_PATH", None) - + cfg = _load_sqlite_config() self.assertFalse(cfg.rundb_active()) @@ -89,35 +90,28 @@ def setUp(self): """Create temporary directory and mock SQLite database.""" self.tmpdir = tempfile.TemporaryDirectory() self.tmppath = Path(self.tmpdir.name) - + # Create mock SQLite database with gridfs_files table self.db_path = self.tmppath / "rundb.sqlite" self.blob_path = self.tmppath / "test_blob.txt" - + # Write test blob self.blob_path.write_text("test content") - + # Create database with gridfs_files table conn = sqlite3.connect(str(self.db_path)) conn.execute( - """ - CREATE TABLE gridfs_files ( - db_name TEXT, - file_id TEXT, - config_name TEXT, - md5 TEXT, - length INTEGER, - uploadDate INTEGER, - blob_path TEXT - ) + """CREATE TABLE gridfs_files ( db_name TEXT, file_id TEXT, config_name TEXT, md5 TEXT, + length INTEGER, uploadDate INTEGER, + + blob_path TEXT ) + """ ) conn.execute( - """ - INSERT INTO gridfs_files - (db_name, file_id, config_name, md5, length, uploadDate, blob_path) - VALUES (?, ?, ?, ?, ?, ?, ?) - """, + """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" + , ( "files", "test_id", @@ -144,7 +138,7 @@ def test_offline_gridfs_list_files(self): offline_root=self.tmppath, cache_dirs=(self.tmppath / "cache",), ) - + files = gfs.list_files() self.assertIn("test_config", files) gfs.close() @@ -159,14 +153,14 @@ def test_offline_gridfs_download_single(self): offline_root=self.tmppath, cache_dirs=(cache_dir,), ) - + # Download file result_path = gfs.download_single("test_config") - + # Should be cached by md5 self.assertTrue(Path(result_path).exists()) self.assertIn("abc123", result_path) # md5 in filename - + gfs.close() def test_offline_gridfs_missing_config_raises(self): @@ -178,10 +172,10 @@ def test_offline_gridfs_missing_config_raises(self): offline_root=self.tmppath, cache_dirs=(self.tmppath / "cache",), ) - + with self.assertRaises(KeyError): gfs.download_single("nonexistent_config") - + gfs.close() @@ -192,47 +186,41 @@ def setUp(self): """Create temporary SQLite database with test data.""" self.tmpdir = tempfile.TemporaryDirectory() self.db_path = Path(self.tmpdir.name) / "rundb.sqlite" - + # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - - conn.execute( - """ - CREATE TABLE kv_collections ( - db_name TEXT, - coll_name TEXT, - doc_id TEXT, - doc_bson_z BLOB - ) + .execute( + """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + + doc_bson_z BLOB ) + """ ) - conn.execute( - """ - CREATE TABLE runs_index ( - db_name TEXT, - number INTEGER, - doc_id TEXT - ) + """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + + doc_id TEXT ) + """ ) - + # Insert test document import zlib + test_doc = {"_id": "test_id_123", "number": 12345, "name": "test_run"} bson_data = BSON.encode(test_doc) compressed = zlib.compress(bson_data, level=6) - + conn.execute( "INSERT INTO kv_collections (db_name, coll_name, doc_id, doc_bson_z) VALUES (?, ?, ?, ?)", ("xenonnt", "runs", "test_id_123", compressed), ) - + conn.execute( "INSERT INTO runs_index (db_name, number, doc_id) VALUES (?, ?, ?)", ("xenonnt", 12345, "test_id_123"), ) - + conn.commit() conn.close() @@ -250,12 +238,12 @@ def test_find_one_by_id(self): coll_name="runs", compression="zlib", ) - + doc = coll.find_one({"_id": "test_id_123"}) self.assertIsNotNone(doc) self.assertEqual(doc["_id"], "test_id_123") self.assertEqual(doc["number"], 12345) - + coll.close() def test_find_one_by_number(self): @@ -268,12 +256,12 @@ def test_find_one_by_number(self): coll_name="runs", compression="zlib", ) - + doc = coll.find_one({"number": 12345}) self.assertIsNotNone(doc) self.assertEqual(doc["number"], 12345) self.assertEqual(doc["_id"], "test_id_123") - + coll.close() def test_find_one_default_returns_first_doc(self): @@ -286,11 +274,11 @@ def test_find_one_default_returns_first_doc(self): coll_name="runs", compression="zlib", ) - + doc = coll.find_one() self.assertIsNotNone(doc) self.assertEqual(doc["_id"], "test_id_123") - + coll.close() def test_count_documents(self): @@ -303,16 +291,16 @@ def test_count_documents(self): coll_name="runs", compression="zlib", ) - + count = coll.count_documents({}) self.assertEqual(count, 1) - + count = coll.count_documents({"number": 12345}) self.assertEqual(count, 1) - + count = coll.count_documents({"number": 99999}) self.assertEqual(count, 0) - + coll.close() def test_find_returns_cursor(self): @@ -325,13 +313,13 @@ def test_find_returns_cursor(self): coll_name="runs", compression="zlib", ) - + cursor = coll.find({"number": 12345}) docs = list(cursor) - + self.assertEqual(len(docs), 1) self.assertEqual(docs[0]["number"], 12345) - + coll.close() @@ -342,13 +330,13 @@ def test_xent_collection_uses_sqlite_when_active(self): """Test that xent_collection uses SQLite when offline is active.""" from utilix.sqlite_backend import SQLiteConfig, OfflineSQLiteCollection from utilix.rundb import xent_collection - + with tempfile.TemporaryDirectory() as tmpdir: rundb_path = Path(tmpdir) / "rundb.sqlite" xedocs_path = Path(tmpdir) / "xedocs.sqlite" rundb_path.touch() xedocs_path.touch() - + with patch.dict( os.environ, { @@ -358,9 +346,9 @@ def test_xent_collection_uses_sqlite_when_active(self): ): with patch("utilix.rundb.uconfig") as mock_config: mock_config.get.return_value = "xenonnt" - + coll = xent_collection("runs") - + # Should return OfflineSQLiteCollection when offline is active self.assertIsInstance(coll, OfflineSQLiteCollection) coll.close() @@ -368,16 +356,16 @@ def test_xent_collection_uses_sqlite_when_active(self): def test_xent_collection_uses_mongodb_when_offline_inactive(self): """Test that xent_collection uses MongoDB when offline is not active.""" from utilix.rundb import xent_collection - + with patch.dict(os.environ, {}, clear=True): os.environ.pop("RUNDB_SQLITE_PATH", None) os.environ.pop("XEDOCS_SQLITE_PATH", None) - + with patch("utilix.rundb._collection") as mock_collection: mock_collection.return_value = MagicMock() - + coll = xent_collection("runs") - + # Should call _collection (MongoDB) when offline is not active mock_collection.assert_called_once() diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 5eefc84..257e2f4 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -16,7 +16,7 @@ >>> import os >>> os.environ["RUNDB_SQLITE_PATH"] = "/data/rundb.sqlite" >>> os.environ["XEDOCS_SQLITE_PATH"] = "/data/xedocs.sqlite" - >>> + >>> >>> from utilix import xent_collection >>> runs = xent_collection("runs") # Uses SQLite if files exist >>> doc = runs.find_one({"number": 12345}) @@ -183,7 +183,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", + LIMIT 1"""\ + , (self.gridfs_db_name, config_name), ).fetchone() @@ -420,7 +421,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?", + WHERE db_name=? AND coll_name=? AND doc_id=?"\ + , (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 From 3e7b04382cb2683101e4e32cf495ecf13acdd9e1 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:23:54 +0100 Subject: [PATCH 13/34] Fix pre-commit issues: syntax error and flake8 violations --- tests/test_offline_sqlite.py | 28 ++++++++++++---------------- utilix/sqlite_backend.py | 8 +++----- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index d33b1a2..278d0dc 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -110,8 +110,7 @@ def setUp(self): ) conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, - blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" - , + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""", ( "files", "test_id", @@ -189,20 +188,17 @@ def setUp(self): # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - .execute( - """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, - - doc_bson_z BLOB ) - - """ - ) - conn.execute( - """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, - - doc_id TEXT ) - - """ - ) + conn.execute("""CREATE TABLE kv_collections ( + db_name TEXT, + coll_name TEXT, + doc_id TEXT, + doc_bson_z BLOB + )""") + conn.execute("""CREATE TABLE runs_index ( + db_name TEXT, + number INTEGER, + doc_id TEXT + )""") # Insert test document import zlib diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 257e2f4..ea23e0f 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,8 +183,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , + LIMIT 1""", (self.gridfs_db_name, config_name), ).fetchone() @@ -420,9 +419,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( - "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?"\ - , + "SELECT COUNT(*) AS n FROM kv_collections " + "WHERE db_name=? AND coll_name=? AND doc_id=?", (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 From cd129350bbaddcb7aa9e090bef2a31169fddf8b4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:24:08 +0000 Subject: [PATCH 14/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 28 ++++++++++++++++------------ utilix/sqlite_backend.py | 3 ++- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 278d0dc..3065cdf 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -110,7 +110,8 @@ def setUp(self): ) conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, - blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""", + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" + , ( "files", "test_id", @@ -188,17 +189,20 @@ def setUp(self): # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - conn.execute("""CREATE TABLE kv_collections ( - db_name TEXT, - coll_name TEXT, - doc_id TEXT, - doc_bson_z BLOB - )""") - conn.execute("""CREATE TABLE runs_index ( - db_name TEXT, - number INTEGER, - doc_id TEXT - )""") + conn.execute( + """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + + doc_bson_z BLOB ) + + """ + ) + conn.execute( + """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + + doc_id TEXT ) + + """ + ) # Insert test document import zlib diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index ea23e0f..0e65e5f 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,7 +183,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", + LIMIT 1"""\ + , (self.gridfs_db_name, config_name), ).fetchone() From 1cba0767e8b423d82c126e0e40c3493e10b300e2 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:25:54 +0100 Subject: [PATCH 15/34] Fix flake8 issues in test file: line length and unused variables --- tests/test_offline_sqlite.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 3065cdf..e0e4663 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -36,7 +36,7 @@ def test_load_config_from_env(self): self.assertIsNotNone(cfg.rundb_sqlite_path) self.assertIsNotNone(cfg.xedocs_sqlite_path) - # Use resolve() on both sides to handle symlinks (e.g., /var -> /private/var on macOS) + # Use resolve() to handle symlinks (e.g., /var -> /private/var) self.assertEqual(cfg.rundb_sqlite_path.resolve(), rundb_path.resolve()) self.assertEqual(cfg.xedocs_sqlite_path.resolve(), xedocs_path.resolve()) self.assertTrue(cfg.rundb_active()) @@ -212,7 +212,8 @@ def setUp(self): compressed = zlib.compress(bson_data, level=6) conn.execute( - "INSERT INTO kv_collections (db_name, coll_name, doc_id, doc_bson_z) VALUES (?, ?, ?, ?)", + "INSERT INTO kv_collections " + "(db_name, coll_name, doc_id, doc_bson_z) VALUES (?, ?, ?, ?)", ("xenonnt", "runs", "test_id_123", compressed), ) @@ -328,7 +329,7 @@ class TestXentCollectionOffline(unittest.TestCase): def test_xent_collection_uses_sqlite_when_active(self): """Test that xent_collection uses SQLite when offline is active.""" - from utilix.sqlite_backend import SQLiteConfig, OfflineSQLiteCollection + from utilix.sqlite_backend import OfflineSQLiteCollection from utilix.rundb import xent_collection with tempfile.TemporaryDirectory() as tmpdir: @@ -364,7 +365,7 @@ def test_xent_collection_uses_mongodb_when_offline_inactive(self): with patch("utilix.rundb._collection") as mock_collection: mock_collection.return_value = MagicMock() - coll = xent_collection("runs") + _result = xent_collection("runs") # noqa: F841 # Should call _collection (MongoDB) when offline is not active mock_collection.assert_called_once() From dd89821e312732c68b519458f4c090327e6cff8f Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:29:25 +0100 Subject: [PATCH 16/34] Fix mypy type errors and flake8 whitespace issues --- .DS_Store | Bin 0 -> 6148 bytes tests/test_offline_sqlite.py | 17 ++++++----------- utilix/mongo_storage.py | 6 ++++++ utilix/rundb.py | 1 + utilix/sqlite_backend.py | 5 ++--- 5 files changed, 15 insertions(+), 14 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..948a180f0f8545b3f10a20ed5e9bc12f71afb3da GIT binary patch literal 6148 zcmeHKu}%U(5S_(SPK^nrvD(%`lly~%9F>g;KVT38CXgcm)bhT-*xAVs_zkv}@(K3Z z+xljA!Cek4jnSE8=H2bg>^>g3+gl=1jY%gVsu58PXN-0bRv72mOIC3s+^l$xem7~I z=YwIXQU*i;QQ$W!U>F5ICy^Jt0@Ra^V}fMvUWdjV=R6={J4LAc&>-}&bRqt z&wDp)zoM0@^SP-<6KV&mQ}w!B;d7Yo&6;nP!_qv1{ENx@s(=q^=Ces-x}Xu_DCBZ_ z5N3TIPe%(LUcb*_bw#-T9k0Iy6~_1<`tX~8L8X4XocB4rB+V12MKFiYVqHzZqcUS2 zV*f({?AdJbu0@HWfG8jeEEM4PgN8Gv4hxIs)q!A+0Kf>t#&FG_6x^d7rVa~>$iS3s z1=?2Sju^_eV_dts)L~)Kwv%#)59L-??u4Su>X=`fa#E>9iK2ig5Gt@?Z)?2&H`ky4 z!zQ^C1w?^ArGSdER@T5za(ip($MIgP;b(9*t}85>76i8)n+x8G_u Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , + LIMIT 1""", (self.gridfs_db_name, config_name), ).fetchone() @@ -552,7 +551,7 @@ def gen(): _orig_mc = pymongo.MongoClient -class MongoClientSpy(_orig_mc): +class MongoClientSpy(_orig_mc): # type: ignore[misc,valid-type] def __init__(self, *args, **kwargs): cfg = _load_sqlite_config() if cfg.spy: From 9602232ea7b3af57904d484cba25fe56a0c97eea Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:30:01 +0000 Subject: [PATCH 17/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 17 +++++++++++------ utilix/sqlite_backend.py | 3 ++- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index cec4ae5..58ed492 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -109,8 +109,9 @@ def setUp(self): """ ) conn.execute( - """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, - uploadDate, blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""", + """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" + , ( "files", "test_id", @@ -188,16 +189,20 @@ def setUp(self): # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - conn.execute("""CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + conn.execute( + """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, doc_bson_z BLOB ) - """) - conn.execute("""CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + """ + ) + conn.execute( + """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, doc_id TEXT ) - """) + """ + ) # Insert test document import zlib diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 4680d86..5a95643 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,7 +183,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", + LIMIT 1"""\ + , (self.gridfs_db_name, config_name), ).fetchone() From 85733de22e6adb4e7b6bb3c3b06e5b079f420cab Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:38:35 +0100 Subject: [PATCH 18/34] Fix pre-existing flake8 and mypy issues in mongo_to_sqlite.py --- utilix/mongo_to_sqlite.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 3d70687..778eeb5 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -28,7 +28,6 @@ from bson import BSON from bson.objectid import ObjectId - # ------------------------- # Compression helpers # ------------------------- @@ -240,12 +239,12 @@ def q(name: str) -> str: # - time interval lookup: version + interval # - common labels (if present) index_sql = [ - f"CREATE INDEX IF NOT EXISTS \ - {q('idx_' + table + '_version_time')} \ - ON {q(table)}({q('version')}, {q('time_ns')});", - f"CREATE INDEX IF NOT EXISTS \ - {q('idx_' + table + '_version_interval')} \ - ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", + f"CREATE INDEX IF NOT EXISTS " + f"{q('idx_' + table + '_version_time')} " + f"ON {q(table)}({q('version')}, {q('time_ns')});", + f"CREATE INDEX IF NOT EXISTS " + f"{q('idx_' + table + '_version_interval')} " + f"ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", ] # Optional label indexes (keep this small to avoid DB bloat) @@ -262,11 +261,9 @@ def q(name: str) -> str: n_extra = 0 for lab in preferred: if lab in present: - index_sql.append( - f"CREATE INDEX IF NOT EXISTS \ + index_sql.append(f"CREATE INDEX IF NOT EXISTS \ {q('idx_' + table + '_version_' + lab)} \ - ON {q(table)}({q('version')}, {q(lab)});" - ) + ON {q(table)}({q('version')}, {q(lab)});") n_extra += 1 if n_extra >= 6: break @@ -787,9 +784,11 @@ def q(name: str) -> str: return '"' + name.replace('"', '""') + '"' placeholders = ",".join(["?"] * len(all_cols)) - ins = f"INSERT OR REPLACE INTO \ - {q(table)}({','.join(q(c) for c in all_cols)}) \ - VALUES ({placeholders})" + ins = ( + f"INSERT OR REPLACE INTO " + f"{q(table)}({','.join(q(c) for c in all_cols)}) " + f"VALUES ({placeholders})" + ) cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) @@ -797,8 +796,8 @@ def q(name: str) -> str: buf: List[Tuple[Any, ...]] = [] for doc in cur: - e = _xedocs_extract(doc, label_cols=extra_cols) - row = tuple(e.get(c) for c in all_cols) + extracted = _xedocs_extract(doc, label_cols=extra_cols) + row = tuple(extracted.get(c) for c in all_cols) buf.append(row) n += 1 From 88603c332b25fa1f793965ed4f293ae4382d3d40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:39:23 +0000 Subject: [PATCH 19/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 2 +- utilix/mongo_to_sqlite.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 58ed492..e0e4663 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -111,7 +111,7 @@ def setUp(self): conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" - , + , ( "files", "test_id", diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 778eeb5..6aa8b1e 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -261,9 +261,11 @@ def q(name: str) -> str: n_extra = 0 for lab in preferred: if lab in present: - index_sql.append(f"CREATE INDEX IF NOT EXISTS \ + index_sql.append( + f"CREATE INDEX IF NOT EXISTS \ {q('idx_' + table + '_version_' + lab)} \ - ON {q(table)}({q('version')}, {q(lab)});") + ON {q(table)}({q('version')}, {q(lab)});" + ) n_extra += 1 if n_extra >= 6: break From 30487949a72d781c1dd60eb0ddc47e7331e40a8d Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:42:04 +0100 Subject: [PATCH 20/34] Add noqa comments for unavoidable black/flake8 conflicts --- tests/test_offline_sqlite.py | 2 +- utilix/sqlite_backend.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index e0e4663..703e01f 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -111,7 +111,7 @@ def setUp(self): conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" - , + , # noqa: E203,E131 ( "files", "test_id", diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 5a95643..64c9c20 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -184,7 +184,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1"""\ - , + , # noqa: E502,E203 (self.gridfs_db_name, config_name), ).fetchone() @@ -469,7 +469,7 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n) :] + self._docs = self._docs[int(n) :] # noqa: E203 return self def limit(self, n): From 598a265373afb6930d69c8b6530972c6b0024f5b Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:43:47 +0100 Subject: [PATCH 21/34] Move noqa comment to correct line in test file --- tests/test_offline_sqlite.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 703e01f..4d37646 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -110,8 +110,7 @@ def setUp(self): ) conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, - blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" - , # noqa: E203,E131 + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""", # noqa: E203,E131 ( "files", "test_id", From 08349dbcf86a99099eb5236d1827d9489f121969 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:44:23 +0000 Subject: [PATCH 22/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 4d37646..703e01f 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -110,7 +110,8 @@ def setUp(self): ) conn.execute( """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, - blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""", # noqa: E203,E131 + blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" + , # noqa: E203,E131 ( "files", "test_id", From 9357e3036223ef7e4e64c2aca10656bf0ab52f43 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:45:54 +0100 Subject: [PATCH 23/34] Rewrite SQL as single-line strings to avoid black/flake8 conflict --- tests/test_offline_sqlite.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 703e01f..113367b 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -109,9 +109,9 @@ def setUp(self): """ ) conn.execute( - """INSERT INTO gridfs_files (db_name, file_id, config_name, md5, length, uploadDate, - blob_path) VALUES (?, ?, ?, ?, ?, ?, ?)""" - , # noqa: E203,E131 + "INSERT INTO gridfs_files " + "(db_name, file_id, config_name, md5, length, uploadDate, blob_path) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", ( "files", "test_id", @@ -189,20 +189,16 @@ def setUp(self): # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - conn.execute( - """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + conn.execute("""CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, doc_bson_z BLOB ) - """ - ) - conn.execute( - """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + """) + conn.execute("""CREATE TABLE runs_index ( db_name TEXT, number INTEGER, doc_id TEXT ) - """ - ) + """) # Insert test document import zlib From 83d083f8d62ae3ecdabe102ea0059e8c9db20bf3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:47:45 +0000 Subject: [PATCH 24/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_offline_sqlite.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py index 113367b..f4f568b 100644 --- a/tests/test_offline_sqlite.py +++ b/tests/test_offline_sqlite.py @@ -189,16 +189,20 @@ def setUp(self): # Create database with kv_collections and runs_index tables conn = sqlite3.connect(str(self.db_path)) - conn.execute("""CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + conn.execute( + """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, doc_bson_z BLOB ) - """) - conn.execute("""CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + """ + ) + conn.execute( + """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, doc_id TEXT ) - """) + """ + ) # Insert test document import zlib From 8e75fedf674d35c86bc9f1919293eef0a5f8552e Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 13:55:05 +0100 Subject: [PATCH 25/34] Allow long first line in SQL query with noqa --- utilix/sqlite_backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 64c9c20..d421906 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,8 +183,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , # noqa: E502,E203 + LIMIT 1""", # noqa: E502,E203 (self.gridfs_db_name, config_name), ).fetchone() From 2d0a6eb39568f4e5420c4c5f17963ed438729755 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:55:21 +0000 Subject: [PATCH 26/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilix/sqlite_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index d421906..64c9c20 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,7 +183,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", # noqa: E502,E203 + LIMIT 1"""\ + , # noqa: E502,E203 (self.gridfs_db_name, config_name), ).fetchone() From 388ea0557235dfc0cc6b7c83e8752528e0487d31 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 14:00:38 +0100 Subject: [PATCH 27/34] Remove accidentally committed .DS_Store file --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 948a180f0f8545b3f10a20ed5e9bc12f71afb3da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKu}%U(5S_(SPK^nrvD(%`lly~%9F>g;KVT38CXgcm)bhT-*xAVs_zkv}@(K3Z z+xljA!Cek4jnSE8=H2bg>^>g3+gl=1jY%gVsu58PXN-0bRv72mOIC3s+^l$xem7~I z=YwIXQU*i;QQ$W!U>F5ICy^Jt0@Ra^V}fMvUWdjV=R6={J4LAc&>-}&bRqt z&wDp)zoM0@^SP-<6KV&mQ}w!B;d7Yo&6;nP!_qv1{ENx@s(=q^=Ces-x}Xu_DCBZ_ z5N3TIPe%(LUcb*_bw#-T9k0Iy6~_1<`tX~8L8X4XocB4rB+V12MKFiYVqHzZqcUS2 zV*f({?AdJbu0@HWfG8jeEEM4PgN8Gv4hxIs)q!A+0Kf>t#&FG_6x^d7rVa~>$iS3s z1=?2Sju^_eV_dts)L~)Kwv%#)59L-??u4Su>X=`fa#E>9iK2ig5Gt@?Z)?2&H`ky4 z!zQ^C1w?^ArGSdER@T5za(ip($MIgP;b(9*t}85>76i8)n+x8G_u Date: Thu, 12 Feb 2026 14:00:54 +0100 Subject: [PATCH 28/34] Add .DS_Store to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index dc94e0b..a65cabe 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist/ build/ .venv *.log +.DS_Store From 3c706f31c31c939adf571541fb963846b0e82daa Mon Sep 17 00:00:00 2001 From: Carlo Fuselli Date: Thu, 12 Feb 2026 14:02:07 +0100 Subject: [PATCH 29/34] Fix critical bugs, add tests, and documentation for SQLite backend (#177) * Fix critical bugs: correct sqlite_path references and complete find_one() logic - Fix AttributeError: replace sqlite_config.sqlite_path with rundb_sqlite_path in rundb.py and mongo_storage.py (3 locations) - Fix NameError in OfflineSQLiteCollection.find_one(): add proper _id handling and default query case - These bugs would cause immediate crashes when offline mode is activated Fixes ensure offline SQLite backend actually works when both rundb and xedocs SQLite files are present. * Apply black formatting to sqlite_backend.py * Add comprehensive tests for offline SQLite backend Tests cover: - SQLiteConfig loading and activation logic - OfflineGridFS file operations (list, download) - OfflineSQLiteCollection queries (find_one, find, count_documents) - xent_collection() fallback behavior (SQLite vs MongoDB) - Edge cases and error handling All 13 tests pass successfully. * Add comprehensive documentation for offline SQLite backend - Added detailed section in README.md covering setup, usage, and troubleshooting - Documented environment variables and their purposes - Added examples for generating SQLite dumps and using offline mode - Included limitations and performance considerations - Enhanced module docstring in sqlite_backend.py with usage examples * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix pre-commit issues: syntax error and flake8 violations * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix flake8 issues in test file: line length and unused variables * Fix mypy type errors and flake8 whitespace issues * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix pre-existing flake8 and mypy issues in mongo_to_sqlite.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add noqa comments for unavoidable black/flake8 conflicts * Move noqa comment to correct line in test file * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Rewrite SQL as single-line strings to avoid black/flake8 conflict * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Allow long first line in SQL query with noqa * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove accidentally committed .DS_Store file * Add .DS_Store to gitignore --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .gitignore | 1 + README.md | 137 +++++++++++++ tests/test_offline_sqlite.py | 375 +++++++++++++++++++++++++++++++++++ utilix/mongo_storage.py | 10 +- utilix/mongo_to_sqlite.py | 25 +-- utilix/rundb.py | 3 +- utilix/sqlite_backend.py | 60 +++++- 7 files changed, 589 insertions(+), 22 deletions(-) create mode 100644 tests/test_offline_sqlite.py diff --git a/.gitignore b/.gitignore index dc94e0b..a65cabe 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ dist/ build/ .venv *.log +.DS_Store diff --git a/README.md b/README.md index c2c0274..1f0660d 100644 --- a/README.md +++ b/README.md @@ -175,6 +175,143 @@ If you need to use different databases or do not want to use the information lis >>> xe1t_coll, xe1t_db, xe1t_user, xe1t_pw, xe1t_url = [ask someone] >>> xe1t_collection = pymongo_collection(xe1t_coll, database=xe1t_coll, user=xe1t_user, password=xe1t_pw, url=xe1t_url) +## Offline SQLite Backend + +For scenarios where network connectivity to MongoDB is unavailable (e.g., working on compute nodes without internet access, traveling, or during database outages), `utilix` provides an **offline SQLite backend** that allows you to continue working with local copies of the database and GridFS files. + +### Overview + +The offline backend consists of two main components: +1. **OfflineSQLiteCollection**: A pymongo-compatible wrapper around SQLite databases containing BSON-compressed documents +2. **OfflineGridFS**: A GridFS-compatible interface for accessing cached files + +When properly configured, utilix will automatically use the SQLite backend when both database files are available, and seamlessly fall back to MongoDB when they're not. + +### Setup + +#### 1. Generate SQLite Database Files + +First, you need to create SQLite dumps of the MongoDB collections you need. The `mongo_to_sqlite.py` script (included in utilix) handles this: + +```bash +# Create a spec file listing what to dump +cat > dump_spec.txt << EOF +xenonnt:runs +files:GRIDFS +xedocs:ALL +corrections:ALL +EOF + +# Run the dump (requires MongoDB access) +python -m utilix.mongo_to_sqlite \ + --spec dump_spec.txt \ + --rundb-out /path/to/rundb.sqlite \ + --xedocs-out /path/to/xedocs.sqlite +``` + +This will create two SQLite files: +- `rundb.sqlite`: Contains runs collection, GridFS file index, and file blobs +- `xedocs.sqlite`: Contains corrections and other xedocs collections + +**Note**: The dump process can take significant time depending on data size. Plan accordingly. + +#### 2. Configure Environment Variables + +Set the following environment variables to enable offline mode: + +```bash +export RUNDB_SQLITE_PATH="/path/to/rundb.sqlite" +export XEDOCS_SQLITE_PATH="/path/to/xedocs.sqlite" + +# Optional: Set compression algorithm (default: zstd) +export OFFLINE_COMP="zstd" # or "zlib" + +# Optional: Enable debug logging +export OFFLINE_DEBUG="1" +``` + +**Important**: Both SQLite files must exist and be accessible for offline mode to activate. If either is missing, utilix will fall back to MongoDB automatically. + +#### 3. Use Normally + +Once configured, your existing code works without modification: + +```python +from utilix import xent_collection + +# Automatically uses SQLite if files are present, MongoDB otherwise +runs = xent_collection("runs") +doc = runs.find_one({"number": 12345}) + +# GridFS downloads also work offline +from utilix.mongo_storage import MongoDownloader +downloader = MongoDownloader() +path = downloader.download_single("my_config") +``` + +### Features and Limitations + +#### Supported Operations +- ✅ `find_one()` with `_id`, `number`, or no filter +- ✅ `find()` with basic filters +- ✅ `count_documents()` +- ✅ GridFS file listing and downloads +- ✅ Automatic MD5-based file caching +- ✅ BSON compression (zstd or zlib) + +#### Limitations +- ⚠️ Complex queries (aggregations, regex, etc.) may not work +- ⚠️ Cursor operations like `sort()` without `limit()` will raise errors to prevent loading entire collections +- ⚠️ Write operations are not supported (read-only) +- ⚠️ The offline database is a snapshot; it won't reflect new data added to MongoDB + +### Performance Considerations + +- SQLite databases are compressed with zstd (or zlib as fallback), typically achieving 5-10x compression +- First-time file access requires decompression; subsequent accesses benefit from OS caching +- For large result sets, queries may be slower than MongoDB due to BSON decompression overhead +- GridFS files are cached by MD5 hash to avoid re-downloading + +### Updating Your Offline Database + +The SQLite files are static snapshots. To refresh them with new data: + +```bash +# Re-run the dump script +python -m utilix.mongo_to_sqlite \ + --spec dump_spec.txt \ + --rundb-out /path/to/rundb.sqlite \ + --xedocs-out /path/to/xedocs.sqlite \ + --overwrite # Add this flag to replace existing files +``` + +### Troubleshooting + +**Problem**: `AttributeError: 'SQLiteConfig' object has no attribute 'sqlite_path'` +- **Solution**: Update to the latest version of utilix. This was a bug in early versions. + +**Problem**: Offline mode not activating +- **Check**: Both environment variables are set: `echo $RUNDB_SQLITE_PATH $XEDOCS_SQLITE_PATH` +- **Check**: Both files exist: `ls -lh $RUNDB_SQLITE_PATH $XEDOCS_SQLITE_PATH` +- **Check**: Set `OFFLINE_DEBUG=1` to see debug messages + +**Problem**: `KeyError: Config 'xyz' not found in offline gridfs_files index` +- **Solution**: The file wasn't included in the dump. Re-dump with the file added to your spec. + +**Problem**: Queries return different results than MongoDB +- **Note**: This shouldn't happen for supported queries. Please report as a bug with example code. + +### Environment Variable Reference + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `RUNDB_SQLITE_PATH` | Yes | - | Path to RunDB SQLite file | +| `XEDOCS_SQLITE_PATH` | Yes | - | Path to xedocs SQLite file | +| `OFFLINE_COMP` | No | `zstd` | Compression algorithm (`zstd` or `zlib`) | +| `OFFLINE_DEBUG` | No | `0` | Enable debug logging (`1` or `0`) | +| `OFFLINE_HARD` | No | `0` | Raise errors instead of warnings on unsupported ops | +| `PYMONGO_SPY` | No | `0` | Log when pymongo.MongoClient is created (for debugging) | + ## Data processing requests You may find yourself missing some data which requires a large amount of resources to process. In these cases, you can submit a processing request to the computing team. diff --git a/tests/test_offline_sqlite.py b/tests/test_offline_sqlite.py new file mode 100644 index 0000000..f4f568b --- /dev/null +++ b/tests/test_offline_sqlite.py @@ -0,0 +1,375 @@ +"""Tests for SQLite offline backend functionality.""" + +import os +import sqlite3 +import tempfile +import unittest +from pathlib import Path +from unittest.mock import patch, MagicMock + +from bson import BSON + + +class TestSQLiteConfig(unittest.TestCase): + """Test SQLiteConfig dataclass and configuration loading.""" + + def test_load_config_from_env(self): + """Test loading SQLite config from environment variables.""" + from utilix.sqlite_backend import _load_sqlite_config + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + + # Create empty files + rundb_path.touch() + xedocs_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + cfg = _load_sqlite_config() + + self.assertIsNotNone(cfg.rundb_sqlite_path) + self.assertIsNotNone(cfg.xedocs_sqlite_path) + # Use resolve() to handle symlinks (e.g., /var -> /private/var) + self.assertEqual(cfg.rundb_sqlite_path.resolve(), rundb_path.resolve()) + self.assertEqual(cfg.xedocs_sqlite_path.resolve(), xedocs_path.resolve()) + self.assertTrue(cfg.rundb_active()) + self.assertTrue(cfg.xedocs_active()) + self.assertTrue(cfg.sqlite_active()) + + def test_sqlite_active_requires_both_files(self): + """Test that sqlite_active() requires both files to exist.""" + from utilix.sqlite_backend import _load_sqlite_config + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + + # Only create rundb file + rundb_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + cfg = _load_sqlite_config() + + self.assertTrue(cfg.rundb_active()) + self.assertFalse(cfg.xedocs_active()) + self.assertFalse(cfg.sqlite_active()) # Requires BOTH + + def test_sqlite_active_false_when_no_env_vars(self): + """Test that sqlite_active() is False without environment variables.""" + from utilix.sqlite_backend import _load_sqlite_config + + with patch.dict(os.environ, {}, clear=True): + # Remove RUNDB_SQLITE_PATH and XEDOCS_SQLITE_PATH if present + os.environ.pop("RUNDB_SQLITE_PATH", None) + os.environ.pop("XEDOCS_SQLITE_PATH", None) + + cfg = _load_sqlite_config() + + self.assertFalse(cfg.rundb_active()) + self.assertFalse(cfg.xedocs_active()) + self.assertFalse(cfg.sqlite_active()) + + +class TestOfflineGridFS(unittest.TestCase): + """Test OfflineGridFS for file operations.""" + + def setUp(self): + """Create temporary directory and mock SQLite database.""" + self.tmpdir = tempfile.TemporaryDirectory() + self.tmppath = Path(self.tmpdir.name) + + # Create mock SQLite database with gridfs_files table + self.db_path = self.tmppath / "rundb.sqlite" + self.blob_path = self.tmppath / "test_blob.txt" + + # Write test blob + self.blob_path.write_text("test content") + + # Create database with gridfs_files table + conn = sqlite3.connect(str(self.db_path)) + conn.execute( + """CREATE TABLE gridfs_files ( db_name TEXT, file_id TEXT, config_name TEXT, md5 TEXT, + length INTEGER, uploadDate INTEGER, + + blob_path TEXT ) + + """ + ) + conn.execute( + "INSERT INTO gridfs_files " + "(db_name, file_id, config_name, md5, length, uploadDate, blob_path) " + "VALUES (?, ?, ?, ?, ?, ?, ?)", + ( + "files", + "test_id", + "test_config", + "abc123", + 12, + 1234567890, + "test_blob.txt", + ), + ) + conn.commit() + conn.close() + + def tearDown(self): + """Clean up temporary directory.""" + self.tmpdir.cleanup() + + def test_offline_gridfs_list_files(self): + """Test listing files from offline GridFS.""" + from utilix.sqlite_backend import OfflineGridFS + + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(self.tmppath / "cache",), + ) + + files = gfs.list_files() + self.assertIn("test_config", files) + gfs.close() + + def test_offline_gridfs_download_single(self): + """Test downloading a single file from offline GridFS.""" + from utilix.sqlite_backend import OfflineGridFS + + cache_dir = self.tmppath / "cache" + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(cache_dir,), + ) + + # Download file + result_path = gfs.download_single("test_config") + + # Should be cached by md5 + self.assertTrue(Path(result_path).exists()) + self.assertIn("abc123", result_path) # md5 in filename + + gfs.close() + + def test_offline_gridfs_missing_config_raises(self): + """Test that missing config raises KeyError.""" + from utilix.sqlite_backend import OfflineGridFS + + gfs = OfflineGridFS( + sqlite_path=self.db_path, + offline_root=self.tmppath, + cache_dirs=(self.tmppath / "cache",), + ) + + with self.assertRaises(KeyError): + gfs.download_single("nonexistent_config") + + gfs.close() + + +class TestOfflineSQLiteCollection(unittest.TestCase): + """Test OfflineSQLiteCollection for database queries.""" + + def setUp(self): + """Create temporary SQLite database with test data.""" + self.tmpdir = tempfile.TemporaryDirectory() + self.db_path = Path(self.tmpdir.name) / "rundb.sqlite" + + # Create database with kv_collections and runs_index tables + conn = sqlite3.connect(str(self.db_path)) + conn.execute( + """CREATE TABLE kv_collections ( db_name TEXT, coll_name TEXT, doc_id TEXT, + + doc_bson_z BLOB ) + + """ + ) + conn.execute( + """CREATE TABLE runs_index ( db_name TEXT, number INTEGER, + + doc_id TEXT ) + + """ + ) + + # Insert test document + import zlib + + test_doc = {"_id": "test_id_123", "number": 12345, "name": "test_run"} + bson_data = BSON.encode(test_doc) + compressed = zlib.compress(bson_data, level=6) + + conn.execute( + "INSERT INTO kv_collections " + "(db_name, coll_name, doc_id, doc_bson_z) VALUES (?, ?, ?, ?)", + ("xenonnt", "runs", "test_id_123", compressed), + ) + + conn.execute( + "INSERT INTO runs_index (db_name, number, doc_id) VALUES (?, ?, ?)", + ("xenonnt", 12345, "test_id_123"), + ) + + conn.commit() + conn.close() + + def tearDown(self): + """Clean up temporary directory.""" + self.tmpdir.cleanup() + + def test_find_one_by_id(self): + """Test find_one with _id filter.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one({"_id": "test_id_123"}) + self.assertIsNotNone(doc) + self.assertEqual(doc["_id"], "test_id_123") + self.assertEqual(doc["number"], 12345) + + coll.close() + + def test_find_one_by_number(self): + """Test find_one with number filter for runs collection.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one({"number": 12345}) + self.assertIsNotNone(doc) + self.assertEqual(doc["number"], 12345) + self.assertEqual(doc["_id"], "test_id_123") + + coll.close() + + def test_find_one_default_returns_first_doc(self): + """Test find_one without filter returns first document.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + doc = coll.find_one() + self.assertIsNotNone(doc) + self.assertEqual(doc["_id"], "test_id_123") + + coll.close() + + def test_count_documents(self): + """Test count_documents method.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + count = coll.count_documents({}) + self.assertEqual(count, 1) + + count = coll.count_documents({"number": 12345}) + self.assertEqual(count, 1) + + count = coll.count_documents({"number": 99999}) + self.assertEqual(count, 0) + + coll.close() + + def test_find_returns_cursor(self): + """Test find method returns iterable cursor.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + + coll = OfflineSQLiteCollection( + sqlite_path=self.db_path, + db_name="xenonnt", + coll_name="runs", + compression="zlib", + ) + + cursor = coll.find({"number": 12345}) + docs = list(cursor) + + self.assertEqual(len(docs), 1) + self.assertEqual(docs[0]["number"], 12345) + + coll.close() + + +class TestXentCollectionOffline(unittest.TestCase): + """Test xent_collection() function with offline mode.""" + + def test_xent_collection_uses_sqlite_when_active(self): + """Test that xent_collection uses SQLite when offline is active.""" + from utilix.sqlite_backend import OfflineSQLiteCollection + from utilix.rundb import xent_collection + + with tempfile.TemporaryDirectory() as tmpdir: + rundb_path = Path(tmpdir) / "rundb.sqlite" + xedocs_path = Path(tmpdir) / "xedocs.sqlite" + rundb_path.touch() + xedocs_path.touch() + + with patch.dict( + os.environ, + { + "RUNDB_SQLITE_PATH": str(rundb_path), + "XEDOCS_SQLITE_PATH": str(xedocs_path), + }, + ): + with patch("utilix.rundb.uconfig") as mock_config: + mock_config.get.return_value = "xenonnt" + + coll = xent_collection("runs") + + # Should return OfflineSQLiteCollection when offline is active + self.assertIsInstance(coll, OfflineSQLiteCollection) + coll.close() + + def test_xent_collection_uses_mongodb_when_offline_inactive(self): + """Test that xent_collection uses MongoDB when offline is not active.""" + from utilix.rundb import xent_collection + + with patch.dict(os.environ, {}, clear=True): + os.environ.pop("RUNDB_SQLITE_PATH", None) + os.environ.pop("XEDOCS_SQLITE_PATH", None) + + with patch("utilix.rundb._collection") as mock_collection: + mock_collection.return_value = MagicMock() + + _result = xent_collection("runs") # noqa: F841 + + # Should call _collection (MongoDB) when offline is not active + mock_collection.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/utilix/mongo_storage.py b/utilix/mongo_storage.py index 45e2f6c..f8301c0 100644 --- a/utilix/mongo_storage.py +++ b/utilix/mongo_storage.py @@ -327,8 +327,11 @@ def initialize(self, store_files_at=None, *args, **kwargs): sqlite_active = False if sqlite_active: + assert sqlite_cfg is not None # for mypy + assert sqlite_cfg.rundb_sqlite_path is not None # for mypy + assert sqlite_cfg.offline_root is not None # for mypy self._offline = OfflineGridFS( - sqlite_path=sqlite_cfg.sqlite_path, + sqlite_path=sqlite_cfg.rundb_sqlite_path, offline_root=sqlite_cfg.offline_root, cache_dirs=tuple(self.storage_options), gridfs_db_name="files", @@ -574,8 +577,11 @@ def initialize( sqlite_active = False if sqlite_active: + assert sqlite_cfg is not None # for mypy + assert sqlite_cfg.rundb_sqlite_path is not None # for mypy + assert sqlite_cfg.offline_root is not None # for mypy self._offline = OfflineGridFS( - sqlite_path=sqlite_cfg.sqlite_path, + sqlite_path=sqlite_cfg.rundb_sqlite_path, offline_root=sqlite_cfg.offline_root, cache_dirs=tuple(self.storage_options), gridfs_db_name="files", diff --git a/utilix/mongo_to_sqlite.py b/utilix/mongo_to_sqlite.py index 3d70687..6aa8b1e 100644 --- a/utilix/mongo_to_sqlite.py +++ b/utilix/mongo_to_sqlite.py @@ -28,7 +28,6 @@ from bson import BSON from bson.objectid import ObjectId - # ------------------------- # Compression helpers # ------------------------- @@ -240,12 +239,12 @@ def q(name: str) -> str: # - time interval lookup: version + interval # - common labels (if present) index_sql = [ - f"CREATE INDEX IF NOT EXISTS \ - {q('idx_' + table + '_version_time')} \ - ON {q(table)}({q('version')}, {q('time_ns')});", - f"CREATE INDEX IF NOT EXISTS \ - {q('idx_' + table + '_version_interval')} \ - ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", + f"CREATE INDEX IF NOT EXISTS " + f"{q('idx_' + table + '_version_time')} " + f"ON {q(table)}({q('version')}, {q('time_ns')});", + f"CREATE INDEX IF NOT EXISTS " + f"{q('idx_' + table + '_version_interval')} " + f"ON {q(table)}({q('version')}, {q('time_left_ns')}, {q('time_right_ns')});", ] # Optional label indexes (keep this small to avoid DB bloat) @@ -787,9 +786,11 @@ def q(name: str) -> str: return '"' + name.replace('"', '""') + '"' placeholders = ",".join(["?"] * len(all_cols)) - ins = f"INSERT OR REPLACE INTO \ - {q(table)}({','.join(q(c) for c in all_cols)}) \ - VALUES ({placeholders})" + ins = ( + f"INSERT OR REPLACE INTO " + f"{q(table)}({','.join(q(c) for c in all_cols)}) " + f"VALUES ({placeholders})" + ) cur = coll.find({}, no_cursor_timeout=True, batch_size=batch_size) @@ -797,8 +798,8 @@ def q(name: str) -> str: buf: List[Tuple[Any, ...]] = [] for doc in cur: - e = _xedocs_extract(doc, label_cols=extra_cols) - row = tuple(e.get(c) for c in all_cols) + extracted = _xedocs_extract(doc, label_cols=extra_cols) + row = tuple(extracted.get(c) for c in all_cols) buf.append(row) n += 1 diff --git a/utilix/rundb.py b/utilix/rundb.py index 209bbf9..e223058 100644 --- a/utilix/rundb.py +++ b/utilix/rundb.py @@ -599,8 +599,9 @@ def _sqlite_collection( if database is None: database = uconfig.get("RunDB", f"{experiment}_database") + assert sqlite_config.rundb_sqlite_path is not None # for mypy return OfflineSQLiteCollection( - sqlite_path=sqlite_config.sqlite_path, + sqlite_path=sqlite_config.rundb_sqlite_path, db_name=database, coll_name=collection, compression=sqlite_config.compression, diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index cfbd119..64c9c20 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -1,5 +1,41 @@ from __future__ import annotations +"""SQLite offline backend for utilix. + +This module provides offline access to XENON RunDB and GridFS data using local +SQLite databases. It allows analysis to continue when MongoDB is unreachable. + +Usage: + 1. Generate SQLite files using mongo_to_sqlite.py (requires MongoDB access) + 2. Set environment variables: + export RUNDB_SQLITE_PATH="/path/to/rundb.sqlite" + export XEDOCS_SQLITE_PATH="/path/to/xedocs.sqlite" + 3. Use utilix normally - offline mode activates automatically + +Example: + >>> import os + >>> os.environ["RUNDB_SQLITE_PATH"] = "/data/rundb.sqlite" + >>> os.environ["XEDOCS_SQLITE_PATH"] = "/data/xedocs.sqlite" + >>> + >>> from utilix import xent_collection + >>> runs = xent_collection("runs") # Uses SQLite if files exist + >>> doc = runs.find_one({"number": 12345}) + +Environment Variables: + RUNDB_SQLITE_PATH: Path to RunDB SQLite file (required) + XEDOCS_SQLITE_PATH: Path to xedocs SQLite file (required) + OFFLINE_COMP: Compression algorithm, 'zstd' or 'zlib' (default: 'zstd') + OFFLINE_DEBUG: Enable debug logging, '1' or '0' (default: '0') + +Classes: + SQLiteConfig: Configuration dataclass for offline mode + OfflineGridFS: GridFS-compatible offline file access + OfflineSQLiteCollection: pymongo-compatible offline collection access + +Functions: + _load_sqlite_config: Load configuration from environment variables +""" + import os import sqlite3 import shutil @@ -148,7 +184,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC LIMIT 1"""\ - , + , # noqa: E502,E203 (self.gridfs_db_name, config_name), ).fetchone() @@ -320,14 +356,19 @@ def find_one(self, filter: dict | None = None, *args, **kwargs): """ Minimal behavior: - if filter contains _id, return that doc + - if filter contains 'number' (for runs collection), look it up - else return first doc (used as connectivity test) """ filter = filter or {} # _id special case if "_id" in filter: - ... + try: + return self._get_by_id(str(filter["_id"])) + except KeyError: + return None + # Special case for runs collection with number filter if self._coll_name == "runs" and "number" in filter: number = int(filter["number"]) row = self._conn.execute( @@ -338,6 +379,12 @@ def find_one(self, filter: dict | None = None, *args, **kwargs): return None return self._get_by_id(row["doc_id"]) + # Default: return first doc (connectivity test) + row = self._conn.execute( + "SELECT doc_bson_z FROM kv_collections WHERE db_name=? AND coll_name=? LIMIT 1", + (self.db_name, self._coll_name), + ).fetchone() + if row is None: return None return self._decode_row(row) @@ -373,9 +420,8 @@ def count_documents(self, filter: dict | None = None, *args, **kwargs) -> int: if "_id" in filter: row = self._conn.execute( - "SELECT COUNT(*) AS n FROM kv_collections \ - WHERE db_name=? AND coll_name=? AND doc_id=?"\ - , + "SELECT COUNT(*) AS n FROM kv_collections " + "WHERE db_name=? AND coll_name=? AND doc_id=?", (self.db_name, self._coll_name, str(filter["_id"])), ).fetchone() return int(row["n"]) if row else 0 @@ -423,7 +469,7 @@ def sort(self, key, direction=1): return self def skip(self, n): - self._docs = self._docs[int(n) :] + self._docs = self._docs[int(n) :] # noqa: E203 return self def limit(self, n): @@ -506,7 +552,7 @@ def gen(): _orig_mc = pymongo.MongoClient -class MongoClientSpy(_orig_mc): +class MongoClientSpy(_orig_mc): # type: ignore[misc,valid-type] def __init__(self, *args, **kwargs): cfg = _load_sqlite_config() if cfg.spy: From eb77f744aad7a2332fe379cb4f7c035921a7c19b Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 14:08:45 +0100 Subject: [PATCH 30/34] try it --- utilix/sqlite_backend.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 64c9c20..d421906 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,8 +183,7 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , # noqa: E502,E203 + LIMIT 1""", # noqa: E502,E203 (self.gridfs_db_name, config_name), ).fetchone() From 60453eed9c5b9ef9167fc0cf5b63c51060bac8e3 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 14:11:12 +0100 Subject: [PATCH 31/34] try it --- utilix/sqlite_backend.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index d421906..ae782a4 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -177,15 +177,15 @@ def _pick_cache_dir(self) -> Path: # sqlite queries # ----------------- def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: - row = self.conn.execute( - """ - SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path - FROM gridfs_files - WHERE db_name = ? AND config_name = ? - ORDER BY uploadDate DESC - LIMIT 1""", # noqa: E502,E203 - (self.gridfs_db_name, config_name), - ).fetchone() + row = self.conn.execute( + """ + SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path + FROM gridfs_files + WHERE db_name = ? AND config_name = ? + ORDER BY uploadDate DESC + LIMIT 1""", + (self.gridfs_db_name, config_name), + ).fetchone() if row is None: return None From e30baa9fe1df41fa691666f6a4d013e5e7373e2d Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 14:14:51 +0100 Subject: [PATCH 32/34] Resolve merge conflict in sqlite_backend.py --- utilix/sqlite_backend.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index d6b7fa7..c187cf8 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -177,28 +177,15 @@ def _pick_cache_dir(self) -> Path: # sqlite queries # ----------------- def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: -<<<<<<< HEAD - row = self.conn.execute( - """ - SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path - FROM gridfs_files - WHERE db_name = ? AND config_name = ? - ORDER BY uploadDate DESC - LIMIT 1""", - (self.gridfs_db_name, config_name), - ).fetchone() -======= row = self.conn.execute( """ SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1"""\ - , # noqa: E502,E203 + LIMIT 1""", (self.gridfs_db_name, config_name), ).fetchone() ->>>>>>> 4f06c10a1c32706945a4021fca563bffdfb55344 if row is None: return None From 519c0a248a6760bdb53ac8f316b2b6d4e50a4b34 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 13:15:06 +0000 Subject: [PATCH 33/34] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utilix/sqlite_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index c187cf8..87fe216 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -183,7 +183,8 @@ def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: FROM gridfs_files WHERE db_name = ? AND config_name = ? ORDER BY uploadDate DESC - LIMIT 1""", + LIMIT 1"""\ + , (self.gridfs_db_name, config_name), ).fetchone() From ed661c7dfce02272fad2dec46c653d17ab812ec3 Mon Sep 17 00:00:00 2001 From: cfuselli Date: Thu, 12 Feb 2026 14:16:53 +0100 Subject: [PATCH 34/34] Rewrite SQL query to single-line string to fix black/docformatter conflict --- utilix/sqlite_backend.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/utilix/sqlite_backend.py b/utilix/sqlite_backend.py index 87fe216..2524644 100644 --- a/utilix/sqlite_backend.py +++ b/utilix/sqlite_backend.py @@ -178,13 +178,9 @@ def _pick_cache_dir(self) -> Path: # ----------------- def latest_by_config_name(self, config_name: str) -> Optional[GridFSRow]: row = self.conn.execute( - """ - SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path - FROM gridfs_files - WHERE db_name = ? AND config_name = ? - ORDER BY uploadDate DESC - LIMIT 1"""\ - , + "SELECT db_name, file_id, config_name, md5, length, uploadDate, blob_path " # noqa: E501 + "FROM gridfs_files WHERE db_name = ? AND config_name = ? " + "ORDER BY uploadDate DESC LIMIT 1", (self.gridfs_db_name, config_name), ).fetchone()