diff --git a/.gitignore b/.gitignore index 33c1d12..b62ee78 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,9 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Data Folder +data/* +# TODO: Remove when you need the cut background file anymore +# Raw data folder excluding only the background cut file +data/raw/ \ No newline at end of file diff --git a/README.md b/README.md index 58ff8bc..172e332 100644 --- a/README.md +++ b/README.md @@ -1,101 +1,217 @@ -# cosiflow +# Cosiflow -The COSI SDOC pipeline based on Apache Airflow +Cosiflow provides an Airflow-based orchestration environment for managing and monitoring scientific pipelines for COSI. -## Build the cosiflow docker +--- -We assume that the cosiflow repository is in your $HOME directory. +### 1. REQUIREMENTS -```bash -cd $HOME/cosiflow/env -``` +#### PREPARE THE ENVIRONMENT FILE -Mac: +1. Copy and rename the `.env.example` file as `.env`: + ```bash + cd env + cp .env.example .env + ``` -```bash -docker build --platform linux/arm64 -t airflow:1.0.0 -f Dockerfile . -``` +2. Open the `.env` file and set a secure password for Airflow: + ```bash + AIRFLOW_ADMIN_PASSWORD= + ``` -Linux: +3. Manually bootstrap user and group IDs for the container: -```bash -docker build -t airflow:1.1.0 -f Dockerfile . -``` + ```bash + id -u + # Copy the output as YOUR_USER_ID -## Execute the docker compose to start containers + id -g + # Copy the output as YOUR_GROUP_ID + ``` -```bash -docker compose up -d -``` +4. Open `.env` and modify the following environment variables: -If you want to enter into the postgre docker container: `docker compose exec postgres bash` + ```bash + UID= + GID= + ``` -If you want to enter into the postgre docker container: `docker compose exec airflow bash` +#### PREPARE THE DOCKERFILE -## Connect to the web server using a browser +Open the `Dockerfile.airflow` file and paste the same values: -localhost:8080 +```dockerfile +ARG UID= +ARG GID= +``` + +#### PREPARE THE FOLDER FOR STORING POSTGRESS DATA +```bash +cd .. +mkdir -p data/postgres_data +``` -Note: if you use a remote server you can change the `docker-compose.yaml` file to use another port. +--- -For example: - - ```yaml - ports: - - "28080:8080" - ``` +### 2. BUILD THE COMPOSE -then from your local pc you can forward the port in this way: +Build all containers defined in `docker-compose.yml`: ```bash -ssh -N -L 28080:localhost:28080 [user]@[remote machine] +cd env +docker compose build ``` -and open the airflow webpace from your local pc at `localhost:28080` +⏱ Estimated build time: **~490 seconds** + +--- -Login with username: `admin` password: `` +### 3. RUN THE CONTAINER -To obtain the password `` execute this command after the initialization of the containers +To run with logs visible: ```bash -docker compose logs | grep pass +docker compose up ``` -### Shutdown the dockers +To run in detached mode (no logs): ```bash -docker compose down -v +docker compose up -d ``` -## Test the cosipy DAG +--- + +### 4. ENTER THE CONTAINER -Enter in the docker airflow +To open a terminal inside the running Airflow container: ```bash docker compose exec airflow bash ``` -First download the data file from wasabi. +--- -```bash -cd /shared_dir/pipeline -source activate cosipy -python initialize_pipeline.py -``` +### 5. CONNECT TO THE AIRFLOW WEB UI + +1. Open your web browser and go to: -This script downloads the input file from wasabi and move it in `/home/gamma/workspace/data` + [http://localhost:8080/home](http://localhost:8080/home) -Now we must activate the DAG named `"cosipt_test_v0"` from the airflow website +2. Insert the user credentials: + ```text + user: admin + password: + ``` -Then we have to copy the file in the input directory to trigger the DAG +--- + +### 6. STOP THE CONTAINER + +To stop and remove all running containers, networks, and volumes: ```bash -cd /home/gamma/workspace/data -cp GalacticScan.inc1.id1.crab2hr.extracted.tra.gz input +docker compose down -v ``` -We should see that the DAG started to process the data. +--- + +### 7. CONFIGURATIONS + +Below is the list of environment variables defined in `.env` with their purpose: + +| Variable | Description | +|-----------|--------------| +| **UID** | User ID for container bootstrap | +| **GID** | Group ID for container bootstrap | +| **DISPLAY** | Display variable for X11 forwarding (optional) | +| **AIRFLOW_ADMIN_USERNAME** | Default Airflow Web UI username | +| **AIRFLOW_ADMIN_EMAIL** | Email associated with Airflow admin user | +| **AIRFLOW_ADMIN_PASSWORD** | Secure password for Airflow Web UI | +| **ALERT_USERS_LIST_PATH** | Path to YAML file containing user alert configurations | +| **ALERT_SMTP_SERVER** | SMTP server used for alert notifications | +| **ALERT_SMTP_PORT** | Port of the SMTP server | +| **ALERT_EMAIL_SENDER** | Email address used as sender for system alerts | +| **ALERT_LOG_PATH** | Path to Airflow log file monitored by alert system | +| **AIRFLOW__SMTP__SMTP_STARTTLS** | Enables/disables STARTTLS (default: False) | +| **AIRFLOW__SMTP__SMTP_SSL** | Enables/disables SMTP over SSL (default: False) | +| **MAILHOG_WEBUI_URL** | URL for MailHog web interface (for testing alerts) | +| **COSI_DATA_DIR** | Root directory for COSI data | +| **COSI_INPUT_DIR** | Directory for COSI input data | +| **COSI_LOG_DIR** | Directory for COSI log files | +| **COSI_OBS_DIR** | Directory for observation data | +| **COSI_TRANSIENT_DIR** | Directory for transient event data | +| **COSI_TRIGGER_DIR** | Directory for trigger event data | +| **COSI_MAPS_DIR** | Directory for map data products | +| **COSI_SOURCE_DIR** | Directory for source-level data products | + +--- + +### NOTES + +- Make sure `.env` and `docker-compose.yml` are located in the same directory. +- Do **not** commit your personal `.env` file to version control. +- To inspect container logs, use: + ```bash + docker compose logs -f airflow + ``` + +--- + +**Cosiflow environment ready for use.** + +--- + +## What is COSIDAG + +A **COSIDAG** (COSI DAG) is a structured abstraction built on top of Apache Airflow DAGs. + +It provides a **standardized workflow layout** for scientific pipelines, reducing boilerplate and enforcing consistent patterns across different analyses. + +In particular, a COSIDAG: + +* defines a common execution skeleton (input resolution, optional monitoring, result handling) +* encapsulates best practices for: + + * file discovery + * parameter propagation + * XCom-based communication +* allows developers to focus only on **scientific tasks**, while orchestration logic is handled automatically + +COSIDAGs are used for all production scientific pipelines (e.g. Light Curve, TS Map), while standard DAGs are reserved for orchestration, testing, or utilities. + +**How to write and customize a COSIDAG** is explained in detail in the [tutorial section](modules/README.md). + +--- + +## Tutorials and developer guide + +A complete, step-by-step guide on how to: + +* understand the COSIDAG execution model +* write new COSIDAGs +* add custom tasks +* use XCom correctly +* integrate external Python environments + +is available in: + +[tutorial section](tutorials/README.md). + +This is the **recommended starting point for developers**. + +--- + +## Available DAGs and COSIDAGs + +A complete and up-to-date list of all DAGs and COSIDAGs implemented in this repository — including: + +* workflow purpose +* inputs and outputs +* task structure +* operators used +* XCom usage -This directory `/home/gamma/workspace/heasarc/dl0` contains several folders with this format `2025-01-24_14-31-56`. +is documented in: [DAG and COSIDAG LIST README](dags/README.md) -Inside the folder we have the results of the analysis. +This document serves as the **catalog and reference** for all workflows available in Cosiflow. \ No newline at end of file diff --git a/callbacks/__init__.py b/callbacks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/callbacks/on_failure_callback.py b/callbacks/on_failure_callback.py new file mode 100644 index 0000000..3868b40 --- /dev/null +++ b/callbacks/on_failure_callback.py @@ -0,0 +1,82 @@ +import yaml +import os +from airflow.utils.email import send_email +import urllib.parse + +ALERT_CONFIG_PATH = os.getenv("ALERT_USERS_LIST_PATH", "/home/gamma/env/alert_users.yaml") + + +def load_alert_config(): + with open(ALERT_CONFIG_PATH, "r") as f: + return yaml.safe_load(f) + + +def get_recipients(keyword: str) -> list[str]: + config = load_alert_config() + matched_groups = set() + + for rule in config.get("rules", []): + if rule["pattern"] == keyword: + matched_groups.update(rule["notify"]) + + emails = set() + for group in matched_groups: + group_data = config["groups"].get(group) + if group_data: + emails.update(group_data.get("emails", [])) + return sorted(emails) + + + +def notify_email(context): + task = context["task_instance"] + dag_id = task.dag_id + task_id = task.task_id + run_id = task.run_id + execution_date = context.get("execution_date") + + # URL-encode i parametri per sicurezza + base_url = "http://localhost:8080" + query = urllib.parse.urlencode({ + "execution_date": execution_date.isoformat(), + "tab": "logs", + "dag_run_id": run_id, + "task_id": task_id + }) + log_url = f"{base_url}/dags/{dag_id}/grid?{query}" + + # Percorso log locale (personalizzabile) + log_path = f"/home/gamma/airflow/logs/dag_id={dag_id}/run_id={run_id}/task_id={task_id}/attempt=1.log" + if not os.path.exists(log_path): + log_preview = "⚠️ Log file not found." + else: + with open(log_path, "r") as f: + lines = f.readlines()[-30:] # Ultime 30 righe + log_preview = "".join(lines) + log_preview = log_preview.replace("<", "<").replace(">", ">") # Escaping HTML + + recipients = get_recipients("ALERT_FAIL") + if not recipients: + return # no recipients, skip + + subject = f"[ALERT] Task {task.task_id} in DAG {task.dag_id} has failed" + html_content = f""" + + +

⚠️ Task Failure Alert

+ + + + + +
DAG:{dag_id}
Task:{task_id}
Execution Time:{execution_date}
Log URL:{log_url}
+

🔍 Log Preview

+
+{log_preview}
+        
+

Full log available at the link above.

+ + + """ + + send_email(to=recipients, subject=subject, html_content=html_content) diff --git a/dags/README.md b/dags/README.md new file mode 100644 index 0000000..0de2a4a --- /dev/null +++ b/dags/README.md @@ -0,0 +1,563 @@ +# DAG & COSIDAG Catalog + +This document describes all **DAGs** and **COSIDAGs** available in this repository, including: + +* DAG title +* workflow type (DAG vs COSIDAG) +* purpose +* inputs / outputs +* number of tasks (and task layout) +* operator types +* XCom usage (inter-task communication) + +--- + +## COSIDAG framework (module) + +### File + +`cosidag.py` + +### What it is + +This is **not a DAG**. +It defines the **`COSIDAG`** convenience class used by multiple pipelines. + +### Standard COSIDAG layout + +A COSIDAG wires a common pattern: + +1. `check_new_file` *(optional)* +2. `automatic_retrig` *(optional)* +3. `resolve_inputs` *(optional)* +4. `[custom tasks]` +5. `show_results` *(always)* + +### Operators used internally + +* `PythonOperator` +* `EmptyOperator` +* `TriggerDagRunOperator` +* (also sensors / utilities internally, depending on configuration) + +### XCom contract (key design) + +* `detected_folder` is pushed by `check_new_file` (or by a fallback setter task when monitoring is disabled) +* resolved inputs are pushed by `resolve_inputs` using keys from `file_patterns` +* `show_results` reads `detected_folder` from XCom and prints a results URL (if configured) + +✅ This file is the **contract** that all COSIDAG-based pipelines rely on. + +--- + +# Entry-point DAGs + +## `init_pipelines` + +### File + +`cosipipe_simdata.py` (header indicates: `# dags/init_pipelines.py`) + +### Type + +**Standard DAG** (entry-point / initializer) + +### Workflow purpose + +Single entry point that: + +* prepares/stages raw inputs +* resolves configuration +* creates the run/products directory +* optionally performs a background cut +* creates symlinks to standardized locations + +This is the DAG you trigger from the Airflow UI to bootstrap a pipeline run. + +### Inputs + +Airflow UI params / configuration (conceptually): + +* destination selector (where to save products) +* paths for source/background/orientation/response (or folders from which they can be resolved) +* optional “date/selection policy” style filters + +### Outputs + +* standardized run folder (products directory) +* staged inputs (symlinked or copied, depending on logic) +* background cut output (if enabled) + +### Number of tasks + +**6 tasks** + +* `prepare_raw_dirs` +* `resolve_config` +* `stage_all_files` +* `create_products_dir` +* `background_cut` +* `create_symlinks` + +### Operators used + +* `PythonOperator` +* `ExternalPythonOperator` (used for scientific steps executed in the `cosipy` conda env) + +### XCom usage + +✅ **Yes** +Used to propagate resolved configuration and output paths between tasks. + +--- + +# Scientific COSIDAG pipelines + +## `cosidag_tsmap` + +### File + +`cosidag_tsmap.py` + +### Type + +**COSIDAG** + +### Workflow purpose + +TS Map computation pipeline (binned GRB + background → TS map products). + +### Inputs + +Resolved via COSIDAG `file_patterns` (search under detected folder): + +* `grb_file`: `GRB*_unbinned_*.fits*` +* `background_file`: `Total_BG*_unbinned_*.fits*` +* `orientation_file`: `*.ori` +* `response_file`: `Response*.h5` + +Plus COSIDAG parameters (monitoring folders, date queries, selection policy, etc.). + +### Outputs + +* binned GRB file +* binned background file +* TS map products (standard and multi-resolution) +* results stored in the run/products directory used by COSIDAG + +### Number of tasks + +**~8 total** (COSIDAG base tasks + custom tasks) + +**Base COSIDAG tasks (created by framework):** + +* `check_new_file` +* `automatic_retrig` *(depends on config)* +* `resolve_inputs` +* `show_results` + +**Custom tasks in this pipeline:** + +* `bin_grb_source` +* `bin_background` +* `ts_map_computation` +* `ts_map_mulres_computation` + +### Operators used + +* `ExternalPythonOperator` (science steps) +* plus COSIDAG internal operators (see `cosidag.py`) + +### XCom usage + +✅ **Yes** + +* COSIDAG publishes resolved input paths (`resolve_inputs`) +* custom tasks read paths using templated `ti.xcom_pull(...)` (return_value pattern is used) + +--- + +## `cosidag_lcurve` + +### File + +`cosidag_lcurve.py` + +### Type + +**COSIDAG** + +### Workflow purpose + +Light Curve plotting pipeline (binned GRB + background → light curve products). + +### Inputs + +Resolved via COSIDAG `file_patterns`: + +* `grb_file`: `GRB*_unbinned_*.fits*` +* `background_file`: `Total_BG*_unbinned_*.fits*` +* `orientation_file`: `*.ori` +* `response_file`: `Response*.h5` + +### Outputs + +* binned GRB file +* binned background file +* light curve plot(s) / products saved into the pipeline output folder + +### Number of tasks + +**~7 total** (COSIDAG base tasks + custom tasks) + +**Base COSIDAG tasks:** + +* `check_new_file` +* `automatic_retrig` *(enabled in config in this file)* +* `resolve_inputs` +* `show_results` + +**Custom tasks in this pipeline:** + +* `bin_grb_source` +* `bin_background` +* `plot_lightcurve` + +### Operators used + +* `ExternalPythonOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** +Uses XCom for: + +* detected folder +* resolved inputs +* passing file paths between binning and plotting + +--- + +# Tutorial / example COSIDAGs + +## `cosidag_example` + +### File + +`cosidag_example.py` + +### Type + +**COSIDAG (example)** + +### Workflow purpose + +Demonstrates how to attach a custom task to a COSIDAG and how to: + +* consume `detected_folder` via XCom +* search files inside the detected folder via helper `dag.find_file_by_pattern(...)` + +### Inputs + +* `monitoring_folders` points to a sample location (example uses `/home/gamma/workspace/data/tsmap`) +* COSIDAG detection parameters (depth, date queries, etc.) + +### Outputs + +* logs + demonstration of resolved file path (printed) +* depends on your custom implementation + +### Number of tasks + +**~5–6 total** (COSIDAG base tasks + 1 custom task) +Custom task: + +* `custom_process` + +### Operators used + +* `PythonOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** (pulls `detected_folder`) + +--- + +## `cosidag_helloworld` + +### File + +`cosidag_helloworld.py` + +### Type + +**COSIDAG (minimal tutorial)** + +### Workflow purpose + +Minimal “hello world” COSIDAG showing: + +* how to define `build_custom(dag)` +* how to chain a single custom task +* how to run even with `monitoring_folders=None` (no folder detection) + +### Inputs + +None (demo-style). In this file `monitoring_folders=None` and `auto_retrig=False`. + +### Outputs + +Logs only. + +### Number of tasks + +**~2–3 total** + +* COSIDAG “detected folder setter” task may exist when monitoring is disabled (implementation detail) +* custom task: `hello_world` + +### Operators used + +* `PythonOperator` +* `BashOperator` +* plus COSIDAG internal operators + +### XCom usage + +✅ **Yes** (used to keep the COSIDAG contract consistent, even in no-monitoring mode) + +--- + +## `cosidag_tutorial_a_svd` + +### File + +`cosidag_a.py` + +### Type + +**COSIDAG (tutorial A)** + +### Workflow purpose + +Tutorial A: + +* build a binary text matrix +* factorize via SVD +* save factor outputs and diagnostic plots + +### Inputs + +* demo parameters (TEXT, SIZE, FONT_SIZE, RANK) +* output base directory: `/home/gamma/workspace/data/tutorials/a_b_factor` + +### Outputs + +In `BASE_DIR`: + +* `factors.pkl` +* `factor_L.png` +* `factor_R.png` + (and any other artifacts generated by the tutorial) + +### Number of tasks + +**~3–4 total** + +* COSIDAG “set detected folder” step (since this tutorial doesn’t rely on monitoring) +* custom task: + + * `a_factorize_text_matrix` + +### Operators used + +* `PythonOperator` (setup / set folder) +* `ExternalPythonOperator` (SVD + plots in external env) + +### XCom usage + +✅ **Yes** +Uses XCom to propagate base folder / run folder into the external step. + +--- + +## `cosidag_tutorial_b_reconstruct` + +### File + +`cosidag_b.py` + +### Type + +**COSIDAG (tutorial B)** + +### Workflow purpose + +Tutorial B: + +* load SVD factors generated in tutorial A +* reconstruct the matrix (float + binarized) +* save reconstruction plots + +### Inputs + +From `BASE_DIR`: + +* `factors.pkl` + Parameters: +* `bin_thr` threshold + +### Outputs + +In `BASE_DIR`: + +* `reconstruction_float.png` +* `reconstruction_binary.png` + +### Number of tasks + +**~3–4 total** + +* COSIDAG “set detected folder” step (no monitoring) +* custom task: + + * `b_reconstruct_and_plot` + +### Operators used + +* `PythonOperator` +* `ExternalPythonOperator` + +### XCom usage + +✅ **Yes** +Folder/path handoff via XCom. + +--- + +# Test / utility DAGs + +## `dag_parallel_test_1` + +### File + +`dag_parallel_test_1.py` + +### Type + +**Standard DAG (test)** + +### Workflow purpose + +Simple parallelism test: two independent sleep tasks. + +### Inputs / Outputs + +None (logs only). + +### Number of tasks + +**2** + +* `sleep_a` +* `sleep_b` + +### Operators used + +* `BashOperator` + +### XCom usage + +❌ No + +--- + +## `dag_parallel_test_2` + +### File + +`dag_parallel_test_2.py` + +### Type + +**Standard DAG (test)** + +### Workflow purpose + +Second parallelism test DAG. + +### Inputs / Outputs + +None (logs only). + +### Number of tasks + +**2** + +* `sleep_c` +* `sleep_d` + +### Operators used + +* `BashOperator` + +### XCom usage + +❌ No + +--- + +## `dag_with_email_alert` + +### File + +`fail_task.py` + +### Type + +**Standard DAG (test / failure path)** + +### Workflow purpose + +Intentional failure DAG used to test: + +* failure handling +* alerting / callbacks (depending on Airflow configuration) + +### Inputs / Outputs + +None (it fails on purpose). + +### Number of tasks + +**1** + +* `failing_task` + +### Operators used + +* `PythonOperator` + +### XCom usage + +❌ No + +--- + +# Summary table + +| DAG ID / Name | Type | Purpose (short) | Tasks (approx) | Operators (main) | XCom | +| -------------------------------- | ------- | --------------------------------------- | -------------- | --------------------------------------- | ---- | +| `init_pipelines` | DAG | stage/init run + background cut + links | 6 | PythonOperator, ExternalPythonOperator | Yes | +| `cosidag_tsmap` | COSIDAG | TS map products | ~8 | ExternalPythonOperator (+ COSIDAG core) | Yes | +| `cosidag_lcurve` | COSIDAG | light curve products | ~7 | ExternalPythonOperator (+ COSIDAG core) | Yes | +| `cosidag_example` | COSIDAG | example: detected_folder + file search | ~5–6 | PythonOperator (+ COSIDAG core) | Yes | +| `cosidag_helloworld` | COSIDAG | minimal tutorial | ~2–3 | PythonOperator, BashOperator | Yes | +| `cosidag_tutorial_a_svd` | COSIDAG | SVD factorization tutorial | ~3–4 | PythonOperator, ExternalPythonOperator | Yes | +| `cosidag_tutorial_b_reconstruct` | COSIDAG | reconstruction tutorial | ~3–4 | PythonOperator, ExternalPythonOperator | Yes | +| `dag_parallel_test_1` | DAG | parallelism test | 2 | BashOperator | No | +| `dag_parallel_test_2` | DAG | parallelism test | 2 | BashOperator | No | +| `dag_with_email_alert` | DAG | intentional failure / alert test | 1 | PythonOperator | No | diff --git a/dags/cosidag_a.py b/dags/cosidag_a.py new file mode 100644 index 0000000..fda1c39 --- /dev/null +++ b/dags/cosidag_a.py @@ -0,0 +1,194 @@ +# cosidag_tutorial_a.py +from datetime import datetime +import sys + +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator, ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + + +def build_custom(dag): + """ + Tutorial A: + build a binary text matrix, factorize via SVD, save A/B and plots. + """ + + # ------------------------------------------------- + # 0) Declare the result folder for COSIDAG + # ------------------------------------------------- + def _set_detected_folder(ti): + ti.xcom_push(key="detected_folder", value=BASE_DIR) + return BASE_DIR + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + python_callable=_set_detected_folder, + dag=dag, + ) + + # ------------------------------------------------- + # 1) Factorize text matrix (external env) + # ------------------------------------------------- + def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, + python_callable=_a_make_factors, # IDENTICA al tutorial originale + op_kwargs={ + "base_dir": BASE_DIR, + "text": TEXT, + "size": SIZE, + "font_size": FONT_SIZE, + "rank": RANK, + }, + dag=dag, + ) + + set_detected_folder >> a_factorize + + +with COSIDAG( + dag_id="cosidag_tutorial_a_svd", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, + auto_retrig=False, + build_custom=build_custom, + tags=["cosidag", "tutorial", "external-python"], +) as dag: + pass diff --git a/dags/cosidag_b.py b/dags/cosidag_b.py new file mode 100644 index 0000000..c141172 --- /dev/null +++ b/dags/cosidag_b.py @@ -0,0 +1,104 @@ +# cosidag_tutorial_b.py +from datetime import datetime +import sys + +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator, ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 + + +def build_custom(dag): + """ + Tutorial B: + reconstruct matrix from A/B factors and produce plots. + """ + + # ------------------------------------------------- + # 0) Declare the result folder (same as A) + # ------------------------------------------------- + def _set_detected_folder(ti): + ti.xcom_push(key="detected_folder", value=BASE_DIR) + return BASE_DIR + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + python_callable=_set_detected_folder, + dag=dag, + ) + + # ------------------------------------------------- + # 1) Reconstruction + plots (external env) + # ------------------------------------------------- + def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, # IDENTICA al tutorial originale + op_kwargs={ + "base_dir": BASE_DIR, + "pkl_path": PKL_PATH, + "bin_thr": BIN_THR, + }, + dag=dag, + ) + + set_detected_folder >> b_reconstruct + + +with COSIDAG( + dag_id="cosidag_tutorial_b_reconstruct", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, + auto_retrig=False, + build_custom=build_custom, + tags=["cosidag", "tutorial", "reconstruction", "external-python"], +) as dag: + pass diff --git a/dags/cosidag_example.py b/dags/cosidag_example.py new file mode 100644 index 0000000..bca593e --- /dev/null +++ b/dags/cosidag_example.py @@ -0,0 +1,41 @@ +from datetime import datetime +# add the path to the cosiflow module +import sys +sys.path.append("/home/gamma/airflow/modules") +from cosidag import COSIDAG +from airflow.operators.python import PythonOperator +# +def build_custom(dag): + # Example custom task consuming the detected folder via XCom + def _process_folder(folder_path: str): + # Do your science here + print(f"Processing folder: {folder_path}") + # search for the file by pattern + file_path = dag.find_file_by_pattern(r".*\.fits.*", folder_path) + print(f"Found file: {file_path}") +# + PythonOperator( + task_id="custom_process", + python_callable=lambda ti, **_: _process_folder( + ti.xcom_pull(task_ids="check_new_file", key="detected_folder") + ), + dag=dag, + ) +# +with COSIDAG( + dag_id="cosidag_example", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=["/home/gamma/workspace/data/tsmap"], + level=3, + date_queries=f"=={datetime.now().strftime("%Y%m%d")}", + build_custom=build_custom, + idle_seconds=5, + min_files=1, + ready_marker=None, + only_basename="products", + tags=["cosidag", "example"], + #ready_marker="_SUCCESS", +) as dag: + pass diff --git a/dags/cosidag_helloworld.py b/dags/cosidag_helloworld.py new file mode 100644 index 0000000..89d4645 --- /dev/null +++ b/dags/cosidag_helloworld.py @@ -0,0 +1,53 @@ +from datetime import datetime +from pydoc import describe +import sys + +# Add cosiflow modules path +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator + + +def build_custom(dag): + """ + Minimal COSIDAG example. + Shows how to attach a single custom task to the COSIDAG lifecycle. + """ + + def _set_detected_folder(ti): + ti.xcom_push( + key="detected_folder", + value="/home/gamma/workspace/data/tutorials" + ) + return "/home/gamma/workspace/data/tutorials" + + set_detected_folder = PythonOperator( + task_id="set_detected_folder", + # Set the detected folder for the DAG, it is used by the COSIDAG to the folder for the show_results task. + # This task is not part of the COSIDAG workflow, it is used to set the detected folder for the DAG. + python_callable=_set_detected_folder, + dag=dag, + ) + + hello_world = BashOperator( + task_id="hello_world", + bash_command="echo 'Hello from COSIDAG' > /home/gamma/workspace/data/tutorials/hello_world.txt", + dag=dag, + ) + + set_detected_folder >> hello_world + +with COSIDAG( + dag_id="cosidag_helloworld", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=None, # dummy folder for demo + auto_retrig=False, + level=0, + build_custom=build_custom, + tags=["cosidag", "example", "helloworld", "tutorial"], +) as dag: + pass diff --git a/dags/cosidag_lcurve.py b/dags/cosidag_lcurve.py new file mode 100644 index 0000000..8bd4a25 --- /dev/null +++ b/dags/cosidag_lcurve.py @@ -0,0 +1,112 @@ +from datetime import datetime +import sys +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from cosidag import cfg +from airflow.operators.python import ExternalPythonOperator +from airflow.models import Variable + + +def build_custom(dag): + + EXTERNAL_PYTHON = cfg("EXTERNAL_PYTHON", "/home/gamma/.conda/envs/cosipy/bin/python") + LIB_DIR = cfg("TSMAP_LIB_DIR", "/home/gamma/airflow/pipeline/lcurve") + + # ----- Python callables executed in the external interpreter ----- + def _bin_grb(run_dir: str, lib_dir: str, grb_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_lc_ops_cosidag import bin_grb_source + return bin_grb_source(grb_file, run_dir) + + def _bin_bkg(run_dir: str, lib_dir: str, background_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_lc_ops_cosidag import bin_background_data + return bin_background_data(background_file, run_dir) + + def _plot_lightcurve(run_dir: str, lib_dir: str, + grb_binned_file: str, background_binned_file: str, + orientation_file: str, response_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_lc_ops_cosidag import plot_lightcurve_from_cells + return plot_lightcurve_from_cells(grb_binned_file, background_binned_file, orientation_file, response_file, run_dir) + + # ----- Operators (IMPORTANT: pass dag=dag) ----- + + # Pull the run_dir produced by COSIDAG's sensor: + # key='detected_folder' from task_id='check_new_file' + RUN_DIR_JINJA = "{{ ti.xcom_pull(task_ids='check_new_file', key='detected_folder') }}" + RUN_DIR = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='run_dir') }}" + GRB_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='grb_file') }}" + BKG_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='background_file') }}" + ORI_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='orientation_file') }}" + RSP_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='response_file') }}" + + bin_grb = ExternalPythonOperator( + task_id="bin_grb_source", + python=EXTERNAL_PYTHON, + python_callable=_bin_grb, + op_kwargs={"run_dir": RUN_DIR, "lib_dir": LIB_DIR, "grb_file": GRB_FILE}, + dag=dag, # <<< IMPORTANT + ) + + bin_bkg = ExternalPythonOperator( + task_id="bin_background", + python=EXTERNAL_PYTHON, + python_callable=_bin_bkg, + op_kwargs={"run_dir": RUN_DIR, "lib_dir": LIB_DIR, "background_file": BKG_FILE}, + dag=dag, # <<< IMPORTANT + ) + + GRB_BINNED_FILE = "{{ ti.xcom_pull(task_ids='bin_grb_source', key='return_value') }}" + BKG_BINNED_FILE = "{{ ti.xcom_pull(task_ids='bin_background', key='return_value') }}" + + plot_lightcurve = ExternalPythonOperator( + task_id="plot_lightcurve", + python=EXTERNAL_PYTHON, + python_callable=_plot_lightcurve, + op_kwargs={"run_dir": RUN_DIR, + "lib_dir": LIB_DIR, + "grb_binned_file": GRB_BINNED_FILE, + "background_binned_file": BKG_BINNED_FILE, + "orientation_file": ORI_FILE, + "response_file": RSP_FILE}, + dag=dag, # <<< IMPORTANT + ) + + # [bin_grb, bin_bkg] >> aggregate >> [plot_lightcurve] + [bin_grb, bin_bkg] >> plot_lightcurve + +with COSIDAG( + dag_id="cosidag_lcurve", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=["/home/gamma/workspace/data/lcurve"], + level=3, + # Let the sensor accept only the deepest-level leaf (products) + only_basename="products", + # Robustness against partial writes + idle_seconds=5, + min_files=1, + # ready_marker="_SUCCESS", # enable if you create a sentinel at end-of-write + # controlled parallelism: + max_active_runs=2, # up to 2 DAG runs in parallel + max_active_tasks=8, # up to 8 tasks in parallel in the DAG + concurrency=8, # local alternative limit (Airflow <2.7) + date_queries=f"=={datetime.now().strftime("%Y%m%d")}", + select_policy="latest_mtime", # or "first" + file_patterns={ + "grb_file": "GRB*_unbinned_*.fits*", + "background_file": "Total_BG*_unbinned_*_window.fits*", + "orientation_file": "*.ori", + "response_file": "Response*.h5", + }, + auto_retrig=True, # enable automatic retrigger + build_custom=build_custom, + tags=["cosidag", "lcurve"], +) as dag: + pass diff --git a/dags/cosidag_tsmap.py b/dags/cosidag_tsmap.py new file mode 100644 index 0000000..40d858c --- /dev/null +++ b/dags/cosidag_tsmap.py @@ -0,0 +1,133 @@ +from datetime import datetime +import sys +sys.path.append("/home/gamma/airflow/modules") + +from cosidag import COSIDAG +from cosidag import cfg +from airflow.operators.python import ExternalPythonOperator +from airflow.models import Variable + + +def build_custom(dag): + + EXTERNAL_PYTHON = cfg("EXTERNAL_PYTHON", "/home/gamma/.conda/envs/cosipy/bin/python") + LIB_DIR = cfg("TSMAP_LIB_DIR", "/home/gamma/airflow/pipeline/ts_map") + + # ----- Python callables executed in the external interpreter ----- + + def _bin_grb(run_dir: str, lib_dir: str, grb_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_tsmap_ops_cosidag import bin_grb_data + return bin_grb_data(grb_file, run_dir) + + def _bin_bkg(run_dir: str, lib_dir: str, background_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_tsmap_ops_cosidag import bin_background_data + return bin_background_data(background_file, run_dir) + + def _ts_map(run_dir: str, lib_dir: str, grb_file: str, background_file: str, + orientation_file: str, response_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_tsmap_ops_cosidag import compute_ts_map + return compute_ts_map(grb_file, background_file, orientation_file, response_file, run_dir) + + def _ts_map_mulres(run_dir: str, lib_dir: str, grb_file: str, background_file: str, + orientation_file: str, response_file: str) -> str: + import sys + sys.path.insert(0, lib_dir) + from cosipipe_tsmap_ops_cosidag import compute_ts_map_mulres + return compute_ts_map_mulres(grb_file, background_file, orientation_file, response_file, run_dir) + + # ----- Operators (IMPORTANT: pass dag=dag) ----- + + # Pull the run_dir produced by COSIDAG's sensor: + # key='detected_folder' from task_id='check_new_file' + RUN_DIR_JINJA = "{{ ti.xcom_pull(task_ids='check_new_file', key='detected_folder') }}" + RUN_DIR = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='run_dir') }}" + GRB_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='grb_file') }}" + BKG_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='background_file') }}" + ORI_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='orientation_file') }}" + RSP_FILE = "{{ ti.xcom_pull(task_ids='resolve_inputs', key='response_file') }}" + + bin_grb = ExternalPythonOperator( + task_id="bin_grb_source", + python=EXTERNAL_PYTHON, + python_callable=_bin_grb, + op_kwargs={"run_dir": RUN_DIR, "lib_dir": LIB_DIR, "grb_file": GRB_FILE}, + dag=dag, # <<< IMPORTANT + ) + + bin_bkg = ExternalPythonOperator( + task_id="bin_background", + python=EXTERNAL_PYTHON, + python_callable=_bin_bkg, + op_kwargs={"run_dir": RUN_DIR, "lib_dir": LIB_DIR, "background_file": BKG_FILE}, + dag=dag, # <<< IMPORTANT + ) + + GRB_BINNED_FILE = "{{ ti.xcom_pull(task_ids='bin_grb_source', key='return_value') }}" + BKG_BINNED_FILE = "{{ ti.xcom_pull(task_ids='bin_background', key='return_value') }}" + + ts_map = ExternalPythonOperator( + task_id="ts_map_computation", + python=EXTERNAL_PYTHON, + python_callable=_ts_map, + op_kwargs={"run_dir": RUN_DIR, + "lib_dir": LIB_DIR, + "grb_file": GRB_BINNED_FILE, + "background_file": BKG_BINNED_FILE, + "orientation_file": ORI_FILE, + "response_file": RSP_FILE}, + dag=dag, # <<< IMPORTANT + ) + + ts_map_mulres = ExternalPythonOperator( + task_id="ts_map_mulres_computation", + python=EXTERNAL_PYTHON, + python_callable=_ts_map_mulres, + op_kwargs={"run_dir": RUN_DIR, + "lib_dir": LIB_DIR, + "grb_file": GRB_BINNED_FILE, + "background_file": BKG_BINNED_FILE, + "orientation_file": ORI_FILE, + "response_file": RSP_FILE}, + dag=dag, # <<< IMPORTANT + ) + + # [bin_grb, bin_bkg] >> aggregate >> [ts_map, ts_map_mulres] + bin_grb >> [ts_map, ts_map_mulres] + bin_bkg >> [ts_map, ts_map_mulres] + + +with COSIDAG( + dag_id="cosidag_tsmap", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + monitoring_folders=["/home/gamma/workspace/data/tsmap"], + level=3, + # Let the sensor accept only the deepest-level leaf (products) + only_basename="products", + # Robustness against partial writes + idle_seconds=5, + min_files=1, + # ready_marker="_SUCCESS", # enable if you create a sentinel at end-of-write + # controlled parallelism: + max_active_runs=2, # up to 2 DAG runs in parallel + max_active_tasks=8, # up to 8 tasks in parallel in the DAG + concurrency=8, # local alternative limit (Airflow <2.7) + date_queries=f"=={datetime.now().strftime("%Y%m%d")}", + file_patterns={ + "grb_file": "GRB*_unbinned_*.fits*", + "background_file": "Total_BG*_unbinned_*_window.fits*", + "orientation_file": "*.ori", + "response_file": "Response*.h5", + }, + select_policy="latest_mtime", # oppure "first" + build_custom=build_custom, + tags=["cosidag", "tsmap"] +) as dag: + pass diff --git a/dags/cosipipe_cosipy.py b/dags/cosipipe_cosipy.py deleted file mode 100644 index db87c80..0000000 --- a/dags/cosipipe_cosipy.py +++ /dev/null @@ -1,148 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.operators.bash_operator import BashOperator -import os -import time -import datetime -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from airflow.operators.dagrun_operator import TriggerDagRunOperator - -# Import necessary Airflow classes and standard libraries - -# Define a data pipeline class for monitoring, ingesting, and storing DL0 files -class DataPipeline: - def __init__(self): - # Define directory paths for input, processed data (heasarc), and logs - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - # Set up inotify to watch the input directory for file-close-write events - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Configure logger with both file rotation and console output - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - # Avoid duplicate logger handlers - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - # Monitor input directory for new files and return the oldest file when available - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - self.logger.info("Daemon process started for continuous file monitoring...") - - # Start infinite polling loop to check for input files - while True: - input_directory = os.path.join(self.base_dir, 'input') - input_files = os.listdir(input_directory) - - # Check if there are any files - if input_files: - # Find and return the path to the oldest file in the input directory - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if os.path.exists(oldest_file): - # Log and push to XCom - self.logger.info(f"New file detected: {oldest_file}") - # Push file path to XCom for downstream tasks - ti.xcom_push(key='new_file_path', value=oldest_file) - # Allow subsequent tasks to run - return True - - # Sleep between checks to reduce CPU usage - time.sleep(5) - - # Move detected input file into a timestamped subdirectory inside heasarc - # Store and push the new path for downstream tasks - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - # Retrieve the input file path from XCom - input_files = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file_sensor_task') - if input_files: - # Check that the file exists and move it into a new timestamped subfolder - if not os.path.exists(input_files): - raise FileNotFoundError(f"Input file {input_files} does not exist.") - self.logger.info(f"Processing DL0 file: {input_files}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - timestamp_utc = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d_%H-%M-%S') - new_dir = f'{self.heasarc_dir}/dl0/{timestamp_utc}' - os.makedirs(new_dir, exist_ok=True) - stored_file_path = f"{new_dir}/{os.path.basename(input_files)}" - os.rename(input_files, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - # Push the new file path to XCom for further use - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - # Handle missing file or other unexpected exceptions gracefully - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - -pipeline = DataPipeline() - -# Define the Airflow DAG to orchestrate DL0 file monitoring, ingestion, and plotting -with DAG('cosipy_test_v0', default_args={'owner': 'airflow'}, schedule=None, - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - # Task to detect the arrival of new files in the input directory - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - dag=dag - ) - - # Task to move and organize the newly detected file - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - ) - - # Task to generate plots using an external script in the cosipy environment - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosipy_test_v0", # Stesso DAG - dag=dag, - ) - - # Task to trigger the same DAG again for continuous processing - generate_plots = BashOperator( - task_id='generate_plots', - bash_command=""" - source activate cosipy && - python /shared_dir/pipeline/generate_plot.py "{{ task_instance.xcom_pull(task_ids='ingest_and_store_dl0_sensor', key='stored_dl0_file') }}" - """, - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_plots >> trigger_next_run diff --git a/dags/cosipipe_cosipy_external_python.py b/dags/cosipipe_cosipy_external_python.py deleted file mode 100644 index b103318..0000000 --- a/dags/cosipipe_cosipy_external_python.py +++ /dev/null @@ -1,185 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator, ExternalPythonOperator -from airflow.operators.bash_operator import BashOperator -import os -import time -import datetime -import logging -from logging.handlers import RotatingFileHandler -from airflow.exceptions import AirflowSkipException -from airflow.operators.dagrun_operator import TriggerDagRunOperator -from airflow.decorators import task, dag - -# Import required modules and operators from Airflow and standard Python libraries - -# Define a class to encapsulate the data ingestion and logging logic -class DataPipeline: - def __init__(self): - # Set base directories for input, output, and logs - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - # Configure logger for both file and console output - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # Add rotating file handler to limit log file size - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Add console stream handler for real-time feedback - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Avoid adding handlers multiple times - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - # Continuously monitor the input directory for new files - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - self.logger.info("Daemon process started for continuous file monitoring...") - - while True: - # List files in the input directory - input_directory = os.path.join(self.base_dir, 'input') - input_files = os.listdir(input_directory) - - # Select the oldest file available - if input_files: - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if os.path.exists(oldest_file): - # Push file path to XCom for downstream tasks - self.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - # Wait a short time before the next polling iteration - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - - # Move detected input file to a timestamped directory and store the path - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - # Retrieve file path from XCom - input_files = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file_sensor_task') - if input_files: - # Check if the file exists before proceeding - if not os.path.exists(input_files): - raise FileNotFoundError(f"Input file {input_files} does not exist.") - self.logger.info(f"Processing DL0 file: {input_files}") - # Create directory structure for storing the file - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - timestamp_utc = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d_%H-%M-%S') - new_dir = f'{self.heasarc_dir}/dl0/{timestamp_utc}' - os.makedirs(new_dir, exist_ok=True) - # Rename (move) the file to the new directory - stored_file_path = f"{new_dir}/{os.path.basename(input_files)}" - os.rename(input_files, stored_file_path) - # Push the new file path to XCom - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - -pipeline = DataPipeline() - -# Generate plots and summary data from the DL0 file using the cosipy library -def generate_plots_task(file_path): - - import sys, os - from cosipy.util import fetch_wasabi_file - from cosipy import BinnedData - from pathlib import Path - - # Define the directory and create the input YAML configuration file - print("test") - print(file_path) - dir_name = os.path.dirname(file_path) - - content_to_write = f"""#----------# - # Data I/O: - - # data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 - data_file: {file_path} # full path - ori_file: "NA" # full path - unbinned_output: 'hdf5' # 'fits' or 'hdf5' - time_bins: 60 # time bin size in seconds. Takes int, float, or list of bin edges. - energy_bins: [100., 200., 500., 1000., 2000., 5000.] # Takes list. Needs to match response. - phi_pix_size: 6 # binning of Compton scattering anlge [deg] - nside: 8 # healpix binning of psi chi local - scheme: 'ring' # healpix binning of psi chi local - tmin: 1835478000.0 # Min time cut in seconds. - tmax: 1835485200.0 # Max time cut in seconds. - #----------# - """ - - dir_name_path = Path(dir_name) - - # Open the file in write mode and write the content - with open(dir_name_path / "inputs.yaml", "w") as file: - file.write(content_to_write) - - # Run analysis steps: read .tra file, bin data, create spectrum and light curve - analysis = BinnedData(dir_name_path / "inputs.yaml") - analysis.read_tra(output_name=dir_name_path / "unbinned_data") - analysis.get_binned_data() - analysis.get_raw_spectrum(output_name=file_path.replace(".crab2hr.extracted.tra.gz", "")) - analysis.get_raw_lightcurve(output_name=file_path.replace(".crab2hr.extracted.tra.gz", "")) - - -# Define the DAG and the task pipeline for DL0 processing and plotting -with DAG('cosipy_external_python_v2', default_args={'owner': 'airflow'}, schedule=None, - #start_date=datetime.now(), - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - # Wait for new file to appear in input directory - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - dag=dag - ) - - # Move the file and store it in the appropriate location - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - ) - - # Trigger the same DAG again to run continuously - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosipy_external_python_v2", - dag=dag, - ) - - # Run the plot generation script in an external Python environment - generate_plots = ExternalPythonOperator( - task_id='generate_plots', - python_callable=generate_plots_task, - python="/home/gamma/.conda/envs/cosipy/bin/python", - op_args=["{{ task_instance.xcom_pull(task_ids='ingest_and_store_dl0_sensor', key='stored_dl0_file') }}"], - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_plots >> trigger_next_run \ No newline at end of file diff --git a/dags/cosipipe_simdata.py b/dags/cosipipe_simdata.py new file mode 100644 index 0000000..efa1ded --- /dev/null +++ b/dags/cosipipe_simdata.py @@ -0,0 +1,361 @@ +# dags/init_pipelines.py +# Airflow 2.x — Initialize and stage COSI pipeline data +# - staged raw subfolders (source/background/orientation/response) +# - staging & background cut executed in Conda 'cosipy' via ExternalPythonOperator (no Airflow context inside) +# - run folder under DEST_MAP/YYYY_MM/YYMMDDXXX/products with symlinks and cut result + +from __future__ import annotations +import json, os, re, shutil +from datetime import datetime +from pathlib import Path +from typing import Dict, List + +from airflow import DAG +from airflow.models.param import Param +from airflow.operators.python import PythonOperator, ExternalPythonOperator + +# === External env/interpreter === +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" +BKG_CUT_SCRIPT = "/home/gamma/airflow/pipeline/bkg_cut.py" + +# === Paths === +RAW_ROOT = Path("/home/gamma/workspace/data/raw") +RAW_SUBDIRS = { + "source": RAW_ROOT / "source", + "background": RAW_ROOT / "background", + "orientation": RAW_ROOT / "orientation", + "response": RAW_ROOT / "response", +} + +DEST_MAP = { + "lcurve": Path("/home/gamma/workspace/data/lcurve"), + "tsmap": Path("/home/gamma/workspace/data/tsmap"), +} + +# === Default Wasabi keys === +WASABI_DEFAULTS = { + "response": "Responses/ResponseContinuum.o3.e100_10000.b10log.s10396905069491.m2284.filtered.nonsparse.binnedimaging.imagingresponse_nside8.area.good_chunks.h5.zip", + "orientation": "Orientation/DC3_final_530km_3_month_with_slew_1sbins_GalacticEarth_SAA.ori", + "source": "Sources/GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits.gz", + "background": "Backgrounds/Ge/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz", +} + +# === Helpers === +def ensure_dir(p: Path) -> Path: + p.mkdir(parents=True, exist_ok=True) + return p + +def next_run_products_dir(base_dest: Path) -> Path: + # Determines the next run directory based on date and incremental index + now = datetime.now() + yyyy_mm = now.strftime("%Y_%m") + yymmdd = now.strftime("%y%m%d") + base_month_dir = ensure_dir(base_dest / yyyy_mm) + pat = re.compile(rf"^{yymmdd}(\d{{3}})$") + max_idx = -1 + for d in base_month_dir.iterdir(): + if d.is_dir() and pat.match(d.name): + max_idx = max(max_idx, int(pat.match(d.name).group(1))) + run_dir = ensure_dir(base_month_dir / f"{yymmdd}{max_idx+1:03d}") + return ensure_dir(run_dir / "products") + +def make_symlinks(target_dir: Path, files_by_kind: Dict[str, List[Path]]) -> Dict[str, List[str]]: + result = {} + for kind, paths in files_by_kind.items(): + result[kind] = [] + for p in paths: + link = target_dir / p.name + if not link.exists(): + link.symlink_to(p) + print(f"[symlink] {link} -> {p}") + # append anyway, even if it already existed + result[kind].append(str(link)) + return result + +# === ExternalPythonOperator callables === +def stage_all_files_external( + *, + inputs_json: str, + response_dir: str, + orientation_dir: str, + source_dir: str, + background_dir: str, +) -> Dict[str, List[str]]: + """ + Executed in cosipy env. Avoids re-downloading/re-extracting files that are already ready. + Detects corrupted .gz/.zip files (redownloads) and validates the background FITS if already present. + """ + import json, os, shutil, gzip, zipfile + from zipfile import BadZipFile + from pathlib import Path + + inputs = json.loads(inputs_json) + + def ensure_dir(p: Path) -> Path: + p.mkdir(parents=True, exist_ok=True) + return p + + def ready_files(target_dir: Path) -> list[Path]: + return [p for p in target_dir.iterdir() if p.is_file() and not p.name.endswith((".zip", ".gz"))] + + def ready_files_exist(target_dir: Path) -> bool: + return len(ready_files(target_dir)) > 0 + + def validate_background_fits(f: Path) -> bool: + """Tries to open HDU[1] to catch truncated files.""" + try: + from astropy.io import fits + with fits.open(f, memmap=True) as hdul: + # minimal access to force reading of the table block + _ = hdul[1].data.shape # noqa + return True + except Exception as e: + print(f"[stage] Background FITS validation failed: {f} ({e})") + return False + + def gunzip_to_same_dir(gz_path: Path) -> Path: + ready = gz_path.with_suffix("") + if ready.exists(): + return ready + with gzip.open(gz_path, "rb") as gz_f, open(ready, "wb") as out_f: + shutil.copyfileobj(gz_f, out_f) + gz_path.unlink(missing_ok=True) + return ready + + def fetch_wasabi(remote_key: str, out_path: Path) -> None: + from cosipy.util import fetch_wasabi_file + ensure_dir(out_path.parent) + fetch_wasabi_file(f"COSI-SMEX/DC3/Data/{remote_key}", output=out_path) + + def redownload(remote_key: str, out_path: Path) -> None: + try: + out_path.unlink(missing_ok=True) + except Exception: + pass + fetch_wasabi(remote_key, out_path) + + def download_or_use_one(remote_or_local: str, target_dir: Path, *, kind: str) -> list[Path]: + ensure_dir(target_dir) + + # If ready files exist, validate the FITS first for BACKGROUND + if ready_files_exist(target_dir): + files = ready_files(target_dir) + if kind == "background": + # validate the first FITS file + bg_fits = next((p for p in files if p.suffix.lower() == ".fits" or p.name.endswith(".fits")), None) + if bg_fits and not validate_background_fits(bg_fits): + print(f"[stage] Detected corrupted ready background in {target_dir}, cleaning and re-fetching…") + for p in target_dir.iterdir(): + if p.is_file(): + p.unlink(missing_ok=True) + else: + print(f"[stage] Ready files already in {target_dir}, skipping.") + return files + + else: + print(f"[stage] Ready files already in {target_dir}, skipping.") + return files + + src = Path(remote_or_local) + + # --- Absolute local path --- + if src.is_absolute() and src.exists(): + dst = target_dir / src.name + if not dst.exists(): + try: + os.link(src, dst) + except OSError: + shutil.copy2(src, dst) + if dst.suffix == ".gz": + ready = gunzip_to_same_dir(dst) + return [ready] + if dst.suffix == ".zip": + try: + with zipfile.ZipFile(dst, "r") as zf: + zf.testzip() + zf.extractall(target_dir) + return ready_files(target_dir) + except BadZipFile as e: + raise RuntimeError(f"[stage] Local .zip seems corrupted: {dst} ({e})") + # validate for local background + if kind == "background" and dst.suffix.lower() == ".fits" and not validate_background_fits(dst): + raise RuntimeError(f"[stage] Local background FITS seems corrupted: {dst}") + return [dst] + + # --- Wasabi --- + out_path = target_dir / src.name + + # download if missing + if not out_path.exists(): + fetch_wasabi(remote_or_local, out_path) + + if out_path.suffix == ".gz": + try: + ready = gunzip_to_same_dir(out_path) + except Exception: + print(f"[stage] Corrupted .gz at {out_path}, re-downloading…") + redownload(remote_or_local, out_path) + ready = gunzip_to_same_dir(out_path) + if kind == "background" and not validate_background_fits(ready): + print(f"[stage] Re-downloading background FITS after failed validation…") + redownload(remote_or_local, out_path) + ready = gunzip_to_same_dir(out_path) + if not validate_background_fits(ready): + raise RuntimeError(f"[stage] Background FITS still invalid after re-download: {ready}") + return [ready] + + if out_path.suffix == ".zip": + try: + with zipfile.ZipFile(out_path, "r") as zf: + zf.testzip() + zf.extractall(target_dir) + except BadZipFile: + print(f"[stage] Corrupted .zip at {out_path}, re-downloading…") + redownload(remote_or_local, out_path) + with zipfile.ZipFile(out_path, "r") as zf: + zf.testzip() + zf.extractall(target_dir) + files = ready_files(target_dir) + return files + + # simple file + if kind == "background" and out_path.suffix.lower() == ".fits" and not validate_background_fits(out_path): + print(f"[stage] Background FITS invalid at {out_path}, re-downloading…") + redownload(remote_or_local, out_path) + if not validate_background_fits(out_path): + raise RuntimeError(f"[stage] Background FITS still invalid after re-download: {out_path}") + return [out_path] + + staged = { + "response": [str(p) for p in download_or_use_one(inputs["response"], Path(response_dir), kind="response")], + "orientation":[str(p) for p in download_or_use_one(inputs["orientation"],Path(orientation_dir),kind="orientation")], + "source": [str(p) for p in download_or_use_one(inputs["source"], Path(source_dir), kind="source")], + "background": [str(p) for p in download_or_use_one(inputs["background"], Path(background_dir), kind="background")], + } + print(json.dumps(staged, indent=2)) + return staged + +def run_bkg_cut_and_move(*, script: str, products_dir: str, source_link_path: str, background_link_path: str, eps_time: float): + """ + Runs bkg_cut in the products folder using SYMLINKs as input. + Does not move anything: the script must generate output directly in products/. + Returns the list of new files created in products. + """ + import sys, subprocess, time + from pathlib import Path + + products = Path(products_dir); products.mkdir(parents=True, exist_ok=True) + before = {p.name for p in products.glob("*.fits*")} + cmd = [sys.executable, script, source_link_path, background_link_path, "--eps_time", str(eps_time)] + print(f"[bkg_cut] CWD={products} CMD: {' '.join(cmd)}") + subprocess.run(cmd, check=True, cwd=str(products)) + + # new files created in products + after = list(products.glob("*.fits*")) + new_files = [str(p) for p in after if p.name not in before] + print(f"[bkg_cut] new files in products/: {new_files}") + return {"created": new_files} + +# === DAG === +with DAG( + dag_id="init_pipelines", + start_date=datetime(2024, 1, 1), + schedule=None, + catchup=False, + tags=["cosiflow", "init"], + description="Initialize and stage COSI pipeline data (optimized with skip logic)", + params={ + "response_path": Param(default=WASABI_DEFAULTS["response"], type="string"), + "orientation_path": Param(default=WASABI_DEFAULTS["orientation"], type="string"), + "source_path": Param(default=WASABI_DEFAULTS["source"], type="string"), + "background_path": Param(default=WASABI_DEFAULTS["background"], type="string"), + "destination": Param(default="tsmap", enum=["lcurve", "tsmap"]), + "eps_time": Param(default=1e-9, type="number"), + }, +) as dag: + + def prepare_raw_dirs(): + # Ensure that all raw data subdirectories exist + for d in RAW_SUBDIRS.values(): + ensure_dir(d) + return {k: str(v) for k, v in RAW_SUBDIRS.items()} + + t_prepare = PythonOperator(task_id="prepare_raw_dirs", python_callable=prepare_raw_dirs) + + def resolve_config(**context): + # Resolve parameters and input paths + p = context["params"] + return { + "destination_root": str(DEST_MAP[p["destination"]]), + "eps_time": float(p["eps_time"]), + "inputs": { + "response": p["response_path"], + "orientation": p["orientation_path"], + "source": p["source_path"], + "background": p["background_path"], + }, + } + + t_resolve = PythonOperator(task_id="resolve_config", python_callable=resolve_config) + + t_stage = ExternalPythonOperator( + task_id="stage_all_files", + python=EXTERNAL_PYTHON, + python_callable=stage_all_files_external, + op_kwargs={ + "inputs_json": "{{ ti.xcom_pull(task_ids='resolve_config')['inputs'] | tojson }}", + "response_dir": str(RAW_SUBDIRS["response"]), + "orientation_dir": str(RAW_SUBDIRS["orientation"]), + "source_dir": str(RAW_SUBDIRS["source"]), + "background_dir": str(RAW_SUBDIRS["background"]), + }, + ) + + def create_products_dir(ti): + # Create the unique products directory for this run + cfg = ti.xcom_pull(task_ids="resolve_config") + pdir = next_run_products_dir(Path(cfg["destination_root"])) + return {"products_dir": str(pdir)} + + t_products = PythonOperator(task_id="create_products_dir", python_callable=create_products_dir) + + t_bkgcut = ExternalPythonOperator( + task_id="background_cut", + python=EXTERNAL_PYTHON, + python_callable=run_bkg_cut_and_move, + op_kwargs={ + "script": BKG_CUT_SCRIPT, + "products_dir": "{{ ti.xcom_pull(task_ids='create_products_dir')['products_dir'] }}", + "source_link_path": "{{ ti.xcom_pull(task_ids='create_symlinks')['source'][0] }}", + "background_link_path": "{{ ti.xcom_pull(task_ids='create_symlinks')['background'][0] }}", + "eps_time": "{{ ti.xcom_pull(task_ids='resolve_config')['eps_time'] }}", + }, + ) + + + def create_symlinks(ti): + """ + Creates symlinks in products/ for: + - source file + - response file(s) (can be more than one after unzip) + - orientation file + - (optional) original background + The 'cut' background file(s) are already in products/ from the previous task, + so no symlinks are needed for those. + """ + staged = ti.xcom_pull(task_ids="stage_all_files") + products_dir = Path(ti.xcom_pull(task_ids="create_products_dir")["products_dir"]) + + # If multiple files exist (e.g. response after unzip), link them all + files_by_kind = { + "source": [Path(p) for p in staged.get("source", [])], + "response": [Path(p) for p in staged.get("response", [])], + "orientation": [Path(p) for p in staged.get("orientation", [])], + "background": [Path(p) for p in staged.get("background", [])], + } + + return make_symlinks(products_dir, files_by_kind) + + t_link = PythonOperator(task_id="create_symlinks", python_callable=create_symlinks) + + t_prepare >> t_resolve >> t_stage >> t_products >> t_link >> t_bkgcut diff --git a/dags/cosipipev0.py b/dags/cosipipev0.py deleted file mode 100644 index af82ab3..0000000 --- a/dags/cosipipev0.py +++ /dev/null @@ -1,236 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago -import os -import time -import csv -import random -import logging -from datetime import datetime, timedelta -from airflow.exceptions import AirflowSkipException - -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - current_time = datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}_{current_time}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - dl0_file = kwargs['ti'].xcom_pull(key='stored_dl0_file', task_ids='ingest_and_store_dl0') - if dl0_file: - self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - - def generate_dl1b(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1a') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1a/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1b') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1b/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1c') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1c/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl2', 'dl2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_1') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_1/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_three(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline', default_args={'owner': 'airflow'}, schedule_interval=None, - start_date=datetime.now(), - concurrency=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - wait_for_new_file = PythonSensor( - task_id='wait_for_new_file', - python_callable=pipeline.check_new_file, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task = PythonOperator( - task_id='ingest_and_store_dl0', - python_callable=pipeline.ingest_and_store_dl0, - provide_context=True - ) - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - notify_completion_task = PythonOperator( - task_id='notify_completion', - python_callable=pipeline.notify_completion - ) - - wait_for_new_file >> ingest_and_store_dl0_task >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> fast_transient_stage_one_task >> fast_transient_stage_two_task >> fast_transient_stage_three_task >> notify_completion_task - diff --git a/dags/cosipipev1.py b/dags/cosipipev1.py deleted file mode 100644 index 86a098f..0000000 --- a/dags/cosipipev1.py +++ /dev/null @@ -1,235 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago -import os -import time -import csv -import random -import logging -from datetime import datetime, timedelta -from airflow.exceptions import AirflowSkipException - -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - current_time = datetime.now().strftime("%Y%m%d_%H%M%S_%f") - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}_{current_time}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - dl0_file = kwargs['ti'].xcom_pull(key='stored_dl0_file', task_ids='ingest_and_store_dl0') - if dl0_file: - self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - - def generate_dl1b(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1a') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1a/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1b') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1b/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl1c') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl1c/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/dl2', 'dl2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/dl2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/dl2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_1') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_1/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_three(self): - try: - input_files = os.listdir(f'{self.heasarc_dir}/fast_transient_stage_2') - if input_files: - latest_file = max([f"{self.heasarc_dir}/fast_transient_stage_2/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(latest_file): - raise FileNotFoundError(f"Input file {latest_file} does not exist.") - self.generate_placeholder_file(latest_file, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v1', default_args={'owner': 'airflow'}, schedule_interval=None, - start_date=datetime.now(), - concurrency=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - wait_for_new_file = PythonSensor( - task_id='wait_for_new_file', - python_callable=pipeline.check_new_file, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task = PythonOperator( - task_id='ingest_and_store_dl0', - python_callable=pipeline.ingest_and_store_dl0, - provide_context=True - ) - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - notify_completion_task = PythonOperator( - task_id='notify_completion', - python_callable=pipeline.notify_completion - ) - - wait_for_new_file >> ingest_and_store_dl0_task >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> fast_transient_stage_one_task >> fast_transient_stage_two_task >> fast_transient_stage_three_task >> notify_completion_task diff --git a/dags/cosipipev2.py b/dags/cosipipev2.py deleted file mode 100644 index 3caef7d..0000000 --- a/dags/cosipipev2.py +++ /dev/null @@ -1,362 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self): - input_files = os.listdir(f'{self.base_dir}/input') - self.logger.info(f"Checking for new files. Current files: {input_files}") - return bool(input_files) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v2', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor = PythonSensor( - task_id='wait_for_new_file_sensor', - python_callable=pipeline.check_new_file_sensor, - poke_interval=1, - timeout=600 - ) - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - wait_for_new_file_sensor >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task diff --git a/dags/cosipipev3.py b/dags/cosipipev3.py deleted file mode 100644 index 2a51cd6..0000000 --- a/dags/cosipipev3.py +++ /dev/null @@ -1,394 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta -from airflow.operators.dagrun_operator import TriggerDagRunOperator - -#airflow dags trigger cosi_data_analysis_pipeline_v3 -#airflow dags list-runs -d cosi_data_analysis_pipeline_v3 --state running - - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - pipeline.logger.info("Daemon process started for continuous file monitoring...") - - while True: - input_files = os.listdir(f'{pipeline.base_dir}/input') - - # Check if there are any files - if input_files: - # Get the oldest file - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - - if os.path.exists(oldest_file): - # Log and push to XCom - pipeline.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - - # Allow subsequent tasks to run - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v3', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Maximum number of tasks that can be executed simultaneously per DAG - max_active_runs=4 # Maximum number of DAG instances that can be executed simultaneously - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - provide_context=True, - dag=dag - ) - - - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - # Definisci il task per triggerare il DAG stesso - trigger_next_run = TriggerDagRunOperator( - task_id="trigger_next_run", - trigger_dag_id="cosi_data_analysis_pipeline_v3", # Stesso DAG - dag=dag, - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task >> trigger_next_run diff --git a/dags/cosipipev3o.py b/dags/cosipipev3o.py deleted file mode 100644 index 782836e..0000000 --- a/dags/cosipipev3o.py +++ /dev/null @@ -1,382 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.sensors.python import PythonSensor -from airflow.utils.dates import days_ago, timezone -import os -import time -import csv -import random -import logging -from logging.handlers import RotatingFileHandler -from inotify_simple import INotify, flags -from airflow.exceptions import AirflowSkipException -from datetime import datetime, timedelta - -#AIRFLOW -class DataPipeline: - def __init__(self): - self.base_dir = '/home/gamma/workspace/data' - self.heasarc_dir = '/home/gamma/workspace/heasarc' - self.logger_dir = '/home/gamma/workspace/log' - - self.inotify = INotify() - self.watch_flags = flags.CLOSE_WRITE - self.inotify.add_watch(f'{self.base_dir}/input', self.watch_flags) - - # Logger setup for both Celery and the pipeline - self.logger = logging.getLogger('data_pipeline_logger') - self.logger.setLevel(logging.DEBUG) - - # File handler for logging to a file - file_handler = RotatingFileHandler('/home/gamma/workspace/data_pipeline.log', maxBytes=5*1024*1024, backupCount=3) - file_handler.setLevel(logging.DEBUG) - file_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - file_handler.setFormatter(file_formatter) - - # Console handler for logging to the console - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.DEBUG) - console_formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - console_handler.setFormatter(console_formatter) - - # Adding handlers to the logger - if not self.logger.hasHandlers(): - self.logger.addHandler(file_handler) - self.logger.addHandler(console_handler) - - self.logger.propagate = False - - self.logger = logging.getLogger(__name__) - - def ingest_and_store_dl0(self, **kwargs): - try: - ti = kwargs['ti'] - new_file_path = ti.xcom_pull(key='new_file_path', task_ids='wait_for_new_file') - if new_file_path: - if not os.path.exists(new_file_path): - raise FileNotFoundError(f"Input file {new_file_path} does not exist.") - self.logger.info(f"Oldest DL0 file: {new_file_path}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - stored_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(new_file_path)}" - os.rename(new_file_path, stored_file_path) - self.logger.info(f"Stored DL0 file: {stored_file_path}") - ti.xcom_push(key='stored_dl0_file', value=stored_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def ingest_and_store_dl0_sensor(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = os.listdir(f'{self.base_dir}/input') - if input_files: - oldest_file = min([f"{self.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - if not os.path.exists(oldest_file): - raise FileNotFoundError(f"Input file {oldest_file} does not exist.") - self.logger.info(f"Oldest DL0 file: {oldest_file}") - os.makedirs(f'{self.heasarc_dir}/dl0', exist_ok=True) - new_file_path = f"{self.heasarc_dir}/dl0/{os.path.basename(oldest_file)}" - os.rename(oldest_file, new_file_path) - self.logger.info(f"Stored DL0 file: {new_file_path}") - ti.xcom_push(key='stored_dl0_file', value=new_file_path) - else: - self.logger.warning("No input files found in the directory. Exiting task gracefully.") - raise AirflowSkipException("No input files found, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_placeholder_file(self, input_file, output_dir, stage): - try: - if not os.path.exists(input_file): - raise FileNotFoundError(f"Input file {input_file} does not exist.") - os.makedirs(output_dir, exist_ok=True) - filename = f"{output_dir}/{stage}_{os.path.basename(input_file)}" - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - writer.writerow(["parameter1", "parameter2", "parameter3"]) - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - self.logger.info(f"Generated placeholder file: {filename}") - return filename - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1a(self, **kwargs): - try: - ti = kwargs['ti'] - dl0_file = ti.xcom_pull(key='stored_dl0_file') - if dl0_file: - if not os.path.exists(dl0_file): - raise FileNotFoundError(f"DL0 file {dl0_file} does not exist. It may have been processed by another instance.") - filename = self.generate_placeholder_file(dl0_file, f'{self.heasarc_dir}/dl1a', 'dl1a') - ti.xcom_push(key='stored_dl1a_file', value=filename) - else: - self.logger.warning("No DL0 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL0 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Exiting task gracefully.") - raise AirflowSkipException(f"File not found: {e}") - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1b(self, **kwargs): - try: - ti = kwargs['ti'] - dl1a_file = ti.xcom_pull(key='stored_dl1a_file', task_ids='generate_dl1a') - if dl1a_file: - if not os.path.exists(dl1a_file): - raise FileNotFoundError(f"DL1a file {dl1a_file} does not exist.") - filename = self.generate_placeholder_file(dl1a_file, f'{self.heasarc_dir}/dl1b', 'dl1b') - ti.xcom_push(key='stored_dl1b_file', value=filename) - else: - self.logger.warning("No DL1a file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1a file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl1c(self, **kwargs): - try: - ti = kwargs['ti'] - dl1b_file = ti.xcom_pull(key='stored_dl1b_file', task_ids='generate_dl1b') - if dl1b_file: - if not os.path.exists(dl1b_file): - raise FileNotFoundError(f"DL1b file {dl1b_file} does not exist.") - filename = self.generate_placeholder_file(dl1b_file, f'{self.heasarc_dir}/dl1c', 'dl1c') - ti.xcom_push(key='stored_dl1c_file', value=filename) - else: - self.logger.warning("No DL1b file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1b file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def generate_dl2(self, **kwargs): - try: - ti = kwargs['ti'] - dl1c_file = ti.xcom_pull(key='stored_dl1c_file', task_ids='generate_dl1c') - if dl1c_file: - if not os.path.exists(dl1c_file): - raise FileNotFoundError(f"DL1c file {dl1c_file} does not exist.") - filename = self.generate_placeholder_file(dl1c_file, f'{self.heasarc_dir}/dl2', 'dl2') - ti.xcom_push(key='stored_dl2_file', value=filename) - else: - self.logger.warning("No DL1c file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL1c file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_one(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_1', 'stage1') - ti.xcom_push(key='stored_stage1_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def fast_transient_stage_two(self, **kwargs): - try: - ti = kwargs['ti'] - dl2_file = ti.xcom_pull(key='stored_dl2_file', task_ids='generate_dl2') - if dl2_file: - if not os.path.exists(dl2_file): - raise FileNotFoundError(f"DL2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(dl2_file, f'{self.heasarc_dir}/fast_transient_stage_2', 'stage2') - ti.xcom_push(key='stored_stage2_file', value=filename) - else: - self.logger.warning("No DL2 file found in XCom. Exiting task gracefully.") - raise AirflowSkipException("No DL2 file found in XCom, skipping task.") - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - - def fast_transient_stage_three(self, **kwargs): - try: - ti = kwargs['ti'] - input_files = ti.xcom_pull(key='stored_stage2_file', task_ids='fast_transient_stage_two') - if not os.path.exists(input_files): - raise FileNotFoundError(f"stage 2 file {dl2_file} does not exist.") - filename = self.generate_placeholder_file(input_files, f'{self.heasarc_dir}/fast_transient_stage_3', 'stage3') - except FileNotFoundError as e: - self.logger.error(f"Error: {e}. Stopping pipeline.") - raise - except Exception as e: - self.logger.error(f"Unexpected error: {e}. Stopping pipeline.") - raise - - def notify_completion(self): - self.logger.info("Pipeline has completed successfully.") - - def log_performance_metric(self, task_id, start_time): - end_time = time.time() - duration = end_time - start_time - self.logger.info(f"Task {task_id} took {duration} seconds to start after receiving its input.") - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): # Wait for 1 second for an event - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - file_path = f"{self.base_dir}/input/{event.name}" - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=file_path) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file(self, **kwargs): - try: - ti = kwargs['ti'] - for event in self.inotify.read(timeout=100000): - if flags.CLOSE_WRITE in flags.from_mask(event.mask): - self.logger.info(f"File {event.name} has been written and closed in the input directory.") - ti.xcom_push(key='new_file_path', value=event.name) - return True - except Exception as e: - self.logger.error(f"Unexpected error while monitoring directory: {e}") - raise - self.logger.info("No new file events detected. Continuing to monitor...") - return False - - def check_new_file_sensor(self, **kwargs): - ti = kwargs['ti'] - pipeline.logger.info("Daemon process started for continuous file monitoring...") - - while True: - input_files = os.listdir(f'{pipeline.base_dir}/input') - - # Check if there are any files - if input_files: - # Get the oldest file - oldest_file = min([f"{pipeline.base_dir}/input/{f}" for f in input_files], key=os.path.getctime) - - if os.path.exists(oldest_file): - # Log and push to XCom - pipeline.logger.info(f"New file detected: {oldest_file}") - ti.xcom_push(key='new_file_path', value=oldest_file) - - # Allow subsequent tasks to run - return True - - # Sleep before next check to avoid high CPU usage - time.sleep(5) - -pipeline = DataPipeline() - -# DAG for processing DL0 and subsequent steps -with DAG('cosi_data_analysis_pipeline_v3o', default_args={'owner': 'airflow'}, schedule=None, - start_date=datetime.now(), - max_active_tasks=5, # Numero massimo di task eseguibili contemporaneamente per DAG - max_active_runs=4 # Numero massimo di istanze del DAG che possono essere eseguite contemporaneamente - ) as dag: - - #wait_for_new_file = PythonOperator( - # task_id='wait_for_new_file', - # python_callable=pipeline.check_new_file, - # provide_context=True - #) - - # ingest_and_store_dl0_task = PythonOperator( - # task_id='ingest_and_store_dl0', - # python_callable=pipeline.ingest_and_store_dl0, - # provide_context=True - # ) - - wait_for_new_file_sensor_task = PythonOperator( - task_id='wait_for_new_file_sensor_task', - python_callable=pipeline.check_new_file_sensor, - provide_context=True, - dag=dag - ) - - - - ingest_and_store_dl0_task_sensor = PythonOperator( - task_id='ingest_and_store_dl0_sensor', - python_callable=pipeline.ingest_and_store_dl0_sensor, - provide_context=True - ) - - - - generate_dl1a_task = PythonOperator( - task_id='generate_dl1a', - python_callable=pipeline.generate_dl1a, - provide_context=True - ) - - generate_dl1b_task = PythonOperator( - task_id='generate_dl1b', - python_callable=pipeline.generate_dl1b - ) - - generate_dl1c_task = PythonOperator( - task_id='generate_dl1c', - python_callable=pipeline.generate_dl1c - ) - - generate_dl2_task = PythonOperator( - task_id='generate_dl2', - python_callable=pipeline.generate_dl2 - ) - - fast_transient_stage_one_task = PythonOperator( - task_id='fast_transient_stage_one', - python_callable=pipeline.fast_transient_stage_one - ) - - fast_transient_stage_two_task = PythonOperator( - task_id='fast_transient_stage_two', - python_callable=pipeline.fast_transient_stage_two - ) - - fast_transient_stage_three_task = PythonOperator( - task_id='fast_transient_stage_three', - python_callable=pipeline.fast_transient_stage_three - ) - - wait_for_new_file_sensor_task >> ingest_and_store_dl0_task_sensor >> generate_dl1a_task >> generate_dl1b_task >> generate_dl1c_task >> generate_dl2_task >> [fast_transient_stage_one_task, fast_transient_stage_two_task] - fast_transient_stage_two_task >> fast_transient_stage_three_task diff --git a/dags/dag_parallel_test_1.py b/dags/dag_parallel_test_1.py new file mode 100644 index 0000000..9da55aa --- /dev/null +++ b/dags/dag_parallel_test_1.py @@ -0,0 +1,16 @@ +from airflow import DAG +from airflow.operators.bash import BashOperator +from datetime import datetime + +with DAG( + dag_id="dag_parallel_test_1", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + max_active_runs=2, + concurrency=3, + tags=["test", "parallel"] +) as dag: + + BashOperator(task_id="sleep_a", bash_command="sleep 60") + BashOperator(task_id="sleep_b", bash_command="sleep 60") diff --git a/dags/dag_parallel_test_2.py b/dags/dag_parallel_test_2.py new file mode 100644 index 0000000..0daec63 --- /dev/null +++ b/dags/dag_parallel_test_2.py @@ -0,0 +1,16 @@ +from airflow import DAG +from airflow.operators.bash import BashOperator +from datetime import datetime + +with DAG( + dag_id="dag_parallel_test_2", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + max_active_runs=2, + concurrency=3, + tags=["test", "parallel"] +) as dag: + + BashOperator(task_id="sleep_c", bash_command="sleep 60") + BashOperator(task_id="sleep_d", bash_command="sleep 60") diff --git a/dags/fail_task.py b/dags/fail_task.py new file mode 100644 index 0000000..4fbf44f --- /dev/null +++ b/dags/fail_task.py @@ -0,0 +1,32 @@ +from airflow import DAG +from airflow.operators.python import PythonOperator +from datetime import datetime + +import sys +import os +airflow_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow") +sys.path.append(os.path.join(airflow_home, "callbacks")) +from on_failure_callback import notify_email + +def failing_task(): + raise ValueError("This task fails.") + +with DAG( + 'dag_with_email_alert', + default_args={ + 'owner': 'airflow', + 'depends_on_past': False, + 'email_on_failure': True, + 'email_on_retry': False, + 'on_failure_callback': notify_email, + 'retries': 0, + }, + schedule_interval=None, + start_date=datetime(2025, 1, 1), + catchup=False, + tags=["test", "failure", "email_alert"] +) as dag: + fail = PythonOperator( + task_id='failing_task', + python_callable=failing_task + ) diff --git a/dags/gendl0.py b/dags/gendl0.py deleted file mode 100644 index 91f7f9b..0000000 --- a/dags/gendl0.py +++ /dev/null @@ -1,54 +0,0 @@ -from airflow import DAG -from airflow.operators.python_operator import PythonOperator -from airflow.utils.dates import days_ago -from datetime import timedelta -import os -import time -import csv -import random - -# Define a class to encapsulate data generation logic -class DataPipeline: - def __init__(self): - # Base directory for storing input data - self.base_dir = '/home/gamma/workspace/data' - - def generate_dl0_file(self): - # Create the input directory if it doesn't exist - input_directory = os.path.join(self.base_dir, 'input') - if not os.path.exists(input_directory): - os.makedirs(input_directory) - # Generate a filename using the current timestamp - filename = os.path.join(input_directory, f"dl0_{int(time.time())}.csv") - # Write random data to the CSV file - with open(filename, 'w', newline='') as file: - writer = csv.writer(file) - # Write header row - writer.writerow(["parameter1", "parameter2", "parameter3"]) - # Write 100 rows of random float values - for _ in range(100): - writer.writerow([random.random() for _ in range(3)]) - -# Instantiate the data pipeline -pipeline = DataPipeline() - -# Define default arguments for the Airflow DAG -default_args = { - 'owner': 'airflow', - 'start_date': days_ago(1), - 'retries': 1, - 'retry_delay': timedelta(minutes=1), -} - -# Define a DAG that periodically generates DL0 data -with DAG( - 'generate_dl0_data', - default_args=default_args, - schedule_interval=timedelta(seconds=10), # Runs every 10 seconds - catchup=False, -) as generate_dl0_dag: - # Create a task that calls the generate_dl0_file function - generate_dl0_task = PythonOperator( - task_id='generate_dl0_file', - python_callable=pipeline.generate_dl0_file - ) diff --git a/env/.env.example b/env/.env.example new file mode 100644 index 0000000..261e00d --- /dev/null +++ b/env/.env.example @@ -0,0 +1,37 @@ +# BOOTSTRAP ID user for container +UID= +GID= +DISPLAY= + +#-------- AIRFLOW Environment Variables +AIRFLOW_ADMIN_USERNAME=admin +AIRFLOW_ADMIN_EMAIL=admin@localhost +# Write here the secure password for airflow Web UI +AIRFLOW_ADMIN_PASSWORD= + +#-------- AIRFLOW Alert Settings +ALERT_USERS_LIST_PATH=/home/gamma/env/alert_users.yaml + +#-------- SMTP Settings +ALERT_SMTP_SERVER=mailhog +ALERT_SMTP_PORT=1025 +ALERT_EMAIL_SENDER=donotreply@cosiflow.alert.errors.it +ALERT_LOG_PATH=/home/gamma/workspace/log/data_pipeline.log +AIRFLOW__SMTP__SMTP_STARTTLS=False +AIRFLOW__SMTP__SMTP_SSL=False + +#-------- SMTP Settings for MailHog +MAILHOG_WEBUI_URL=http://localhost:8025 + +#-------- COSI Directory Structure +#-------- Base directories +COSI_DATA_DIR=/home/gamma/workspace/data +COSI_INPUT_DIR=/home/gamma/workspace/data/input +COSI_LOG_DIR=/home/gamma/workspace/log + +#-------- Main data type directories +COSI_OBS_DIR=/home/gamma/workspace/data/obs +COSI_TRANSIENT_DIR=/home/gamma/workspace/data/transient +COSI_TRIGGER_DIR=/home/gamma/workspace/data/trigger +COSI_MAPS_DIR=/home/gamma/workspace/data/maps +COSI_SOURCE_DIR=/home/gamma/workspace/data/source \ No newline at end of file diff --git a/env/Dockerfile b/env/Dockerfile deleted file mode 100644 index f6dac44..0000000 --- a/env/Dockerfile +++ /dev/null @@ -1,77 +0,0 @@ -FROM oraclelinux:8 AS oracle8 - -# ---------------------------------- Installing dependencies as root ---------------------------------- -RUN dnf install -y wget epel-release git cmake3 gcc-c++ gcc binutils \ -compat-openssl10 libX11-devel libXpm-devel libXft-devel libXext-devel \ -gsl-devel openssl-devel wget bzip2-devel libffi-devel xz-devel sqlite-devel \ -ncurses ncurses-devel make xz libzstd libzstd-devel which rsync \ -nmap-ncat chrony - -RUN dnf install -y oracle-epel-release-el8 -RUN dnf config-manager --enable ol8_codeready_builder -RUN dnf install -y hdf5 hdf5-devel - -# ---------------------------------- Create gamma ---------------------------------- -RUN useradd gamma -USER gamma -WORKDIR /home/gamma -RUN mkdir -p /home/gamma/env -COPY environment.yml /home/gamma/env -COPY requirements.txt /home/gamma/env -SHELL ["/bin/bash", "--login", "-c"] - -USER root - -# ARM processors (Mac) -# Definisci la variabile per il file Miniconda -ARG MINICONDA=Miniconda3-latest-Linux-aarch64.sh -# INTEL/AMD processors -ARG MINICONDA=Miniconda3-latest-Linux-x86_64.sh - -# Scarica Miniconda utilizzando la variabile -RUN wget https://repo.anaconda.com/miniconda/$MINICONDA \ - && chmod +x $MINICONDA \ - && ./$MINICONDA -b -p /opt/conda \ - && rm $MINICONDA - -RUN chown -R gamma:gamma /home/gamma - -USER gamma - -COPY entrypoint.sh /home/gamma/entrypoint.sh - -RUN export PATH=$PATH:/opt/conda/bin && conda config --append channels conda-forge && conda config --set channel_priority strict - -RUN export PATH=$PATH:/opt/conda/bin && conda create -n gamma python=3.12 pip psycopg2 jupyter numpy scipy matplotlib pandas plotly scikit-learn tensorboard tensorflow inotify_simple -y - -RUN export PATH=$PATH:/opt/conda/bin && source activate gamma && export AIRFLOW_VERSION=2.10.3 && \ - export PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" && \ - export CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" && \ - pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" - - -RUN export PATH=$PATH:/opt/conda/bin && conda create -n cosipy python=3.10 pip jupyter notebook && source activate cosipy && git clone https://github.com/cositools/cosipy.git && cd cosipy && git checkout develop && pip install -e . - -RUN mkdir -p ${HOME}/airflow - -USER root -RUN mkdir /shared_dir -RUN chown -R gamma:gamma /shared_dir -RUN mkdir /data01 -RUN chown -R gamma:gamma /data01 -RUN mkdir /data02 -RUN chown -R gamma:gamma /data02 -RUN chmod +x /home/gamma/entrypoint.sh - -USER gamma -RUN mkdir /home/gamma/workspace -#dir to run pipeline -RUN mkdir /home/gamma/workspace/data -RUN mkdir /home/gamma/workspace/data/input -RUN mkdir /home/gamma/workspace/heasarc -RUN mkdir /home/gamma/workspace/log - - -ENV PATH="/opt/conda/bin:$PATH" - -#ENTRYPOINT ["/home/gamma/entrypoint.sh"] diff --git a/env/Dockerfile.airflow b/env/Dockerfile.airflow new file mode 100644 index 0000000..71e2060 --- /dev/null +++ b/env/Dockerfile.airflow @@ -0,0 +1,127 @@ +# ============================================================================= +# Dockerfile.airflow — Miniconda (multi-arch), silent ToS, robust UID/GID +# Base: OracleLinux 8 | Works on amd64 and arm64 (Apple Silicon) +# ============================================================================= + +FROM oraclelinux:8 AS oracle8 + +# ------------------------------ Base dependencies (installed as root) ------------------------------ +# Install core system packages and scientific libraries required by Airflow and COSIPY. +RUN set -eux; \ + dnf install -y \ + wget curl ca-certificates gnupg2 tar bzip2 xz unzip which rsync \ + git cmake3 gcc-c++ gcc make binutils \ + compat-openssl10 openssl-devel \ + libX11-devel libXpm-devel libXft-devel libXext-devel \ + gsl-devel bzip2-devel libffi-devel xz-devel sqlite-devel \ + ncurses ncurses-devel xz libzstd libzstd-devel \ + nmap-ncat chrony; \ + dnf install -y oracle-epel-release-el8; \ + dnf config-manager --enable ol8_codeready_builder; \ + dnf install -y hdf5 hdf5-devel; \ + dnf clean all; rm -rf /var/cache/dnf/* + +# ------------------------------ Non-root user (robust with existing UID/GID) ------------------------------ +# Create a user called "gamma" that matches the host UID/GID to avoid permission issues when mounting volumes. +ARG UID= +ARG GID= +# On macOS, use: --build-arg UID=501 --build-arg GID=20 +RUN set -eux; \ + if getent group "${GID}" >/dev/null; then \ + GN="$(getent group "${GID}" | cut -d: -f1)"; \ + else \ + groupadd -g "${GID}" gamma; GN=gamma; \ + fi; \ + if id -u gamma >/dev/null 2>&1; then \ + usermod -u "${UID}" -g "${GID}" gamma; \ + else \ + useradd -m -u "${UID}" -g "${GID}" -s /bin/bash gamma; \ + fi; \ + mkdir -p /home/gamma && chown -R "${UID}:${GID}" /home/gamma +ENV HOME=/home/gamma +WORKDIR /home/gamma + +# ------------------------------ Copy environment/config files (if available) ------------------------------ +# These are optional configuration or dependency files mounted or baked into the image. +RUN mkdir -p /home/gamma/env +COPY environment.yml /home/gamma/env/ +COPY requirements.txt /home/gamma/env/ +COPY alert_users.yaml /home/gamma/env/ +COPY .env /home/gamma/env/ + +# ------------------------------ Install Miniconda (multi-architecture support) ------------------------------ +# Automatically download the right Miniconda installer for amd64 or arm64 architectures. +ARG TARGETARCH +RUN set -eux; \ + case "${TARGETARCH}" in \ + amd64) MINICONDA=Miniconda3-latest-Linux-x86_64.sh ;; \ + arm64) MINICONDA=Miniconda3-latest-Linux-aarch64.sh ;; \ + *) echo "Unsupported TARGETARCH: ${TARGETARCH}"; exit 1 ;; \ + esac; \ + wget https://repo.anaconda.com/miniconda/$MINICONDA \ + && chmod +x $MINICONDA \ + && ./$MINICONDA -b -p /opt/conda \ + && rm $MINICONDA + +# Fix ownership of the gamma home directory after installation +RUN chown -R ${UID}:${GID} /home/gamma + +# Switch to non-root user for all subsequent steps +USER gamma + +# ------------------------------ Copy entrypoint script ------------------------------ +# This script initializes Airflow (creates admin user, DB, starts webserver, etc.) +COPY entrypoint-airflow.sh /home/gamma/entrypoint-airflow.sh + +# ------------------------------ Conda configuration ------------------------------ +# Add conda-forge channel and set strict channel priority for reproducible builds. +RUN export PATH=$PATH:/opt/conda/bin && conda config --append channels conda-forge && conda config --set channel_priority strict + +# ------------------------------ Accept Conda Terms of Service ------------------------------ +# Required to use the official Anaconda repositories. Done non-interactively. +RUN export PATH=$PATH:/opt/conda/bin && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main +RUN export PATH=$PATH:/opt/conda/bin && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r + +# ------------------------------ Create main "gamma" environment ------------------------------ +# Main Python environment for Airflow and related tools. +RUN export PATH=$PATH:/opt/conda/bin && conda create -n gamma python=3.12 pip psycopg2 jupyter numpy scipy matplotlib pandas plotly scikit-learn tensorboard tensorflow inotify_simple -y + +# ------------------------------ Install Apache Airflow ------------------------------ +# Install Airflow using official constraint files to ensure version compatibility. +RUN export PATH=$PATH:/opt/conda/bin && source activate gamma && export AIRFLOW_VERSION=2.10.3 && \ + export PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" && \ + export CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" && \ + pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" + +# ------------------------------ Create secondary "cosipy" environment ------------------------------ +# Dedicated environment for COSIPY tools and dependencies. +RUN export PATH=$PATH:/opt/conda/bin && conda create -n cosipy python=3.10 pip jupyter notebook && source activate cosipy && pip install py7zr && git clone https://github.com/cositools/cosipy.git && cd cosipy && git checkout v0.3.x && pip install -e . + +# ------------------------------ Create "cosipy_develop" environment ------------------------------ +# Development environment for COSIPY tools and dependencies (develop branch). +RUN export PATH=$PATH:/opt/conda/bin && conda create -n cosipy_develop python=3.10 pip jupyter notebook -y && source activate cosipy_develop && pip install py7zr && git clone https://github.com/cositools/cosipy.git cosipy_develop && cd cosipy_develop && git checkout develop && pip install -e . + +# ------------------------------ Prepare Airflow home directory ------------------------------ +RUN mkdir -p ${HOME}/airflow + +# ------------------------------ Create shared directories and fix permissions ------------------------------ +USER root +RUN mkdir /shared_dir +RUN chown -R ${UID}:${GID} /shared_dir +RUN mkdir /data01 +RUN chown -R ${UID}:${GID} /data01 +RUN mkdir /data02 +RUN chown -R ${UID}:${GID} /data02 +RUN chown -R ${UID}:${GID} /home/gamma/entrypoint-airflow.sh +RUN chmod +x /home/gamma/entrypoint-airflow.sh + +# ------------------------------ Switch back to non-root user ------------------------------ +USER gamma +RUN mkdir /home/gamma/workspace + +# ------------------------------ Environment variables and PATH ------------------------------ +ENV PATH="/opt/conda/bin:$PATH" + +# ------------------------------ Default entrypoint (disabled) ------------------------------ +# Uncomment to automatically start Airflow when the container runs +# ENTRYPOINT ["/home/gamma/entrypoint-airflow.sh"] diff --git a/env/airflow.cfg.postgresql b/env/airflow.cfg.postgresql index 99bf743..e0cfc1b 100644 --- a/env/airflow.cfg.postgresql +++ b/env/airflow.cfg.postgresql @@ -499,7 +499,7 @@ alembic_ini_file_path = alembic.ini #sql_alchemy_conn = sqlite:////shared_dir/airflow/airflow.db #for LocalExecutor -sql_alchemy_conn = postgresql+psycopg2://airflow_user:secure_password@cosi_postgres/airflow_db +sql_alchemy_conn = postgresql+psycopg2://airflow_user:secure_password@postgres:5432/airflow_db # Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value diff --git a/env/alert_users.yaml b/env/alert_users.yaml new file mode 100644 index 0000000..22859bd --- /dev/null +++ b/env/alert_users.yaml @@ -0,0 +1,20 @@ +groups: + pipeline_dev: + description: Pipeline developers team + emails: + - riccardo.falco@inaf.it + - admin@localhost + # - dev2@cosi.it + + qa_team: + description: Quality Assurance + emails: + - riccardo.falco@inaf.it + # - qa@cosi.it + +rules: + - pattern: "ALERT_FAIL" + notify: ["pipeline_dev"] + + - pattern: "TriggerDagRunOperator" + notify: ["qa_team"] diff --git a/env/bootstrap.sh b/env/bootstrap.sh new file mode 100755 index 0000000..38185ba --- /dev/null +++ b/env/bootstrap.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -euo pipefail + +if [ $# -ne 0 ]; then + printf "\n\033[31mUsage: bootstrap.sh (no arguments)\033[0m\n\n" + exit 1 +fi + +MY_UID="$(id -u)" +MY_GID="$(id -g)" + +echo "${MY_UID}" +echo "${MY_UID}" + +echo -e "\n[INFO] Starting containers with UID=${MY_UID} and GID=${MY_GID}..." + +docker compose build --build-arg MY_UID=${MY_UID} --build-arg MY_GID=${MY_GID} +# docker compose up \ No newline at end of file diff --git a/env/docker-compose.yaml b/env/docker-compose.yaml index 542c4ce..9bf3306 100644 --- a/env/docker-compose.yaml +++ b/env/docker-compose.yaml @@ -1,35 +1,122 @@ - +version: "3.9" services: + # --------------------------------------------------------------------------- + # POSTGRES DATABASE SERVICE + # --------------------------------------------------------------------------- postgres: - image: postgres + image: postgres:15 container_name: cosi_postgres + # The default Postgres image runs as user "postgres". + # You usually should NOT override this with your local UID/GID. + # user: "${UID}:${GID}" + environment: - - POSTGRES_USER=airflow_user - - POSTGRES_PASSWORD=secure_password - - POSTGRES_DB=airflow_db - #volumes: - # - ${HOME}/postgres_data:/var/lib/postgresql/data - #restart: always + # These values initialize the PostgreSQL database cluster. + # The first initialization only happens if the data volume is empty. + - UID=${UID} + - GID=${GID} + - POSTGRES_USER=airflow_user # Username for Airflow DB + - POSTGRES_DB=airflow_db # Database name for Airflow + - POSTGRES_PASSWORD=secure_password # Password for Airflow DB + + # Health check to verify that PostgreSQL is accepting connections + healthcheck: + test: ["CMD-SHELL", "pg_isready -h 127.0.0.1 -p 5432 -U $${POSTGRES_USER} -d $${POSTGRES_DB}"] + interval: 5s # Run every 5 seconds + timeout: 5s # Fail if command takes longer than 5 seconds + retries: 10 # Mark container as unhealthy after 10 failed checks + + volumes: + # Local directory that persists the database files + # Commented line for a user-level data directory: + # - ${HOME}/postgres_data:/var/lib/postgresql/data + - ../data/postgres_data:/var/lib/postgresql/data + + # restart: always # Uncomment for auto-restart on failure + # --------------------------------------------------------------------------- + # AIRFLOW SERVICE + # --------------------------------------------------------------------------- airflow: - image: airflow:1.1.0 + image: cosiflow-airflow:native + build: + context: . # Build context for Dockerfile + dockerfile: Dockerfile.airflow container_name: cosi_airflow + + # Run the container as the "gamma" user created inside the image. + # This matches the user configured in your Dockerfile. + user: "gamma" + environment: + # User and group IDs passed to match host permissions (optional) + - UID=${UID} + - GID=${GID} + + # Airflow configuration - AIRFLOW_HOME=/home/gamma/airflow - - DISPLAY=${DISPLAY} - - AIRFLOW__CORE__LOAD_EXAMPLES=False + - DISPLAY=${DISPLAY} # Needed if you run GUI apps (e.g., plots) + - AIRFLOW__CORE__LOAD_EXAMPLES=False # Disable default example DAGs + + # Mail alert system (MailHog) + - ALERT_SMTP_SERVER=mailhog + - ALERT_SMTP_PORT=1025 + - ALERT_EMAIL_SENDER=donotreply@cosiflow.alert.errors.it + + # Extra environment variables loaded from .env file in this directory + env_file: + - .env + volumes: - - ../dags:/home/gamma//airflow/dags + # DAGs, plugins, and pipeline directories are mounted so you can edit + # them live without rebuilding the image. + - ../dags:/home/gamma/airflow/dags + - ../plugins:/home/gamma/airflow/plugins + - ../pipeline:/home/gamma/airflow/pipeline + - ../callbacks:/home/gamma/airflow/callbacks + - ../modules:/home/gamma/airflow/modules + + # Mount the PostgreSQL-specific Airflow configuration - ./airflow.cfg.postgresql:/home/gamma/airflow/airflow.cfg + + # X11 socket for graphical display forwarding (if needed) - /tmp/.X11-unix:/tmp/.X11-unix:rw - - ${HOME}/cosiflow:/shared_dir + + # Shared directory between host and container + - ..:/shared_dir + + # Data workspace directory for processing large files + - ../data:/home/gamma/workspace/data + + ports: + - "8080:8080" # Airflow Web UI + - "28888:28888" # Jupyter Notebook (if enabled) + + depends_on: + postgres: + # Wait until the Postgres healthcheck passes before starting Airflow + condition: service_healthy + + # restart: always # Uncomment if you want automatic restarts + + # Custom entrypoint script that initializes Airflow DB and starts services + entrypoint: ["bash", "/home/gamma/entrypoint-airflow.sh"] + # Alternative entrypoint for debugging: + # entrypoint: ["tail", "-f", "/dev/null"] + + # --------------------------------------------------------------------------- + # MAILHOG SERVICE (Fake SMTP server for local testing) + # --------------------------------------------------------------------------- + mailhog: + image: mailhog/mailhog + container_name: cosi_mailhog ports: - - "8080:8080" - - "28888:28888" #jupyter notebook - #restart: always - entrypoint: ["bash", "/home/gamma/entrypoint.sh"] - #entrypoint: ["tail", "-f", "/dev/null"] + - "1025:1025" # SMTP port for Airflow alerts + - "8025:8025" # Web UI → http://localhost:8025 +# --------------------------------------------------------------------------- +# NAMED VOLUMES (Optional - not used directly since host paths are mounted) +# --------------------------------------------------------------------------- volumes: postgres_data: diff --git a/env/entrypoint-airflow.sh b/env/entrypoint-airflow.sh new file mode 100644 index 0000000..a266c15 --- /dev/null +++ b/env/entrypoint-airflow.sh @@ -0,0 +1,109 @@ +#!/bin/bash +#set -euo pipefail + +cd /home/gamma + +ENV_FILE="/home/gamma/env/.env" + +if [ ! -f "$ENV_FILE" ]; then + echo "❌ Missing .env file at $ENV_FILE" + echo "👉 Please create the file with the following structure:" + echo "" + echo "AIRFLOW_ADMIN_USERNAME=admin" + echo "AIRFLOW_ADMIN_EMAIL=admin@localhost" + echo "AIRFLOW_ADMIN_PASSWORD=yourpassword" + echo "" + echo "ALERT_SMTP_SERVER=mailhog" + echo "ALERT_SMTP_PORT=1025" + echo "ALERT_EMAIL_SENDER=donotreply@cosiflow.alert.errors.it" + exit 1 +fi + +# Load environment variables +set -o allexport +source "$ENV_FILE" +set +o allexport + +# Check required variables +if [ -z "${AIRFLOW_ADMIN_USERNAME:-}" ] || [ -z "${AIRFLOW_ADMIN_EMAIL:-}" ] || [ -z "${AIRFLOW_ADMIN_PASSWORD:-}" ]; then + echo "❌ Missing one or more required environment variables in $ENV_FILE" + exit 1 +fi + +# Export SMTP settings for Airflow if present +if [ -n "${ALERT_SMTP_SERVER:-}" ]; then + export AIRFLOW__SMTP__SMTP_HOST="$ALERT_SMTP_SERVER" +fi + +if [ -n "${ALERT_SMTP_PORT:-}" ]; then + export AIRFLOW__SMTP__SMTP_PORT="$ALERT_SMTP_PORT" +fi + +if [ -n "${ALERT_EMAIL_SENDER:-}" ]; then + export AIRFLOW__SMTP__SMTP_MAIL_FROM="$ALERT_EMAIL_SENDER" +fi + +# Always use this email backend +export AIRFLOW__EMAIL__EMAIL_BACKEND=airflow.utils.email.send_email_smtp + +# Export COSI directory structure environment variables if present +if [ -n "${COSI_DATA_DIR:-}" ]; then + export COSI_DATA_DIR="$COSI_DATA_DIR" +fi + +if [ -n "${COSI_OBS_DIR:-}" ]; then + export COSI_OBS_DIR="$COSI_OBS_DIR" +fi + +if [ -n "${COSI_TRANSIENT_DIR:-}" ]; then + export COSI_TRANSIENT_DIR="$COSI_TRANSIENT_DIR" +fi + +if [ -n "${COSI_TRIGGER_DIR:-}" ]; then + export COSI_TRIGGER_DIR="$COSI_TRIGGER_DIR" +fi + +if [ -n "${COSI_MAPS_DIR:-}" ]; then + export COSI_MAPS_DIR="$COSI_MAPS_DIR" +fi + +if [ -n "${COSI_SOURCE_DIR:-}" ]; then + export COSI_SOURCE_DIR="$COSI_SOURCE_DIR" +fi + +if [ -n "${COSI_INPUT_DIR:-}" ]; then + export COSI_INPUT_DIR="$COSI_INPUT_DIR" +fi + +if [ -n "${COSI_LOG_DIR:-}" ]; then + export COSI_LOG_DIR="$COSI_LOG_DIR" +fi + +# Create COSI directory structure if not present +mkdir -p $COSI_DATA_DIR/{obs,transient,trigger,maps,source} + +# Activate conda environment +source activate gamma +export PATH="$PATH:~/.local/bin" +echo "✅ Environment activated." + +# Initialize Airflow DB +airflow db init + +# Create admin user if not present +if ! airflow users list | grep -q "$AIRFLOW_ADMIN_USERNAME"; then + airflow users create \ + --username "$AIRFLOW_ADMIN_USERNAME" \ + --firstname COSI \ + --lastname Admin \ + --role Admin \ + --email "$AIRFLOW_ADMIN_EMAIL" \ + --password "$AIRFLOW_ADMIN_PASSWORD" + echo "✅ Admin user created." +else + echo "ℹ️ Admin user already exists. Skipping creation." +fi + +# Start webserver (in background) and scheduler +airflow webserver --port 8080 & +airflow scheduler diff --git a/env/entrypoint.sh b/env/entrypoint.sh deleted file mode 100644 index 63ffe6e..0000000 --- a/env/entrypoint.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -cd /home/gamma -source activate gamma -export PATH="$PATH:~/.local/bin" -echo $PATH -airflow standalone diff --git a/env/environment.yml b/env/environment.yml index 070c95c..de1971b 100644 --- a/env/environment.yml +++ b/env/environment.yml @@ -6,6 +6,6 @@ dependencies: - root=6.26 - root_base=6.26 - pip - - python=3.10 # Specifica la versione di Python se necessario + - python=3.10 # Specify the Python version if needed - pip: - cosipy diff --git a/modules/README.md b/modules/README.md new file mode 100644 index 0000000..b8069cb --- /dev/null +++ b/modules/README.md @@ -0,0 +1,269 @@ +# CosiDAG + +A high-level reactive DAG template for filesystem-driven scientific workflows + +## Overview + +`COSIDAG` is a convenience subclass of Airflow’s `DAG` designed to simplify the creation of **file-driven scientific pipelines**. +It encapsulates a standard five-step workflow pattern: + +1. **check_new_file** + Monitors one or more folders, searching for new subdirectories. + Handles date filtering, basename filtering, depth traversal, and stability checks. + Pushes the detected folder path into XCom and tracks processed folders via Airflow Variables. + +2. **automatic_retrig** + Immediately triggers a new run of the same DAG, so the sensor keeps watching for fresh data. + This enables near-real-time reactive pipelines. + +3. **resolve_inputs** *(optional)* + If `file_patterns` is provided, the module automatically scans the detected folder, resolves filenames, and pushes the results to XCom using user-defined keys. + +4. **custom tasks** + A user-provided `build_custom(dag)` function can attach any processing tasks (analysis, binning, model execution, visualization). + These tasks use the values produced by steps 1–3 via `xcom_pull`. + +5. **show_results** + Logs a homepage URL (read from an environment variable) and optionally builds a deep link referencing the detected folder. + +This structure removes 80–90% of the boilerplate typically involved in writing dynamic pipelines while ensuring consistency across future COSI workflows (e.g. **TSMap**, **Light Curve**). + +--- + +## Why use CosiDAG? + +CosiDAG solves common problems in scientific workflows: + +* **Dynamic file discovery** — your pipeline reacts automatically to new folders dropped on disk. +* **No hard-coded filenames** — files are resolved automatically using regex patterns. +* **Unified behavior** — TSMap, Light Curve, and other pipelines share the same structure. +* **Clean separation of infrastructure vs science code** — COSIDAG handles monitoring, deduplication, and XCom logic; the user only implements the scientific tasks. +* **Consistent task orchestration** — every pipeline follows the same five-step DAG layout. + +Additionally, CosiDAG integrates seamlessly with the **MailHog link plugin**, which captures and exposes exception emails directly in the Airflow UI. +This means that mailbox-based alerting and debugging works **out-of-the-box** with all CosiDAG-derived workflows, without requiring extra configuration. + +--- + +## Minimal Example + +```python +from datetime import datetime +from cosidag import COSIDAG +from airflow.operators.python import ExternalPythonOperator + +def build_custom(dag): + + # Pull runtime-discovered folder and file paths + RUN_DIR = "{{ ti.xcom_pull('check_new_file', key='detected_folder') }}" + RESPONSE = "{{ ti.xcom_pull('resolve_inputs', key='response_file') }}" + + def compute(run_dir: str, response_file: str): + print("Running analysis...", run_dir, response_file) + + compute_task = ExternalPythonOperator( + task_id="compute_step", + python="/path/to/external/env/bin/python", + python_callable=compute, + op_kwargs={"run_dir": RUN_DIR, "response_file": RESPONSE}, + dag=dag, + ) + + return [compute_task] + +with COSIDAG( + dag_id="example_cosidag", + schedule_interval=None, + start_date=datetime(2025, 1, 1), + monitoring_folders=["/data/incoming"], + file_patterns={ + "response_file": r"Response.*\.h5" + }, + select_policy="latest_mtime", + only_basename="products", + prefer_deepest=True, + idle_seconds=5, + build_custom=build_custom, + tags=["example"], +): + pass +``` + +--- + +## How CosiDAG Passes Data Between Tasks + +CosiDAG uses **XCom** to pass runtime-discovered paths to the user-defined tasks. + +* `check_new_file` pushes the detected folder: + + ``` + key="detected_folder" + ``` +* `resolve_inputs` pushes files matched by regex patterns, using the corresponding keys: + + ``` + "response_file": "/path/to/Response_003.h5" + ``` + +User tasks retrieve them via: + +```python +"{{ ti.xcom_pull('check_new_file', key='detected_folder') }}" +"{{ ti.xcom_pull('resolve_inputs', key='response_file') }}" +``` + +This allows pipelines to be fully dynamic and independent of hard-coded paths. + +--- + +## Configuration Parameters + +| Parameter | Type | Description | +| ----------------------------- | --------------------------- | ------------------------------------------------------------ | +| `monitoring_folders` | list[str] | Folders to scan for new data. | +| `level` | int | Directory depth to scan. | +| `date` | str/int | Accept only folders with this date. | +| `date_queries` | str | Query expression for date filtering (e.g. `==20251119`). | +| `only_basename` | str | Accept only folders with this basename (e.g., `"products"`). | +| `prefer_deepest` | bool | Selects deepest matching subfolder. | +| `min_files` | int | Minimum number of files required before accepting a folder. | +| `idle_seconds` | int | Seconds to wait for the folder to “settle” (no live writes). | +| `ready_marker` | str | Marker file required for folder acceptance. | +| `home_env_var` | str | ENV var containing the UI base URL. | +| `file_patterns` | dict[str, str] | Mapping XCom key → regex pattern for auto file resolution. | +| `select_policy` | `"latest_mtime"`, `"first"` | Strategy for resolving multiple matches. | +| `default_args_extra` | dict | Additional default args for tasks. | +| `tags` | list[str] | Airflow UI tags. | +| `auto_retrig` | bool | Enables real-time monitoring. | +| `processed_variable` | str | Name of the Airflow Variable storing processed paths. | +| `builder_fn` / `build_custom` | callable | Function that attaches user-defined tasks. | +| `xcom_detected_key` | str | XCom key for detected folder path. | + +--- + +## Configuring and Running a CosiDAG from the Airflow UI + +Once a CosiDAG script is defined, **you do not need to modify the Python file** to run the pipeline on different datasets. +Instead, Airflow’s Trigger UI allows you to dynamically set: + +* monitoring folders +* date filters +* file patterns +* selection policy +* any custom configuration values defined in your DAG parameters + +This makes CosiDAG pipelines fully reusable: **the same code can be triggered dozens of times with different inputs**, without editing the script. + +To run a CosiDAG on new data: + +1. Open the DAG in the Airflow Web UI +2. Click **Trigger DAG** +3. Fill in the configuration form (folder paths, patterns, etc.) +4. Click **Trigger** + +Every run will process a different dataset with identical logic. + +--- + +## Processed Folder Tracking (Airflow Variable) + +Every CosiDAG keeps track of previously processed folders using an Airflow Variable named: + +``` +COSIDAG_PROCESSED:: +``` + +This prevents the pipeline from reprocessing the same folder unless explicitly requested. + +### Viewing the stored folders + +From the CLI: + +```bash +airflow variables get COSIDAG_PROCESSED:: +``` + +### Clearing the list (e.g. to reprocess everything) + +```bash +airflow variables set COSIDAG_PROCESSED:: "[]" +``` + +### Removing the variable entirely + +```bash +airflow variables delete COSIDAG_PROCESSED:: +``` + +These commands allow you to “reset” the monitoring history at any time. + +--- + +## Disabling Automatic Retrigger + +By default, CosiDAG enables **automatic retriggering** (`automatic_retrig=True`), meaning the DAG keeps running in a loop to continuously watch for new folders. + +You can disable this behavior by setting: + +```python +auto_retrig=False +``` + +or by exposing it as a configurable parameter and turning it off in the Airflow UI. + +When retriggering is disabled: + +* The DAG will **not** restart automatically +* You can manually rerun the pipeline on a folder that was already processed +* This is useful for **re-analysis**, debugging, or running multiple configurations on the same dataset + +Disabling retriggering + clearing the processed-variable list lets you fully reprocess any folder without modifying the DAG code. + +--- + +## MailHog Link Plugin (Exception Visibility) + +CosiDAG integrates cleanly with the Airflow **MailHog link plugin**, a small extension that: + +* intercepts exception emails generated by Airflow, +* displays a direct link to the captured email next to the failed task in the Airflow UI. + +While you don’t need to configure anything manually, it is useful to know that: + +* When a CosiDAG-based pipeline fails, any email alerts triggered by Airflow’s email backend will appear in the MailHog UI. +* COSIFLOW’s environment already includes MailHog and the plugin, so notifications are automatically routed and linked. +* This provides faster debugging and lowers the cost of diagnosing failed tasks. + +You do **not** need to interact with MailHog directly, CosiDAG DAGs just benefit from it. + +--- + +## When to Use CosiDAG + +CosiDAG is ideal when your workflow: + +* **should react to new data** appearing in a filesystem, +* requires robust **folder validation**, +* should **run analysis scripts in external Python environments**, +* must be: + - **easy to maintain** + - **extend** + - **reuse across different scientific pipelines** + +Examples include: + +* TSMap pipeline +* Light Curve pipeline +* SimData ingestion +* Any workflow triggered by incoming instrument data + +--- + +## Summary + +* CosiDAG pipelines are reusable and configurable directly from the Airflow UI +* No need to modify the DAG script to process new datasets +* Processed folders are stored in `COSIDAG_PROCESSED::` +* CLI commands allow viewing, clearing, or deleting this history +* The automatic retrigger step can be disabled to purposely re-run old folders \ No newline at end of file diff --git a/modules/cosidag.py b/modules/cosidag.py new file mode 100644 index 0000000..16a2c0f --- /dev/null +++ b/modules/cosidag.py @@ -0,0 +1,741 @@ +""" +COSIDAG — a convenience DAG subclass that wires a standard layout: + + 1) check_new_file -> 2) automatic_retrig -> 3) resolve_inputs -> 4) [custom tasks] -> 5) show_results + +This implementation supports disabling optional steps: + +- check_new_file is NOT created if monitoring_folders is empty / None. +- automatic_retrig is NOT created if auto_retrig is False. + +The chaining logic adapts automatically depending on which tasks exist. + +Notes +------ +* Requires Airflow 2.x. +* Environment: define COSIFLOW_HOME_URL (in your .env) to point to the web UI homepage. +* State: a Variable named f"COSIDAG_PROCESSED::{dag_id}" is used to track processed + folder paths across runs, to avoid reprocessing the same folder. + * To clear the processed folder paths, delete the Variable, with the command: + airflow variables set COSIDAG_PROCESSED::{cosidag_id} [] +* Date queries: use date_queries (e.g. '>=2025-11-01' or ['>=2025-11-01','<=2025-11-05']). +* Only basename: if only_basename is provided, it only accepts subfolders with the given basename. +* Prefer deepest: if prefer_deepest is True, it prefers the deepest subfolder. +* File patterns: if file_patterns is provided, it searches for files matching the given patterns + using glob recursion and selects according to select_policy. +* Path helper: if available, the module cosiflow.modules.path is used to parse/build + URL fragments from detected folders. The code degrades gracefully if not found. +""" +from __future__ import annotations + +import os +import json +import re +import time +from datetime import datetime +from typing import Callable, Iterable, Optional, Sequence + +from airflow import DAG +from airflow.models import Variable +from airflow.operators.empty import EmptyOperator +from airflow.operators.python import PythonOperator +from airflow.sensors.python import PythonSensor +from airflow.utils.trigger_rule import TriggerRule +from airflow.operators.trigger_dagrun import TriggerDagRunOperator + +# ---- Optional path utils -------------------------------------------------------- +try: + from cosiflow.modules.path import PathInfo, build_url_fragment # type: ignore +except Exception: + PathInfo = None # type: ignore + build_url_fragment = None # type: ignore + +# ---- Import on-failure callback ------------------------------------------------- +import sys + +airflow_home = os.environ.get("AIRFLOW_HOME", "/opt/airflow") +sys.path.append(os.path.join(airflow_home, "callbacks")) +sys.path.append(os.path.join(airflow_home, "modules")) +from on_failure_callback import notify_email # type: ignore + +from date_helper import _looks_like_date_folder, _parse_date_string, _apply_date_queries # type: ignore + +_BASE_DEFAULT_ARGS = { + "owner": "cosiflow", + "email_on_failure": True, + "on_failure_callback": notify_email, # from callbacks/on_failure_callback.py +} + +# --- Public config helpers (Airflow Variable -> ENV -> default) ----------------- +try: + from airflow.models import Variable as _AFVariable +except Exception: + _AFVariable = None + + +def cfg(key: str, default=None): + """Read config from Airflow Variable, then ENV, else default.""" + val = None + if _AFVariable is not None: + try: + val = _AFVariable.get(key) + except Exception: + val = None + if val is None: + val = os.environ.get(key, default) + return val + + +def cfg_int(key: str, default: int) -> int: + v = cfg(key, default) + try: + return int(v) + except Exception: + return default + + +def cfg_float(key: str, default: float) -> float: + v = cfg(key, default) + try: + return float(v) + except Exception: + return default + + +def cfg_bool(key: str, default: bool = False) -> bool: + v = cfg(key, None) + if isinstance(v, bool): + return v + if v is None: + return default + return str(v).strip().lower() in {"1", "true", "t", "yes", "y", "on"} + + +# ----- Helper functions (MUST stay at module top-level) -------------------------- + + +def _dir_stats(path: str): + """Return (count, total_size_bytes, latest_mtime) across all files under path.""" + count = 0 + total = 0 + latest = 0.0 + for root, _, files in os.walk(path): + for fn in files: + fp = os.path.join(root, fn) + try: + st = os.stat(fp) + except FileNotFoundError: + continue + count += 1 + total += st.st_size + if st.st_mtime > latest: + latest = st.st_mtime + return count, total, latest + + +def _is_dir_stable(path: str, idle_seconds: int, min_files: int) -> bool: + """True if dir has >= min_files and last write is older than idle_seconds.""" + count, _, latest = _dir_stats(path) + if count < min_files: + return False + return (time.time() - latest) >= idle_seconds + + +def _normalize_folders(monitoring_folders: Iterable[str]) -> Sequence[str]: + """Return absolute existing directories; ignore non-existing.""" + if isinstance(monitoring_folders, (str, os.PathLike)): + candidates = [str(monitoring_folders)] + else: + candidates = [str(p) for p in monitoring_folders] + out = [] + for p in candidates: + ap = os.path.abspath(os.path.expanduser(p)) + if os.path.isdir(ap): + out.append(ap) + return out + + +def _iter_subfolders(root: str, max_depth: int) -> Iterable[str]: + """Yield subfolders under root up to max_depth (depth 1 = direct children).""" + root_depth = root.rstrip(os.sep).count(os.sep) + for current_root, dirs, _ in os.walk(root): + current_depth = current_root.rstrip(os.sep).count(os.sep) - root_depth + if current_depth > max_depth: + dirs[:] = [] + continue + if current_depth >= 1: + yield current_root + + +def _date_filter_ok(path: str, date_queries) -> bool: + """ + Accept path if its 'reference date' (folder name or mtime) satisfies ALL queries. + + date_queries can be: + - None -> always True + - string like '>=2025-11-01' + - list of strings ['>=2025-11-01', '<=2025-11-05'] + """ + print(f"[COSIDAG] _date_filter_ok: path={path}, date_queries={date_queries}") + if not date_queries: + return True + + last = os.path.basename(os.path.normpath(path)) + + # 1) Try parsing date from folder name (YYYYMMDD[_...] or YYYY-MM-DD[_...]) + ref_date = None + if _looks_like_date_folder(last): + ds = last.split("_")[0] + try: + ref_date = _parse_date_string(ds) + except Exception as e: + print(f"[COSIDAG] _date_filter_ok: failed to parse folder date {ds!r}: {e}") + + # 2) Fallback to mtime date + if ref_date is None: + try: + ref_date = datetime.fromtimestamp(os.stat(path).st_mtime).date() + except Exception as e: + print(f"[COSIDAG] _date_filter_ok: failed to get mtime for {path}: {e}") + # If we cannot determine a reference date, do not filter out for safety. + return True + + return _apply_date_queries(ref_date, date_queries) + + +def _load_processed_set(dag_id: str) -> set: + """Load processed paths set from Airflow Variable.""" + key = f"COSIDAG_PROCESSED::{dag_id}" + raw = Variable.get(key, default_var="[]") + try: + return set(json.loads(raw)) + except Exception: + return set() + + +def _save_processed_set(dag_id: str, processed: set) -> None: + """Save processed paths set to Airflow Variable.""" + key = f"COSIDAG_PROCESSED::{dag_id}" + Variable.set(key, json.dumps(sorted(processed))) + + +def _find_new_folder( + monitoring_folders: Iterable[str], + level: int, + dag_id: str, + date_queries: Optional[str | list[str]] = None, + only_basename: Optional[str] = None, + prefer_deepest: bool = True, +) -> Optional[str]: + """Return the first new folder across roots (filtered & depth-limited).""" + print( + "[COSIDAG] _find_new_folder: searching for new folders " + f"(dag_id={dag_id}, level={level}, date_queries={date_queries}, only_basename={only_basename})" + ) + roots = _normalize_folders(monitoring_folders) + if not roots: + print("[COSIDAG] _find_new_folder: no valid monitoring folders found") + return None + + print(f"[COSIDAG] _find_new_folder: monitoring {len(roots)} root folder(s): {', '.join(roots)}") + processed = _load_processed_set(dag_id) + print(f"[COSIDAG] _find_new_folder: loaded {len(processed)} already processed folder(s)") + + candidates: list[str] = [] + for root in sorted(roots): + subfolders = list(_iter_subfolders(root, max_depth=level)) # materialize once + print(f"[COSIDAG] _find_new_folder: found {len(subfolders)} subfolder(s) in {root} (max_depth={level})") + for sub in subfolders: + if only_basename and os.path.basename(sub) != only_basename: + continue + if _date_filter_ok(sub, date_queries): + candidates.append(sub) + + if not candidates: + print("[COSIDAG] _find_new_folder: no candidates found after filtering") + return None + + print(f"[COSIDAG] _find_new_folder: {len(candidates)} candidate folder(s) after filtering") + + # Prefer deeper paths first + if prefer_deepest: + candidates.sort(key=lambda p: (p.count(os.sep), p), reverse=True) + print("[COSIDAG] _find_new_folder: sorted candidates by depth (deepest first)") + else: + candidates.sort() + print("[COSIDAG] _find_new_folder: sorted candidates alphabetically") + + for path in candidates: + if path not in processed: + print(f"[COSIDAG] _find_new_folder: found new folder: {path}") + return path + + print(f"[COSIDAG] _find_new_folder: all {len(candidates)} candidate(s) already processed") + return None + + +# ---- COSIDAG -------------------------------------------------------------------- + + +class COSIDAG(DAG): + """ + DAG subclass that wires: + check_new_file -> automatic_retrig -> resolve_inputs -> [custom] -> show_results + + Optional steps can be disabled: + - check_new_file is not created if monitoring_folders is empty. + - automatic_retrig is not created if auto_retrig is False. + """ + + def __init__( + self, + monitoring_folders, + level: int = 1, + date: Optional[str] = None, + date_queries: Optional[str | list[str]] = None, + build_custom: Optional[Callable[[DAG], None]] = None, + sensor_poke_seconds: int = 30, + sensor_timeout_seconds: int = 60 * 60 * 6, + home_env_var: str = "COSIFLOW_HOME_URL", + idle_seconds: int = 20, + min_files: int = 1, + ready_marker: Optional[str] = None, + only_basename: Optional[str] = None, + prefer_deepest: bool = True, + file_patterns: Optional[dict] = None, # {"xcom_key": "glob_pattern", ...} + select_policy: str = "first", # "first" | "latest_mtime" + tags: Optional[list[str]] = None, + default_args_extra: Optional[dict] = None, + auto_retrig: bool = True, + *args, + **kwargs, + ) -> None: + # --- merge default_args --- + # priority: kwargs.default_args < _BASE_DEFAULT_ARGS < default_args_extra + base = dict(_BASE_DEFAULT_ARGS) + if "default_args" in kwargs and kwargs["default_args"]: + base.update(kwargs["default_args"]) # allows override from caller + if default_args_extra: + base.update(default_args_extra) # extensions/override requested + + # ensure that DAG receives the final default_args + kwargs["default_args"] = base + + # --- merge tags --- + existing_tags = list(kwargs.get("tags", []) or []) + merged_tags = sorted(set((tags or []) + existing_tags)) + if merged_tags: + kwargs["tags"] = merged_tags + + super().__init__(*args, **kwargs) + + # Decide whether monitoring is enabled (task existence, not just runtime behavior). + self.has_monitoring = bool(monitoring_folders) + + # Base params (can be overridden by dag_run.conf at runtime) + self.params.update( + { + "monitoring_folders": monitoring_folders, + "level": int(level), + "date": date, + "date_queries": date_queries, + "home_env_var": home_env_var, + "idle_seconds": int(idle_seconds), + "min_files": int(min_files), + "ready_marker": ready_marker, + "only_basename": only_basename, + "prefer_deepest": bool(prefer_deepest), + "file_patterns": file_patterns, + "select_policy": select_policy, + "max_active_runs": int(kwargs.get("max_active_runs", 2)), + "max_active_tasks": int(kwargs.get("max_active_tasks", 8)), + "concurrency": int(kwargs.get("concurrency", 8)), + "auto_retrig": bool(auto_retrig), + } + ) + + self.auto_retrig = bool(auto_retrig) + + print( + "[COSIDAG] enabled: " + f"check_new_file={self.has_monitoring}, " + f"automatic_retrig={self.auto_retrig}, " + f"resolve_inputs={bool(file_patterns)}" + ) + + # --------------------------------------------------------------------- + # 1) check_new_file — PythonSensor (optional) + # --------------------------------------------------------------------- + + def _sensor_poke(ti, **context): + conf = (context.get("dag_run").conf or {}) if context.get("dag_run") else {} + monitoring = conf.get("monitoring_folders", self.params["monitoring_folders"]) + level_val = int(conf.get("level", self.params["level"])) + + # Date queries: runtime conf has precedence. + conf_date_queries = conf.get("date_queries", None) + if conf_date_queries is None: + # fallback: use the optional "date" as '==date' + conf_date = conf.get("date", self.params.get("date")) + if conf_date: + conf_date_queries = f"=={conf_date}" + else: + conf_date_queries = self.params.get("date_queries") + + idle_s = int(conf.get("idle_seconds", self.params.get("idle_seconds", 20))) + min_f = int(conf.get("min_files", self.params.get("min_files", 1))) + marker = conf.get("ready_marker", self.params.get("ready_marker")) + only_bn = conf.get("only_basename", self.params.get("only_basename")) + prefer_deep = bool(conf.get("prefer_deepest", self.params.get("prefer_deepest", True))) + + new_path = _find_new_folder( + monitoring_folders=monitoring, + level=level_val, + date_queries=conf_date_queries, + dag_id=self.dag_id, + only_basename=only_bn, + prefer_deepest=prefer_deep, + ) + + print(f"[COSIDAG] _sensor_poke: new_path={new_path}") + if not new_path: + return False + + print(f"[COSIDAG] _sensor_poke: marker={marker}") + if marker: + marker_path = os.path.join(new_path, marker) + if not os.path.exists(marker_path): + return False + + print(f"[COSIDAG] _sensor_poke: idle_seconds={idle_s}, min_files={min_f}") + if not _is_dir_stable(new_path, idle_seconds=idle_s, min_files=min_f): + return False + + print("[COSIDAG] _sensor_poke: pushing detected_folder to XCom") + ti.xcom_push(key="detected_folder", value=new_path) + processed = _load_processed_set(self.dag_id) + processed.add(new_path) + _save_processed_set(self.dag_id, processed) + return True + + check_new_file = None + if self.has_monitoring: + check_new_file = PythonSensor( + task_id="check_new_file", + poke_interval=sensor_poke_seconds, + timeout=sensor_timeout_seconds, + mode="poke", + python_callable=_sensor_poke, + dag=self, + ) + self.check_new_file = check_new_file + else: + print("[COSIDAG] monitoring_folders empty → check_new_file disabled") + self.check_new_file = None + + # --------------------------------------------------------------------- + # 2) automatic_retrig — Trigger this same DAG again (optional) + # --------------------------------------------------------------------- + + def _unique_run_id() -> str: + ts = datetime.utcnow().strftime("%Y%m%dT%H%M%S%fZ") + return f"auto::{self.dag_id}::{ts}" + + automatic_retrig = None + if self.auto_retrig: + import inspect + + trig_kwargs = { + "task_id": "automatic_retrig", + "trigger_dag_id": self.dag_id, + "reset_dag_run": False, + "wait_for_completion": False, + "dag": self, + } + + # Propagate conf from previous run — must be valid JSON. + trig_kwargs["conf"] = "{{ dag_run.conf | tojson if dag_run and dag_run.conf else '{}' }}" + + # Airflow version differences + params = inspect.signature(TriggerDagRunOperator.__init__).parameters + if "trigger_run_id" in params: + trig_kwargs["trigger_run_id"] = _unique_run_id() + elif "run_id" in params: + trig_kwargs["run_id"] = _unique_run_id() + + automatic_retrig = TriggerDagRunOperator(**trig_kwargs) + self.automatic_retrig = automatic_retrig + else: + self.automatic_retrig = None + + # --------------------------------------------------------------------- + # 3) resolve_inputs (optional) + # --------------------------------------------------------------------- + resolve_inputs = None + if file_patterns: + import glob + from airflow.exceptions import AirflowFailException + + def _resolve_inputs(**context): + ti = context["ti"] + dag_run = context.get("dag_run") + conf = (dag_run.conf or {}) if dag_run else {} + + # Prefer XCom from check_new_file, but allow manual runs by passing detected_folder in conf. + run_dir = None + if check_new_file is not None: + run_dir = ti.xcom_pull(task_ids="check_new_file", key="detected_folder") + if not run_dir: + run_dir = conf.get("detected_folder") + + if not run_dir or not os.path.isdir(run_dir): + raise AirflowFailException(f"[resolve_inputs] invalid run_dir: {run_dir}") + + def pick_one(paths: list[str]) -> Optional[str]: + if not paths: + return None + if select_policy == "first": + return sorted(paths)[0] + if select_policy == "latest_mtime": + return max(paths, key=lambda p: os.stat(p).st_mtime) + return sorted(paths)[0] + + for key, pattern in file_patterns.items(): + matches = sorted(glob.glob(os.path.join(run_dir, "**", pattern), recursive=True)) + chosen = pick_one(matches) + if not chosen: + raise AirflowFailException( + f"[resolve_inputs] no file for key={key!r} pattern={pattern!r} under {run_dir}" + ) + ti.xcom_push(key=key, value=chosen) + print(f"[resolve_inputs] {key} = {chosen}") + + # Also republish run_dir for convenience. + ti.xcom_push(key="run_dir", value=run_dir) + print(f"[resolve_inputs] run_dir = {run_dir}") + + resolve_inputs = PythonOperator( + task_id="resolve_inputs", + python_callable=_resolve_inputs, + dag=self, + ) + + # --------------------------------------------------------------------- + # 4) [custom] — Let users append their tasks (optional) + # --------------------------------------------------------------------- + + before_tasks = set(self.task_dict.keys()) + if callable(build_custom): + build_custom(self) + after_tasks = set(self.task_dict.keys()) + new_ids = sorted(after_tasks - before_tasks) + + if new_ids: + new_set = set(new_ids) + new_tasks = [self.task_dict[t] for t in new_ids] + + roots, leaves = [], [] + for t in new_tasks: + ups = {u.task_id for u in t.upstream_list} + if ups.isdisjoint(new_set): + roots.append(t) + for t in new_tasks: + downs = {d.task_id for d in t.downstream_list} + if downs.isdisjoint(new_set): + leaves.append(t) + + last_custom = EmptyOperator(task_id="custom_anchor", dag=self) + for t in leaves: + t >> last_custom + else: + # No custom tasks created + roots = [] + last_custom = EmptyOperator(task_id="custom_placeholder", dag=self) + + # --------------------------------------------------------------------- + # 5) show_results — Log homepage and optional deep link + # --------------------------------------------------------------------- + + def _show_results(**context): + ti = context["ti"] + dag_run = context.get("dag_run") + conf = (dag_run.conf or {}) if dag_run else {} + + # ------------------------------------------------- + # 1) Retrieve detected folder + # ------------------------------------------------- + detected = None + + if check_new_file is not None: + detected = ti.xcom_pull( + task_ids="check_new_file", + key="detected_folder" + ) + else: + detected = ti.xcom_pull( + key="detected_folder" + ) + + # Allow manual runs + if not detected: + detected = conf.get("detected_folder") + + if not detected: + print( + "[COSIDAG] No detected folder available " + "(monitoring disabled and no dag_run.conf['detected_folder'])" + ) + return None + + # ------------------------------------------------- + # 2) Build deep-link URL (if possible) + # ------------------------------------------------- + homepage = os.environ.get( + self.params.get("home_env_var", "COSIFLOW_HOME_URL") + ) or os.environ.get("COSIFLOW_HOME_URL") + + url = None + if homepage: + # ⚠️ Adapt this base path to your filesystem layout + DATA_ROOT = "/home/gamma/workspace/data" + rel = detected.replace(DATA_ROOT, "").lstrip("/") + url = f"{homepage.rstrip('/')}/folder/{rel}" + + # ------------------------------------------------- + # 3) Push structured result to XCom (canonical output) + # ------------------------------------------------- + result = { + "folder": detected, + "url": url, + } + + ti.xcom_push( + key="cosidag_result", + value=result, + ) + + # ------------------------------------------------- + # 4) Human-friendly logs + # ------------------------------------------------- + print("=" * 80) + print("📂 COSIDAG RESULT") + print(f"Folder: {detected}") + if url: + print(f"URL: {url}") + else: + print("URL: ") + print("=" * 80) + + # ------------------------------------------------- + # 5) Optional deep-link via PathInfo (fallback / enrichment) + # ------------------------------------------------- + if not url and PathInfo is not None: + try: + info = PathInfo.from_path(detected) # type: ignore[attr-defined] + if callable(build_url_fragment) and homepage: + frag = build_url_fragment(info) # type: ignore + deep = f"{homepage.rstrip('/')}/{frag.lstrip('/')}" + print(f"[COSIDAG] Result page (PathInfo): {deep}") + except Exception as e: + print(f"[COSIDAG] Deep-linking via PathInfo failed: {e}") + + # ------------------------------------------------- + # 6) Return value (kept for backward compatibility) + # ------------------------------------------------- + return result + + + show_results = PythonOperator( + task_id="show_results", + python_callable=_show_results, + trigger_rule=TriggerRule.ALL_DONE, + dag=self, + ) + + # --------------------------------------------------------------------- + # Wiring — Build a robust chain depending on what exists. + # --------------------------------------------------------------------- + + # Start anchor: the last "pre-custom" task that exists. + anchor = None + + if check_new_file is not None and automatic_retrig is not None: + check_new_file >> automatic_retrig + anchor = automatic_retrig + elif check_new_file is not None: + anchor = check_new_file + elif automatic_retrig is not None: + # Note: without check_new_file, retrigger is still allowed (manual DAG that loops), + # but it is usually not recommended unless you pass detected_folder in conf. + anchor = automatic_retrig + + # Optional resolve_inputs comes after anchor if anchor exists; otherwise it can run standalone. + if resolve_inputs is not None: + if anchor is not None: + anchor >> resolve_inputs + anchor = resolve_inputs + + # Attach custom roots after anchor if possible. + if roots and anchor is not None: + for t in roots: + anchor >> t + + # Close chain into show_results + if anchor is not None: + anchor >> last_custom >> show_results + else: + # No monitoring, no retrigger, no resolve_inputs: run custom (or placeholder) then show results. + last_custom >> show_results + + # Expose handles + self.show_results = show_results + + def find_file_by_pattern(self, pattern: str, detected_folder: str) -> Optional[str]: + """Find the first file matching the given regex pattern under detected_folder.""" + print(f"[COSIDAG] find_file_by_pattern: pattern={pattern}, detected_folder={detected_folder}") + rx = re.compile(pattern) + for root, _, files in os.walk(detected_folder): + for fname in files: + if rx.search(fname): + return os.path.join(root, fname) + return None + + +# ------------------------------ Example usage ------------------------------------ +# Put the following into your DAG file under the Airflow 'dags/' directory. +# +# from datetime import datetime +# from cosiflow.cosidag import COSIDAG +# from airflow.operators.python import PythonOperator +# +# def build_custom(dag): +# def _process_folder(folder_path: str): +# print(f"Processing folder: {folder_path}") +# +# PythonOperator( +# task_id="custom_process", +# python_callable=lambda ti, **_: _process_folder( +# ti.xcom_pull(task_ids="check_new_file", key="detected_folder") +# ), +# dag=dag, +# ) +# +# with COSIDAG( +# dag_id="cosipipe_example", +# start_date=datetime(2025, 1, 1), +# schedule_interval=None, +# catchup=False, +# monitoring_folders=["/data/incoming", "/data/alt"], +# level=3, +# only_basename="products", +# idle_seconds=30, +# min_files=1, +# date=None, +# auto_retrig=False, # disable retrigger +# build_custom=build_custom, +# ) as dag: +# pass +# +# Manual run without monitoring: +# monitoring_folders=[] +# and trigger with dag_run.conf = {"detected_folder": "/path/to/process"} diff --git a/modules/date_helper.py b/modules/date_helper.py new file mode 100644 index 0000000..d4579d4 --- /dev/null +++ b/modules/date_helper.py @@ -0,0 +1,61 @@ +from datetime import datetime, date +import re +import os + + +def _looks_like_date_folder(name: str) -> bool: + return bool( + re.match(r"^\d{8}(?:_|$)", name) or re.match(r"^\d{4}-\d{2}-\d{2}(?:_|$)", name) + ) + +def _parse_date_string(s: str) -> date: + """Parse 'YYYYMMDD' or 'YYYY-MM-DD' to datetime.date.""" + s = s.strip() + if re.match(r"^\d{8}$", s): + return datetime.strptime(s, "%Y%m%d").date() + return datetime.strptime(s, "%Y-%m-%d").date() + + +def _parse_date_query(q: str): + """ + Parse a query like '>=2025-11-01' to (op, date). + op ∈ {'==', '>=', '<=', '>', '<'}; if missing → '=='. + """ + q = q.strip() + op = "==" + for candidate in ("==", ">=", "<=", ">", "<"): + if q.startswith(candidate): + op = candidate + q = q[len(candidate):].strip() + break + d = _parse_date_string(q) + return op, d + + +def _apply_date_queries(d: date, queries) -> bool: + """Return True if the date d satisfies ALL queries.""" + if not queries: + return True + + if isinstance(queries, str): + queries = [queries] + + for q in queries: + try: + op, target = _parse_date_query(q) + except Exception as e: + print(f"[COSIDAG] _apply_date_queries: parse error for {q!r}: {e}; ignoring condition") + continue + + if op == "==" and not (d == target): + return False + if op == ">=" and not (d >= target): + return False + if op == "<=" and not (d <= target): + return False + if op == ">" and not (d > target): + return False + if op == "<" and not (d < target): + return False + + return True diff --git a/modules/paths.py b/modules/paths.py new file mode 100644 index 0000000..96ad355 --- /dev/null +++ b/modules/paths.py @@ -0,0 +1,148 @@ +from __future__ import annotations +import os +import re +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Optional, Tuple, Iterable + +# === Config === +DATA_ROOT = Path(os.getenv("COSIFLOW_DATA_ROOT", "cosi/data")) +YYYY_MM_RE = re.compile(r"^(?P\d{4})_(?P0[1-9]|1[0-2])$") + +# === Enums === +class Domain(str, Enum): + obs = "obs" + transient = "transient" + trigger = "trigger" + maps = "maps" + source = "source" + +class ObsLeaf(str, Enum): + auxil = "auxil" + compton = "compton" + acs = "acs" + bto = "bto" + +class CommonLeaf(str, Enum): + plots = "plots" + products = "products" + +# === Dataclass === +@dataclass(frozen=True) +class PathInfo: + domain: Domain + year: Optional[int] = None + month: Optional[int] = None + entity_id: Optional[str] = None + leaf: Optional[str] = None + remainder: Tuple[str, ...] = () + +# === Helpers === +def _ym(year: int, month: int) -> str: + if not (1 <= month <= 12): + raise ValueError(f"Invalid month: {month}") + return f"{year:04d}_{month:02d}" + +def ensure_dir(p: Path) -> Path: + p.mkdir(parents=True, exist_ok=True) + return p + +def list_files(p: Path, glob: str = "*") -> Iterable[Path]: + return p.glob(glob) + +# === Builder specifici === +def obs_leaf_path(year: int, month: int, obs_id: str, leaf: ObsLeaf, *rel: str) -> Path: + return DATA_ROOT / "obs" / _ym(year, month) / obs_id / leaf.value / Path(*rel) + +def transient_path(year: int, month: int, transient_id: str, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "transient" / _ym(year, month) / transient_id / leaf.value / Path(*rel) + +def trigger_path(year: int, month: int, trigger_id: str, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "trigger" / _ym(year, month) / trigger_id / leaf.value / Path(*rel) + +def maps_path(year: int, month: int, *rel: str) -> Path: + return DATA_ROOT / "maps" / _ym(year, month) / Path(*rel) + +def source_path(src_id: str, year: int, month: int, leaf: CommonLeaf, *rel: str) -> Path: + return DATA_ROOT / "source" / src_id / _ym(year, month) / leaf.value / Path(*rel) + +# === Builder generico === +def build_path( + domain: Domain, + *, + year: Optional[int] = None, + month: Optional[int] = None, + entity_id: Optional[str] = None, + leaf: Optional[str] = None, + rel: Tuple[str, ...] = (), +) -> Path: + """Costruisce un path conforme allo schema della figura.""" + if domain == Domain.obs: + if None in (year, month, entity_id, leaf): + raise ValueError("obs requires year, month, obs_id and leaf") + return obs_leaf_path(year, month, entity_id, ObsLeaf(leaf), *rel) + + if domain in (Domain.transient, Domain.trigger): + if None in (year, month, entity_id, leaf): + raise ValueError(f"{domain.value} requires year, month, id and leaf") + fn = transient_path if domain == Domain.transient else trigger_path + return fn(year, month, entity_id, CommonLeaf(leaf), *rel) + + if domain == Domain.maps: + if None in (year, month): + raise ValueError("maps requires year and month") + return maps_path(year, month, *rel) + + if domain == Domain.source: + if None in (entity_id, year, month, leaf): + raise ValueError("source requires src_id, year, month, leaf") + return source_path(entity_id, year, month, CommonLeaf(leaf), *rel) + + raise ValueError(f"Unsupported domain: {domain}") + +# === Parser === +def parse_path(p: Path) -> PathInfo: + """Interpreta un path e restituisce un PathInfo con dominio, anno, mese, id e leaf.""" + parts = p.parts[p.parts.index("data")+1:] if "data" in p.parts else p.parts + domain = Domain(parts[0]) + + if domain == Domain.obs: + ym, entity_id, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), entity_id, leaf, tuple(rem)) + + if domain in (Domain.transient, Domain.trigger): + ym, entity_id, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), entity_id, leaf, tuple(rem)) + + if domain == Domain.maps: + ym, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), None, None, tuple(rem)) + + if domain == Domain.source: + src_id, ym, leaf, *rem = parts[1:] + m = YYYY_MM_RE.match(ym) + return PathInfo(domain, int(m["year"]), int(m["month"]), src_id, leaf, tuple(rem)) + + raise ValueError(f"Unsupported domain: {domain}") + +# === Utility extra === +def file_path(domain: Domain, *, year=None, month=None, entity_id=None, leaf=None, filename: str) -> Path: + """Restituisce il path completo di un file con nome arbitrario.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + ensure_dir(dirpath) + return dirpath / filename + +def first_match(domain: Domain, *, year=None, month=None, entity_id=None, leaf=None, pattern: str = "*") -> Optional[Path]: + """Restituisce il primo file che combacia con un pattern nella directory canonica.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + return next(dirpath.glob(pattern), None) + +def route_dest_for(src: Path, domain: Domain, *, year=None, month=None, entity_id=None, leaf=None) -> Path: + """Costruisce il percorso di destinazione mantenendo il nome originale del file.""" + dirpath = build_path(domain, year=year, month=month, entity_id=entity_id, leaf=leaf) + ensure_dir(dirpath) + return dirpath / src.name diff --git a/pipeline/README.md b/pipeline/README.md new file mode 100644 index 0000000..9bd0636 --- /dev/null +++ b/pipeline/README.md @@ -0,0 +1,90 @@ +# Pipeline + +## Overview + +All example pipelines (Light Curve, TSMap, etc.) are initialized and triggered **through the Airflow Web UI** using the **`init_pipelines` DAG**. + +Each scientific pipeline is implemented as a dedicated **COSIDAG** (e.g. Light Curve, TSMap), while `init_pipelines` acts as a **single entry point** responsible for initialization and routing. + +The `init_pipelines` DAG is responsible for: + +* staging all required inputs (response, orientation, source, background) +* validating paths and configuration +* selecting which scientific pipeline to run (`lcurve`, `tsmap`, …) +* defining **where outputs are stored** (Light Curve or TSMap folders) +* triggering the appropriate downstream COSIDAG automatically + +**No manual scripts are required anymore to start individual pipelines.** + +--- + +## Starting a pipeline (NEW workflow) + +### Step 1 — Enable the scientific COSIDAG + +Before triggering any pipeline, make sure that the **target COSIDAG** is enabled in the Airflow UI: + +* enable `cosipipe_lightcurve` for Light Curve products +* enable `cosipipe_tsmap` for TS Map products + +This is required only once (or after a DAG refresh). + +--- + +### Step 2 — Use the `init_pipelines` DAG + +To start **any pipeline**, follow these steps: + +1. Open the **Airflow Web UI** + +2. Enable the DAG named **`init_pipelines`** + +3. Click **Trigger DAG** + +4. Fill in the required parameters: + + * `response_path` + * `orientation_path` + * `source_path` + * `background_path` + * `destination` → choose where outputs will be saved: + + * `lcurve` → results stored in the Light Curve pipeline folder + * `tsmap` → results stored in the TS Map pipeline folder + * other optional parameters (e.g. time windows) + +5. Click **Trigger** + +That’s it. +The selected COSIDAG will be instantiated and executed automatically, with outputs routed to the chosen destination. + +--- + +## Supported pipelines + +| Destination value | Pipeline started | Output location | +| ----------------- | -------------------- | ------------------ | +| `lcurve` | Light Curve plotting | Light Curve folder | +| `tsmap` | TS Map generation | TS Map folder | + +More destinations can be added without changing the user workflow. + +--- + +## What changed (important) + +### Old workflow (deprecated) + +* Manual scripts such as: + + * `start_lcurvepipe.sh` + * `start_tsmappipe.sh` +* Manual triggering of individual pipeline DAGs + +### New workflow (current) + +* **Single entry point:** `init_pipelines` +* Explicit activation of the target COSIDAG +* Fully UI-driven configuration +* Output destination selected at trigger time +* Cleaner, reproducible, and closer to production usage \ No newline at end of file diff --git a/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb.py b/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb.py new file mode 100644 index 0000000..0a65d4d --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb.py @@ -0,0 +1,7 @@ +from cosipy import BinnedData + +analysis = BinnedData("inputs.yaml") + +analysis.get_binned_data(unbinned_data = "GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits.gz", + output_name = "GRB_bn081207680_binned_O3", + psichi_binning = "local") diff --git a/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb__galactic.py b/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb__galactic.py new file mode 100644 index 0000000..d90c70b --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_GRB_data_O3/bin_grb__galactic.py @@ -0,0 +1,7 @@ +from cosipy import BinnedData + +analysis = BinnedData("inputs__galactic.yaml") + +analysis.get_binned_data(unbinned_data = "GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits", + output_name = "GRB_bn081207680_binned_O3", + psichi_binning = "local") diff --git a/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs.yaml b/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs.yaml new file mode 100644 index 0000000..9bcdba5 --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs.yaml @@ -0,0 +1,15 @@ +#----------# +# Data I/O: + +# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 +data_file: "GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits.gz" # full path +ori_file: "NA" # full path +unbinned_output: 'fits' # 'fits' or 'hdf5' +time_bins: 1 # time bin size in seconds. Takes int, float, or list of bin edges. +energy_bins: [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.] # Takes list. Needs to match response. +phi_pix_size: 6 # binning of Compton scattering anlge [deg] +nside: 8 # healpix binning of psi chi local +scheme: 'ring' # healpix binning of psi chi local +tmin: 1836496300.00 # Min time cut in seconds. +tmax: 1836496389.00 # Max time cut in seconds. +#----------# diff --git a/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs__galactic.yaml b/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs__galactic.yaml new file mode 100644 index 0000000..9bcdba5 --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_GRB_data_O3/inputs__galactic.yaml @@ -0,0 +1,15 @@ +#----------# +# Data I/O: + +# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 +data_file: "GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits.gz" # full path +ori_file: "NA" # full path +unbinned_output: 'fits' # 'fits' or 'hdf5' +time_bins: 1 # time bin size in seconds. Takes int, float, or list of bin edges. +energy_bins: [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.] # Takes list. Needs to match response. +phi_pix_size: 6 # binning of Compton scattering anlge [deg] +nside: 8 # healpix binning of psi chi local +scheme: 'ring' # healpix binning of psi chi local +tmin: 1836496300.00 # Min time cut in seconds. +tmax: 1836496389.00 # Max time cut in seconds. +#----------# diff --git a/pipeline/binning_script/Bin_DC3_background_O3/bin_bg.py b/pipeline/binning_script/Bin_DC3_background_O3/bin_bg.py new file mode 100644 index 0000000..2024f94 --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_background_O3/bin_bg.py @@ -0,0 +1,7 @@ +from cosipy import BinnedData + +analysis = BinnedData("inputs.yaml") + +analysis.get_binned_data(unbinned_data = "/project/majello/astrohe/yong/COSI/Data_Challenges/DC3/Unbinned_raw_data/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz", + output_name = "Total_BG_continuum_O3_binned", + psichi_binning = "local") diff --git a/pipeline/binning_script/Bin_DC3_background_O3/bin_bg__galactic.py b/pipeline/binning_script/Bin_DC3_background_O3/bin_bg__galactic.py new file mode 100644 index 0000000..0d88e65 --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_background_O3/bin_bg__galactic.py @@ -0,0 +1,7 @@ +from cosipy import BinnedData + +analysis = BinnedData("inputs__galactic.yaml") + +analysis.get_binned_data(unbinned_data = "Unbinned_raw_data/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz", + output_name = "Total_BG_continuum_O3_binned", + psichi_binning = "local") diff --git a/pipeline/binning_script/Bin_DC3_background_O3/inputs.yaml b/pipeline/binning_script/Bin_DC3_background_O3/inputs.yaml new file mode 100644 index 0000000..529909e --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_background_O3/inputs.yaml @@ -0,0 +1,15 @@ +#----------# +# Data I/O: + +# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 +data_file: "/project/majello/astrohe/yong/COSI/Data_Challenges/DC3/Unbinned_raw_data/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz" # full path +ori_file: "NA" # full path +unbinned_output: 'fits' # 'fits' or 'hdf5' +time_bins: 1 # time bin size in seconds. Takes int, float, or list of bin edges. +energy_bins: [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.] # Takes list. Needs to match response. +phi_pix_size: 6 # binning of Compton scattering anlge [deg] +nside: 8 # healpix binning of psi chi local +scheme: 'ring' # healpix binning of psi chi local +tmin: 1835487300.0 # Min time cut in seconds. +tmax: 1843467255.0 # Max time cut in seconds. +#----------# diff --git a/pipeline/binning_script/Bin_DC3_background_O3/inputs__galactic.yaml b/pipeline/binning_script/Bin_DC3_background_O3/inputs__galactic.yaml new file mode 100644 index 0000000..529909e --- /dev/null +++ b/pipeline/binning_script/Bin_DC3_background_O3/inputs__galactic.yaml @@ -0,0 +1,15 @@ +#----------# +# Data I/O: + +# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 +data_file: "/project/majello/astrohe/yong/COSI/Data_Challenges/DC3/Unbinned_raw_data/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz" # full path +ori_file: "NA" # full path +unbinned_output: 'fits' # 'fits' or 'hdf5' +time_bins: 1 # time bin size in seconds. Takes int, float, or list of bin edges. +energy_bins: [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.] # Takes list. Needs to match response. +phi_pix_size: 6 # binning of Compton scattering anlge [deg] +nside: 8 # healpix binning of psi chi local +scheme: 'ring' # healpix binning of psi chi local +tmin: 1835487300.0 # Min time cut in seconds. +tmax: 1843467255.0 # Max time cut in seconds. +#----------# diff --git a/pipeline/bkg_cut.py b/pipeline/bkg_cut.py new file mode 100644 index 0000000..0e1f5ce --- /dev/null +++ b/pipeline/bkg_cut.py @@ -0,0 +1,85 @@ +from astropy.io import fits +from astropy.table import Table +import os +import numpy as np +import argparse + +""" +This script is used to cut the background window from the background file. +""" + +def check_file(path): + """Check if the file exists""" + print(f"Checking file: {path}") + # Check if the file exists + if not os.path.exists(path): + raise FileNotFoundError(f"File not found: {path}") + print(f"File found: {path}") + return True + +def open_background(background_path): + """Open the background file""" + check_file(background_path) + # Open the background file + print(f"Opening background file: {background_path}") + hdul = fits.open(background_path) + bkg_full = hdul[1].data + return bkg_full + +def get_times_source(source_path): + """Get the starting and ending time tag of the GRB""" + check_file(source_path) + # Read the GRB signal + signal = fits.open(source_path) + # get the starting and ending time tag of the GRB + times = signal[1].data["TimeTags"] + grb_tmin = float(np.min(times)) + grb_tmax = float(np.max(times)) + grb_duration = grb_tmax - grb_tmin + return grb_tmin, grb_tmax, grb_duration + +def extract_bkg_window(background_path, source_path, eps_time = 0.000000001): + """Extract the background window from the background file. + The eps_time is the adding time in seconds to add before and after the source time + to extract the background. It is a safety margin to avoid edge effects.""" + bkg_full = open_background(background_path) + grb_tmin, grb_tmax, _ = get_times_source(source_path) + mask = (bkg_full['TimeTags'] >= grb_tmin - eps_time) & (bkg_full['TimeTags'] <= grb_tmax + eps_time) + bkg_cut = bkg_full[mask] + return bkg_cut + +def save_bkg_window(background_path, source_path, eps_time): + """Save the background window to a file""" + # Reuse the same name of the background file but with the word "cut" added + # output_path = background_path.replace(".fits.gz", "_cut.fits") + base, ext = os.path.splitext(background_path) + output_path = base + "_window.fits" + bkg_cut = extract_bkg_window(background_path, source_path, eps_time) + # Save the background window to a file + print(f"Saving background window to: {output_path}") + t = Table(bkg_cut) + t.write(output_path, format="fits", overwrite=True) + print(f"Background window saved to: {output_path}") + return True + +def main(): + """Main function to cut the background window from the background file based on GRB time range + Args: + grb_source_path: Path to the GRB source FITS file + total_background_path: Path to the total background FITS file (can be compressed .fits.gz) + --eps_time : Time margin in seconds to add before and after source time (default: 1e-9) + Returns: + None + """ + parser = argparse.ArgumentParser(description="Cut background window from background file based on GRB time range") + parser.add_argument("grb_source_path", help="Path to the GRB source FITS file (can be compressed .fits.gz)") + parser.add_argument("total_background_path", help="Path to the total background FITS file (can be compressed .fits.gz)") + parser.add_argument("--eps_time", type=float, default=0.000000001, + help="Time margin in seconds to add before and after source time (default: 1e-9)") + + args = parser.parse_args() + + save_bkg_window(args.total_background_path, args.grb_source_path, args.eps_time) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pipeline/download_data.py b/pipeline/download_data.py new file mode 100644 index 0000000..9a22962 --- /dev/null +++ b/pipeline/download_data.py @@ -0,0 +1,56 @@ + +from cosipy.util import fetch_wasabi_file +from pathlib import Path +import subprocess +import zipfile +import tarfile +from datetime import datetime + +# === Setup download paths === +data_folder = Path("/home/gamma/workspace/data/raw") +data_folder.mkdir(exist_ok=True) + +wasabi = { + # NOTE: The original background file is extremely large, and the complete data directory can reach about 52 GB. + # To avoid heavy storage usage and slowdowns for experiments, we trimmed the background around the source window, + # producing a much smaller background file (~1MB) for use here. + # + # "background": "Backgrounds/Ge/Total_BG_with_SAAcomponent_3months_unbinned_data_filtered_with_SAAcut.fits.gz", + "response": "Responses/ResponseContinuum.o3.e100_10000.b10log.s10396905069491.m2284.filtered.nonsparse.binnedimaging.imagingresponse_nside8.area.good_chunks.h5.zip", + "orientation": "Orientation/DC3_final_530km_3_month_with_slew_1sbins_GalacticEarth_SAA.ori", + "source": "Sources/GRB_bn081207680_3months_unbinned_data_filtered_with_SAAcut.fits.gz", +} + +paths = {key: data_folder / Path(val).name for key, val in wasabi.items()} + +# === Download and prepare files === +for key, remote in wasabi.items(): + out = paths[key] + ready = out.with_suffix('') if out.suffix == ".gz" else out + if not ready.exists(): + print(f"Downloading {key} from Wasabi...") + fetch_wasabi_file(f"COSI-SMEX/DC3/Data/{remote}", output=out) + if out.suffix == ".gz": + subprocess.run(["gunzip", "-f", str(out)], check=True) + elif out.suffix == ".zip": + print(f"Unzipping {out.name}...") + with zipfile.ZipFile(out, 'r') as zip_ref: + zip_ref.extractall(data_folder) + print("Unzip complete") + else: + print(f"[Ready] {key}") + +# === Compress all files into a tar.gz archive === +print("\nCreating compressed archive...") +archive_name = f"raw.tar.gz" +archive_path = data_folder / archive_name + +with tarfile.open(archive_path, "w:gz") as tar: + for file_path in data_folder.iterdir(): + if file_path.is_file() and not file_path.name.endswith(".zip") and not file_path.name.endswith(".gz"): + tar.add(file_path, arcname=file_path.name) + print(f"Added {file_path.name} to archive") + +print(f"Archive created: {archive_path}") +print(f"Archive size: {archive_path.stat().st_size / (1024*1024):.2f} MB") + diff --git a/pipeline/generate_plot.py b/pipeline/generate_plot.py deleted file mode 100644 index 5b2194f..0000000 --- a/pipeline/generate_plot.py +++ /dev/null @@ -1,42 +0,0 @@ -import sys,os -from cosipy.util import fetch_wasabi_file -from cosipy import BinnedData -from pathlib import Path - -#/home/gamma/workspace/heasarc/dl0/2025-01-24_14-16-50/GalacticScan.inc1.id1.crab2hr.extracted.tra.gz - -# create the inputs.yaml file to process the data. -print("test") -print(sys.argv[1]) -file_path = sys.argv[1] -dir_name = os.path.dirname(file_path) - -content_to_write = f"""#----------# -# Data I/O: - -# data files available on the COSI Sharepoint: https://drive.google.com/drive/folders/1UdLfuLp9Fyk4dNussn1wt7WEOsTWrlQ6 -data_file: {file_path} # full path -ori_file: "NA" # full path -unbinned_output: 'hdf5' # 'fits' or 'hdf5' -time_bins: 60 # time bin size in seconds. Takes int, float, or list of bin edges. -energy_bins: [100., 200., 500., 1000., 2000., 5000.] # Takes list. Needs to match response. -phi_pix_size: 6 # binning of Compton scattering anlge [deg] -nside: 8 # healpix binning of psi chi local -scheme: 'ring' # healpix binning of psi chi local -tmin: 1835478000.0 # Min time cut in seconds. -tmax: 1835485200.0 # Max time cut in seconds. -#----------# -""" - -dir_name_path = Path(dir_name) - -# Open the file in write mode and write the content -with open(dir_name_path / "inputs.yaml", "w") as file: - file.write(content_to_write) - - -analysis = BinnedData(dir_name_path / "inputs.yaml") -analysis.read_tra(output_name = dir_name_path / "unbinned_data") -analysis.get_binned_data() -analysis.get_raw_spectrum(output_name = file_path.replace(".crab2hr.extracted.tra.gz","")) -analysis.get_raw_lightcurve(output_name = file_path.replace(".crab2hr.extracted.tra.gz","")) diff --git a/pipeline/initialize_pipeline.py b/pipeline/initialize_pipeline.py deleted file mode 100644 index d2a15eb..0000000 --- a/pipeline/initialize_pipeline.py +++ /dev/null @@ -1,18 +0,0 @@ -from cosipy.util import fetch_wasabi_file -import os -import shutil -from pathlib import Path - -# This script must be executed the first time we install this airflow app to obtain a file used to test the DAG - -home_dir = Path(os.environ['HOME']) -new_path = os.path.join(home_dir, "workspace", "data", "GalacticScan.inc1.id1.crab2hr.extracted.tra.gz") - -# Check if the file already exists -if os.path.exists(new_path): - print(f"File {new_path} already exists. Removing it to fetch a new one.") - # If the file exists, remove it - os.remove(new_path) - -fetch_wasabi_file(file='ComptonSphere/mini-DC2/GalacticScan.inc1.id1.crab2hr.extracted.tra.gz', - output=new_path) diff --git a/pipeline/lcurve/cosipipe_lc_ops_cosidag.py b/pipeline/lcurve/cosipipe_lc_ops_cosidag.py new file mode 100644 index 0000000..b0af762 --- /dev/null +++ b/pipeline/lcurve/cosipipe_lc_ops_cosidag.py @@ -0,0 +1,295 @@ +# /home/gamma/airflow/dags/_lib/cosipipe_lc_ops.py +# Utility functions used by the cosipipe_lightcurve DAG with ExternalPythonOperator. +# All functions are self-contained and do not rely on DAG-level globals. + +from __future__ import annotations + +from pathlib import Path +import os +import yaml + +# ----------------------------- +# Binning (source & background) +# ----------------------------- +def bin_grb_source(unbinned_file_path: str, data_folder: str) -> str: + """ + Bin GRB data source based on the bin_grb.py script + """ + print(f"[bin_grb_source] Found GRB unbinned fits file: {unbinned_file_path}") + + # Define output file path + extension = unbinned_file_path.split(".")[-1] + binned_file_name = unbinned_file_path.replace("_unbinned_", "_binned_").replace("."+extension, "") + binned_file_path = os.path.join(data_folder, f"{binned_file_name}.hdf5") + + print(f"[bin_grb_source] Expected output file: {binned_file_path}") + + # Check if binned file already exists + if os.path.exists(binned_file_path): + print(f"[bin_grb_source] Binned GRB file already exists: {binned_file_path}") + print("[bin_grb_source] Skipping GRB binning step.") + return binned_file_path + + print("[bin_grb_source] Binned GRB file not found. Proceeding with binning process...") + + # Construct paths + # Create inputs.yaml configuration for binning + print("[bin_grb_source] Creating GRB binning configuration...") + config = { + "data_file": unbinned_file_path, + "ori_file": "NA", + "unbinned_output": "fits", + "time_bins": 1, + "energy_bins": [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.], + "phi_pix_size": 6, + "nside": 8, + "scheme": "ring", + "tmin": 1836496300.00, + "tmax": 1836496389.00 + } + + # Write inputs.yaml to the data folder + inputs_path = os.path.join(data_folder, "inputs_grb.yaml") + with open(inputs_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + print(f"[bin_grb_source] ✓ Created GRB binning configuration: {inputs_path}") + print(f"[bin_grb_source] Configuration details:") + print(f"[bin_grb_source] - Time bins: {config['time_bins']} seconds") + print(f"[bin_grb_source] - Energy bins: {len(config['energy_bins'])-1} bins from {config['energy_bins'][0]} to {config['energy_bins'][-1]} keV") + print(f"[bin_grb_source] - Phi pixel size: {config['phi_pix_size']} degrees") + print(f"[bin_grb_source] - Nside: {config['nside']}") + print(f"[bin_grb_source] - Scheme: {config['scheme']}") + print(f"[bin_grb_source] - Time range: {config['tmin']} to {config['tmax']}") + print(f"[bin_grb_source] - GRB duration: {config['tmax'] - config['tmin']} seconds") + + # Execute the binning process + print("[bin_grb_source] Initializing COSIpy BinnedData analysis...") + from cosipy import BinnedData + + analysis = BinnedData(inputs_path) + print("[bin_grb_source] ✓ BinnedData analysis object created successfully") + + print(f"[bin_grb_source] Starting binning process for output: {binned_file_name}") + print("[bin_grb_source] This may take several minutes depending on data size...") + + analysis.get_binned_data( + unbinned_data=unbinned_file_path, + output_name=binned_file_name, + psichi_binning="galactic" + ) + + print(f"[bin_grb_source] GRB data binning completed successfully!") + print(f"[bin_grb_source] Output file created: {binned_file_path}") + + # Verify the output file was created + if os.path.exists(binned_file_path): + file_size = os.path.getsize(binned_file_path) / (1024 * 1024) # Size in MB + print(f"[bin_grb_source] Output file verified: {file_size:.2f} MB") + else: + print(f"[bin_grb_source] Warning: Expected output file not found: {binned_file_path}") + + return binned_file_path + + +def bin_background_data(unbinned_file_path: str, data_folder: str) -> str: + """ + Bin background data based on the bin_bg.py script + """ + print(f"[bin_background_data] Found background unbinned fits file: {unbinned_file_path}") + + # Define output file path + extension = unbinned_file_path.split(".")[-1] + print(f"[bin_background_data] Extension: {extension}") + binned_file_name = unbinned_file_path.replace("_unbinned_", "_binned_").replace("."+extension, "") + print(f"[bin_background_data] Binned file name: {binned_file_name}") + binned_file_path = os.path.join(data_folder, f"{binned_file_name}.hdf5") + + print(f"[bin_background_data] Expected output file: {binned_file_path}") + + # Check if binned file already exists + if os.path.exists(binned_file_path): + print(f"[bin_background_data] Binned background file already exists: {binned_file_path}") + print("[bin_background_data] Skipping background binning step.") + return binned_file_path + + print("[bin_background_data] Binned background file not found. Proceeding with binning process...") + + # Construct paths + # Create inputs.yaml configuration for binning + print("[bin_background_data] Creating background binning configuration...") + config = { + "data_file": unbinned_file_path, + "ori_file": "NA", + "unbinned_output": "fits", + "time_bins": 1, + "energy_bins": [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.], + "phi_pix_size": 6, + "nside": 8, + "scheme": "ring", + "tmin": 1835487300.0, + "tmax": 1843467255.0 + } + + # Write inputs.yaml to the data folder + inputs_path = os.path.join(data_folder, "inputs_bg.yaml") + with open(inputs_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + print(f"[bin_background_data] ✓ Created background binning configuration: {inputs_path}") + print(f"[bin_background_data] Configuration details:") + print(f"[bin_background_data] - Time bins: {config['time_bins']} seconds") + print(f"[bin_background_data] - Energy bins: {len(config['energy_bins'])-1} bins from {config['energy_bins'][0]} to {config['energy_bins'][-1]} keV") + print(f"[bin_background_data] - Phi pixel size: {config['phi_pix_size']} degrees") + print(f"[bin_background_data] - Nside: {config['nside']}") + print(f"[bin_background_data] - Scheme: {config['scheme']}") + print(f"[bin_background_data] - Time range: {config['tmin']} to {config['tmax']}") + + # Execute the binning process + print("[bin_background_data] Initializing COSIpy BinnedData analysis...") + from cosipy import BinnedData + + analysis = BinnedData(inputs_path) + print("[bin_background_data] ✓ BinnedData analysis object created successfully") + + print(f"[bin_background_data] Starting binning process for output: {binned_file_name}") + print("[bin_background_data] This may take several minutes depending on data size...") + + analysis.get_binned_data( + unbinned_data=unbinned_file_path, + # Use output_file without the extension + output_name=binned_file_name, + psichi_binning="galactic" + ) + + print(f"[bin_background_data] Background data binning completed successfully!") + print(f"[bin_background_data] Output file created: {binned_file_path}") + + # Verify the output file was created + if os.path.exists(binned_file_path): + file_size = os.path.getsize(binned_file_path) / (1024 * 1024) # Size in MB + print(f"[bin_background_data] ✓ Output file verified: {file_size:.2f} MB") + else: + print(f"[bin_background_data] Warning: Expected output file not found: {binned_file_path}") + + return binned_file_path + + +# ----------------------------- +# Light curve plotting (full cell logic) +# ----------------------------- +def plot_lightcurve_from_cells(grb_signal_path: str, + background_path: str, + orientation_path: str, + response_path: str, + data_dir: str) -> str: + """ + Plot the light curve based on the grb_signal_path, background_path, orientation_path, and response_path + """ + + # Imports used by the notebook cells + import numpy as np + import matplotlib.pyplot as plt + import astropy.units as u + from astropy.coordinates import SkyCoord + from histpy import Histogram + from cosipy.background_estimation import ContinuumEstimation + from cosipy.spacecraftfile import SpacecraftFile + from cosipy.response import FullDetectorResponse + + # ---- helpers (as in notebook) ---- + def load_projected_psr(psr_file): + instance = ContinuumEstimation() + psr_hist = instance.load_psr_from_file(psr_file) + projected = psr_hist.project(['Em', 'Phi', 'PsiChi']) + data = projected.contents.value + return projected, data + + def mask_from_cumdist_vectorized(psr_map, containment=0.4): + psr_norm = psr_map / np.sum(psr_map, axis=-1, keepdims=True) + sort_idx = np.argsort(psr_norm, axis=-1)[..., ::-1] + sorted_vals = np.take_along_axis(psr_norm, sort_idx, axis=-1) + cumsum_vals = np.cumsum(sorted_vals, axis=-1) + mask_sorted = (cumsum_vals < containment).astype(float) + mask = np.empty_like(mask_sorted) + np.put_along_axis(mask, sort_idx, mask_sorted, axis=-1) + # NOTE: like in your cell, we do NOT invert the mask. + return mask + + def create_psr(l, b, ori_file: str, response_file: str): + ori = SpacecraftFile.parse_from_file(ori_file) + coord = SkyCoord(l=l * u.deg, b=b * u.deg, frame="galactic") + scatt_map = ori.get_scatt_map(coord, nside=16, coordsys='galactic') + with FullDetectorResponse.open(response_file) as response: + psr = response.get_point_source_response(coord=coord, scatt_map=scatt_map) + print("Works!") + return psr + + def get_signal_window(signal): + grb_tmin = signal.axes["Time"].edges.min() + grb_tmax = signal.axes["Time"].edges.max() + print(f"The GRB duration is {grb_tmax - grb_tmin} from {grb_tmin} to {grb_tmax}") + return grb_tmin, grb_tmax + + def load_data(signal_full, bkg_full, tstart, tstop, window_start, window_stop): + # Slice & project Signal only if within GRB window + if tstart >= window_start.value and tstop <= window_stop.value: + signal_tmin_idx = np.where(signal_full.axes['Time'].edges.value == tstart)[0][0] + signal_tmax_idx = np.where(signal_full.axes["Time"].edges.value == tstop)[0][0] + signal = signal_full.slice[signal_tmin_idx:signal_tmax_idx, :] + signal = signal.project(['Em', 'Phi', 'PsiChi']) + # Background always sliced on the same interval + bkg_tmin_idx = np.where(bkg_full.axes['Time'].edges.value == tstart)[0][0] + bkg_tmax_idx = np.where(bkg_full.axes["Time"].edges.value == tstop)[0][0] + bkg = bkg_full.slice[bkg_tmin_idx:bkg_tmax_idx, :] + bkg = bkg.project(['Em', 'Phi', 'PsiChi']) + # Add signal only inside window + if tstart >= window_start.value and tstop <= window_stop.value: + data = signal + bkg + else: + data = bkg + return data, data.contents.todense() + + # ---- build PSR + mask ---- + psr_map = create_psr(171.56, -4.780, ori_file=orientation_path, response_file=response_path) + input_psr = psr_map.project(['Em', 'Phi', 'PsiChi']).contents.value + mask_map = mask_from_cumdist_vectorized(input_psr, containment=0.5) + + # ---- open histograms ---- + signal_full = Histogram.open(grb_signal_path) + bkg_full = Histogram.open(background_path) + + # ---- time scan around the GRB window ---- + window_start, window_stop = get_signal_window(signal_full) + + counts = [] + bin_size = 5 # seconds + tstart = window_start.value - 20 + while tstart < window_stop.value + 20: + tstop = tstart + bin_size + _, data_map = load_data(signal_full, bkg_full, tstart, tstop, window_start, window_stop) + masked_data = mask_map * data_map + counts.append(masked_data.sum()) + tstart = tstop + + # Build bin edges like in the notebook + N = int(((window_stop.value + 20) - (window_start.value - 20)) / bin_size) + bins = np.linspace(window_start.value - 20, window_stop.value + 20, N + 1) + + # ---- plot ---- + plt.figure(figsize=(10, 4)) + plt.step(bins, counts) + plt.xlabel("Time (s)") + plt.ylabel("Counts") + plt.title(f"GRB light curve (bin = {bin_size}s)") + plt.axvline(x=window_start.value, linestyle='--', linewidth=1.5) + out_png = Path(data_dir) / "lightcurve.png" + plt.tight_layout() + plt.savefig(out_png, dpi=150) + plt.close() + + print(f"Light curve saved to: {out_png}") + + return str(out_png) + + diff --git a/pipeline/ts_map/cosipipe_tsmap_ops_cosidag.py b/pipeline/ts_map/cosipipe_tsmap_ops_cosidag.py new file mode 100644 index 0000000..2b8bb7f --- /dev/null +++ b/pipeline/ts_map/cosipipe_tsmap_ops_cosidag.py @@ -0,0 +1,430 @@ +""" +Utility functions used by the cosipipe_tsmap DAG with ExternalPythonOperator. +All functions are self-contained and do not rely on DAG-level globals. + +- bin_grb_data: bin the GRB data source based on the bin_grb.py script +- bin_background_data: bin the background data based on the bin_bg.py script +- compute_ts_map: compute the TS map based on the ts_map.py script +- compute_ts_map_mulres: compute the TS map based on the ts_map_mulres.py script +""" +from __future__ import annotations + +import os +from pathlib import Path + +# ===================================================================== +# =============== Inlined pipeline step implementations =============== +# ===================================================================== + + +# ---[ from 2_binGRBdatasource.py ]--- +import os +import sys +import yaml +from pathlib import Path + +def bin_grb_data(unbinned_file_path: str, data_folder: str) -> str: + """ + Bin GRB data source based on the bin_grb.py script + """ + print(f"[bin_grb_data] Found GRB unbinned fits file: {unbinned_file_path}") + + # Define output file path + extension = unbinned_file_path.split(".")[-1] + binned_file_name = unbinned_file_path.replace("_unbinned_", "_binned_").replace("."+extension, "") + binned_file_path = os.path.join(data_folder, f"{binned_file_name}.hdf5") + + print(f"[bin_grb_data] Expected output file: {binned_file_path}") + + # Check if binned file already exists + if os.path.exists(binned_file_path): + print(f"[bin_grb_data] Binned GRB file already exists: {binned_file_path}") + print("[bin_grb_data] Skipping GRB binning step.") + return binned_file_path + + print("[bin_grb_data] Binned GRB file not found. Proceeding with binning process...") + + # Construct paths + # Create inputs.yaml configuration for binning + print("[bin_grb_data] Creating GRB binning configuration...") + config = { + "data_file": unbinned_file_path, + "ori_file": "NA", + "unbinned_output": "fits", + "time_bins": 1, + "energy_bins": [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.], + "phi_pix_size": 6, + "nside": 8, + "scheme": "ring", + "tmin": 1836496300.00, + "tmax": 1836496389.00 + } + + # Write inputs.yaml to the data folder + inputs_path = os.path.join(data_folder, "inputs_grb.yaml") + with open(inputs_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + print(f"[bin_grb_data] ✓ Created GRB binning configuration: {inputs_path}") + print(f"[bin_grb_data] Configuration details:") + print(f"[bin_grb_data] - Time bins: {config['time_bins']} seconds") + print(f"[bin_grb_data] - Energy bins: {len(config['energy_bins'])-1} bins from {config['energy_bins'][0]} to {config['energy_bins'][-1]} keV") + print(f"[bin_grb_data] - Phi pixel size: {config['phi_pix_size']} degrees") + print(f"[bin_grb_data] - Nside: {config['nside']}") + print(f"[bin_grb_data] - Scheme: {config['scheme']}") + print(f"[bin_grb_data] - Time range: {config['tmin']} to {config['tmax']}") + print(f"[bin_grb_data] - GRB duration: {config['tmax'] - config['tmin']} seconds") + + # Execute the binning process + print("[bin_grb_data] Initializing COSIpy BinnedData analysis...") + from cosipy import BinnedData + + analysis = BinnedData(inputs_path) + print("[bin_grb_data] ✓ BinnedData analysis object created successfully") + + print(f"[bin_grb_data] Starting binning process for output: {binned_file_name}") + print("[bin_grb_data] This may take several minutes depending on data size...") + + analysis.get_binned_data( + unbinned_data=unbinned_file_path, + output_name=binned_file_name, + psichi_binning="local" + ) + + print(f"[bin_grb_data] GRB data binning completed successfully!") + print(f"[bin_grb_data] Output file created: {binned_file_path}") + + # Verify the output file was created + if os.path.exists(binned_file_path): + file_size = os.path.getsize(binned_file_path) / (1024 * 1024) # Size in MB + print(f"[bin_grb_data] Output file verified: {file_size:.2f} MB") + else: + print(f"[bin_grb_data] Warning: Expected output file not found: {binned_file_path}") + + return binned_file_path + + +# ---[ from 3_binBackground.py ]--- +import os +import sys +import yaml +from pathlib import Path + +def bin_background_data(unbinned_file_path: str, data_folder: str) -> str: + """ + Bin background data based on the bin_bg.py script + """ + print(f"[bin_background_data] Found background unbinned fits file: {unbinned_file_path}") + + # Define output file path + extension = unbinned_file_path.split(".")[-1] + print(f"[bin_background_data] Extension: {extension}") + binned_file_name = unbinned_file_path.replace("_unbinned_", "_binned_").replace("."+extension, "") + print(f"[bin_background_data] Binned file name: {binned_file_name}") + binned_file_path = os.path.join(data_folder, f"{binned_file_name}.hdf5") + + print(f"[bin_background_data] Expected output file: {binned_file_path}") + + # Check if binned file already exists + if os.path.exists(binned_file_path): + print(f"[bin_background_data] Binned background file already exists: {binned_file_path}") + print("[bin_background_data] Skipping background binning step.") + return binned_file_path + + print("[bin_background_data] Binned background file not found. Proceeding with binning process...") + + # Construct paths + # Create inputs.yaml configuration for binning + print("[bin_background_data] Creating background binning configuration...") + config = { + "data_file": unbinned_file_path, + "ori_file": "NA", + "unbinned_output": "fits", + "time_bins": 1, + "energy_bins": [100., 158.489, 251.189, 398.107, 630.957, 1000., 1584.89, 2511.89, 3981.07, 6309.57, 10000.], + "phi_pix_size": 6, + "nside": 8, + "scheme": "ring", + "tmin": 1835487300.0, + "tmax": 1843467255.0 + } + + # Write inputs.yaml to the data folder + inputs_path = os.path.join(data_folder, "inputs_bg.yaml") + with open(inputs_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False) + + print(f"[bin_background_data] ✓ Created background binning configuration: {inputs_path}") + print(f"[bin_background_data] Configuration details:") + print(f"[bin_background_data] - Time bins: {config['time_bins']} seconds") + print(f"[bin_background_data] - Energy bins: {len(config['energy_bins'])-1} bins from {config['energy_bins'][0]} to {config['energy_bins'][-1]} keV") + print(f"[bin_background_data] - Phi pixel size: {config['phi_pix_size']} degrees") + print(f"[bin_background_data] - Nside: {config['nside']}") + print(f"[bin_background_data] - Scheme: {config['scheme']}") + print(f"[bin_background_data] - Time range: {config['tmin']} to {config['tmax']}") + + # Execute the binning process + print("[bin_background_data] Initializing COSIpy BinnedData analysis...") + from cosipy import BinnedData + + analysis = BinnedData(inputs_path) + print("[bin_background_data] ✓ BinnedData analysis object created successfully") + + print(f"[bin_background_data] Starting binning process for output: {binned_file_name}") + print("[bin_background_data] This may take several minutes depending on data size...") + + analysis.get_binned_data( + unbinned_data=unbinned_file_path, + # Use output_file without the extension + output_name=binned_file_name, + psichi_binning="local" + ) + + print(f"[bin_background_data] Background data binning completed successfully!") + print(f"[bin_background_data] Output file created: {binned_file_path}") + + # Verify the output file was created + if os.path.exists(binned_file_path): + file_size = os.path.getsize(binned_file_path) / (1024 * 1024) # Size in MB + print(f"[bin_background_data] ✓ Output file verified: {file_size:.2f} MB") + else: + print(f"[bin_background_data] Warning: Expected output file not found: {binned_file_path}") + + return binned_file_path + + +# ---[ from 4_tsmapcomputation.py ]--- +import os +import sys +import pickle +import matplotlib +matplotlib.use('Agg') # Use non-interactive backend for server environment +import matplotlib.pyplot as plt + +def compute_ts_map(grb_signal_path: str, + background_path: str, + orientation_path: str, + response_path: str, + data_folder: str) -> dict: + """ + Compute the final TS map and generate plots + """ + # Load the aggregated data from the previous step + # In a real implementation, you might want to use a more robust data passing mechanism + # For now, we'll recreate the FastTSMap object + + # Check if files exist + for path, name in [(grb_signal_path, "GRB signal"), + (background_path, "background"), + (orientation_path, "orientation"), + (response_path, "response")]: + if not os.path.exists(path): + raise FileNotFoundError(f"{name} file not found: {path}") + + print(f"[compute_ts_map] Recreating FastTSMap object for TS computation...") + + # Import required modules + import gc + import numpy as np + from astropy.time import Time + from astropy.coordinates import SkyCoord + import astropy.units as u + from cosipy import SpacecraftFile, FastTSMap + from histpy import Histogram + from threeML import Powerlaw + + # Read the GRB signal + signal = Histogram.open(grb_signal_path) + grb_tmin = signal.axes["Time"].edges.min() + grb_tmax = signal.axes["Time"].edges.max() + signal = signal.project(['Em', 'PsiChi', 'Phi']) + + # Load background data + bkg_full = Histogram.open(background_path) + bkg_tmin_idx = np.where(bkg_full.axes['Time'].edges.value == grb_tmin.value)[0][0] + bkg_tmax_idx = np.where(bkg_full.axes["Time"].edges.value == grb_tmax.value)[0][0] + bkg = bkg_full.slice[bkg_tmin_idx:bkg_tmax_idx,:] + bkg = bkg.project(['Em', 'PsiChi', 'Phi']) + + # Assemble data + data = bkg + signal + + # Create background model + bkg_full_duration = (bkg_full.axes['Time'].edges.max() - bkg_full.axes['Time'].edges.min()) + bkg_model = bkg_full/(bkg_full_duration/40) + bkg_model = bkg_model.project(['Em', 'PsiChi', 'Phi']) + + # Process orientation + ori_full = SpacecraftFile.parse_from_file(orientation_path) + grb_ori = ori_full.source_interval(Time(grb_tmin, format = "unix"), Time(grb_tmax, format = "unix")) + + # Clear memory + del bkg_full + del ori_full + _ = gc.collect() + + # Create FastTSMap object + ts = FastTSMap(data = data, bkg_model = bkg_model, orientation = grb_ori, + response_path = response_path, cds_frame = "local", scheme = "RING") + + # Define the true location of the GRB + coord = SkyCoord(l = 93, b = -53, unit = (u.deg, u.deg), frame = "galactic") + + # get a list of hypothesis coordinates to fit. The models will be put on these locations for get the expected counts from the source spectrum. + # note that this nside is also the nside of the final TS map + hypothesis_coords = FastTSMap.get_hypothesis_coords(nside = 16) + print(f"[compute_ts_map] Computing TS map...") + + # Define spectrum + index = -2.2 + K = 10 / u.cm / u.cm / u.s / u.keV + piv = 100 * u.keV + spectrum = Powerlaw() + spectrum.index.value = index + spectrum.K.value = K.value + spectrum.piv.value = piv.value + spectrum.K.unit = K.unit + spectrum.piv.unit = piv.unit + + # Generate TS map plots + try: + ts_results = ts.parallel_ts_fit(hypothesis_coords=hypothesis_coords, + energy_channel = [2,3], + spectrum=spectrum, + ts_scheme="RING", + cpu_cores=56) + # plots the raw TS values, which is also an image of the GRB. However, + # for the purpose of localization, we are more interested in the confidence + # level of the imaged GRB. Thus, you can plot the 90% containment level of + # the GRB location by setting `containment` parameter to the percetage you + # want to plot. However, because the strength of the GRB signal is very + # very strong, the ts map looks the same under different containment levels. + ts.plot_ts(save_plot = True, save_dir = data_folder, save_name = "ts_map.png") + + ts.plot_ts(containment = 0.9, save_plot = True, save_dir = data_folder, save_name = "ts_map_90containment.png") + print(f"[compute_ts_map] TS map data saved") + + print(f"[compute_ts_map] TS map computation completed successfully!") + return data_folder + + except Exception as e: + print(f"[compute_ts_map] Error during TS map computation: {str(e)}") + raise e + + +# ---[ from 4_tsmapmulres_computation.py ]--- +import os +import sys +import pickle +import matplotlib +matplotlib.use('Agg') # Use non-interactive backend for server environment +import matplotlib.pyplot as plt + +def compute_ts_map_mulres(grb_signal_path: str, + background_path: str, + orientation_path: str, + response_path: str, + data_folder: str) -> dict: + """ + Compute the final TS map and generate plots + """ + # Load the aggregated data from the previous step + # In a real implementation, you might want to use a more robust data passing mechanism + # For now, we'll recreate the FastTSMap object + + print(f"[compute_ts_map_mulres] Recreating FastTSMap object for TS computation...") + + # Import required modules + import gc + import numpy as np + from astropy.time import Time + from astropy.coordinates import SkyCoord + import astropy.units as u + from cosipy import SpacecraftFile, MOCTSMap + from histpy import Histogram + from threeML import Powerlaw + + # Read the GRB signal + signal = Histogram.open(grb_signal_path) + grb_tmin = signal.axes["Time"].edges.min() + grb_tmax = signal.axes["Time"].edges.max() + signal = signal.project(['Em', 'PsiChi', 'Phi']) + + # Load background data + bkg_full = Histogram.open(background_path) + bkg_tmin_idx = np.where(bkg_full.axes['Time'].edges.value == grb_tmin.value)[0][0] + bkg_tmax_idx = np.where(bkg_full.axes["Time"].edges.value == grb_tmax.value)[0][0] + bkg = bkg_full.slice[bkg_tmin_idx:bkg_tmax_idx,:] + bkg = bkg.project(['Em', 'PsiChi', 'Phi']) + + # Assemble data + data = bkg + signal + + # Create background model + bkg_full_duration = (bkg_full.axes['Time'].edges.max() - bkg_full.axes['Time'].edges.min()) + bkg_model = bkg_full/(bkg_full_duration/40) + bkg_model = bkg_model.project(['Em', 'PsiChi', 'Phi']) + + # Process orientation + ori_full = SpacecraftFile.parse_from_file(orientation_path) + grb_ori = ori_full.source_interval(Time(grb_tmin, format = "unix"), Time(grb_tmax, format = "unix")) + + # Clear memory + del bkg_full + del ori_full + _ = gc.collect() + + # Here we will us MOCTSMap instead of FastTSMap, the parameters are same + moc_fit = MOCTSMap(data = data, + bkg_model = bkg_model, + response_path = response_path, + orientation = grb_ori, # we don't need orientation since we are using the precomputed galactic reaponse + cds_frame = "local") + + # Define the true location of the GRB + coord = SkyCoord(l = 93, b = -53, unit = (u.deg, u.deg), frame = "galactic") + + print(f"[compute_ts_map_mulres] Computing TS map...") + + # Define spectrum + index = -2.2 + K = 10 / u.cm / u.cm / u.s / u.keV + piv = 100 * u.keV + spectrum = Powerlaw() + spectrum.index.value = index + spectrum.K.value = K.value + spectrum.piv.value = piv.value + spectrum.K.unit = K.unit + spectrum.piv.unit = piv.unit + + # get a list of hypothesis coordinates to fit. The models will be put on these locations for get the expected counts from the source spectrum. + # note that this nside is also the nside of the final TS map + # here we need to give the order of map to stop fitting and the top 8 likelihood to find the pixels to upscale the resolution + moc_map = moc_fit.moc_ts_fit(max_moc_order = 4, # this is the maximum order of the final map + top_number = 8, # In each iterations, only the pixels with top 8 likelihood values will be split in the next iteration + energy_channel = [2,3], # The energy channel used to perform the fit. + spectrum = spectrum) + + # Generate TS map plots + try: + # here we need to give the order of map to stop fitting and the top 8 likelihood to find the pixels to upscale the resolution + moc_map = moc_fit.moc_ts_fit(max_moc_order = 4, # this is the maximum order of the final map + top_number = 8, # In each iterations, only the pixels with top 8 likelihood values will be split in the next iteration + energy_channel = [2,3], # The energy channel used to perform the fit. + spectrum = spectrum) + + # plot the raw ts values + moc_fit.plot_ts(dpi = 300, save_plot = True, save_dir = data_folder, save_name = "ts_map_multires.png") + + # plot the 90% confidence region + # You can see from the plot below, we recover the same 90% containment region as we did in Example 3 + moc_fit.plot_ts(dpi = 300, containment = 0.9, save_plot = True, save_dir = data_folder, save_name = "ts_map_multires_90containment.png") + + print(f"[compute_ts_map_mulres] TS map data saved") + + print(f"[compute_ts_map_mulres] TS map computation completed successfully!") + return data_folder + + except Exception as e: + print(f"[compute_ts_map_mulres] Error during TS map computation: {str(e)}") + raise e \ No newline at end of file diff --git a/plugins/__init__.py b/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plugins/data_explorer/data_explorer_plugin.py b/plugins/data_explorer/data_explorer_plugin.py new file mode 100644 index 0000000..5a837fe --- /dev/null +++ b/plugins/data_explorer/data_explorer_plugin.py @@ -0,0 +1,245 @@ +import os +import traceback +import base64 +import mimetypes +from pathlib import Path +from airflow.plugins_manager import AirflowPlugin +from airflow.models import BaseOperator +from flask import Blueprint, render_template, send_from_directory, redirect, url_for, session, jsonify, abort +from jinja2 import Environment +from flask_login import login_required, current_user + +# Get from the env variable COSI_DATA_DIR the path to the data directory if it is not set, use the default path +DL0_FOLDER = os.environ.get("COSI_DATA_DIR", "/home/gamma/workspace/data") + +# Definiamo il percorso assoluto alla cartella del plugin +plugin_folder = os.path.dirname(os.path.abspath(__file__)) + +# Blueprint con percorso assoluto a templates e static +heasarc_explorer_bp = Blueprint( + "heasarc_explorer_bp", + __name__, + template_folder=os.path.join(plugin_folder, "templates"), + static_folder=os.path.join(plugin_folder, "static"), + url_prefix='/heasarcbrowser' +) + +@heasarc_explorer_bp.route('/') +def explorer_home(): + if not current_user.is_authenticated: + return redirect('/login/?next=/heasarcbrowser/') + try: + folders = sorted([f for f in os.listdir(DL0_FOLDER) if os.path.isdir(os.path.join(DL0_FOLDER, f))]) + return render_template("explorer.html", folders=folders, current_path=DL0_FOLDER) + except PermissionError: + abort(403) + except Exception as e: + error_traceback = traceback.format_exc() + return f"Error loading folders: {e}\n\nTraceback:\n{error_traceback}", 500 + +@heasarc_explorer_bp.route('/folder/') +@login_required +def explorer_folder(foldername): + try: + folder_path = os.path.join(DL0_FOLDER, foldername) + + # Check if the folder path is within the allowed directory + if not os.path.commonpath([DL0_FOLDER, folder_path]).startswith(DL0_FOLDER): + abort(403) + + # Check if the directory exists + if not os.path.exists(folder_path): + return render_template("explorer.html", + folders=[], + files=[], + foldername=foldername, + current_path=folder_path, + error_message=f"Directory '{foldername}' does not exist.") + + # Check if the path is actually a directory + if not os.path.isdir(folder_path): + return render_template("explorer.html", + folders=[], + files=[], + foldername=foldername, + current_path=folder_path, + error_message=f"'{foldername}' is not a directory.") + + # Show all files in the folder, not only pdfs + files = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]) + folders = sorted([f for f in os.listdir(folder_path) if not os.path.isfile(os.path.join(folder_path, f))]) + + # Add parent directory to folders list if we're not at root + if foldername: + parent_path = os.path.dirname(foldername) + if parent_path and parent_path != foldername: # Not at root + folders.insert(0, "..") # Add parent directory indicator + + return render_template("explorer.html", folders=folders, files=files, foldername=foldername, current_path=folder_path) + except PermissionError: + abort(403) + except Exception as e: + error_traceback = traceback.format_exc() + return f"Error loading files: {e}\n\nTraceback:\n{error_traceback}", 500 + +@heasarc_explorer_bp.route('/download/') +@login_required +def download_file(filepath): + abs_path = os.path.join(DL0_FOLDER, filepath) + folder, filename = os.path.split(abs_path) + return send_from_directory(folder, filename, as_attachment=True) + +@heasarc_explorer_bp.route('/preview/') +@login_required +def preview_file(filepath): + print(f"Preview request for: {filepath}") # Debug logging + try: + abs_path = os.path.join(DL0_FOLDER, filepath) + + # Security check - ensure path is within allowed directory + if not os.path.commonpath([DL0_FOLDER, abs_path]).startswith(DL0_FOLDER): + return jsonify({"error": "Access denied"}), 403 + + # Check if file exists + if not os.path.exists(abs_path): + return jsonify({"error": "File not found"}), 404 + + if not os.path.isfile(abs_path): + return jsonify({"error": "Not a file"}), 400 + + # Get file info + file_size = os.path.getsize(abs_path) + mime_type, _ = mimetypes.guess_type(abs_path) + + # Determine content type + content_type = get_content_type(abs_path, mime_type) + + # For large files, don't load them + if file_size > 10 * 1024 * 1024: # 10MB limit + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + if content_type == "image": + # Load image as base64 + with open(abs_path, 'rb') as f: + content = base64.b64encode(f.read()).decode('utf-8') + return jsonify({ + "content_type": "image", + "content": content, + "mime_type": mime_type or "application/octet-stream", + "size": file_size + }) + + elif content_type == "text": + # Load text file + try: + with open(abs_path, 'r', encoding='utf-8') as f: + content = f.read() + # Limit text preview to first 50KB + if len(content) > 50000: + content = content[:50000] + "\n... (truncated)" + return jsonify({ + "content_type": "text", + "content": content, + "mime_type": mime_type or "text/plain", + "size": file_size + }) + except UnicodeDecodeError: + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + else: + return jsonify({ + "content_type": "binary", + "size": file_size, + "mime_type": mime_type or "application/octet-stream" + }) + + except Exception as e: + return jsonify({"error": f"Error loading file: {str(e)}"}), 500 + +def get_content_type(filepath, mime_type): + """Determine content type based on file extension and mime type""" + if not mime_type: + mime_type = "application/octet-stream" + + # Image types + if mime_type.startswith('image/'): + return "image" + + # Text types + if (mime_type.startswith('text/') or + mime_type in ['application/json', 'application/xml', 'application/javascript']): + return "text" + + # Check by extension for common text files + ext = os.path.splitext(filepath)[1].lower() + text_extensions = ['.txt', '.py', '.js', '.html', '.css', '.json', '.xml', '.yaml', '.yml', '.md', '.log', '.csv'] + if ext in text_extensions: + return "text" + + # Check by extension for common image files + image_extensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp'] + if ext in image_extensions: + return "image" + + return "binary" + + +# Register template function +@heasarc_explorer_bp.app_template_global() +def get_file_icon(filename): + """Get appropriate icon for file type - template global function""" + ext = os.path.splitext(filename)[1].lower() + + # Image files + if ext in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp']: + return "🖼️" + + # Text files + if ext in ['.txt', '.md', '.log']: + return "📄" + + # Code files + if ext in ['.py']: + return "🐍" + elif ext in ['.js']: + return "📜" + elif ext in ['.html', '.htm']: + return "🌐" + elif ext in ['.css']: + return "🎨" + elif ext in ['.json']: + return "📋" + elif ext in ['.xml', '.yaml', '.yml']: + return "⚙️" + + # Data files + if ext in ['.csv']: + return "📊" + elif ext in ['.hdf5', '.h5']: + return "🗃️" + elif ext in ['.fits', '.fit']: + return "🔭" + + # Archive files + if ext in ['.zip', '.tar', '.gz', '.rar']: + return "📦" + + # Default + return "📄" + +class DummyOperator(BaseOperator): + def execute(self, context): + pass + +class heasarcExplorerPlugin(AirflowPlugin): + name = "heasarc_explorer_plugin" + operators = [DummyOperator] + flask_blueprints = [heasarc_explorer_bp] \ No newline at end of file diff --git a/plugins/data_explorer/data_explorer_view_plugin.py b/plugins/data_explorer/data_explorer_view_plugin.py new file mode 100644 index 0000000..f02debf --- /dev/null +++ b/plugins/data_explorer/data_explorer_view_plugin.py @@ -0,0 +1,20 @@ +from airflow.plugins_manager import AirflowPlugin +from flask import redirect +from flask_appbuilder import BaseView, expose + +class HEASARCExplorerView(BaseView): + default_view = "redirect_to_heasarc" + + @expose("/") + def redirect_to_heasarc(self): + return redirect("/heasarcbrowser/") + +class HEASARCExplorerViewPlugin(AirflowPlugin): + name = "heasarc_explorer_view_plugin" + appbuilder_views = [ + { + "name": "heasarc Browser", + "category": "Results Browser", + "view": HEASARCExplorerView() + } + ] diff --git a/plugins/data_explorer/templates/explorer.html b/plugins/data_explorer/templates/explorer.html new file mode 100644 index 0000000..b41db0c --- /dev/null +++ b/plugins/data_explorer/templates/explorer.html @@ -0,0 +1,526 @@ + + + + + COSI Data Explorer + + + + + +

Data File Browser

+ + + + + + {% if error_message %} +
+ Error: {{ error_message }} +
+ {% endif %} + + + {% if foldername %} + + {% endif %} + + +
+ +
+ + {% if folders and folders|length > 0 %} +

Folders:

+ + {% endif %} + + + {% if files and files|length > 0 %} +

Files in {{ foldername }}:

+
    + {% for file in files %} +
  • + + +
  • + {% endfor %} +
+ {% endif %} +
+ + +
+
+

File Preview

+

Click on a file to preview its contents

+
+
+
+

Select a file to preview its contents

+
+
+
+
+ + + + + \ No newline at end of file diff --git a/plugins/mailhog_link/mailhog_link_plugin.py b/plugins/mailhog_link/mailhog_link_plugin.py new file mode 100644 index 0000000..a1eaef2 --- /dev/null +++ b/plugins/mailhog_link/mailhog_link_plugin.py @@ -0,0 +1,15 @@ +from flask import Blueprint, redirect +import os + +# Blueprint empty (no custom routing, we use only the link) +mailhog_bp = Blueprint( + "mailhog_bp", + __name__, + url_prefix="/mailhog" +) + +@mailhog_bp.route('/') +def redirect_to_mailhog(): + # use the environment variable MAILHOG_WEBUI_URL if it is set, otherwise use the default value + mail_server = os.environ.get('MAILHOG_WEBUI_URL', 'http://localhost:8025') + return redirect(mail_server, code=302) \ No newline at end of file diff --git a/plugins/mailhog_link/mailhog_link_view_plugin.py b/plugins/mailhog_link/mailhog_link_view_plugin.py new file mode 100644 index 0000000..12bd730 --- /dev/null +++ b/plugins/mailhog_link/mailhog_link_view_plugin.py @@ -0,0 +1,22 @@ +from airflow.plugins_manager import AirflowPlugin +from flask import redirect +from flask_appbuilder import BaseView, expose +import os + +class MailhogView(BaseView): + default_view = "redirect_to_mailhog" + + @expose("/") + def redirect_to_mailhog(self): + mail_server = os.environ.get('MAILHOG_WEBUI_URL', '"http://localhost:8025"') + return redirect(mail_server) + +class MailhogViewPlugin(AirflowPlugin): + name = "mailhog_view_plugin" + appbuilder_views = [ + { + "name": "Mailhog", + "category": "Develop tools", + "view": MailhogView() + } + ] \ No newline at end of file diff --git a/tutorials/README.md b/tutorials/README.md new file mode 100644 index 0000000..14aa9bc --- /dev/null +++ b/tutorials/README.md @@ -0,0 +1,199 @@ +# COSIfest Mini‑Tutorial — Building Two DAGs in Airflow +Cosiflow / COSI-AIRFLOW + +--- + +## 0) What we’ll build +- **Exercise 1 — Hello World DAG** + - `BashOperator` → create folder & `result.txt` + - `PythonOperator` → append `"Hello Wolrd!"` to the file +- **Exercise 2 — A & B** + - Two DAGs running “in parallel”, communicating via **filesystem** + - **A (ExternalPythonOperator in `cosipy`)**: render 48×48 text → factorize via SVD → save `A`, `B` and plots + - **B (ExternalPythonOperator + PythonSensor)**: wait for `factors.pkl` → reconstruct `A@B` → save plots + +--- + +## 1) Airflow basics recap +- **DAG**: Directed Acyclic Graph — a workflow +- **Task**: a node in the DAG (atomic step) +- **Operators**: + - `BashOperator`: run shell commands + - `PythonOperator`: run a Python callable in the **Airflow** runtime + - `ExternalPythonOperator`: run a Python callable in an **external interpreter** (Conda env) +- **Sensors**: tasks that **wait** for a condition (file exists, external task completes, etc.) + +--- + +## 2) Exercise 1 — Folder & file (BashOperator) +**Goal**: create `/home/gamma/workspace/data/tutorials/result.txt` +```python +from airflow.operators.bash import BashOperator + +make_file = BashOperator( + task_id="make_folder_and_file", + bash_command=( + "mkdir -p /home/gamma/workspace/data/tutorials && " + "touch /home/gamma/workspace/data/tutorials/result.txt" + ), +) +``` +**Tip**: use `mkdir -p` to be idempotent. + +--- + +## 3) Exercise 1 — Append text (PythonOperator) +**Goal**: append `"Hello Wolrd!"` (typo kept) into the file. +```python +from airflow.operators.python import PythonOperator +from pathlib import Path + +BASE = Path("/home/gamma/workspace/data/tutorials") +RESULT = BASE / "result.txt" + +def write_hello(): + with open(RESULT, "a", encoding="utf-8") as f: + f.write("Hello Wolrd!\n") + +with DAG( + dag_id="hello_world_dag", + default_args=default_args, + description="Minimal example: Bash touch + Python writes text", + start_date=datetime(2025, 1, 1), + schedule_interval=None, # run on-demand + catchup=False, + tags=["cosifest", "handson", "tutorials"], +) as dag: + + write_text = PythonOperator( + task_id="write_text", + python_callable=write_hello, + ) +``` +**Flow**: `make_file >> write_text` + +--- + +## 4) Run & verify Exercise 1 +- Trigger **hello_world_dag** +- Verify on the host/container: +``` +cat /home/gamma/workspace/data/tutorials/result.txt +``` +- You should see the line `Hello Wolrd!` appended. + +--- + +## 5) Why ExternalPythonOperator for Exercise 2? +- Isolate scientific dependencies in **Conda env** (here: `cosipy`): + - `EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python"` +- Clean separation between orchestration (Airflow) and heavy libs +- Caveat: **no Airflow context** inside external process (fine for this demo) + +--- + +## 6) Exercise 2 — Architecture +**A** (producer): +- Render multiline text into a tiny canvas → 0/1 matrix `X` +- SVD factorization: `X ≈ L @ R` +- Save: `factors.pkl` + plots `factor_L.png`, `factor_R.png` + +**B** (consumer): +- `PythonSensor` waits for `factors.pkl` +- Load `L`, `R`; compute `M = L @ R` +- Save `reconstruction_float.png` and `reconstruction_binary.png` + +Communication: **filesystem** at +`/home/gamma/workspace/data/tutorials/a_b_factor/` + +--- + +## 7) Exercise 2 — A DAG (key pattern) +Create a new DAG in `cosiflow/dags` path. + +```python +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +with DAG( + dag_id="a_dag", + default_args=default_args, + description="A: make 32×32 text matrix, factorize via SVD into A,B and save them", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "cosipy", "producer", "linalg"], +) as dag: + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, + python_callable=_a_make_factors, # defined in the DAG file + op_kwargs={ + "base_dir": "/home/gamma/workspace/data/tutorials/a_b_factor", + "text": "DAGs\n ARE\nCOOL!", + "size": [48, 48], + "font_size": 6, + "rank": 12, + }, + ) +``` +**Rule**: pass everything via **`op_kwargs`** to avoid global‑scope issues. + +Now copy paste the code contained in `cosiflow/tutorials/functions/a_standalone.py` + +--- + +## 8) Exercise 2 — B DAG (sensor + external python) +Create a new DAG in `cosiflow/dags` path. + +```python +from airflow.sensors.python import PythonSensor +from airflow.operators.python import ExternalPythonOperator + +with DAG( + dag_id="b_dag", + default_args=default_args, + description="B: wait for L,R factors, reconstruct L@R and re-plot the original matrix", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "consumer", "linalg"], +) as dag: + + wait_for_factors = PythonSensor( + task_id="wait_for_factors_pickle", + python_callable=_file_exists, + op_kwargs={"pkl_path": "/.../factors.pkl"}, + poke_interval=10, timeout=3600, + ) + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, + op_kwargs={ + "base_dir": "/.../a_b_factor", + "pkl_path": "/.../a_b_factor/factors.pkl", + "bin_thr": 0.5, + }, + ) + + wait_for_factors >> b_reconstruct +``` + +Now copy paste the code contained in `cosiflow/tutorials/functions/b_standalone.py` + +--- + +## 9) Demo flow +1. Trigger **B** first → observe the Sensor waiting +2. Trigger **A** → produces factors & plots +3. B continues → produces reconstructions +4. Show files in the shared folder + +Cleanup (optional): +``` +rm -f /home/gamma/workspace/data/tutorials/a_b_factor/* +``` diff --git a/tutorials/dags/dag_a.py b/tutorials/dags/dag_a.py new file mode 100644 index 0000000..be72913 --- /dev/null +++ b/tutorials/dags/dag_a.py @@ -0,0 +1,175 @@ +# a_dag.py +# Airflow 2.x — Alice: build 32x32 binary text matrix, factorize via SVD, save A,B and plots +from datetime import datetime + +from airflow import DAG +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] # pass lists in op_kwargs (safer JSON-serializable) +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + +def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") + + + + +default_args = { + "owner": "gamma", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, +} + + +with DAG( + dag_id="a_dag", + default_args=default_args, + description="A: make 32×32 text matrix, factorize via SVD into A,B and save them", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "cosipy", "producer", "linalg"], +) as dag: + + a_factorize = ExternalPythonOperator( + task_id="a_factorize_text_matrix", + python=EXTERNAL_PYTHON, # interpreter in cosipy env + python_callable=_a_make_factors, # callable executed in external PY + op_kwargs={ + "base_dir": BASE_DIR, + "text": TEXT, + "size": SIZE, + "font_size": FONT_SIZE, + "rank": RANK, + }, + ) diff --git a/tutorials/dags/dag_b.py b/tutorials/dags/dag_b.py new file mode 100644 index 0000000..2b9581e --- /dev/null +++ b/tutorials/dags/dag_b.py @@ -0,0 +1,98 @@ +# b_dag.py +# Airflow 2.x — Bob: wait for factors.pkl, reconstruct L@R, plot float and binary images +from datetime import datetime + +from airflow import DAG +from airflow.sensors.python import PythonSensor +from airflow.operators.python import ExternalPythonOperator + +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 # threshold to binarize reconstruction + +def _file_exists(pkl_path: str) -> bool: + """Sensor callable: returns True when the pickle file exists.""" + import os + return os.path.exists(pkl_path) + +def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() + +default_args = { + "owner": "gamma", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, +} + +with DAG( + dag_id="b_dag", + default_args=default_args, + description="B: wait for L,R factors, reconstruct L@R and re-plot the original matrix", + start_date=datetime(2025, 1, 1), + schedule_interval=None, + catchup=False, + tags=["cosifest", "handson", "tutorial", "consumer", "linalg"], +) as dag: + + wait_for_factors = PythonSensor( + task_id="wait_for_factors_pickle", + python_callable=_file_exists, + op_kwargs={"pkl_path": PKL_PATH}, + poke_interval=10, # seconds + timeout=60 * 60, # 1 hour + mode="poke", + ) + + b_reconstruct = ExternalPythonOperator( + task_id="b_reconstruct_and_plot", + python=EXTERNAL_PYTHON, + python_callable=_b_reconstruct_and_plot, + op_kwargs={ + "base_dir": BASE_DIR, + "pkl_path": PKL_PATH, + "bin_thr": BIN_THR, + }, + ) + + wait_for_factors >> b_reconstruct \ No newline at end of file diff --git a/tutorials/dags/dag_helloworld.py b/tutorials/dags/dag_helloworld.py new file mode 100644 index 0000000..8ffed2b --- /dev/null +++ b/tutorials/dags/dag_helloworld.py @@ -0,0 +1,49 @@ +# hello_world_dag.py +# Airflow 2.x +from datetime import datetime +from pathlib import Path + +from airflow import DAG +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator + +BASE_DIR = Path("/home/gamma/workspace/data/tutorials") +RESULT_FILE = BASE_DIR / "result.txt" + +def write_hello(): + """Append 'Hello Wolrd!' into result.txt. + Note: the folder/file is guaranteed to exist from the Bash task.""" + with open(RESULT_FILE, "a", encoding="utf-8") as f: + f.write("Hello Wolrd!\n") # intentionally keeping the requested typo + +# Default arguments for the DAG +default_args = { + 'owner': 'gamma', +} + +with DAG( + dag_id="hello_world_dag", + default_args=default_args, + description="Minimal example: Bash touch + Python writes text", + start_date=datetime(2025, 1, 1), + schedule_interval=None, # run on-demand + catchup=False, + tags=["cosifest", "handson", "tutorials"], +) as dag: + + make_file = BashOperator( + task_id="make_folder_and_file", + bash_command=( + f"mkdir -p {BASE_DIR} && " + f"touch {RESULT_FILE}" + ), + # Good practice: fail if any piece fails + env={}, + ) + + write_text = PythonOperator( + task_id="write_text", + python_callable=write_hello, + ) + + make_file >> write_text diff --git a/tutorials/functions/a_standalone.py b/tutorials/functions/a_standalone.py new file mode 100644 index 0000000..7fb4916 --- /dev/null +++ b/tutorials/functions/a_standalone.py @@ -0,0 +1,139 @@ +# a_standalone.py +# Airflow 2.x — Alice: build 32x32 binary text matrix, factorize via SVD, save L,R and plots + +# =========[ ALICE: CONFIG ]========= +# External Python interpreter (your cosipy conda env) +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +TEXT = "DAGs\n ARE\nCOOL!" +SIZE = [48, 48] # pass lists in op_kwargs (safer JSON-serializable) +FONT_SIZE = 6 +RANK = 12 +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" + +# =========[ ALICE: TASK CALLABLES ]========= +def _a_make_factors(base_dir: str, text: str, size: list, font_size: int, rank: int): + """Run entirely in the external 'cosipy' interpreter. + Robustly measure multiline text size across Pillow versions (no draw.textsize). + """ + from pathlib import Path + import pickle + import numpy as np + import matplotlib + matplotlib.use("Agg") # safe non-interactive backend + import matplotlib.pyplot as plt + from PIL import Image, ImageDraw, ImageFont + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + pkl_path = base / "factors.pkl" + img_L = base / "factor_L.png" + img_R = base / "factor_R.png" + + W, H = int(size[0]), int(size[1]) + + # -- Load a mono font if available, otherwise default fallback + try: + font = ImageFont.truetype("DejaVuSansMono.ttf", font_size) + except Exception: + font = ImageFont.load_default() + + # -- Helper: robust multiline text bounding box across Pillow versions + def measure_multiline(draw: ImageDraw.ImageDraw, txt: str, font: ImageFont.ImageFont): + """Return (w, h) for multiline text. Tries modern APIs first, falls back gracefully.""" + if hasattr(draw, "multiline_textbbox"): + left, top, right, bottom = draw.multiline_textbbox((0, 0), txt, font=font, align="center") + return (right - left, bottom - top) + if hasattr(draw, "textbbox"): + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + if line == "": + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + widths.append(0) + heights.append(lh) + else: + l, t, r, b = draw.textbbox((0, 0), line, font=font) + widths.append(r - l) + heights.append(b - t) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + # Fallback + lines = txt.splitlines() or [txt] + widths, heights = [], [] + for line in lines: + try: + w_line = draw.textlength(line, font=font) + except Exception: + w_line = max(1, int(len(line) * font.size * 0.6)) + widths.append(int(w_line)) + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + heights.append(lh) + return (max(widths) if widths else 0, sum(heights) if heights else 0) + + # -- 1) Render text -> binary matrix (0 white, 1 black) + img = Image.new("L", (W, H), color=255) + draw = ImageDraw.Draw(img) + + w, h = measure_multiline(draw, text, font) + x = (W - w) // 2 + y = (H - h) // 2 + + if hasattr(draw, "multiline_text"): + draw.multiline_text((x, y), text, fill=0, font=font, align="center") + else: + lines = text.splitlines() or [text] + cur_y = y + for line in lines: + try: + ascent, descent = font.getmetrics() + lh = ascent + descent + except Exception: + lh = font.size + draw.text((x, cur_y), line, fill=0, font=font) + cur_y += lh + + arr = np.array(img) + X = (arr < 128).astype(float) # binary 0/1 as float + + # -- 2) SVD factorization: X ≈ (U_k sqrt(S)) (sqrt(S) V_k^T) + U, s, Vt = np.linalg.svd(X, full_matrices=False) + k = max(1, min(int(rank), len(s))) + Uk = U[:, :k] + Sk = np.diag(s[:k]) + Vk = Vt[:k, :] + Ssqrt = np.sqrt(Sk) + L = Uk @ Ssqrt + R = Ssqrt @ Vk + + # -- 3) Persist factors + with open(pkl_path, "wb") as f: + pickle.dump( + { + "L": L.astype("float32"), + "R": R.astype("float32"), + "meta": {"rank": int(k), "size": [W, H], "text": text}, + }, + f, + ) + + # -- 4) Visualize L and R (not binary) + def _plot_matrix(M, out_path, title): + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title(title) + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(out_path) + plt.close() + + _plot_matrix(L, img_L, f"L factor ({W}×{k})") + _plot_matrix(R, img_R, f"R factor ({k}×{H})") diff --git a/tutorials/functions/b_standalone.py b/tutorials/functions/b_standalone.py new file mode 100644 index 0000000..74e4678 --- /dev/null +++ b/tutorials/functions/b_standalone.py @@ -0,0 +1,58 @@ +# b_standalone.py +# Airflow 2.x — Bob: wait for factors.pkl, reconstruct L@R, plot float and binary images + +# =========[ BOB: RECONSTRUCT AND PLOT ]========= +# External Python interpreter (your cosipy conda env) +EXTERNAL_PYTHON = "/home/gamma/.conda/envs/cosipy/bin/python" + +# Defaults for the demo +BASE_DIR = "/home/gamma/workspace/data/tutorials/a_b_factor" +PKL_PATH = f"{BASE_DIR}/factors.pkl" +BIN_THR = 0.5 # threshold to binarize reconstruction + +# =========[ BOB: SENSOR ]========= +def _file_exists(pkl_path: str) -> bool: + """Sensor callable: returns True when the pickle file exists.""" + import os + return os.path.exists(pkl_path) + +# =========[ BOB: RECONSTRUCT AND PLOT ]========= +def _b_reconstruct_and_plot(base_dir: str, pkl_path: str, bin_thr: float): + """Run in external interpreter. Load L,R -> M=L@R; save float & binarized reconstructions.""" + from pathlib import Path + import pickle + import numpy as np + import matplotlib.pyplot as plt + + base = Path(base_dir) + base.mkdir(parents=True, exist_ok=True) + img_rec_float = base / "reconstruction_float.png" + img_rec_bin = base / "reconstruction_binary.png" + + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + + L = np.asarray(payload["L"], dtype=float) # (32×k) + R = np.asarray(payload["R"], dtype=float) # (k×32) + + # 1) Reconstruct + M = L @ R + + # 2) Save float heatmap + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M, cmap="gray_r", interpolation="nearest") + plt.title("Reconstruction (float)") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_float) + plt.close() + + # 3) Save binarized heatmap (to match Alice's binary look) + M_bin = (M >= bin_thr).astype(int) + plt.figure(figsize=(4, 4), dpi=120) + plt.imshow(M_bin, cmap="gray_r", interpolation="nearest") + plt.title(f"Reconstruction (binary, thr={bin_thr})") + plt.axis("off") + plt.tight_layout(pad=0.2) + plt.savefig(img_rec_bin) + plt.close() diff --git a/tutorials/test/example_paths_usage.py b/tutorials/test/example_paths_usage.py new file mode 100644 index 0000000..00ca012 --- /dev/null +++ b/tutorials/test/example_paths_usage.py @@ -0,0 +1,15 @@ +from cosiflow.paths import build_path, file_path, first_match, Domain + +# Costruisci un path canonico per un file di output +p = file_path(Domain.trigger, year=2027, month=7, entity_id="trg_001", + leaf="plots", filename="tsmap_2deg.png") +print(p) +# -> cosi/data/trigger/2027_07/trg_001/plots/tsmap_2deg.png + +# Trova il primo file che combacia +found = first_match(Domain.obs, year=2027, month=8, + entity_id="obs_123", leaf="compton", pattern="*.fits") + +# Parsing inverso +info = parse_path(p) +# -> PathInfo(domain='trigger', year=2027, month=7, entity_id='trg_001', leaf='plots', remainder=('tsmap_2deg.png',))