From 2f7f39c56bc24f714d38257b9f494f9c31ec05d1 Mon Sep 17 00:00:00 2001
From: Chenxi <chenxi.whitehouse@gmail.com>
Date: Wed, 24 Jan 2024 09:56:45 +0000
Subject: [PATCH] replicate

---
 README.md  |  1 +
 cog.yaml   | 16 ++++++++++
 predict.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)
 create mode 100644 cog.yaml
 create mode 100644 predict.py
diff --git a/README.md b/README.md
index a6bfa099..1e08bcd2 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@
 <a href="https://arxiv.org/abs/2401.10891"><img src='https://img.shields.io/badge/arXiv-Depth Anything-red' alt='Paper PDF'></a>
 <a href='https://depth-anything.github.io'><img src='https://img.shields.io/badge/Project_Page-Depth Anything-green' alt='Project Page'></a>
 <a href='https://huggingface.co/spaces/LiheYoung/Depth-Anything'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+[![Replicate](https://replicate.com/cjwbw/depth-anything/badge)](https://replicate.com/cjwbw/depth-anything)
 </div>
 
 This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**.
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 00000000..346b3278
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,16 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+build:
+  gpu: true
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  python_version: "3.11"
+  python_packages:
+    - "opencv-python==4.9.0.80"
+    - "torch==2.0.1"
+    - "torchvision==0.15.2"
+    - "tqdm==4.66.1"
+    - "huggingface_hub==0.20.3"
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
new file mode 100644
index 00000000..a26a25ae
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,85 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+from cog import BasePredictor, Input, Path
+
+from depth_anything.dpt import DepthAnything
+from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+        encoder_options = ["vits", "vitb", "vitl"]
+        self.device = "cuda:0"
+        model_cache = "model_cache"
+        self.models = {
+            k: DepthAnything.from_pretrained(
+                f"LiheYoung/depth_anything_{k}14", cache_dir=model_cache
+            ).to(self.device)
+            for k in encoder_options
+        }
+        self.total_params = {
+            k: sum(param.numel() for param in self.models[k].parameters())
+            for k in encoder_options
+        }
+
+        self.transform = Compose(
+            [
+                Resize(
+                    width=518,
+                    height=518,
+                    resize_target=False,
+                    keep_aspect_ratio=True,
+                    ensure_multiple_of=14,
+                    resize_method="lower_bound",
+                    image_interpolation_method=cv2.INTER_CUBIC,
+                ),
+                NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+                PrepareForNet(),
+            ]
+        )
+
+    def predict(
+        self,
+        image: Path = Input(description="Input image"),
+        encoder: str = Input(
+            description="Choose an encoder.",
+            default="vitl",
+            choices=["vits", "vitb", "vitl"],
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+        depth_anything = self.models[encoder]
+        total_params = self.total_params[encoder]
+        print("Total parameters: {:.2f}M".format(total_params / 1e6))
+
+        depth_anything.eval()
+
+        raw_image = cv2.imread(str(image))
+        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+
+        h, w = image.shape[:2]
+
+        image = self.transform({"image": image})["image"]
+        image = torch.from_numpy(image).unsqueeze(0).to(self.device)
+
+        with torch.no_grad():
+            depth = depth_anything(image)
+
+        depth = F.interpolate(
+            depth[None], (h, w), mode="bilinear", align_corners=False
+        )[0, 0]
+        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
+
+        depth = depth.cpu().numpy().astype(np.uint8)
+        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
+        output_path = "/tmp/out.png"
+        cv2.imwrite(output_path, depth_color)
+
+        return Path(output_path)