From 2f7f39c56bc24f714d38257b9f494f9c31ec05d1 Mon Sep 17 00:00:00 2001 From: Chenxi Date: Wed, 24 Jan 2024 09:56:45 +0000 Subject: [PATCH] replicate --- README.md | 1 + cog.yaml | 16 ++++++++++ predict.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 cog.yaml create mode 100644 predict.py diff --git a/README.md b/README.md index a6bfa099..1e08bcd2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Paper PDF Project Page +[![Replicate](https://replicate.com/cjwbw/depth-anything/badge)](https://replicate.com/cjwbw/depth-anything) This work presents Depth Anything, a highly practical solution for robust monocular depth estimation by training on a combination of 1.5M labeled images and **62M+ unlabeled images**. diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 00000000..346b3278 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,16 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +build: + gpu: true + system_packages: + - "libgl1-mesa-glx" + - "libglib2.0-0" + python_version: "3.11" + python_packages: + - "opencv-python==4.9.0.80" + - "torch==2.0.1" + - "torchvision==0.15.2" + - "tqdm==4.66.1" + - "huggingface_hub==0.20.3" +predict: "predict.py:Predictor" diff --git a/predict.py b/predict.py new file mode 100644 index 00000000..a26a25ae --- /dev/null +++ b/predict.py @@ -0,0 +1,85 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +from torchvision.transforms import Compose +from cog import BasePredictor, Input, Path + +from depth_anything.dpt import DepthAnything +from depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet + + +class Predictor(BasePredictor): + def setup(self) -> None: + """Load the model into memory to make running multiple predictions efficient""" + encoder_options = ["vits", "vitb", "vitl"] + self.device = "cuda:0" + model_cache = "model_cache" + self.models = { + k: DepthAnything.from_pretrained( + f"LiheYoung/depth_anything_{k}14", cache_dir=model_cache + ).to(self.device) + for k in encoder_options + } + self.total_params = { + k: sum(param.numel() for param in self.models[k].parameters()) + for k in encoder_options + } + + self.transform = Compose( + [ + Resize( + width=518, + height=518, + resize_target=False, + keep_aspect_ratio=True, + ensure_multiple_of=14, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + ] + ) + + def predict( + self, + image: Path = Input(description="Input image"), + encoder: str = Input( + description="Choose an encoder.", + default="vitl", + choices=["vits", "vitb", "vitl"], + ), + ) -> Path: + """Run a single prediction on the model""" + depth_anything = self.models[encoder] + total_params = self.total_params[encoder] + print("Total parameters: {:.2f}M".format(total_params / 1e6)) + + depth_anything.eval() + + raw_image = cv2.imread(str(image)) + image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0 + + h, w = image.shape[:2] + + image = self.transform({"image": image})["image"] + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + + with torch.no_grad(): + depth = depth_anything(image) + + depth = F.interpolate( + depth[None], (h, w), mode="bilinear", align_corners=False + )[0, 0] + depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 + + depth = depth.cpu().numpy().astype(np.uint8) + depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) + output_path = "/tmp/out.png" + cv2.imwrite(output_path, depth_color) + + return Path(output_path)