diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/.gitkeep b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_exp_idx.mat b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_exp_idx.mat
new file mode 100644
index 00000000..1146e4e9
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_exp_idx.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_front_idx.mat b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_front_idx.mat
new file mode 100644
index 00000000..b9d7b095
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/BFM_front_idx.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/facemodel_info.mat b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/facemodel_info.mat
new file mode 100644
index 00000000..3e516ec7
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/facemodel_info.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/select_vertex_id.mat b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/select_vertex_id.mat
new file mode 100644
index 00000000..5b8b2200
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/select_vertex_id.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/similarity_Lm3D_all.mat b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/similarity_Lm3D_all.mat
new file mode 100644
index 00000000..a0e23588
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/similarity_Lm3D_all.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/BFM/std_exp.txt b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/std_exp.txt
new file mode 100644
index 00000000..767b8de4
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/BFM/std_exp.txt
@@ -0,0 +1 @@
+453980 257264 263068 211890 135873 184721 47055.6 72732 62787.4 106226 56708.5 51439.8 34887.1 44378.7 51813.4 31030.7 23354.9 23128.1 19400 21827.6 22767.7 22057.4 19894.3 16172.8 17142.7 10035.3 14727.5 12972.5 10763.8 8953.93 8682.62 8941.81 6342.3 5205.3 7065.65 6083.35 6678.88 4666.63 5082.89 5134.76 4908.16 3964.93 3739.95 3180.09 2470.45 1866.62 1624.71 2423.74 1668.53 1471.65 1194.52 782.102 815.044 835.782 834.937 744.496 575.146 633.76 705.685 753.409 620.306 673.326 766.189 619.866 559.93 357.264 396.472 556.849 455.048 460.592 400.735 326.702 279.428 291.535 326.584 305.664 287.816 283.642 276.19
\ No newline at end of file
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/README.md b/dreamtalk/Deep3DFaceRecon_pytorch/README.md
new file mode 100644
index 00000000..cfaeb3eb
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/README.md
@@ -0,0 +1,168 @@
+### Deep3DFaceRecon_pytorch
+
+|||
+|:--:|:--:|
+| **[论文网址](https://arxiv.org/abs/1903.08527)** | **[GitHub](https://github.com/sicxu/Deep3DFaceRecon_pytorch)** |
+
+
+### 提取 3DMM 参数
+
+#### 环境配置
+
+`nvdiffrast` 需要与底层交互，环境依赖于 CUDA 驱动 ( 而不是 pytorch 安装的 cuda-tookit ), docker 环境也需要参考 linux 或 WSL 安装的驱动 ( WSL 调用的实际是 windows 的显卡驱动 )
+
+> 这个 [issue](https://github.com/sicxu/Deep3DFaceRecon_pytorch/issues/12) 下有 `nvdiffrast` 的替代方案 `pytorch3d` 或许能作为其他解决方案的参考
+
+> 或者可以通过这个[项目](https://github.com/yfeng95/DECA)来提取 3DMM 参数
+
+> 如果提示 pytorch 支持的算力和当前显卡不匹配需要升级 pytorch
+
+1. 克隆项目到本地后进行基本配置
+
+```bash
+    conda create -n DP python=3.9
+    conda activate DP
+    conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+    pip install face_alignment
+    pip install ffmpeg-python
+    pip install kornia
+    pip install trimesh
+    pip install Ninja
+
+    # nvdiffrast 经过个人修改，能在 WSL 支持使用 CUDA 而不是 OpenGL 进行渲染
+    # nvdiffrast 在 0.3.0 版本后均支持 CUDA，不过我没找到 3DMM 怎么向 nvdiffrast 传参，或许可以通过附加参数使用其 CUDA 而不是变更代码
+    # WSL 上 OpenGL 存在版本限制，无法满足 nvdiffrast 的版本需求，通过微软的强制指令升级后也会与 CUDA 驱动冲突 
+    # 而项目只要检测到 OpenGL 就会使用 OpenGL 而不是 CUDA
+    # 因此我修改了 nvdiffrast 的源码，将其改为仅使用 CUDA 进行渲染
+    # 在 Linux 主机下使用正常的 nvdiffrast 包
+    # git clone --branch v0.3.0 --single-branch https://github.com/NVlabs/nvdiffrast
+    # WSL 下运行建议直接使用我修改后的包
+    cd nvdiffrast
+    pip install .
+```
+
+按照[教程](https://blog.csdn.net/Sihang_Xie/article/details/127347139)安装 cuda 官网给的安装包后添加环境变量
+
+> 很详细的博客，WSL 照着做就是了，云服务器需要选择 cuda=12.4 的预装驱动，要不还要自己升级很麻烦
+
+安装 Arcface Pytorch
+
+```bash
+    cd ..    # ./Deep3DFaceRecon_pytorch
+    git clone https://github.com/deepinsight/insightface.git
+    cp -r ./insightface/recognition/arcface_torch ./models/
+```
+
+#### 下载预训练模型
+
+
+下载 [01_MorphableModel.mat](https://faces.dmi.unibas.ch/bfm/main.php?nav=1-2&id=downloads), 需要填信息获得访问权限
+
+下载 [Exp_Pca.bin](https://drive.google.com/file/d/1bw5Xf8C12pWmcMhNEu6PtsYVZkVucEN6/view)
+
+```
+Deep3DFaceRecon_pytorch
+│
+└─── BFM
+    │
+    └─── 01_MorphableModel.mat
+    │
+    └─── Exp_Pca.bin
+    |
+    └─── ...
+```
+
+下载 [epoch_20.pth](https://drive.google.com/drive/folders/1liaIxn9smpudjjqMaWWRpP0mXRW_qRPP)
+
+```
+Deep3DFaceRecon_pytorch
+│
+└─── checkpoints
+    │
+    └─── test
+        │
+        └─── epoch_20.pth
+```
+
+#### 运行
+
+将视频放入对应文件夹 `input_dir` 运行即可，我在 16G/i9/4060 下使用的参数如下，在 `face_recon_videos.py` 可以进行第二段命令的参数调整：
+
+> 需要较多的内存，WSL 环境需要修改 .wslconfig 文件中的限制，评估视频的使用的最多内存大概在 26G 左右
+
+```bash
+python extract_kp_videos.py --input_dir data/input --output_dir data/keypoint --device_ids 0 --workers 6
+python face_recon_videos.py --input_dir data/input --keypoint_dir data/keypoint --output_dir data/output --inference_batch_size 200 --name=test --epoch=20 --model facerecon
+```
+
+### 遇到的问题
+
+**如图所示**
+
+![](./img/comment.jpeg)
+
++ 项目 `README.md` 给的 nvdiffrast 的拉取链接是错的，已经更新成新版了
+
+    修改为 
+    ```bash
+        git clone --branch v0.3.0 --single-branch https://github.com/NVlabs/nvdiffrast
+    ```
+
++ `numpy` 没有属性 `VisibleDeprecationWarning`, 过时版本。。
+
+    修改 util/preprocess.py 19 行为（或者删去该行）
+    ```python
+        warnings.filterwarnings("ignore")
+    ```
++ 按照步骤进行 `conda env create -f environment.yml` 时会出现 "No module named pip"
+
+    猜测是较新版本 conda 的问题或者是 pip 版本和 python 版本的问题，修复方法参考我在这个 [issue](https://github.com/sicxu/Deep3DFaceRecon_pytorch/issues/184) 下的回答
+    
+    不过我最终是自己改了环境，因此不会出现这个问题，pass
+
++ 忘了具体内容了，反正就是 torch 不支持 sm_89，但是我是 40 系的所以用不了
+
+    自己配置环境，pass
+
++ 在 `extract_kp_videos.py` 中 `face_alignment.LandmarksType` 没有._2D 类型
+
+    由于是自行下载的 face_alignment ( 忘了是个什么问题所以直接 pip install )，版本较新，PIRender 提供的脚本过时，修改对应位置为
+    ```python
+        self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D)
+    ```
+
+    顺便把脚本优化了一下
+
+    ![](./img/pir.jpeg)
+
++ ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part 
+
+    数组形状的问题，可能和 python 版本有关( 可能以前这样是合法的，虽然我开始接触 python 就一直是 3.9 所以显然不合法hh )，在 `/util/preprocess.py` 202 行中修复了此错误，如下所示:
+
+    ```python
+        trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])
+    ```
+
++ EGL/egl.h: No such file or directory
+
+    缺少相应头文件，egl 和 gl 是老问题了， `apt-get install libegl1-mesa-dev`
+
++ RuntimeError: OpenGL 4.4 or later is required
+
+    WSL 上的问题，被限制了 OpenGL 的版本，因此 nvdiffrast 无法正常工作，即便按照微软的[官方教程](https://devblogs.microsoft.com/commandline/d3d12-gpu-video-acceleration-in-the-windows-subsystem-for-linux-now-available/)将 WSL 上的 OpenGL 版本升级, 依旧会出现 OpenGL 和 CUDA 冲突的问题。
+    
+    按照开发者在以往问题下的回答，我得知 nvdiffrast 从 0.3.0 其实就开始支持 CUDA 进行渲染，但是没有找到 3DMM 这个项目使用 nvdiffrast 的传参点，因此我修改了 nvdiffrast 的源代码，将其中涉及 OpenGL 的部分全部修改为 CUDA 进行渲染，最后重新安装 nvdiffrast 库
+    
+    ```bash
+        cd nvdiffrast
+        pip install .
+    ```
+
++ CUDA_HOME environment variable is not set
+
+    问题来自于 `pytorch/extension-cpp`，使用 pytorch 的这个模组必须安装完整的 CUDA 接受直接底层调用，因此这和预装的驱动 CUDA 息息相关。
+    在之前的 [issue](https://github.com/pytorch/extension-cpp/issues/26) 中找到回答：cudatoolkit probably won't work for you, it doesn't provide access to low level c++ apis. You'd need to install CUDA using the official method., 因此需要安装 CUDA 驱动，pytoch 的版本也由此决定，在我的电脑上，环境是pytorch-cuda=12.4
+
++ to be continued
+
+    记不住了，总之一堆问题，浪费一整天
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/coeff_detector.py b/dreamtalk/Deep3DFaceRecon_pytorch/coeff_detector.py
new file mode 100644
index 00000000..d2f28fdf
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/coeff_detector.py
@@ -0,0 +1,118 @@
+import os
+import glob
+import numpy as np
+from os import makedirs, name
+from PIL import Image
+from tqdm import tqdm
+
+import torch
+import torch.nn as nn
+
+from options.inference_options import InferenceOptions
+from models import create_model
+from util.preprocess import align_img
+from util.load_mats import load_lm3d
+from extract_kp_videos import KeypointExtractor
+
+
+class CoeffDetector(nn.Module):
+    def __init__(self, opt):
+        super().__init__()
+
+        self.model = create_model(opt)
+        self.model.setup(opt)
+        self.model.device = 'cuda'
+        self.model.parallelize()
+        self.model.eval()
+
+        self.lm3d_std = load_lm3d(opt.bfm_folder) 
+
+    def forward(self, img, lm):
+        
+        img, trans_params = self.image_transform(img, lm)
+
+        data_input = {                
+                'imgs': img[None],
+                }        
+        self.model.set_input(data_input)  
+        self.model.test()
+        pred_coeff = {key:self.model.pred_coeffs_dict[key].cpu().numpy() for key in self.model.pred_coeffs_dict}
+        pred_coeff = np.concatenate([
+            pred_coeff['id'], 
+            pred_coeff['exp'], 
+            pred_coeff['tex'], 
+            pred_coeff['angle'],
+            pred_coeff['gamma'],
+            pred_coeff['trans'],
+            trans_params[None],
+            ], 1)
+        
+        return {'coeff_3dmm':pred_coeff, 
+                'crop_img': Image.fromarray((img.cpu().permute(1, 2, 0).numpy()*255).astype(np.uint8))}
+
+    def image_transform(self, images, lm):
+        """
+        param:
+            images:          -- PIL image 
+            lm:              -- numpy array
+        """
+        W,H = images.size
+        if np.mean(lm) == -1:
+            lm = (self.lm3d_std[:, :2]+1)/2.
+            lm = np.concatenate(
+                [lm[:, :1]*W, lm[:, 1:2]*H], 1
+            )
+        else:
+            lm[:, -1] = H - 1 - lm[:, -1]
+
+        trans_params, img, lm, _ = align_img(images, lm, self.lm3d_std)        
+        img = torch.tensor(np.array(img)/255., dtype=torch.float32).permute(2, 0, 1)
+        trans_params = np.array([float(item) for item in np.hsplit(trans_params, 5)])
+        trans_params = torch.tensor(trans_params.astype(np.float32))
+        return img, trans_params        
+
+def get_data_path(root, keypoint_root):
+    filenames = list()
+    keypoint_filenames = list()
+
+    IMAGE_EXTENSIONS_LOWERCASE = {'jpg', 'png', 'jpeg', 'webp'}
+    IMAGE_EXTENSIONS = IMAGE_EXTENSIONS_LOWERCASE.union({f.upper() for f in IMAGE_EXTENSIONS_LOWERCASE})
+    extensions = IMAGE_EXTENSIONS
+
+    for ext in extensions:
+        filenames += glob.glob(f'{root}/*.{ext}', recursive=True)
+    filenames = sorted(filenames)
+    for filename in filenames:
+        name = os.path.splitext(os.path.basename(filename))[0]
+        keypoint_filenames.append(
+            os.path.join(keypoint_root, name + '.txt')
+        )
+    return filenames, keypoint_filenames
+
+
+if __name__ == "__main__":
+    opt = InferenceOptions().parse() 
+    coeff_detector = CoeffDetector(opt)
+    kp_extractor = KeypointExtractor()
+    image_names, keypoint_names = get_data_path(opt.input_dir, opt.keypoint_dir)
+    makedirs(opt.keypoint_dir, exist_ok=True)
+    makedirs(opt.output_dir, exist_ok=True)
+
+    for image_name, keypoint_name in tqdm(zip(image_names, keypoint_names)):
+        image = Image.open(image_name)
+        if not os.path.isfile(keypoint_name):
+            lm = kp_extractor.extract_keypoint(image, keypoint_name)
+        else:
+            lm = np.loadtxt(keypoint_name).astype(np.float32)
+            lm = lm.reshape([-1, 2]) 
+        predicted = coeff_detector(image, lm)
+        name = os.path.splitext(os.path.basename(image_name))[0]
+        np.savetxt(
+            "{}/{}_3dmm_coeff.txt".format(opt.output_dir, name), 
+            predicted['coeff_3dmm'].reshape(-1))
+
+        
+
+
+
+    
\ No newline at end of file
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/data/__init__.py
new file mode 100644
index 00000000..56fe2126
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data/__init__.py
@@ -0,0 +1,116 @@
+"""This package includes all the modules related to data loading and preprocessing
+
+ To add a custom dataset class called 'dummy', you need to add a file called 'dummy_dataset.py' and define a subclass 'DummyDataset' inherited from BaseDataset.
+ You need to implement four functions:
+    -- <__init__>:                      initialize the class, first call BaseDataset.__init__(self, opt).
+    -- <__len__>:                       return the size of dataset.
+    -- <__getitem__>:                   get a data point from data loader.
+    -- <modify_commandline_options>:    (optionally) add dataset-specific options and set default options.
+
+Now you can use the dataset class by specifying flag '--dataset_mode dummy'.
+See our template dataset class 'template_dataset.py' for more details.
+"""
+import numpy as np
+import importlib
+import torch.utils.data
+from data.base_dataset import BaseDataset
+
+
+def find_dataset_using_name(dataset_name):
+    """Import the module "data/[dataset_name]_dataset.py".
+
+    In the file, the class called DatasetNameDataset() will
+    be instantiated. It has to be a subclass of BaseDataset,
+    and it is case-insensitive.
+    """
+    dataset_filename = "data." + dataset_name + "_dataset"
+    datasetlib = importlib.import_module(dataset_filename)
+
+    dataset = None
+    target_dataset_name = dataset_name.replace('_', '') + 'dataset'
+    for name, cls in datasetlib.__dict__.items():
+        if name.lower() == target_dataset_name.lower() \
+           and issubclass(cls, BaseDataset):
+            dataset = cls
+
+    if dataset is None:
+        raise NotImplementedError("In %s.py, there should be a subclass of BaseDataset with class name that matches %s in lowercase." % (dataset_filename, target_dataset_name))
+
+    return dataset
+
+
+def get_option_setter(dataset_name):
+    """Return the static method <modify_commandline_options> of the dataset class."""
+    dataset_class = find_dataset_using_name(dataset_name)
+    return dataset_class.modify_commandline_options
+
+
+def create_dataset(opt, rank=0):
+    """Create a dataset given the option.
+
+    This function wraps the class CustomDatasetDataLoader.
+        This is the main interface between this package and 'train.py'/'test.py'
+
+    Example:
+        >>> from data import create_dataset
+        >>> dataset = create_dataset(opt)
+    """
+    data_loader = CustomDatasetDataLoader(opt, rank=rank)
+    dataset = data_loader.load_data()
+    return dataset
+
+class CustomDatasetDataLoader():
+    """Wrapper class of Dataset class that performs multi-threaded data loading"""
+
+    def __init__(self, opt, rank=0):
+        """Initialize this class
+
+        Step 1: create a dataset instance given the name [dataset_mode]
+        Step 2: create a multi-threaded data loader.
+        """
+        self.opt = opt
+        dataset_class = find_dataset_using_name(opt.dataset_mode)
+        self.dataset = dataset_class(opt)
+        self.sampler = None
+        print("rank %d %s dataset [%s] was created" % (rank, self.dataset.name, type(self.dataset).__name__))
+        if opt.use_ddp and opt.isTrain:
+            world_size = opt.world_size
+            self.sampler = torch.utils.data.distributed.DistributedSampler(
+                    self.dataset,
+                    num_replicas=world_size,
+                    rank=rank,
+                    shuffle=not opt.serial_batches
+                )
+            self.dataloader = torch.utils.data.DataLoader(
+                        self.dataset,
+                        sampler=self.sampler,
+                        num_workers=int(opt.num_threads / world_size), 
+                        batch_size=int(opt.batch_size / world_size), 
+                        drop_last=True)
+        else:
+            self.dataloader = torch.utils.data.DataLoader(
+                self.dataset,
+                batch_size=opt.batch_size,
+                shuffle=(not opt.serial_batches) and opt.isTrain,
+                num_workers=int(opt.num_threads),
+                drop_last=True
+            )
+
+    def set_epoch(self, epoch):
+        self.dataset.current_epoch = epoch
+        if self.sampler is not None:
+            self.sampler.set_epoch(epoch)
+
+    def load_data(self):
+        return self
+
+    def __len__(self):
+        """Return the number of data in the dataset"""
+        return min(len(self.dataset), self.opt.max_dataset_size)
+
+    def __iter__(self):
+        """Return a batch of data"""
+        for i, data in enumerate(self.dataloader):
+            if i * self.opt.batch_size >= self.opt.max_dataset_size:
+                break
+            yield data
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/base_dataset.py b/dreamtalk/Deep3DFaceRecon_pytorch/data/base_dataset.py
new file mode 100644
index 00000000..1275d606
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data/base_dataset.py
@@ -0,0 +1,131 @@
+"""This module implements an abstract base class (ABC) 'BaseDataset' for datasets.
+
+It also includes common transformation functions (e.g., get_transform, __scale_width), which can be later used in subclasses.
+"""
+import random
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+try:
+    from PIL.Image import Resampling
+    RESAMPLING_METHOD = Resampling.BICUBIC
+except ImportError:
+    from PIL.Image import BICUBIC
+    RESAMPLING_METHOD = BICUBIC
+import torchvision.transforms as transforms
+from abc import ABC, abstractmethod
+
+
+class BaseDataset(data.Dataset, ABC):
+    """This class is an abstract base class (ABC) for datasets.
+
+    To create a subclass, you need to implement the following four functions:
+    -- <__init__>:                      initialize the class, first call BaseDataset.__init__(self, opt).
+    -- <__len__>:                       return the size of dataset.
+    -- <__getitem__>:                   get a data point.
+    -- <modify_commandline_options>:    (optionally) add dataset-specific options and set default options.
+    """
+
+    def __init__(self, opt):
+        """Initialize the class; save the options in the class
+
+        Parameters:
+            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        self.opt = opt
+        # self.root = opt.dataroot
+        self.current_epoch = 0
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new dataset-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        return parser
+
+    @abstractmethod
+    def __len__(self):
+        """Return the total number of images in the dataset."""
+        return 0
+
+    @abstractmethod
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index - - a random integer for data indexing
+
+        Returns:
+            a dictionary of data with their names. It ususally contains the data itself and its metadata information.
+        """
+        pass
+
+
+def get_transform(grayscale=False):
+    transform_list = []
+    if grayscale:
+        transform_list.append(transforms.Grayscale(1))
+    transform_list += [transforms.ToTensor()]
+    return transforms.Compose(transform_list)
+
+def get_affine_mat(opt, size):
+    shift_x, shift_y, scale, rot_angle, flip = 0., 0., 1., 0., False
+    w, h = size
+
+    if 'shift' in opt.preprocess:
+        shift_pixs = int(opt.shift_pixs)
+        shift_x = random.randint(-shift_pixs, shift_pixs)
+        shift_y = random.randint(-shift_pixs, shift_pixs)
+    if 'scale' in opt.preprocess:
+        scale = 1 + opt.scale_delta * (2 * random.random() - 1)
+    if 'rot' in opt.preprocess:
+        rot_angle = opt.rot_angle * (2 * random.random() - 1)
+        rot_rad = -rot_angle * np.pi/180
+    if 'flip' in opt.preprocess:
+        flip = random.random() > 0.5
+
+    shift_to_origin = np.array([1, 0, -w//2, 0, 1, -h//2, 0, 0, 1]).reshape([3, 3])
+    flip_mat = np.array([-1 if flip else 1, 0, 0, 0, 1, 0, 0, 0, 1]).reshape([3, 3])
+    shift_mat = np.array([1, 0, shift_x, 0, 1, shift_y, 0, 0, 1]).reshape([3, 3])
+    rot_mat = np.array([np.cos(rot_rad), np.sin(rot_rad), 0, -np.sin(rot_rad), np.cos(rot_rad), 0, 0, 0, 1]).reshape([3, 3])
+    scale_mat = np.array([scale, 0, 0, 0, scale, 0, 0, 0, 1]).reshape([3, 3])
+    shift_to_center = np.array([1, 0, w//2, 0, 1, h//2, 0, 0, 1]).reshape([3, 3])
+    
+    affine = shift_to_center @ scale_mat @ rot_mat @ shift_mat @ flip_mat @ shift_to_origin    
+    affine_inv = np.linalg.inv(affine)
+    return affine, affine_inv, flip
+
+def apply_img_affine(img, affine_inv, method=RESAMPLING_METHOD):
+    return img.transform(img.size, Image.AFFINE, data=affine_inv.flatten()[:6], resample=RESAMPLING_METHOD)
+
+def apply_lm_affine(landmark, affine, flip, size):
+    _, h = size
+    lm = landmark.copy()
+    lm[:, 1] = h - 1 - lm[:, 1]
+    lm = np.concatenate((lm, np.ones([lm.shape[0], 1])), -1)
+    lm = lm @ np.transpose(affine)
+    lm[:, :2] = lm[:, :2] / lm[:, 2:]
+    lm = lm[:, :2]
+    lm[:, 1] = h - 1 - lm[:, 1]
+    if flip:
+        lm_ = lm.copy()
+        lm_[:17] = lm[16::-1]
+        lm_[17:22] = lm[26:21:-1]
+        lm_[22:27] = lm[21:16:-1]
+        lm_[31:36] = lm[35:30:-1]
+        lm_[36:40] = lm[45:41:-1]
+        lm_[40:42] = lm[47:45:-1]
+        lm_[42:46] = lm[39:35:-1]
+        lm_[46:48] = lm[41:39:-1]
+        lm_[48:55] = lm[54:47:-1]
+        lm_[55:60] = lm[59:54:-1]
+        lm_[60:65] = lm[64:59:-1]
+        lm_[65:68] = lm[67:64:-1]
+        lm = lm_
+    return lm
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/flist_dataset.py b/dreamtalk/Deep3DFaceRecon_pytorch/data/flist_dataset.py
new file mode 100644
index 00000000..c0b6945c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data/flist_dataset.py
@@ -0,0 +1,125 @@
+"""This script defines the custom dataset for Deep3DFaceRecon_pytorch
+"""
+
+import os.path
+from data.base_dataset import BaseDataset, get_transform, get_affine_mat, apply_img_affine, apply_lm_affine
+from data.image_folder import make_dataset
+from PIL import Image
+import random
+import util.util as util
+import numpy as np
+import json
+import torch
+from scipy.io import loadmat, savemat
+import pickle
+from util.preprocess import align_img, estimate_norm
+from util.load_mats import load_lm3d
+
+
+def default_flist_reader(flist):
+    """
+    flist format: impath label\nimpath label\n ...(same to caffe's filelist)
+    """
+    imlist = []
+    with open(flist, 'r') as rf:
+        for line in rf.readlines():
+            impath = line.strip()
+            imlist.append(impath)
+
+    return imlist
+
+def jason_flist_reader(flist):
+    with open(flist, 'r') as fp:
+        info = json.load(fp)
+    return info
+
+def parse_label(label):
+    return torch.tensor(np.array(label).astype(np.float32))
+
+
+class FlistDataset(BaseDataset):
+    """
+    It requires one directories to host training images '/path/to/data/train'
+    You can train the model with the dataset flag '--dataroot /path/to/data'.
+    """
+
+    def __init__(self, opt):
+        """Initialize this dataset class.
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        BaseDataset.__init__(self, opt)
+        
+        self.lm3d_std = load_lm3d(opt.bfm_folder)
+        
+        msk_names = default_flist_reader(opt.flist)
+        self.msk_paths = [os.path.join(opt.data_root, i) for i in msk_names]
+
+        self.size = len(self.msk_paths) 
+        self.opt = opt
+        
+        self.name = 'train' if opt.isTrain else 'val'
+        if '_' in opt.flist:
+            self.name += '_' + opt.flist.split(os.sep)[-1].split('_')[0]
+        
+
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index (int)      -- a random integer for data indexing
+
+        Returns a dictionary that contains A, B, A_paths and B_paths
+            img (tensor)       -- an image in the input domain
+            msk (tensor)       -- its corresponding attention mask
+            lm  (tensor)       -- its corresponding 3d landmarks
+            im_paths (str)     -- image paths
+            aug_flag (bool)    -- a flag used to tell whether its raw or augmented
+        """
+        msk_path = self.msk_paths[index % self.size]  # make sure index is within then range
+        img_path = msk_path.replace('mask/', '')
+        lm_path = '.'.join(msk_path.replace('mask', 'landmarks').split('.')[:-1]) + '.txt'
+
+        raw_img = Image.open(img_path).convert('RGB')
+        raw_msk = Image.open(msk_path).convert('RGB')
+        raw_lm = np.loadtxt(lm_path).astype(np.float32)
+
+        _, img, lm, msk = align_img(raw_img, raw_lm, self.lm3d_std, raw_msk)
+        
+        aug_flag = self.opt.use_aug and self.opt.isTrain
+        if aug_flag:
+            img, lm, msk = self._augmentation(img, lm, self.opt, msk)
+        
+        _, H = img.size
+        M = estimate_norm(lm, H)
+        transform = get_transform()
+        img_tensor = transform(img)
+        msk_tensor = transform(msk)[:1, ...]
+        lm_tensor = parse_label(lm)
+        M_tensor = parse_label(M)
+
+
+        return {'imgs': img_tensor, 
+                'lms': lm_tensor, 
+                'msks': msk_tensor, 
+                'M': M_tensor,
+                'im_paths': img_path, 
+                'aug_flag': aug_flag,
+                'dataset': self.name}
+
+    def _augmentation(self, img, lm, opt, msk=None):
+        affine, affine_inv, flip = get_affine_mat(opt, img.size)
+        img = apply_img_affine(img, affine_inv)
+        lm = apply_lm_affine(lm, affine, flip, img.size)
+        if msk is not None:
+            msk = apply_img_affine(msk, affine_inv, method=Image.BILINEAR)
+        return img, lm, msk
+    
+
+
+
+    def __len__(self):
+        """Return the total number of images in the dataset.
+        """
+        return self.size
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/image_folder.py b/dreamtalk/Deep3DFaceRecon_pytorch/data/image_folder.py
new file mode 100644
index 00000000..efadc2ec
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data/image_folder.py
@@ -0,0 +1,66 @@
+"""A modified image folder class
+
+We modify the official PyTorch image folder (https://github.com/pytorch/vision/blob/master/torchvision/datasets/folder.py)
+so that this class can load images from both current directory and its subdirectories.
+"""
+import numpy as np
+import torch.utils.data as data
+
+from PIL import Image
+import os
+import os.path
+
+IMG_EXTENSIONS = [
+    '.jpg', '.JPG', '.jpeg', '.JPEG',
+    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
+    '.tif', '.TIF', '.tiff', '.TIFF',
+]
+
+
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def make_dataset(dir, max_dataset_size=float("inf")):
+    images = []
+    assert os.path.isdir(dir) or os.path.islink(dir), '%s is not a valid directory' % dir
+
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in fnames:
+            if is_image_file(fname):
+                path = os.path.join(root, fname)
+                images.append(path)
+    return images[:min(max_dataset_size, len(images))]
+
+
+def default_loader(path):
+    return Image.open(path).convert('RGB')
+
+
+class ImageFolder(data.Dataset):
+
+    def __init__(self, root, transform=None, return_paths=False,
+                 loader=default_loader):
+        imgs = make_dataset(root)
+        if len(imgs) == 0:
+            raise(RuntimeError("Found 0 images in: " + root + "\n"
+                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
+
+        self.root = root
+        self.imgs = imgs
+        self.transform = transform
+        self.return_paths = return_paths
+        self.loader = loader
+
+    def __getitem__(self, index):
+        path = self.imgs[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.return_paths:
+            return img, path
+        else:
+            return img
+
+    def __len__(self):
+        return len(self.imgs)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/input/demo2.mp4 b/dreamtalk/Deep3DFaceRecon_pytorch/data/input/demo2.mp4
new file mode 100644
index 00000000..45711ed9
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/data/input/demo2.mp4 differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data/template_dataset.py b/dreamtalk/Deep3DFaceRecon_pytorch/data/template_dataset.py
new file mode 100644
index 00000000..bfdf16be
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data/template_dataset.py
@@ -0,0 +1,75 @@
+"""Dataset class template
+
+This module provides a template for users to implement custom datasets.
+You can specify '--dataset_mode template' to use this dataset.
+The class name should be consistent with both the filename and its dataset_mode option.
+The filename should be <dataset_mode>_dataset.py
+The class name should be <Dataset_mode>Dataset.py
+You need to implement the following functions:
+    -- <modify_commandline_options>:　Add dataset-specific options and rewrite default values for existing options.
+    -- <__init__>: Initialize this dataset class.
+    -- <__getitem__>: Return a data point and its metadata information.
+    -- <__len__>: Return the number of images.
+"""
+from data.base_dataset import BaseDataset, get_transform
+# from data.image_folder import make_dataset
+# from PIL import Image
+
+
+class TemplateDataset(BaseDataset):
+    """A template dataset class for you to implement custom datasets."""
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new dataset-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        parser.add_argument('--new_dataset_option', type=float, default=1.0, help='new dataset option')
+        parser.set_defaults(max_dataset_size=10, new_dataset_option=2.0)  # specify dataset-specific default values
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this dataset class.
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+
+        A few things can be done here.
+        - save the options (have been done in BaseDataset)
+        - get image paths and meta information of the dataset.
+        - define the image transformation.
+        """
+        # save the option and dataset root
+        BaseDataset.__init__(self, opt)
+        # get the image paths of your dataset;
+        self.image_paths = []  # You can call sorted(make_dataset(self.root, opt.max_dataset_size)) to get all the image paths under the directory self.root
+        # define the default transform function. You can use <base_dataset.get_transform>; You can also define your custom transform function
+        self.transform = get_transform(opt)
+
+    def __getitem__(self, index):
+        """Return a data point and its metadata information.
+
+        Parameters:
+            index -- a random integer for data indexing
+
+        Returns:
+            a dictionary of data with their names. It usually contains the data itself and its metadata information.
+
+        Step 1: get a random image path: e.g., path = self.image_paths[index]
+        Step 2: load your data from the disk: e.g., image = Image.open(path).convert('RGB').
+        Step 3: convert your data to a PyTorch tensor. You can use helpder functions such as self.transform. e.g., data = self.transform(image)
+        Step 4: return a data point as a dictionary.
+        """
+        path = 'temp'    # needs to be a string
+        data_A = None    # needs to be a tensor
+        data_B = None    # needs to be a tensor
+        return {'data_A': data_A, 'data_B': data_B, 'path': path}
+
+    def __len__(self):
+        """Return the total number of images."""
+        return len(self.image_paths)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/data_preparation.py b/dreamtalk/Deep3DFaceRecon_pytorch/data_preparation.py
new file mode 100644
index 00000000..6ffc79d3
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/data_preparation.py
@@ -0,0 +1,45 @@
+"""This script is the data preparation script for Deep3DFaceRecon_pytorch
+"""
+
+import os 
+import numpy as np
+import argparse
+from util.detect_lm68 import detect_68p,load_lm_graph
+from util.skin_mask import get_skin_mask
+from util.generate_list import check_list, write_list
+import warnings
+warnings.filterwarnings("ignore") 
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_root', type=str, default='datasets', help='root directory for training data')
+parser.add_argument('--img_folder', nargs="+", required=True, help='folders of training images')
+parser.add_argument('--mode', type=str, default='train', help='train or val')
+opt = parser.parse_args()
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+def data_prepare(folder_list,mode):
+
+    lm_sess,input_op,output_op = load_lm_graph('./checkpoints/lm_model/68lm_detector.pb') # load a tensorflow version 68-landmark detector
+
+    for img_folder in folder_list:
+        detect_68p(img_folder,lm_sess,input_op,output_op) # detect landmarks for images
+        get_skin_mask(img_folder) # generate skin attention mask for images
+
+    # create files that record path to all training data
+    msks_list = []
+    for img_folder in folder_list:
+        path = os.path.join(img_folder, 'mask')
+        msks_list += ['/'.join([img_folder, 'mask', i]) for i in sorted(os.listdir(path)) if 'jpg' in i or 
+                                                    'png' in i or 'jpeg' in i or 'PNG' in i]
+
+    imgs_list = [i.replace('mask/', '') for i in msks_list]
+    lms_list = [i.replace('mask', 'landmarks') for i in msks_list]
+    lms_list = ['.'.join(i.split('.')[:-1]) + '.txt' for i in lms_list]
+    
+    lms_list_final, imgs_list_final, msks_list_final = check_list(lms_list, imgs_list, msks_list) # check if the path is valid
+    write_list(lms_list_final, imgs_list_final, msks_list_final, mode=mode) # save files
+
+if __name__ == '__main__':
+    print('Datasets:',opt.img_folder)
+    data_prepare([os.path.join(opt.data_root,folder) for folder in opt.img_folder],opt.mode)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/extract_kp_videos.py b/dreamtalk/Deep3DFaceRecon_pytorch/extract_kp_videos.py
new file mode 100644
index 00000000..4801b660
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/extract_kp_videos.py
@@ -0,0 +1,106 @@
+import os
+import cv2
+import time
+import glob
+import argparse
+import face_alignment
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from itertools import cycle
+
+from torch.multiprocessing import Pool, Process, set_start_method
+
+class KeypointExtractor():
+    def __init__(self):
+        self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D)   
+
+class KeypointExtractor():
+    def __init__(self):
+        self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D)
+        # Replaced outdated call methods
+        # self.detector = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D)
+
+    def extract_keypoint(self, images, name=None):
+        if isinstance(images, list):
+            keypoints = []
+            for image in images:
+                current_kp = self.extract_keypoint(image)
+                if np.mean(current_kp) == -1 and keypoints:
+                    keypoints.append(keypoints[-1])
+                else:
+                    keypoints.append(current_kp[None])
+
+            keypoints = np.concatenate(keypoints, 0)
+            np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+            return keypoints
+        else:
+            while True:
+                try:
+                    keypoints = self.detector.get_landmarks_from_image(np.array(images))[0]
+                    break
+                except RuntimeError as e:
+                    if str(e).startswith('CUDA'):
+                        print("Warning: out of memory, sleep for 1s")
+                        time.sleep(1)
+                    else:
+                        print(e)
+                        break    
+                except TypeError:
+                    print('No face detected in this image')
+                    shape = [68, 2]
+                    keypoints = -1. * np.ones(shape)                    
+                    break
+            if name is not None:
+                np.savetxt(os.path.splitext(name)[0]+'.txt', keypoints.reshape(-1))
+            return keypoints
+
+
+def read_video(filename):
+    frames = []
+    cap = cv2.VideoCapture(filename)
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frames.append(frame)
+        else:
+            break
+    cap.release()
+    return frames
+
+def run(data):
+    filename, opt, device = data
+    os.environ['CUDA_VISIBLE_DEVICES'] = device
+    kp_extractor = KeypointExtractor()
+    images = read_video(filename)
+    name = os.path.basename(filename).split('.')[0]  # More robust path handling
+    output_dir = os.path.join(opt.output_dir, name)
+    os.makedirs(output_dir, exist_ok=True)
+    kp_extractor.extract_keypoint(
+        images, 
+        name=os.path.join(output_dir, name)
+    )
+
+if __name__ == '__main__':
+    set_start_method('spawn')
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--input_dir', type=str, help='the folder of the input files')
+    parser.add_argument('--output_dir', type=str, help='the folder of the output files')
+    parser.add_argument('--device_ids', type=str, default='0,1')
+    parser.add_argument('--workers', type=int, default=4)
+
+    opt = parser.parse_args()
+    filenames = []
+    VIDEO_EXTENSIONS = {'mp4'}
+    for ext in VIDEO_EXTENSIONS:
+        filenames.extend(sorted(glob.glob(f'{opt.input_dir}/**/*.{ext}', recursive=True)))
+    
+    print('Total number of videos:', len(filenames))
+    pool = Pool(opt.workers)
+    args_list = cycle([opt])
+    device_ids = opt.device_ids.split(",")
+    device_ids = cycle(device_ids)
+    for data in tqdm(pool.imap_unordered(run, zip(filenames, args_list, device_ids))):
+        None
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/face_recon_videos.py b/dreamtalk/Deep3DFaceRecon_pytorch/face_recon_videos.py
new file mode 100644
index 00000000..addd4cd8
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/face_recon_videos.py
@@ -0,0 +1,156 @@
+import os
+import cv2
+import glob
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from scipy.io import savemat
+
+import torch 
+
+from models import create_model
+from options.inference_options import InferenceOptions
+from util.preprocess import align_img
+from util.load_mats import load_lm3d
+from util.util import mkdirs, tensor2im, save_image
+
+
+def get_data_path(root, keypoint_root):
+    filenames = list()
+    keypoint_filenames = list()
+
+    VIDEO_EXTENSIONS_LOWERCASE = {'mp4'}
+    VIDEO_EXTENSIONS = VIDEO_EXTENSIONS_LOWERCASE.union({f.upper() for f in VIDEO_EXTENSIONS_LOWERCASE})
+    extensions = VIDEO_EXTENSIONS
+
+    for ext in extensions:
+        filenames += glob.glob(f'{root}/**/*.{ext}', recursive=True)
+    filenames = sorted(filenames)
+    keypoint_filenames = sorted(glob.glob(f'{keypoint_root}/**/*.txt', recursive=True))
+    assert len(filenames) == len(keypoint_filenames)
+
+    return filenames, keypoint_filenames
+
+class VideoPathDataset(torch.utils.data.Dataset):
+    def __init__(self, filenames, txt_filenames, bfm_folder):
+        self.filenames = filenames
+        self.txt_filenames = txt_filenames
+        self.lm3d_std = load_lm3d(bfm_folder) 
+
+    def __len__(self):
+        return len(self.filenames)
+
+    def __getitem__(self, index):
+        filename = self.filenames[index]
+        txt_filename = self.txt_filenames[index]
+        frames = self.read_video(filename)
+        lm = np.loadtxt(txt_filename).astype(np.float32)
+        lm = lm.reshape([len(frames), -1, 2]) 
+        out_images, out_trans_params = list(), list()
+        for i in range(len(frames)):
+            out_img, _, out_trans_param \
+                = self.image_transform(frames[i], lm[i])
+            out_images.append(out_img[None])
+            out_trans_params.append(out_trans_param[None])
+        return {
+            'imgs': torch.cat(out_images, 0),
+            'trans_param':torch.cat(out_trans_params, 0),
+            'filename': filename
+        }
+        
+    def read_video(self, filename):
+        frames = list()
+        cap = cv2.VideoCapture(filename)
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if ret:
+                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frame = Image.fromarray(frame)
+                frames.append(frame)
+            else:
+                break
+        cap.release()
+        return frames
+
+    def image_transform(self, images, lm):
+        W,H = images.size
+        if np.mean(lm) == -1:
+            lm = (self.lm3d_std[:, :2]+1)/2.
+            lm = np.concatenate(
+                [lm[:, :1]*W, lm[:, 1:2]*H], 1
+            )
+        else:
+            lm[:, -1] = H - 1 - lm[:, -1]
+
+        trans_params, img, lm, _ = align_img(images, lm, self.lm3d_std)        
+        img = torch.tensor(np.array(img)/255., dtype=torch.float32).permute(2, 0, 1)
+        lm = torch.tensor(lm)
+        trans_params = np.array([float(item) for item in np.hsplit(trans_params, 5)])
+        trans_params = torch.tensor(trans_params.astype(np.float32))
+        return img, lm, trans_params        
+
+def main(opt, model):
+    import torch.multiprocessing
+    torch.multiprocessing.set_sharing_strategy('file_system')
+    filenames, keypoint_filenames = get_data_path(opt.input_dir, opt.keypoint_dir)
+    dataset = VideoPathDataset(filenames, keypoint_filenames, opt.bfm_folder)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1, # can noly set to one here!
+        shuffle=False,
+        drop_last=False,
+        num_workers=8,
+    )     
+    batch_size = opt.inference_batch_size
+    for data in tqdm(dataloader):
+        num_batch = data['imgs'][0].shape[0] // batch_size + 1
+        pred_coeffs = list()
+        for index in range(num_batch):
+            data_input = {                
+                'imgs': data['imgs'][0,index*batch_size:(index+1)*batch_size],
+            }
+            model.set_input(data_input)  
+            model.test()
+            pred_coeff = {key:model.pred_coeffs_dict[key].cpu().numpy() for key in model.pred_coeffs_dict}
+            pred_coeff = np.concatenate([
+                pred_coeff['id'], 
+                pred_coeff['exp'], 
+                pred_coeff['tex'], 
+                pred_coeff['angle'],
+                pred_coeff['gamma'],
+                pred_coeff['trans']], 1)
+            pred_coeffs.append(pred_coeff) 
+            visuals = model.get_current_visuals()  # get image results
+            if False: # debug
+                for name in visuals:
+                    images = visuals[name]
+                    for i in range(images.shape[0]):
+                        image_numpy = tensor2im(images[i])
+                        save_image(
+                            image_numpy, 
+                            os.path.join(
+                                opt.output_dir,
+                                os.path.basename(data['filename'][0])+str(i).zfill(5)+'.jpg')
+                            )
+                exit()
+
+        pred_coeffs = np.concatenate(pred_coeffs, 0)
+        pred_trans_params = data['trans_param'][0].cpu().numpy()
+        name = data['filename'][0].split('/')[-2:]
+        name[-1] = os.path.splitext(name[-1])[0] + '.mat'
+        os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
+        savemat(
+            os.path.join(opt.output_dir, name[-2], name[-1]), 
+            {'coeff':pred_coeffs, 'transform_params':pred_trans_params}
+        )
+
+if __name__ == '__main__':
+    opt = InferenceOptions().parse()  # get test options
+    model = create_model(opt)
+    model.setup(opt)
+    model.device = 'cuda:0'
+    model.parallelize()
+    model.eval()
+
+    main(opt, model)
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/img/comment.jpeg b/dreamtalk/Deep3DFaceRecon_pytorch/img/comment.jpeg
new file mode 100644
index 00000000..d2dfafa8
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/img/comment.jpeg differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/img/pir.jpeg b/dreamtalk/Deep3DFaceRecon_pytorch/img/pir.jpeg
new file mode 100644
index 00000000..e5daa177
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/img/pir.jpeg differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/__init__.py
new file mode 100644
index 00000000..fc01113d
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/__init__.py
@@ -0,0 +1,67 @@
+"""This package contains modules related to objective functions, optimizations, and network architectures.
+
+To add a custom model class called 'dummy', you need to add a file called 'dummy_model.py' and define a subclass DummyModel inherited from BaseModel.
+You need to implement the following five functions:
+    -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+    -- <set_input>:                     unpack data from dataset and apply preprocessing.
+    -- <forward>:                       produce intermediate results.
+    -- <optimize_parameters>:           calculate loss, gradients, and update network weights.
+    -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+
+In the function <__init__>, you need to define four lists:
+    -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+    -- self.model_names (str list):         define networks used in our training.
+    -- self.visual_names (str list):        specify the images that you want to display and save.
+    -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an usage.
+
+Now you can use the model class by specifying flag '--model dummy'.
+See our template model class 'template_model.py' for more details.
+"""
+
+import importlib
+from models.base_model import BaseModel
+
+
+def find_model_using_name(model_name):
+    """Import the module "models/[model_name]_model.py".
+
+    In the file, the class called DatasetNameModel() will
+    be instantiated. It has to be a subclass of BaseModel,
+    and it is case-insensitive.
+    """
+    model_filename = "models." + model_name + "_model"
+    modellib = importlib.import_module(model_filename)
+    model = None
+    target_model_name = model_name.replace('_', '') + 'model'
+    for name, cls in modellib.__dict__.items():
+        if name.lower() == target_model_name.lower() \
+           and issubclass(cls, BaseModel):
+            model = cls
+
+    if model is None:
+        print("In %s.py, there should be a subclass of BaseModel with class name that matches %s in lowercase." % (model_filename, target_model_name))
+        exit(0)
+
+    return model
+
+
+def get_option_setter(model_name):
+    """Return the static method <modify_commandline_options> of the model class."""
+    model_class = find_model_using_name(model_name)
+    return model_class.modify_commandline_options
+
+
+def create_model(opt):
+    """Create a model given the option.
+
+    This function warps the class CustomDatasetDataLoader.
+    This is the main interface between this package and 'train.py'/'test.py'
+
+    Example:
+        >>> from models import create_model
+        >>> model = create_model(opt)
+    """
+    model = find_model_using_name(opt.model)
+    instance = model(opt)
+    print("model [%s] was created" % type(instance).__name__)
+    return instance
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/base_model.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/base_model.py
new file mode 100644
index 00000000..2a05d3a0
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/base_model.py
@@ -0,0 +1,316 @@
+"""This script defines the base network model for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import numpy as np
+import torch
+from collections import OrderedDict
+from abc import ABC, abstractmethod
+from . import networks
+
+
+class BaseModel(ABC):
+    """This class is an abstract base class (ABC) for models.
+    To create a subclass, you need to implement the following five functions:
+        -- <__init__>:                      initialize the class; first call BaseModel.__init__(self, opt).
+        -- <set_input>:                     unpack data from dataset and apply preprocessing.
+        -- <forward>:                       produce intermediate results.
+        -- <optimize_parameters>:           calculate losses, gradients, and update network weights.
+        -- <modify_commandline_options>:    (optionally) add model-specific options and set default options.
+    """
+
+    def __init__(self, opt):
+        """Initialize the BaseModel class.
+
+        Parameters:
+            opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions
+
+        When creating your custom class, you need to implement your own initialization.
+        In this fucntion, you should first call <BaseModel.__init__(self, opt)>
+        Then, you need to define four lists:
+            -- self.loss_names (str list):          specify the training losses that you want to plot and save.
+            -- self.model_names (str list):         specify the images that you want to display and save.
+            -- self.visual_names (str list):        define networks used in our training.
+            -- self.optimizers (optimizer list):    define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
+        """
+        self.opt = opt
+        self.isTrain = opt.isTrain
+        self.device = torch.device('cpu') 
+        self.save_dir = os.path.join(opt.checkpoints_dir, opt.name)  # save all the checkpoints to save_dir
+        self.loss_names = []
+        self.model_names = []
+        self.visual_names = []
+        self.parallel_names = []
+        self.optimizers = []
+        self.image_paths = []
+        self.metric = 0  # used for learning rate policy 'plateau'
+
+    @staticmethod
+    def dict_grad_hook_factory(add_func=lambda x: x):
+        saved_dict = dict()
+
+        def hook_gen(name):
+            def grad_hook(grad):
+                saved_vals = add_func(grad)
+                saved_dict[name] = saved_vals
+            return grad_hook
+        return hook_gen, saved_dict
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train):
+        """Add new model-specific options, and rewrite default values for existing options.
+
+        Parameters:
+            parser          -- original option parser
+            is_train (bool) -- whether training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        return parser
+
+    @abstractmethod
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input (dict): includes the data itself and its metadata information.
+        """
+        pass
+
+    @abstractmethod
+    def forward(self):
+        """Run forward pass; called by both functions <optimize_parameters> and <test>."""
+        pass
+
+    @abstractmethod
+    def optimize_parameters(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+        pass
+
+    def setup(self, opt):
+        """Load and print networks; create schedulers
+
+        Parameters:
+            opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        """
+        if self.isTrain:
+            self.schedulers = [networks.get_scheduler(optimizer, opt) for optimizer in self.optimizers]
+        
+        if not self.isTrain or opt.continue_train:
+            load_suffix = opt.epoch
+            self.load_networks(load_suffix)
+ 
+            
+        # self.print_networks(opt.verbose)
+
+    def parallelize(self, convert_sync_batchnorm=True):
+        if not self.opt.use_ddp:
+            for name in self.parallel_names:
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+        else:
+            for name in self.model_names:
+                if isinstance(name, str):
+                    module = getattr(self, name)
+                    if convert_sync_batchnorm:
+                        module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module)
+                    setattr(self, name, torch.nn.parallel.DistributedDataParallel(module.to(self.device),
+                        device_ids=[self.device.index], 
+                        find_unused_parameters=True, broadcast_buffers=True))
+            
+            # DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.
+            for name in self.parallel_names:
+                if isinstance(name, str) and name not in self.model_names:
+                    module = getattr(self, name)
+                    setattr(self, name, module.to(self.device))
+            
+        # put state_dict of optimizer to gpu device
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+                for optim in self.optimizers:
+                    for state in optim.state.values():
+                        for k, v in state.items():
+                            if isinstance(v, torch.Tensor):
+                                state[k] = v.to(self.device)
+
+    def data_dependent_initialize(self, data):
+        pass
+
+    def train(self):
+        """Make models train mode"""
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                net.train()
+
+    def eval(self):
+        """Make models eval mode"""
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                net.eval()
+
+    def test(self):
+        """Forward function used in test time.
+
+        This function wraps <forward> function in no_grad() so we don't save intermediate steps for backprop
+        It also calls <compute_visuals> to produce additional visualization results
+        """
+        with torch.no_grad():
+            self.forward()
+            self.compute_visuals()
+
+    def compute_visuals(self):
+        """Calculate additional output images for visdom and HTML visualization"""
+        pass
+
+    def get_image_paths(self, name='A'):
+        """ Return image paths that are used to load current data"""
+        return self.image_paths if name =='A' else self.image_paths_B
+
+    def update_learning_rate(self):
+        """Update learning rates for all the networks; called at the end of every epoch"""
+        for scheduler in self.schedulers:
+            if self.opt.lr_policy == 'plateau':
+                scheduler.step(self.metric)
+            else:
+                scheduler.step()
+
+        lr = self.optimizers[0].param_groups[0]['lr']
+        print('learning rate = %.7f' % lr)
+
+    def get_current_visuals(self):
+        """Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
+        visual_ret = OrderedDict()
+        for name in self.visual_names:
+            if isinstance(name, str):
+                visual_ret[name] = getattr(self, name)[:, :3, ...]
+        return visual_ret
+
+    def get_current_losses(self):
+        """Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
+        errors_ret = OrderedDict()
+        for name in self.loss_names:
+            if isinstance(name, str):
+                errors_ret[name] = float(getattr(self, 'loss_' + name))  # float(...) works for both scalar tensor and float number
+        return errors_ret
+
+    def save_networks(self, epoch):
+        """Save all the networks to the disk.
+
+        Parameters:
+            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+        """
+        if not os.path.isdir(self.save_dir):
+            os.makedirs(self.save_dir)
+
+        save_filename = 'epoch_%s.pth' % (epoch)
+        save_path = os.path.join(self.save_dir, save_filename)
+        
+        save_dict = {}
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                if isinstance(net, torch.nn.DataParallel) or isinstance(net,
+                        torch.nn.parallel.DistributedDataParallel):
+                    net = net.module
+                save_dict[name] = net.state_dict()
+                
+
+        for i, optim in enumerate(self.optimizers):
+            save_dict['opt_%02d'%i] = optim.state_dict()
+
+        for i, sched in enumerate(self.schedulers):
+            save_dict['sched_%02d'%i] = sched.state_dict()
+        
+        torch.save(save_dict, save_path)
+
+    def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0):
+        """Fix InstanceNorm checkpoints incompatibility (prior to 0.4)"""
+        key = keys[i]
+        if i + 1 == len(keys):  # at the end, pointing to a parameter/buffer
+            if module.__class__.__name__.startswith('InstanceNorm') and \
+                    (key == 'running_mean' or key == 'running_var'):
+                if getattr(module, key) is None:
+                    state_dict.pop('.'.join(keys))
+            if module.__class__.__name__.startswith('InstanceNorm') and \
+               (key == 'num_batches_tracked'):
+                state_dict.pop('.'.join(keys))
+        else:
+            self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1)
+
+    def load_networks(self, epoch):
+        """Load all the networks from the disk.
+
+        Parameters:
+            epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
+        """
+        if self.opt.isTrain and self.opt.pretrained_name is not None:
+            load_dir = os.path.join(self.opt.checkpoints_dir, self.opt.pretrained_name)
+        else:
+            load_dir = self.save_dir    
+        load_filename = 'epoch_%s.pth' % (epoch)
+        load_path = os.path.join(load_dir, load_filename)
+        state_dict = torch.load(load_path, map_location=self.device)
+        print('loading the model from %s' % load_path)
+
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                if isinstance(net, torch.nn.DataParallel):
+                    net = net.module
+                net.load_state_dict(state_dict[name])
+        
+        if self.opt.phase != 'test':
+            if self.opt.continue_train:
+                print('loading the optim from %s' % load_path)
+                for i, optim in enumerate(self.optimizers):
+                    optim.load_state_dict(state_dict['opt_%02d'%i])
+
+                try:
+                    print('loading the sched from %s' % load_path)
+                    for i, sched in enumerate(self.schedulers):
+                        sched.load_state_dict(state_dict['sched_%02d'%i])
+                except:
+                    print('Failed to load schedulers, set schedulers according to epoch count manually')
+                    for i, sched in enumerate(self.schedulers):
+                        sched.last_epoch = self.opt.epoch_count - 1
+                    
+
+            
+
+    def print_networks(self, verbose):
+        """Print the total number of parameters in the network and (if verbose) network architecture
+
+        Parameters:
+            verbose (bool) -- if verbose: print the network architecture
+        """
+        print('---------- Networks initialized -------------')
+        for name in self.model_names:
+            if isinstance(name, str):
+                net = getattr(self, name)
+                num_params = 0
+                for param in net.parameters():
+                    num_params += param.numel()
+                if verbose:
+                    print(net)
+                print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
+        print('-----------------------------------------------')
+
+    def set_requires_grad(self, nets, requires_grad=False):
+        """Set requies_grad=Fasle for all the networks to avoid unnecessary computations
+        Parameters:
+            nets (network list)   -- a list of networks
+            requires_grad (bool)  -- whether the networks require gradients or not
+        """
+        if not isinstance(nets, list):
+            nets = [nets]
+        for net in nets:
+            if net is not None:
+                for param in net.parameters():
+                    param.requires_grad = requires_grad
+
+    def generate_visuals_for_evaluation(self, data, mode):
+        return {}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/bfm.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/bfm.py
new file mode 100644
index 00000000..33a27e7d
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/bfm.py
@@ -0,0 +1,299 @@
+"""This script defines the parametric 3d face model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+import  torch
+import torch.nn.functional as F
+from scipy.io import loadmat
+from util.load_mats import transferBFM09
+import os
+
+def perspective_projection(focal, center):
+    # return p.T (N, 3) @ (3, 3) 
+    return np.array([
+        focal, 0, center,
+        0, focal, center,
+        0, 0, 1
+    ]).reshape([3, 3]).astype(np.float32).transpose()
+
+class SH:
+    def __init__(self):
+        self.a = [np.pi, 2 * np.pi / np.sqrt(3.), 2 * np.pi / np.sqrt(8.)]
+        self.c = [1/np.sqrt(4 * np.pi), np.sqrt(3.) / np.sqrt(4 * np.pi), 3 * np.sqrt(5.) / np.sqrt(12 * np.pi)]
+
+
+
+class ParametricFaceModel:
+    def __init__(self, 
+                bfm_folder='./BFM', 
+                recenter=True,
+                camera_distance=10.,
+                init_lit=np.array([
+                    0.8, 0, 0, 0, 0, 0, 0, 0, 0
+                    ]),
+                focal=1015.,
+                center=112.,
+                is_train=True,
+                default_name='BFM_model_front.mat'):
+        
+        if not os.path.isfile(os.path.join(bfm_folder, default_name)):
+            transferBFM09(bfm_folder)
+        model = loadmat(os.path.join(bfm_folder, default_name))
+        # mean face shape. [3*N,1]
+        self.mean_shape = model['meanshape'].astype(np.float32)
+        # identity basis. [3*N,80]
+        self.id_base = model['idBase'].astype(np.float32)
+        # expression basis. [3*N,64]
+        self.exp_base = model['exBase'].astype(np.float32)
+        # mean face texture. [3*N,1] (0-255)
+        self.mean_tex = model['meantex'].astype(np.float32)
+        # texture basis. [3*N,80]
+        self.tex_base = model['texBase'].astype(np.float32)
+        # face indices for each vertex that lies in. starts from 0. [N,8]
+        self.point_buf = model['point_buf'].astype(np.int64) - 1
+        # vertex indices for each face. starts from 0. [F,3]
+        self.face_buf = model['tri'].astype(np.int64) - 1
+        # vertex indices for 68 landmarks. starts from 0. [68,1]
+        self.keypoints = np.squeeze(model['keypoints']).astype(np.int64) - 1
+
+        if is_train:
+            # vertex indices for small face region to compute photometric error. starts from 0.
+            self.front_mask = np.squeeze(model['frontmask2_idx']).astype(np.int64) - 1
+            # vertex indices for each face from small face region. starts from 0. [f,3]
+            self.front_face_buf = model['tri_mask2'].astype(np.int64) - 1
+            # vertex indices for pre-defined skin region to compute reflectance loss
+            self.skin_mask = np.squeeze(model['skinmask'])
+        
+        if recenter:
+            mean_shape = self.mean_shape.reshape([-1, 3])
+            mean_shape = mean_shape - np.mean(mean_shape, axis=0, keepdims=True)
+            self.mean_shape = mean_shape.reshape([-1, 1])
+
+        self.persc_proj = perspective_projection(focal, center)
+        self.device = 'cpu'
+        self.camera_distance = camera_distance
+        self.SH = SH()
+        self.init_lit = init_lit.reshape([1, 1, -1]).astype(np.float32)
+        
+
+    def to(self, device):
+        self.device = device
+        for key, value in self.__dict__.items():
+            if type(value).__module__ == np.__name__:
+                setattr(self, key, torch.tensor(value).to(device))
+
+    
+    def compute_shape(self, id_coeff, exp_coeff):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            id_coeff         -- torch.tensor, size (B, 80), identity coeffs
+            exp_coeff        -- torch.tensor, size (B, 64), expression coeffs
+        """
+        batch_size = id_coeff.shape[0]
+        id_part = torch.einsum('ij,aj->ai', self.id_base, id_coeff)
+        exp_part = torch.einsum('ij,aj->ai', self.exp_base, exp_coeff)
+        face_shape = id_part + exp_part + self.mean_shape.reshape([1, -1])
+        return face_shape.reshape([batch_size, -1, 3])
+    
+
+    def compute_texture(self, tex_coeff, normalize=True):
+        """
+        Return:
+            face_texture     -- torch.tensor, size (B, N, 3), in RGB order, range (0, 1.)
+
+        Parameters:
+            tex_coeff        -- torch.tensor, size (B, 80)
+        """
+        batch_size = tex_coeff.shape[0]
+        face_texture = torch.einsum('ij,aj->ai', self.tex_base, tex_coeff) + self.mean_tex
+        if normalize:
+            face_texture = face_texture / 255.
+        return face_texture.reshape([batch_size, -1, 3])
+
+
+    def compute_norm(self, face_shape):
+        """
+        Return:
+            vertex_norm      -- torch.tensor, size (B, N, 3)
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+
+        v1 = face_shape[:, self.face_buf[:, 0]]
+        v2 = face_shape[:, self.face_buf[:, 1]]
+        v3 = face_shape[:, self.face_buf[:, 2]]
+        e1 = v1 - v2
+        e2 = v2 - v3
+        face_norm = torch.cross(e1, e2, dim=-1)
+        face_norm = F.normalize(face_norm, dim=-1, p=2)
+        face_norm = torch.cat([face_norm, torch.zeros(face_norm.shape[0], 1, 3).to(self.device)], dim=1)
+        
+        vertex_norm = torch.sum(face_norm[:, self.point_buf], dim=2)
+        vertex_norm = F.normalize(vertex_norm, dim=-1, p=2)
+        return vertex_norm
+
+
+    def compute_color(self, face_texture, face_norm, gamma):
+        """
+        Return:
+            face_color       -- torch.tensor, size (B, N, 3), range (0, 1.)
+
+        Parameters:
+            face_texture     -- torch.tensor, size (B, N, 3), from texture model, range (0, 1.)
+            face_norm        -- torch.tensor, size (B, N, 3), rotated face normal
+            gamma            -- torch.tensor, size (B, 27), SH coeffs
+        """
+        batch_size = gamma.shape[0]
+        v_num = face_texture.shape[1]
+        a, c = self.SH.a, self.SH.c
+        gamma = gamma.reshape([batch_size, 3, 9])
+        gamma = gamma + self.init_lit
+        gamma = gamma.permute(0, 2, 1)
+        Y = torch.cat([
+             a[0] * c[0] * torch.ones_like(face_norm[..., :1]).to(self.device),
+            -a[1] * c[1] * face_norm[..., 1:2],
+             a[1] * c[1] * face_norm[..., 2:],
+            -a[1] * c[1] * face_norm[..., :1],
+             a[2] * c[2] * face_norm[..., :1] * face_norm[..., 1:2],
+            -a[2] * c[2] * face_norm[..., 1:2] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] / np.sqrt(3.) * (3 * face_norm[..., 2:] ** 2 - 1),
+            -a[2] * c[2] * face_norm[..., :1] * face_norm[..., 2:],
+            0.5 * a[2] * c[2] * (face_norm[..., :1] ** 2  - face_norm[..., 1:2] ** 2)
+        ], dim=-1)
+        r = Y @ gamma[..., :1]
+        g = Y @ gamma[..., 1:2]
+        b = Y @ gamma[..., 2:]
+        face_color = torch.cat([r, g, b], dim=-1) * face_texture
+        return face_color
+
+    
+    def compute_rotation(self, angles):
+        """
+        Return:
+            rot              -- torch.tensor, size (B, 3, 3) pts @ trans_mat
+
+        Parameters:
+            angles           -- torch.tensor, size (B, 3), radian
+        """
+
+        batch_size = angles.shape[0]
+        ones = torch.ones([batch_size, 1]).to(self.device)
+        zeros = torch.zeros([batch_size, 1]).to(self.device)
+        x, y, z = angles[:, :1], angles[:, 1:2], angles[:, 2:],
+        
+        rot_x = torch.cat([
+            ones, zeros, zeros,
+            zeros, torch.cos(x), -torch.sin(x), 
+            zeros, torch.sin(x), torch.cos(x)
+        ], dim=1).reshape([batch_size, 3, 3])
+        
+        rot_y = torch.cat([
+            torch.cos(y), zeros, torch.sin(y),
+            zeros, ones, zeros,
+            -torch.sin(y), zeros, torch.cos(y)
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot_z = torch.cat([
+            torch.cos(z), -torch.sin(z), zeros,
+            torch.sin(z), torch.cos(z), zeros,
+            zeros, zeros, ones
+        ], dim=1).reshape([batch_size, 3, 3])
+
+        rot = rot_z @ rot_y @ rot_x
+        return rot.permute(0, 2, 1)
+
+
+    def to_camera(self, face_shape):
+        face_shape[..., -1] = self.camera_distance - face_shape[..., -1]
+        return face_shape
+
+    def to_image(self, face_shape):
+        """
+        Return:
+            face_proj        -- torch.tensor, size (B, N, 2), y direction is opposite to v direction
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+        """
+        # to image_plane
+        face_proj = face_shape @ self.persc_proj
+        face_proj = face_proj[..., :2] / face_proj[..., 2:]
+
+        return face_proj
+
+
+    def transform(self, face_shape, rot, trans):
+        """
+        Return:
+            face_shape       -- torch.tensor, size (B, N, 3) pts @ rot + trans
+
+        Parameters:
+            face_shape       -- torch.tensor, size (B, N, 3)
+            rot              -- torch.tensor, size (B, 3, 3)
+            trans            -- torch.tensor, size (B, 3)
+        """
+        return face_shape @ rot + trans.unsqueeze(1)
+
+
+    def get_landmarks(self, face_proj):
+        """
+        Return:
+            face_lms         -- torch.tensor, size (B, 68, 2)
+
+        Parameters:
+            face_proj       -- torch.tensor, size (B, N, 2)
+        """  
+        return face_proj[:, self.keypoints]
+
+    def split_coeff(self, coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80: 144]
+        tex_coeffs = coeffs[:, 144: 224]
+        angles = coeffs[:, 224: 227]
+        gammas = coeffs[:, 227: 254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+    def compute_for_render(self, coeffs):
+        """
+        Return:
+            face_vertex     -- torch.tensor, size (B, N, 3), in camera coordinate
+            face_color      -- torch.tensor, size (B, N, 3), in RGB order
+            landmark        -- torch.tensor, size (B, 68, 2), y direction is opposite to v direction
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 257)
+        """
+        coef_dict = self.split_coeff(coeffs)
+        face_shape = self.compute_shape(coef_dict['id'], coef_dict['exp'])
+        rotation = self.compute_rotation(coef_dict['angle'])
+
+
+        face_shape_transformed = self.transform(face_shape, rotation, coef_dict['trans'])
+        face_vertex = self.to_camera(face_shape_transformed)
+        
+        face_proj = self.to_image(face_vertex)
+        landmark = self.get_landmarks(face_proj)
+
+        face_texture = self.compute_texture(coef_dict['tex'])
+        face_norm = self.compute_norm(face_shape)
+        face_norm_roted = face_norm @ rotation
+        face_color = self.compute_color(face_texture, face_norm_roted, coef_dict['gamma'])
+
+        return face_vertex, face_texture, face_color, landmark
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/facerecon_model.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/facerecon_model.py
new file mode 100644
index 00000000..dfaaea92
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/facerecon_model.py
@@ -0,0 +1,228 @@
+"""This script defines the face reconstruction model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+import torch
+from .base_model import BaseModel
+from . import networks
+from .bfm import ParametricFaceModel
+from .losses import perceptual_loss, photo_loss, reg_loss, reflectance_loss, landmark_loss
+from util import util 
+from util.nvdiffrast import MeshRenderer
+from util.preprocess import estimate_norm_torch
+
+import trimesh
+from scipy.io import savemat
+
+class FaceReconModel(BaseModel):
+
+    @staticmethod
+    def modify_commandline_options(parser, is_train=True):
+        """  Configures options specific for CUT model
+        """
+        # net structure and parameters
+        parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='network structure')
+        parser.add_argument('--init_path', type=str, default='checkpoints/init_model/resnet50-0676ba61.pth')
+        parser.add_argument('--use_last_fc', type=util.str2bool, nargs='?', const=True, default=False, help='zero initialize the last fc')
+        parser.add_argument('--bfm_folder', type=str, default='BFM')
+        parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
+
+        # renderer parameters
+        parser.add_argument('--focal', type=float, default=1015.)
+        parser.add_argument('--center', type=float, default=112.)
+        parser.add_argument('--camera_d', type=float, default=10.)
+        parser.add_argument('--z_near', type=float, default=5.)
+        parser.add_argument('--z_far', type=float, default=15.)
+        parser.add_argument('--use_opengl', type=util.str2bool, nargs='?', const=True, default=True, help='use opengl context or not')
+
+        if is_train:
+            # training parameters
+            parser.add_argument('--net_recog', type=str, default='r50', choices=['r18', 'r43', 'r50'], help='face recog network structure')
+            parser.add_argument('--net_recog_path', type=str, default='checkpoints/recog_model/ms1mv3_arcface_r50_fp16/backbone.pth')
+            parser.add_argument('--use_crop_face', type=util.str2bool, nargs='?', const=True, default=False, help='use crop mask for photo loss')
+            parser.add_argument('--use_predef_M', type=util.str2bool, nargs='?', const=True, default=False, help='use predefined M for predicted face')
+
+            
+            # augmentation parameters
+            parser.add_argument('--shift_pixs', type=float, default=10., help='shift pixels')
+            parser.add_argument('--scale_delta', type=float, default=0.1, help='delta scale factor')
+            parser.add_argument('--rot_angle', type=float, default=10., help='rot angles, degree')
+
+            # loss weights
+            parser.add_argument('--w_feat', type=float, default=0.2, help='weight for feat loss')
+            parser.add_argument('--w_color', type=float, default=1.92, help='weight for loss loss')
+            parser.add_argument('--w_reg', type=float, default=3.0e-4, help='weight for reg loss')
+            parser.add_argument('--w_id', type=float, default=1.0, help='weight for id_reg loss')
+            parser.add_argument('--w_exp', type=float, default=0.8, help='weight for exp_reg loss')
+            parser.add_argument('--w_tex', type=float, default=1.7e-2, help='weight for tex_reg loss')
+            parser.add_argument('--w_gamma', type=float, default=10.0, help='weight for gamma loss')
+            parser.add_argument('--w_lm', type=float, default=1.6e-3, help='weight for lm loss')
+            parser.add_argument('--w_reflc', type=float, default=5.0, help='weight for reflc loss')
+
+
+
+        opt, _ = parser.parse_known_args()
+        parser.set_defaults(
+                focal=1015., center=112., camera_d=10., use_last_fc=False, z_near=5., z_far=15.
+            )
+        if is_train:
+            parser.set_defaults(
+                use_crop_face=True, use_predef_M=False
+            )
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this model class.
+
+        Parameters:
+            opt -- training/test options
+
+        A few things can be done here.
+        - (required) call the initialization function of BaseModel
+        - define loss function, visualization images, model names, and optimizers
+        """
+        BaseModel.__init__(self, opt)  # call the initialization method of BaseModel
+        
+        self.visual_names = ['output_vis']
+        self.model_names = ['net_recon']
+        self.parallel_names = self.model_names + ['renderer']
+
+        self.net_recon = networks.define_net_recon(
+            net_recon=opt.net_recon, use_last_fc=opt.use_last_fc, init_path=opt.init_path
+        )
+
+        self.facemodel = ParametricFaceModel(
+            bfm_folder=opt.bfm_folder, camera_distance=opt.camera_d, focal=opt.focal, center=opt.center,
+            is_train=self.isTrain, default_name=opt.bfm_model
+        )
+        
+        fov = 2 * np.arctan(opt.center / opt.focal) * 180 / np.pi
+        self.renderer = MeshRenderer(
+            rasterize_fov=fov, znear=opt.z_near, zfar=opt.z_far, rasterize_size=int(2 * opt.center), use_opengl=opt.use_opengl
+        )
+
+        if self.isTrain:
+            self.loss_names = ['all', 'feat', 'color', 'lm', 'reg', 'gamma', 'reflc']
+
+            self.net_recog = networks.define_net_recog(
+                net_recog=opt.net_recog, pretrained_path=opt.net_recog_path
+                )
+            # loss func name: (compute_%s_loss) % loss_name
+            self.compute_feat_loss = perceptual_loss
+            self.comupte_color_loss = photo_loss
+            self.compute_lm_loss = landmark_loss
+            self.compute_reg_loss = reg_loss
+            self.compute_reflc_loss = reflectance_loss
+
+            self.optimizer = torch.optim.Adam(self.net_recon.parameters(), lr=opt.lr)
+            self.optimizers = [self.optimizer]
+            self.parallel_names += ['net_recog']
+        # Our program will automatically call <model.setup> to define schedulers, load networks, and print networks
+
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input: a dictionary that contains the data itself and its metadata information.
+        """
+        self.input_img = input['imgs'].to(self.device) 
+        self.atten_mask = input['msks'].to(self.device) if 'msks' in input else None
+        self.gt_lm = input['lms'].to(self.device)  if 'lms' in input else None
+        self.trans_m = input['M'].to(self.device) if 'M' in input else None
+        self.image_paths = input['im_paths'] if 'im_paths' in input else None
+
+    def forward(self):
+        output_coeff = self.net_recon(self.input_img)
+        self.facemodel.to(self.device)
+        self.pred_vertex, self.pred_tex, self.pred_color, self.pred_lm = \
+            self.facemodel.compute_for_render(output_coeff)
+        self.pred_mask, _, self.pred_face = self.renderer(
+            self.pred_vertex, self.facemodel.face_buf, feat=self.pred_color)
+        
+        self.pred_coeffs_dict = self.facemodel.split_coeff(output_coeff)
+
+
+    def compute_losses(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+
+        assert self.net_recog.training == False
+        trans_m = self.trans_m
+        if not self.opt.use_predef_M:
+            trans_m = estimate_norm_torch(self.pred_lm, self.input_img.shape[-2])
+
+        pred_feat = self.net_recog(self.pred_face, trans_m)
+        gt_feat = self.net_recog(self.input_img, self.trans_m)
+        self.loss_feat = self.opt.w_feat * self.compute_feat_loss(pred_feat, gt_feat)
+
+        face_mask = self.pred_mask
+        if self.opt.use_crop_face:
+            face_mask, _, _ = self.renderer(self.pred_vertex, self.facemodel.front_face_buf)
+        
+        face_mask = face_mask.detach()
+        self.loss_color = self.opt.w_color * self.comupte_color_loss(
+            self.pred_face, self.input_img, self.atten_mask * face_mask)
+        
+        loss_reg, loss_gamma = self.compute_reg_loss(self.pred_coeffs_dict, self.opt)
+        self.loss_reg = self.opt.w_reg * loss_reg
+        self.loss_gamma = self.opt.w_gamma * loss_gamma
+
+        self.loss_lm = self.opt.w_lm * self.compute_lm_loss(self.pred_lm, self.gt_lm)
+
+        self.loss_reflc = self.opt.w_reflc * self.compute_reflc_loss(self.pred_tex, self.facemodel.skin_mask)
+
+        self.loss_all = self.loss_feat + self.loss_color + self.loss_reg + self.loss_gamma \
+                        + self.loss_lm + self.loss_reflc
+            
+
+    def optimize_parameters(self, isTrain=True):
+        self.forward()               
+        self.compute_losses()
+        """Update network weights; it will be called in every training iteration."""
+        if isTrain:
+            self.optimizer.zero_grad()  
+            self.loss_all.backward()         
+            self.optimizer.step()        
+
+    def compute_visuals(self):
+        with torch.no_grad():
+            input_img_numpy = 255. * self.input_img.detach().cpu().permute(0, 2, 3, 1).numpy()
+            output_vis = self.pred_face * self.pred_mask + (1 - self.pred_mask) * self.input_img
+            output_vis_numpy_raw = 255. * output_vis.detach().cpu().permute(0, 2, 3, 1).numpy()
+            
+            if self.gt_lm is not None:
+                gt_lm_numpy = self.gt_lm.cpu().numpy()
+                pred_lm_numpy = self.pred_lm.detach().cpu().numpy()
+                output_vis_numpy = util.draw_landmarks(output_vis_numpy_raw, gt_lm_numpy, 'b')
+                output_vis_numpy = util.draw_landmarks(output_vis_numpy, pred_lm_numpy, 'r')
+            
+                output_vis_numpy = np.concatenate((input_img_numpy, 
+                                    output_vis_numpy_raw, output_vis_numpy), axis=-2)
+            else:
+                output_vis_numpy = np.concatenate((input_img_numpy, 
+                                    output_vis_numpy_raw), axis=-2)
+
+            self.output_vis = torch.tensor(
+                    output_vis_numpy / 255., dtype=torch.float32
+                ).permute(0, 3, 1, 2).to(self.device)
+
+    def save_mesh(self, name):
+
+        recon_shape = self.pred_vertex  # get reconstructed shape
+        recon_shape[..., -1] = 10 - recon_shape[..., -1] # from camera space to world space
+        recon_shape = recon_shape.cpu().numpy()[0]
+        recon_color = self.pred_color
+        recon_color = recon_color.cpu().numpy()[0]
+        tri = self.facemodel.face_buf.cpu().numpy()
+        mesh = trimesh.Trimesh(vertices=recon_shape, faces=tri, vertex_colors=np.clip(255. * recon_color, 0, 255).astype(np.uint8), process=False)
+        mesh.export(name)
+
+    def save_coeff(self,name):
+
+        pred_coeffs = {key:self.pred_coeffs_dict[key].cpu().numpy() for key in self.pred_coeffs_dict}
+        pred_lm = self.pred_lm.cpu().numpy()
+        pred_lm = np.stack([pred_lm[:,:,0],self.input_img.shape[2]-1-pred_lm[:,:,1]],axis=2) # transfer to image coordinate
+        pred_coeffs['lm68'] = pred_lm
+        savemat(name,pred_coeffs)
+
+
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/losses.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/losses.py
new file mode 100644
index 00000000..fbacb63b
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/losses.py
@@ -0,0 +1,113 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from kornia.geometry import warp_affine
+import torch.nn.functional as F
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+### perceptual level loss
+class PerceptualLoss(nn.Module):
+    def __init__(self, recog_net, input_size=112):
+        super(PerceptualLoss, self).__init__()
+        self.recog_net = recog_net
+        self.preprocess = lambda x: 2 * x - 1
+        self.input_size=input_size
+    def forward(imageA, imageB, M):
+        """
+        1 - cosine distance
+        Parameters:
+            imageA       --torch.tensor (B, 3, H, W), range (0, 1) , RGB order
+            imageB       --same as imageA
+        """
+
+        imageA = self.preprocess(resize_n_crop(imageA, M, self.input_size))
+        imageB = self.preprocess(resize_n_crop(imageB, M, self.input_size))
+
+        # freeze bn
+        self.recog_net.eval()
+        
+        id_featureA = F.normalize(self.recog_net(imageA), dim=-1, p=2)
+        id_featureB = F.normalize(self.recog_net(imageB), dim=-1, p=2)  
+        cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+        # assert torch.sum((cosine_d > 1).float()) == 0
+        return torch.sum(1 - cosine_d) / cosine_d.shape[0]        
+
+def perceptual_loss(id_featureA, id_featureB):
+    cosine_d = torch.sum(id_featureA * id_featureB, dim=-1)
+        # assert torch.sum((cosine_d > 1).float()) == 0
+    return torch.sum(1 - cosine_d) / cosine_d.shape[0]  
+
+### image level loss
+def photo_loss(imageA, imageB, mask, eps=1e-6):
+    """
+    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    Parameters:
+        imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order 
+        imageB       --same as imageA
+    """
+    loss = torch.sqrt(eps + torch.sum((imageA - imageB) ** 2, dim=1, keepdims=True)) * mask
+    loss = torch.sum(loss) / torch.max(torch.sum(mask), torch.tensor(1.0).to(mask.device))
+    return loss
+
+def landmark_loss(predict_lm, gt_lm, weight=None):
+    """
+    weighted mse loss
+    Parameters:
+        predict_lm    --torch.tensor (B, 68, 2)
+        gt_lm         --torch.tensor (B, 68, 2)
+        weight        --numpy.array (1, 68)
+    """
+    if not weight:
+        weight = np.ones([68])
+        weight[28:31] = 20
+        weight[-8:] = 20
+        weight = np.expand_dims(weight, 0)
+        weight = torch.tensor(weight).to(predict_lm.device)
+    loss = torch.sum((predict_lm - gt_lm)**2, dim=-1) * weight
+    loss = torch.sum(loss) / (predict_lm.shape[0] * predict_lm.shape[1])
+    return loss
+
+
+### regulization
+def reg_loss(coeffs_dict, opt=None):
+    """
+    l2 norm without the sqrt, from yu's implementation (mse)
+    tf.nn.l2_loss https://www.tensorflow.org/api_docs/python/tf/nn/l2_loss
+    Parameters:
+        coeffs_dict     -- a  dict of torch.tensors , keys: id, exp, tex, angle, gamma, trans
+
+    """
+    # coefficient regularization to ensure plausible 3d faces
+    if opt:
+        w_id, w_exp, w_tex = opt.w_id, opt.w_exp, opt.w_tex
+    else:
+        w_id, w_exp, w_tex = 1, 1, 1, 1
+    creg_loss = w_id * torch.sum(coeffs_dict['id'] ** 2) +  \
+           w_exp * torch.sum(coeffs_dict['exp'] ** 2) + \
+           w_tex * torch.sum(coeffs_dict['tex'] ** 2)
+    creg_loss = creg_loss / coeffs_dict['id'].shape[0]
+
+    # gamma regularization to ensure a nearly-monochromatic light
+    gamma = coeffs_dict['gamma'].reshape([-1, 3, 9])
+    gamma_mean = torch.mean(gamma, dim=1, keepdims=True)
+    gamma_loss = torch.mean((gamma - gamma_mean) ** 2)
+
+    return creg_loss, gamma_loss
+
+def reflectance_loss(texture, mask):
+    """
+    minimize texture variance (mse), albedo regularization to ensure an uniform skin albedo
+    Parameters:
+        texture       --torch.tensor, (B, N, 3)
+        mask          --torch.tensor, (N), 1 or 0
+
+    """
+    mask = mask.reshape([1, mask.shape[0], 1])
+    texture_mean = torch.sum(mask * texture, dim=1, keepdims=True) / torch.sum(mask)
+    loss = torch.sum(((texture - texture_mean) * mask)**2) / (texture.shape[0] * torch.sum(mask))
+    return loss
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/networks.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/networks.py
new file mode 100644
index 00000000..40ce9f99
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/networks.py
@@ -0,0 +1,521 @@
+"""This script defines deep neural networks for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import numpy as np
+import torch.nn.functional as F
+from torch.nn import init
+import functools
+from torch.optim import lr_scheduler
+import torch
+from torch import Tensor
+import torch.nn as nn
+try:
+    from torch.hub import load_state_dict_from_url
+except ImportError:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+from typing import Type, Any, Callable, Union, List, Optional
+from .arcface_torch.backbones import get_model
+from kornia.geometry import warp_affine
+
+def resize_n_crop(image, M, dsize=112):
+    # image: (b, c, h, w)
+    # M   :  (b, 2, 3)
+    return warp_affine(image, M, dsize=(dsize, dsize))
+
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+
+def get_scheduler(optimizer, opt):
+    """Return a learning rate scheduler
+
+    Parameters:
+        optimizer          -- the optimizer of the network
+        opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions．　
+                              opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine
+
+    For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers.
+    See https://pytorch.org/docs/stable/optim.html for more details.
+    """
+    if opt.lr_policy == 'linear':
+        def lambda_rule(epoch):
+            lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.n_epochs) / float(opt.n_epochs + 1)
+            return lr_l
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+    elif opt.lr_policy == 'step':
+        scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_epochs, gamma=0.2)
+    elif opt.lr_policy == 'plateau':
+        scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5)
+    elif opt.lr_policy == 'cosine':
+        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.n_epochs, eta_min=0)
+    else:
+        return NotImplementedError('learning rate policy [%s] is not implemented', opt.lr_policy)
+    return scheduler
+
+
+def define_net_recon(net_recon, use_last_fc=False, init_path=None):
+    return ReconNetWrapper(net_recon, use_last_fc=use_last_fc, init_path=init_path)
+
+def define_net_recog(net_recog, pretrained_path=None):
+    net = RecogNetWrapper(net_recog=net_recog, pretrained_path=pretrained_path)
+    net.eval()
+    return net
+
+class ReconNetWrapper(nn.Module):
+    fc_dim=257
+    def __init__(self, net_recon, use_last_fc=False, init_path=None):
+        super(ReconNetWrapper, self).__init__()
+        self.use_last_fc = use_last_fc
+        if net_recon not in func_dict:
+            return  NotImplementedError('network [%s] is not implemented', net_recon)
+        func, last_dim = func_dict[net_recon]
+        backbone = func(use_last_fc=use_last_fc, num_classes=self.fc_dim)
+        if init_path and os.path.isfile(init_path):
+            state_dict = filter_state_dict(torch.load(init_path, map_location='cpu'))
+            backbone.load_state_dict(state_dict)
+            print("loading init net_recon %s from %s" %(net_recon, init_path))
+        self.backbone = backbone
+        if not use_last_fc:
+            self.final_layers = nn.ModuleList([
+                conv1x1(last_dim, 80, bias=True), # id layer
+                conv1x1(last_dim, 64, bias=True), # exp layer
+                conv1x1(last_dim, 80, bias=True), # tex layer
+                conv1x1(last_dim, 3, bias=True),  # angle layer
+                conv1x1(last_dim, 27, bias=True), # gamma layer
+                conv1x1(last_dim, 2, bias=True),  # tx, ty
+                conv1x1(last_dim, 1, bias=True)   # tz
+            ])
+            for m in self.final_layers:
+                nn.init.constant_(m.weight, 0.)
+                nn.init.constant_(m.bias, 0.)
+
+    def forward(self, x):
+        x = self.backbone(x)
+        if not self.use_last_fc:
+            output = []
+            for layer in self.final_layers:
+                output.append(layer(x))
+            x = torch.flatten(torch.cat(output, dim=1), 1)
+        return x
+
+
+class RecogNetWrapper(nn.Module):
+    def __init__(self, net_recog, pretrained_path=None, input_size=112):
+        super(RecogNetWrapper, self).__init__()
+        net = get_model(name=net_recog, fp16=False)
+        if pretrained_path:
+            state_dict = torch.load(pretrained_path, map_location='cpu')
+            net.load_state_dict(state_dict)
+            print("loading pretrained net_recog %s from %s" %(net_recog, pretrained_path))
+        for param in net.parameters():
+            param.requires_grad = False
+        self.net = net
+        self.preprocess = lambda x: 2 * x - 1
+        self.input_size=input_size
+        
+    def forward(self, image, M):
+        image = self.preprocess(resize_n_crop(image, M, self.input_size))
+        id_feature = F.normalize(self.net(image), dim=-1, p=2)
+        return id_feature
+
+
+# adapted from https://github.com/pytorch/vision/edit/master/torchvision/models/resnet.py
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-f37072fd.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-b627a593.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-0676ba61.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-63fe2227.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-394f9c45.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+
+
+def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1, bias: bool = False) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        use_last_fc: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None
+    ) -> None:
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.use_last_fc = use_last_fc
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        
+        if self.use_last_fc:
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
+                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        if self.use_last_fc:
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+def _resnet(
+    arch: str,
+    block: Type[Union[BasicBlock, Bottleneck]],
+    layers: List[int],
+    pretrained: bool,
+    progress: bool,
+    **kwargs: Any
+) -> ResNet:
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
+def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+
+
+def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+
+
+def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+
+
+def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+
+
+func_dict = {
+    'resnet18': (resnet18, 512),
+    'resnet50': (resnet50, 2048)
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/models/template_model.py b/dreamtalk/Deep3DFaceRecon_pytorch/models/template_model.py
new file mode 100644
index 00000000..dac7b33d
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/models/template_model.py
@@ -0,0 +1,100 @@
+"""Model class template
+
+This module provides a template for users to implement custom models.
+You can specify '--model template' to use this model.
+The class name should be consistent with both the filename and its model option.
+The filename should be <model>_dataset.py
+The class name should be <Model>Dataset.py
+It implements a simple image-to-image translation baseline based on regression loss.
+Given input-output pairs (data_A, data_B), it learns a network netG that can minimize the following L1 loss:
+    min_<netG> ||netG(data_A) - data_B||_1
+You need to implement the following functions:
+    <modify_commandline_options>:　Add model-specific options and rewrite default values for existing options.
+    <__init__>: Initialize this model class.
+    <set_input>: Unpack input data and perform data pre-processing.
+    <forward>: Run forward pass. This will be called by both <optimize_parameters> and <test>.
+    <optimize_parameters>: Update network weights; it will be called in every training iteration.
+"""
+import numpy as np
+import torch
+from .base_model import BaseModel
+from . import networks
+
+
+class TemplateModel(BaseModel):
+    @staticmethod
+    def modify_commandline_options(parser, is_train=True):
+        """Add new model-specific options and rewrite default values for existing options.
+
+        Parameters:
+            parser -- the option parser
+            is_train -- if it is training phase or test phase. You can use this flag to add training-specific or test-specific options.
+
+        Returns:
+            the modified parser.
+        """
+        parser.set_defaults(dataset_mode='aligned')  # You can rewrite default values for this model. For example, this model usually uses aligned dataset as its dataset.
+        if is_train:
+            parser.add_argument('--lambda_regression', type=float, default=1.0, help='weight for the regression loss')  # You can define new arguments for this model.
+
+        return parser
+
+    def __init__(self, opt):
+        """Initialize this model class.
+
+        Parameters:
+            opt -- training/test options
+
+        A few things can be done here.
+        - (required) call the initialization function of BaseModel
+        - define loss function, visualization images, model names, and optimizers
+        """
+        BaseModel.__init__(self, opt)  # call the initialization method of BaseModel
+        # specify the training losses you want to print out. The program will call base_model.get_current_losses to plot the losses to the console and save them to the disk.
+        self.loss_names = ['loss_G']
+        # specify the images you want to save and display. The program will call base_model.get_current_visuals to save and display these images.
+        self.visual_names = ['data_A', 'data_B', 'output']
+        # specify the models you want to save to the disk. The program will call base_model.save_networks and base_model.load_networks to save and load networks.
+        # you can use opt.isTrain to specify different behaviors for training and test. For example, some networks will not be used during test, and you don't need to load them.
+        self.model_names = ['G']
+        # define networks; you can use opt.isTrain to specify different behaviors for training and test.
+        self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, gpu_ids=self.gpu_ids)
+        if self.isTrain:  # only defined during training time
+            # define your loss functions. You can use losses provided by torch.nn such as torch.nn.L1Loss.
+            # We also provide a GANLoss class "networks.GANLoss". self.criterionGAN = networks.GANLoss().to(self.device)
+            self.criterionLoss = torch.nn.L1Loss()
+            # define and initialize optimizers. You can define one optimizer for each network.
+            # If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
+            self.optimizer = torch.optim.Adam(self.netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
+            self.optimizers = [self.optimizer]
+
+        # Our program will automatically call <model.setup> to define schedulers, load networks, and print networks
+
+    def set_input(self, input):
+        """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+        Parameters:
+            input: a dictionary that contains the data itself and its metadata information.
+        """
+        AtoB = self.opt.direction == 'AtoB'  # use <direction> to swap data_A and data_B
+        self.data_A = input['A' if AtoB else 'B'].to(self.device)  # get image data A
+        self.data_B = input['B' if AtoB else 'A'].to(self.device)  # get image data B
+        self.image_paths = input['A_paths' if AtoB else 'B_paths']  # get image paths
+
+    def forward(self):
+        """Run forward pass. This will be called by both functions <optimize_parameters> and <test>."""
+        self.output = self.netG(self.data_A)  # generate output image given the input data_A
+
+    def backward(self):
+        """Calculate losses, gradients, and update network weights; called in every training iteration"""
+        # caculate the intermediate results if necessary; here self.output has been computed during function <forward>
+        # calculate loss given the input and intermediate results
+        self.loss_G = self.criterionLoss(self.output, self.data_B) * self.opt.lambda_regression
+        self.loss_G.backward()       # calculate gradients of network G w.r.t. loss_G
+
+    def optimize_parameters(self):
+        """Update network weights; it will be called in every training iteration."""
+        self.forward()               # first call forward to calculate intermediate results
+        self.optimizer.zero_grad()   # clear network G's existing gradients
+        self.backward()              # calculate gradients for network G
+        self.optimizer.step()        # update gradients for network G
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/LICENSE.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/LICENSE.txt
new file mode 100644
index 00000000..26a070a4
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/LICENSE.txt
@@ -0,0 +1,97 @@
+Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
+
+
+Nvidia Source Code License (1-Way Commercial)
+
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. The Work or
+    derivative works thereof may be used or intended for use by Nvidia
+    or its affiliates commercially or non-commercially. As used herein,
+    "non-commercially" means for research or evaluation purposes only
+    and not for any direct or indirect monetary gain.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor's or its affiliates' names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/README.md b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/README.md
new file mode 100644
index 00000000..3eeb4115
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/README.md
@@ -0,0 +1,42 @@
+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+
+![Teaser image](./docs/img/teaser.png)
+
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+
+## Licenses
+
+Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+We do not currently accept outside code contributions in the form of pull requests.
+
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under 
+[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+
+## Citation
+
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/__init__.py
new file mode 100644
index 00000000..fd28a087
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.3.3'
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu
new file mode 100644
index 00000000..95cc3bab
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.cu
@@ -0,0 +1,558 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
+
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = float_to_triidx(zt0.y) - 1;
+        int tri1 = float_to_triidx(zt1.y) - 1;
+
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase;
+            vi1 += vbase;
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+
+    int workCount = p.workBuffer[0].x;
+
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.h
new file mode 100644
index 00000000..a324f2f2
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/antialias.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH         32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT        8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK        256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK            256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc)        ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
+#define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc)    ((alloc) >= (2 << 25) ? 2 : 3)
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK            256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.cpp
new file mode 100644
index 00000000..e566c035
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.h
new file mode 100644
index 00000000..01ecf9fc
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/common.h
@@ -0,0 +1,263 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+
+#ifdef __CUDACC__
+
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+
+static __device__ __forceinline__ float2&   operator*=  (float2& a, const float2& b)       { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, const float2& b)       { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, const float2& b)       { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2&   operator*=  (float2& a, float b)               { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, float b)               { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, float b)               { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2    operator*   (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2    operator*   (const float2& a, float b)         { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, float b)         { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, float b)         { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2    operator*   (float a, const float2& b)         { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2    operator+   (float a, const float2& b)         { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2    operator-   (float a, const float2& b)         { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a)                  { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, const float3& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, const float3& b)       { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, const float3& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, float b)               { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, float b)               { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, float b)               { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3    operator*   (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3    operator*   (const float3& a, float b)         { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, float b)         { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, float b)         { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3    operator*   (float a, const float3& b)         { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3    operator+   (float a, const float3& b)         { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3    operator-   (float a, const float3& b)         { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a)                  { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, const float4& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, const float4& b)       { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, const float4& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, float b)               { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, float b)               { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, float b)               { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4    operator*   (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4    operator*   (const float4& a, float b)         { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, float b)         { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, float b)         { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4    operator*   (float a, const float4& b)         { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4    operator+   (float a, const float4& b)         { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4    operator-   (float a, const float4& b)         { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a)                  { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, const int2& b)           { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, const int2& b)           { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, const int2& b)           { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, int b)                   { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, int b)                   { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, int b)                   { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2      operator*   (const int2& a, const int2& b)     { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, const int2& b)     { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, const int2& b)     { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2      operator*   (const int2& a, int b)             { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, int b)             { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, int b)             { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2      operator*   (int a, const int2& b)             { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2      operator+   (int a, const int2& b)             { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2      operator-   (int a, const int2& b)             { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a)                    { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, const int3& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, const int3& b)           { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, const int3& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, int b)                   { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3      operator*   (const int3& a, const int3& b)     { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, const int3& b)     { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, const int3& b)     { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3      operator*   (const int3& a, int b)             { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, int b)             { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, int b)             { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3      operator*   (int a, const int3& b)             { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3      operator+   (int a, const int3& b)             { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3      operator-   (int a, const int3& b)             { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a)                    { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, const int4& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, const int4& b)           { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, const int4& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, int b)                   { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4      operator*   (const int4& a, const int4& b)     { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, const int4& b)     { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, const int4& b)     { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4      operator*   (const int4& a, int b)             { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, int b)             { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, int b)             { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4      operator*   (int a, const int4& b)             { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4      operator+   (int a, const int4& b)             { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4      operator-   (int a, const int4& b)             { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a)                    { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, const uint2& b)         { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, const uint2& b)         { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, const uint2& b)         { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, unsigned int b)         { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, unsigned int b)         { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, unsigned int b)         { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, const uint2& b)   { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, const uint2& b)   { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, const uint2& b)   { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, unsigned int b)   { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, unsigned int b)   { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, unsigned int b)   { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2     operator*   (unsigned int a, const uint2& b)   { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2     operator+   (unsigned int a, const uint2& b)   { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2     operator-   (unsigned int a, const uint2& b)   { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, const uint3& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, const uint3& b)         { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, const uint3& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, const uint3& b)   { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, const uint3& b)   { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, const uint3& b)   { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, unsigned int b)   { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, unsigned int b)   { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, unsigned int b)   { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3     operator*   (unsigned int a, const uint3& b)   { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3     operator+   (unsigned int a, const uint3& b)   { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3     operator-   (unsigned int a, const uint3& b)   { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, const uint4& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, const uint4& b)         { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, const uint4& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, const uint4& b)   { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, const uint4& b)   { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, const uint4& b)   { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, unsigned int b)   { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, unsigned int b)   { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, unsigned int b)   { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4     operator*   (unsigned int a, const uint4& b)   { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4     operator+   (unsigned int a, const uint4& b)   { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4     operator-   (unsigned int a, const uint4& b)   { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+
+template<class T> static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float  zero_value<float> (void)                      { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value<float2>(void)                      { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value<float4>(void)                      { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b)             { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b)             { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b)     { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b)                     { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b)                     { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b)             { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b)         { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b)         { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b)         { return make_uint4(a.x, a.y, b.x, b.y); }
+
+template<class T> static __device__ __forceinline__ void swap(T& a, T& b)                  { T temp = a; a = b; b = temp; }
+
+//------------------------------------------------------------------------
+// Triangle ID <-> float32 conversion functions to support very large triangle IDs.
+//
+// Values up to and including 16777216 (also, negative values) are converted trivially and retain
+// compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
+// the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
+
+static __device__ __forceinline__ int   float_to_triidx(float x) { if (x <= 16777216.f) return (int)x;   return __float_as_int(x) - 0x4a800000; }
+static __device__ __forceinline__ float triidx_to_float(int x)   { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
+
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
+#define CA_TEMP       _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+    __shared__ float CA_TEMP[(threads_per_block)]
+
+#define CA_SET_GROUP_MASK(group, thread_mask)                   \
+    bool   _ca_leader;                                          \
+    float* _ca_ptr;                                             \
+    do {                                                        \
+        int tidx   = threadIdx.x + blockDim.x * threadIdx.y;    \
+        int lane   = tidx & 31;                                 \
+        int warp   = tidx >> 5;                                 \
+        int tmask  = __match_any_sync((thread_mask), (group));  \
+        int leader = __ffs(tmask) - 1;                          \
+        _ca_leader = (leader == lane);                          \
+        _ca_ptr    = &_ca_temp[((warp << 5) + leader)];         \
+    } while(0)
+
+#define CA_SET_GROUP(group) \
+    CA_SET_GROUP_MASK((group), 0xffffffffu)
+
+#define caAtomicAdd(ptr, value)         \
+    do {                                \
+        if (_ca_leader)                 \
+            *_ca_ptr = 0.f;             \
+        atomicAdd(_ca_ptr, (value));    \
+        if (_ca_leader)                 \
+            atomicAdd((ptr), *_ca_ptr); \
+    } while(0)
+
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        caAtomicAdd((ptr), (x));        \
+        caAtomicAdd((ptr)+1, (y));      \
+        caAtomicAdd((ptr)+3, (w));      \
+    } while(0)
+
+#define caAtomicAddTexture(ptr, level, idx, value)  \
+    do {                                            \
+        CA_SET_GROUP((idx) ^ ((level) << 27));      \
+        caAtomicAdd((ptr)+(idx), (value));          \
+    } while(0)
+
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
+//------------------------------------------------------------------------
+#endif // __CUDACC__
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/CudaRaster.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/CudaRaster.hpp
new file mode 100644
index 00000000..3c1c3a7f
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/CudaRaster.hpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// This is a slimmed-down and modernized version of the original
+// CudaRaster codebase that accompanied the HPG 2011 paper
+// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
+// Modifications have been made to accommodate post-Volta execution model
+// with warp divergence. Support for shading, blending, quad rendering,
+// and supersampling have been removed as unnecessary for nvdiffrast.
+//------------------------------------------------------------------------
+
+namespace CR
+{
+
+class RasterImpl;
+
+//------------------------------------------------------------------------
+// Interface class to isolate user from implementation details.
+//------------------------------------------------------------------------
+
+class CudaRaster
+{
+public:
+    enum
+    {
+        RenderModeFlag_EnableBackfaceCulling = 1 << 0,   // Enable backface culling.
+        RenderModeFlag_EnableDepthPeeling    = 1 << 1,   // Enable depth peeling. Must have a peel buffer set.
+    };
+
+public:
+					        CudaRaster				(void);
+					        ~CudaRaster				(void);
+
+    void                    setBufferSize           (int width, int height, int numImages);              // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
+    void                    setViewport             (int width, int height, int offsetX, int offsetY);   // Tiled rendering viewport setup.
+    void                    setRenderModeFlags      (unsigned int renderModeFlags);                      // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+    void                    deferredClear           (unsigned int clearColor);                           // Clears color and depth buffers during next call to drawTriangles().
+    void                    setVertexBuffer         (void* vertices, int numVertices);                   // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+    void                    setIndexBuffer          (void* indices, int numTriangles);                   // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+    bool                    drawTriangles           (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+    void*                   getColorBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void*                   getDepthBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void                    swapDepthAndPeel        (void);                                              // Swap depth and peeling buffers.
+
+private:
+					        CudaRaster           	(const CudaRaster&); // forbidden
+	CudaRaster&             operator=           	(const CudaRaster&); // forbidden
+
+private:
+    RasterImpl*             m_impl;                 // Opaque pointer to implementation.
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/BinRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/BinRaster.inl
new file mode 100644
index 00000000..deae9d2c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/BinRaster.inl
@@ -0,0 +1,423 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void binRasterImpl(const CRParams p)
+{
+    __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
+    __shared__ volatile S32 s_outOfs    [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outTotal  [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outMask   [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_outCount  [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_triBuf    [CR_BIN_WARPS*32*4];                // triangle ring buffer
+    __shared__ volatile U32 s_batchPos;
+    __shared__ volatile U32 s_bufCount;
+    __shared__ volatile U32 s_overTotal;
+    __shared__ volatile U32 s_allocBase;
+
+    const CRImageParams&    ip              = getImageParams(p, blockIdx.z);
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const U8*               triSubtris      = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+
+    S32*                    binFirstSeg     = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binTotal        = (S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binSegData      = (S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    S32*                    binSegNext      = (S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    S32*                    binSegCount     = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+
+    if (atomics.numSubtris > p.maxSubtris)
+        return;
+
+    // per-thread state
+    int thrInBlock = threadIdx.x + threadIdx.y * 32;
+    int batchPos = 0;
+
+    // first 16 elements of s_broadcast are always zero
+    if (thrInBlock < 16)
+        s_broadcast[thrInBlock] = 0;
+
+    // initialize output linked lists and offsets
+    if (thrInBlock < p.numBins)
+    {
+        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
+        s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
+        s_outTotal[thrInBlock] = 0;
+    }
+
+    // repeat until done
+    for(;;)
+    {
+        // get batch
+        if (thrInBlock == 0)
+            s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
+        __syncthreads();
+        batchPos = s_batchPos;
+
+        // all batches done?
+        if (batchPos >= ip.triCount)
+            break;
+
+        // per-thread state
+        int bufIndex = 0;
+        int bufCount = 0;
+        int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
+
+        // loop over batch as long as we have triangles in it
+        do
+        {
+            // read more triangles
+            while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
+            {
+                // get subtriangle count
+
+                int triIdx = batchPos + thrInBlock;
+                int num = 0;
+                if (triIdx < batchEnd)
+                    num = triSubtris[triIdx];
+
+                // cumulative sum of subtriangles within each warp
+                U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
+                if (__any_sync(~0u, num > 1))
+                {
+                    myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
+                    myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
+                }
+                if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
+                    s_broadcast[threadIdx.y + 16] = myIdx + num;
+                __syncthreads();
+
+                // cumulative sum of per-warp subtriangle counts
+                // Note: cannot have more than 32 warps or this needs to sync between each step.
+                bool act = (thrInBlock < CR_BIN_WARPS);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (threadIdx.y == 0 && act)
+                {
+                    volatile U32* ptr = &s_broadcast[thrInBlock + 16];
+                    U32 val = *ptr;
+                    #if (CR_BIN_WARPS > 1)
+                        val += ptr[-1]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 2)
+                        val += ptr[-2]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 4)
+                        val += ptr[-4]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 8)
+                        val += ptr[-8]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 16)
+                        val += ptr[-16]; __syncwarp(actMask);
+                        *ptr = val;      __syncwarp(actMask);
+                    #endif
+
+                    // initially assume that we consume everything
+                    // only last active thread does the writes
+                    if (threadIdx.x == CR_BIN_WARPS - 1)
+                    {
+                        s_batchPos = batchPos + CR_BIN_WARPS * 32;
+                        s_bufCount = bufCount + val;
+                    }
+                }
+                __syncthreads();
+
+                // skip if no subtriangles
+                if (num)
+                {
+                    // calculate write position for first subtriangle
+                    U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
+
+                    // only write if entire triangle fits
+                    if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        pos += bufIndex; // adjust for current start position
+                        pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                        if (num == 1)
+                            s_triBuf[pos] = triIdx * 8 + 7; // single triangle
+                        else
+                        {
+                            for (int i=0; i < num; i++)
+                            {
+                                s_triBuf[pos] = triIdx * 8 + i;
+                                pos++;
+                                pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                            }
+                        }
+                    } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        // this triangle is the first that failed, overwrite total count and triangle count
+                        s_batchPos = batchPos + thrInBlock;
+                        s_bufCount = pos;
+                    }
+                }
+
+                // update triangle counts
+                __syncthreads();
+                batchPos = s_batchPos;
+                bufCount = s_bufCount;
+            }
+
+            // make every warp clear its output buffers
+            for (int i=threadIdx.x; i < p.numBins; i += 32)
+                s_outMask[threadIdx.y][i] = 0;
+            __syncwarp();
+
+            // choose our triangle
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (thrInBlock < bufCount)
+            {
+                U32 triPos = bufIndex + thrInBlock;
+                triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
+
+                // find triangle
+                int triIdx = s_triBuf[triPos];
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+
+                // read triangle
+
+                triData = *(((const uint4*)triHeader) + dataIdx);
+            }
+
+            // setup bounding box and edge functions, and rasterize
+            S32 lox, loy, hix, hiy;
+            bool hasTri = (thrInBlock < bufCount);
+            U32 hasTriMask = __ballot_sync(~0u, hasTri);
+            if (hasTri)
+            {
+                S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp  * (CR_SUBPIXEL_SIZE >> 1));
+                S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+                int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+                lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+                hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+
+                U32 bit = 1 << threadIdx.x;
+#if __CUDA_ARCH__ >= 700
+                bool multi = (hix != lox || hiy != loy);
+                if (!__any_sync(hasTriMask, multi))
+                {
+                    int binIdx = lox + p.widthBins * loy;
+                    U32 mask = __match_any_sync(hasTriMask, binIdx);
+                    s_outMask[threadIdx.y][binIdx] = mask;
+                    __syncwarp(hasTriMask);
+                } else
+#endif
+                {
+                    bool complex = (hix > lox+1 || hiy > loy+1);
+                    if (!__any_sync(hasTriMask, complex))
+                    {
+                        int binIdx = lox + p.widthBins * loy;
+                        atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
+                        if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
+                        if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
+                        if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
+                    } else
+                    {
+                        S32 d12x = d02x - d01x, d12y = d02y - d01y;
+                        v0x -= lox << binLog, v0y -= loy << binLog;
+
+                        S32 t01 = v0x * d01y - v0y * d01x;
+                        S32 t02 = v0y * d02x - v0x * d02y;
+                        S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                        S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
+                        S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
+                        S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
+
+                        int width = hix - lox + 1;
+                        d01x += width * d01y;
+                        d02x += width * d02y;
+                        d12x += width * d12y;
+
+                        U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
+                        U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
+                        U8* endPtr  = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
+                        int stride  = p.widthBins * 4;
+                        int ptrYInc = stride - width * 4;
+
+                        do
+                        {
+                            if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                atomicOr((U32*)currPtr, bit);
+                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                            if (currPtr == skipPtr)
+                                currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
+                        }
+                        while (currPtr != endPtr);
+                    }
+                }
+            }
+
+            // count per-bin contributions
+            if (thrInBlock == 0)
+                s_overTotal = 0; // overflow counter
+
+            // ensure that out masks are done
+            __syncthreads();
+
+            int overIndex = -1;
+            bool act = (thrInBlock < p.numBins);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
+                U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
+                int total = 0;
+                for (int i = 0; i < CR_BIN_WARPS; i++)
+                {
+                    total += __popc(*(U32*)srcPtr);
+                    *(U32*)dstPtr = total;
+                    srcPtr += (CR_MAXBINS_SQR + 1) * 4;
+                    dstPtr += (CR_MAXBINS_SQR + 1) * 4;
+                }
+
+                // overflow => request a new segment
+                int ofs = s_outOfs[thrInBlock];
+                bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
+                U32 ovrMask = __ballot_sync(actMask, ovr);
+                if (ovr)
+                {
+                    overIndex = __popc(ovrMask & getLaneMaskLt());
+                    if (overIndex == 0)
+                        s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
+                    __syncwarp(ovrMask);
+                    overIndex += s_broadcast[threadIdx.y + 16];
+                    s_overIndex[thrInBlock] = overIndex;
+                }
+            }
+
+            // sync after overTotal is ready
+            __syncthreads();
+
+            // at least one segment overflowed => allocate segments
+            U32 overTotal = s_overTotal;
+            U32 allocBase = 0;
+            if (overTotal > 0)
+            {
+                // allocate memory
+                if (thrInBlock == 0)
+                {
+                    U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
+                    s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
+                }
+                __syncthreads();
+                allocBase = s_allocBase;
+
+                // did my bin overflow?
+                if (overIndex != -1)
+                {
+                    // calculate new segment index
+                    int segIdx = allocBase + overIndex;
+
+                    // add to linked list
+                    if (s_outOfs[thrInBlock] < 0)
+                        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
+                    else
+                        binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
+
+                    // defaults
+                    binSegNext [segIdx] = -1;
+                    binSegCount[segIdx] = CR_BIN_SEG_SIZE;
+                }
+            }
+
+            // concurrent emission -- each warp handles its own triangle
+            if (thrInBlock < bufCount)
+            {
+                int triPos  = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
+                int currBin = lox + loy * p.widthBins;
+                int skipBin = (hix + 1) + loy * p.widthBins;
+                int endBin  = lox + (hiy + 1) * p.widthBins;
+                int binYInc = p.widthBins - (hix - lox + 1);
+
+                // loop over triangle's bins
+                do
+                {
+                    U32 outMask = s_outMask[threadIdx.y][currBin];
+                    if (outMask & (1<<threadIdx.x))
+                    {
+                        int idx = __popc(outMask & getLaneMaskLt());
+                        if (threadIdx.y > 0)
+                            idx += s_outCount[threadIdx.y-1][currBin];
+
+                        int base = s_outOfs[currBin];
+                        int free = (-base) & (CR_BIN_SEG_SIZE - 1);
+                        if (idx >= free)
+                            idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
+                        else
+                            idx += base;
+
+                        binSegData[idx] = s_triBuf[triPos];
+                    }
+
+                    currBin++;
+                    if (currBin == skipBin)
+                        currBin += binYInc, skipBin += p.widthBins;
+                }
+                while (currBin != endBin);
+            }
+
+            // wait all triangles to finish, then replace overflown segment offsets
+            __syncthreads();
+            if (thrInBlock < p.numBins)
+            {
+                U32 total  = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
+                U32 oldOfs = s_outOfs[thrInBlock];
+                if (overIndex == -1)
+                    s_outOfs[thrInBlock] = oldOfs + total;
+                else
+                {
+                    int addr = oldOfs + total;
+                    addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
+                    addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
+                    s_outOfs[thrInBlock] = addr;
+                }
+                s_outTotal[thrInBlock] += total;
+            }
+
+            // these triangles are now done
+            int count = ::min(bufCount, CR_BIN_WARPS * 32);
+            bufCount -= count;
+            bufIndex += count;
+            bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
+        }
+        while (bufCount > 0 || batchPos < batchEnd);
+
+        // flush all bins
+        if (thrInBlock < p.numBins)
+        {
+            int ofs = s_outOfs[thrInBlock];
+            if (ofs & (CR_BIN_SEG_SIZE-1))
+            {
+                int seg = ofs >> CR_BIN_SEG_LOG2;
+                binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
+                s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
+            }
+        }
+    }
+
+    // output totals
+    if (thrInBlock < p.numBins)
+        binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.cpp
new file mode 100644
index 00000000..b2cd7b92
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "Buffer.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// GPU buffer.
+//------------------------------------------------------------------------
+
+Buffer::Buffer(void)
+:   m_gpuPtr(NULL),
+    m_bytes (0)
+{
+    // empty
+}
+
+Buffer::~Buffer(void)
+{
+    if (m_gpuPtr)
+        cudaFree(m_gpuPtr); // Don't throw an exception.
+}
+
+void Buffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_gpuPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+        m_gpuPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+void Buffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
+// Host buffer with page-locked memory.
+//------------------------------------------------------------------------
+
+HostBuffer::HostBuffer(void)
+:   m_hostPtr(NULL),
+    m_bytes  (0)
+{
+    // empty
+}
+
+HostBuffer::~HostBuffer(void)
+{
+    if (m_hostPtr)
+        cudaFreeHost(m_hostPtr); // Don't throw an exception.
+}
+
+void HostBuffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_hostPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
+        m_hostPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+void HostBuffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.hpp
new file mode 100644
index 00000000..8a4b38fd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Buffer.hpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class Buffer
+{
+public:
+                    Buffer      (void);
+                    ~Buffer     (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_gpuPtr = ptr; }
+
+private:
+    void*           m_gpuPtr;
+    size_t          m_bytes;
+};
+
+//------------------------------------------------------------------------
+
+class HostBuffer
+{
+public:
+                    HostBuffer  (void);
+                    ~HostBuffer (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (void) { return m_hostPtr; }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_hostPtr = ptr; }
+
+private:
+    void*           m_hostPtr;
+    size_t          m_bytes;
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
new file mode 100644
index 00000000..a7081c7e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
@@ -0,0 +1,730 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
+{
+    int tileX = tileInBin & (CR_BIN_SIZE - 1);
+    int tileY = tileInBin >> CR_BIN_LOG2;
+    return tileX + tileY * widthTiles;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void coarseRasterImpl(const CRParams p)
+{
+    // Common.
+
+    __shared__ volatile U32 s_workCounter;
+    __shared__ volatile U32 s_scanTemp          [CR_COARSE_WARPS][48];              // 3KB
+
+    // Input.
+
+    __shared__ volatile U32 s_binOrder          [CR_MAXBINS_SQR];                   // 1KB
+    __shared__ volatile S32 s_binStreamCurrSeg  [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_triQueue          [CR_COARSE_QUEUE_SIZE];             // 4KB
+    __shared__ volatile S32 s_triQueueWritePos;
+    __shared__ volatile U32 s_binStreamSelectedOfs;
+    __shared__ volatile U32 s_binStreamSelectedSize;
+
+    // Output.
+
+    __shared__ volatile U32 s_warpEmitMask      [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR];                       // 1KB
+    __shared__ volatile U32 s_firstAllocSeg;
+    __shared__ volatile U32 s_firstActiveIdx;
+
+    // Pointers and constants.
+
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+    const S32*              binFirstSeg     = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binTotal        = (const S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binSegData      = (const S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    const S32*              binSegNext      = (const S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    const S32*              binSegCount     = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+    S32*                    activeTiles     = (S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileFirstSeg    = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileSegData     = (S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    S32*                    tileSegNext     = (S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    S32*                    tileSegCount    = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+    int tileLog     = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+    int thrInBlock  = threadIdx.x + threadIdx.y * 32;
+    int emitShift   = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
+
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
+        return;
+
+    // Initialize sharedmem arrays.
+
+    if (thrInBlock == 0)
+    {
+        s_tileEmitPrefixSum[0] = 0;
+        s_tileAllocPrefixSum[0] = 0;
+    }
+    s_scanTemp[threadIdx.y][threadIdx.x] = 0;
+
+    // Sort bins in descending order of triangle count.
+
+    for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
+    {
+        int count = 0;
+        for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
+            count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
+        s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
+    }
+
+    __syncthreads();
+    sortShared(s_binOrder, p.numBins);
+
+    // Process each bin by one block.
+
+    for (;;)
+    {
+        // Pick a bin for the block.
+
+        if (thrInBlock == 0)
+            s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
+        __syncthreads();
+
+        int workCounter = s_workCounter;
+        if (workCounter >= p.numBins)
+            break;
+
+        U32 binOrder = s_binOrder[workCounter];
+        bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
+        if (binEmpty && !p.deferredClear)
+            break;
+
+        int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
+
+        // Initialize input/output streams.
+
+        int triQueueWritePos = 0;
+        int triQueueReadPos = 0;
+
+        if (thrInBlock < CR_BIN_STREAMS_SIZE)
+        {
+            int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
+            s_binStreamCurrSeg[thrInBlock] = segIdx;
+            s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
+        }
+
+        for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
+
+        // Initialize per-bin state.
+
+        int binY = idiv_fast(binIdx, p.widthBins);
+        int binX = binIdx - binY * p.widthBins;
+        int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
+
+        // Entire block: Merge input streams and process triangles.
+
+        if (!binEmpty)
+        do
+        {
+            //------------------------------------------------------------------------
+            // Merge.
+            //------------------------------------------------------------------------
+
+            // Entire block: Not enough triangles => merge and queue segments.
+            // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
+
+            while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
+            {
+                // First warp: Choose the segment with the lowest initial triangle index.
+
+                bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
+                U32 hasStreamMask = __ballot_sync(~0u, hasStream);
+                if (hasStream)
+                {
+                    // Find the stream with the lowest triangle index.
+
+                    U32 firstTri = s_binStreamFirstTri[thrInBlock];
+                    U32 t = firstTri;
+                    volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+
+                    #if (CR_BIN_STREAMS_SIZE > 1)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 2)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 4)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 8)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 16)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
+                    #endif
+                    v[0] = t; __syncwarp(hasStreamMask);
+
+                    // Consume and broadcast.
+
+                    bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
+                    U32 firstMask = __ballot_sync(hasStreamMask, first);
+                    if (first && (firstMask >> threadIdx.x) == 1u)
+                    {
+                        int segIdx = s_binStreamCurrSeg[thrInBlock];
+                        s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
+                        if (segIdx != -1)
+                        {
+                            int segSize = binSegCount[segIdx];
+                            int segNext = binSegNext[segIdx];
+                            s_binStreamSelectedSize = segSize;
+                            s_triQueueWritePos = triQueueWritePos + segSize;
+                            s_binStreamCurrSeg[thrInBlock] = segNext;
+                            s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
+                        }
+                    }
+                }
+
+                // No more segments => break.
+
+                __syncthreads();
+                triQueueWritePos = s_triQueueWritePos;
+                int segOfs = s_binStreamSelectedOfs;
+                if (segOfs < 0)
+                    break;
+
+                int segSize = s_binStreamSelectedSize;
+                __syncthreads();
+
+                // Fetch triangles into the queue.
+
+                for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
+                {
+                    S32 triIdx = binSegData[segOfs + idxInSeg];
+                    s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
+                }
+            }
+
+            // All threads: Clear emit masks.
+
+            for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
+                s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
+
+            __syncthreads();
+
+            //------------------------------------------------------------------------
+            // Raster.
+            //------------------------------------------------------------------------
+
+            // Triangle per thread: Read from the queue.
+
+            int triIdx = -1;
+            if (triQueueReadPos + thrInBlock < triQueueWritePos)
+                triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (triIdx != -1)
+            {
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+                triData = *((uint4*)triHeader + dataIdx);
+            }
+
+            // 32 triangles per warp: Record emits (= tile intersections).
+
+            if (__any_sync(~0u, triIdx != -1))
+            {
+                S32 v0x = sub_s16lo_s16lo(triData.x, originX);
+                S32 v0y = sub_s16hi_s16lo(triData.x, originY);
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+
+                // Compute tile-based AABB.
+
+                int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int sizex = add_sub(hix, 1, lox);
+                int sizey = add_sub(hiy, 1, loy);
+                int area = sizex * sizey;
+
+                // Miscellaneous init.
+
+                U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
+                int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
+                U32 maskBit = 1 << threadIdx.x;
+
+                // Case A: All AABBs are small => record the full AABB using atomics.
+
+                if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
+                {
+                    if (triIdx != -1)
+                    {
+                        atomicOr((U32*)currPtr, maskBit);
+                        if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
+                        if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
+                        if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
+                    }
+                }
+                else
+                {
+                    // Compute warp-AABB (scan-32).
+
+                    U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
+                    if (triIdx == -1)
+                        aabbMask = 0;
+
+                    volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
+
+                    U32 maskX = aabbMask & 0xFFFF;
+                    U32 maskY = aabbMask >> 16;
+                    int wlox = findLeadingOne(maskX ^ (maskX - 1));
+                    int wloy = findLeadingOne(maskY ^ (maskY - 1));
+                    int whix = findLeadingOne(maskX);
+                    int whiy = findLeadingOne(maskY);
+                    int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
+
+                    // Initialize edge functions.
+
+                    S32 d12x = d02x - d01x;
+                    S32 d12y = d02y - d01y;
+                    v0x -= lox << tileLog;
+                    v0y -= loy << tileLog;
+
+                    S32 t01 = v0x * d01y - v0y * d01x;
+                    S32 t02 = v0y * d02x - v0x * d02y;
+                    S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                    S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
+                    S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
+                    S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
+
+                    d01x += sizex * d01y;
+                    d02x += sizex * d02y;
+                    d12x += sizex * d12y;
+
+                    // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
+                    if (__any_sync(~0u, warea * 4 <= area * 8))
+                    {
+                        // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
+                        bool act = (triIdx != -1);
+                        U32 actMask = __ballot_sync(~0u, act);
+                        if (act)
+                        {
+                            for (int y = wloy; y <= whiy; y++)
+                            {
+                                bool yIn = (y >= loy && y <= hiy);
+                                U32 yMask = __ballot_sync(actMask, yIn);
+                                if (yIn)
+                                {
+                                    for (int x = wlox; x <= whix; x++)
+                                    {
+                                        bool xyIn = (x >= lox && x <= hix);
+                                        U32 xyMask = __ballot_sync(yMask, xyIn);
+                                        if (xyIn)
+                                        {
+                                            U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
+                                            if (threadIdx.x == 31 - __clz(xyMask))
+                                                *(U32*)currPtr = res;
+                                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                        }
+                                    }
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
+                                }
+                            }
+                        }
+                    }
+
+                    // Case C: General case => Check tiles in AABB, record using atomics.
+
+                    else
+                    {
+                        if (triIdx != -1)
+                        {
+                            U8* skipPtr = currPtr + (sizex << 2);
+                            U8* endPtr  = currPtr + (sizey << (CR_BIN_LOG2 + 2));
+                            do
+                            {
+                                if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                    atomicOr((U32*)currPtr, maskBit);
+                                currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                if (currPtr == skipPtr)
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
+                            }
+                            while (currPtr != endPtr);
+                        }
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            //------------------------------------------------------------------------
+            // Count.
+            //------------------------------------------------------------------------
+
+            // Tile per thread: Initialize prefix sums.
+
+            for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+            {
+                int tileInBin = tileInBin_base + thrInBlock;
+                bool act = (tileInBin < CR_BIN_SQR);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (act)
+                {
+                    // Compute prefix sum of emits over warps.
+
+                    U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
+                    U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
+                    int tileEmits = 0;
+                    for (int i = 0; i < CR_COARSE_WARPS; i++)
+                    {
+                        tileEmits += __popc(*(U32*)srcPtr);
+                        *(U32*)dstPtr = tileEmits;
+                        srcPtr += (CR_BIN_SQR + 1) * 4;
+                        dstPtr += (CR_BIN_SQR + 1) * 4;
+                    }
+
+                    // Determine the number of segments to allocate.
+
+                    int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
+                    int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
+                    volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
+
+                    // All counters within the warp are small => compute prefix sum using ballot.
+
+                    if (!__any_sync(actMask, tileEmits >= 2))
+                    {
+                        U32 m = getLaneMaskLe();
+                        *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
+                    }
+
+                    // Otherwise => scan-32 within the warp.
+
+                    else
+                    {
+                        U32 sum = (tileEmits << emitShift) | tileAllocs;
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1)  sum += v[-1]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2)  sum += v[-2]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4)  sum += v[-4]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8)  sum += v[-8]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask);
+                    }
+                }
+            }
+
+            // First warp: Scan-8.
+
+            __syncthreads();
+
+            bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+            U32 scan8Mask = __ballot_sync(~0u, scan8);
+            if (scan8)
+            {
+                int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
+                volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+                v[0] = sum; __syncwarp(scan8Mask);
+                #if (CR_BIN_SQR > 1 * 32)
+                    sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 2 * 32)
+                    sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 4 * 32)
+                    sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+            }
+
+            __syncthreads();
+
+            // Tile per thread: Finalize prefix sums.
+            // Single thread: Allocate segments.
+
+            for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
+                int numEmits = sum >> emitShift;
+                int numAllocs = sum & ((1 << emitShift) - 1);
+                s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
+                s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
+
+                if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
+                {
+                    int t = atomicAdd(&atomics.numTileSegs, numAllocs);
+                    s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
+                }
+            }
+
+            __syncthreads();
+            int firstAllocSeg   = s_firstAllocSeg;
+            int totalEmits      = s_tileEmitPrefixSum[CR_BIN_SQR];
+            int totalAllocs     = s_tileAllocPrefixSum[CR_BIN_SQR];
+
+            //------------------------------------------------------------------------
+            // Emit.
+            //------------------------------------------------------------------------
+
+            // Emit per thread: Write triangle index to globalmem.
+
+            for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
+            {
+                // Find tile in bin.
+
+                U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
+                U8* tilePtr = tileBase;
+                U8* ptr;
+
+                #if (CR_BIN_SQR > 128)
+                    ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 64)
+                    ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 32)
+                    ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 16)
+                    ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 8)
+                    ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 4)
+                    ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 2)
+                    ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 1)
+                    ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+
+                int tileInBin = (tilePtr - tileBase) >> 2;
+                int emitInTile = emitInBin - *(U32*)tilePtr;
+
+                // Find warp in tile.
+
+                int warpStep = (CR_BIN_SQR + 1) * 4;
+                U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
+                U8* warpPtr = warpBase;
+
+                #if (CR_COARSE_WARPS > 8)
+                    ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 4)
+                    ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 2)
+                    ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 1)
+                    ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+
+                int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
+                U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
+                int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
+
+                // Find thread in warp.
+
+                int threadInWarp = 0;
+                int pop = __popc(emitMask & 0xFFFF);
+                bool pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x10;
+                if (pred) threadInWarp += 0x10;
+
+                pop = __popc(emitMask & 0xFF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x08;
+                if (pred) threadInWarp += 0x08;
+
+                pop = __popc(emitMask & 0xF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x04;
+                if (pred) threadInWarp += 0x04;
+
+                pop = __popc(emitMask & 0x3);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x02;
+                if (pred) threadInWarp += 0x02;
+
+                if (emitInWarp >= (emitMask & 1))
+                    threadInWarp++;
+
+                // Figure out where to write.
+
+                int currOfs = s_tileStreamCurrOfs[tileInBin];
+                int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
+                int outOfs = emitInTile;
+
+                if (outOfs < spaceLeft)
+                    outOfs += currOfs;
+                else
+                {
+                    int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
+                    outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
+                }
+
+                // Write.
+
+                int queueIdx = warpInTile * 32 + threadInWarp;
+                int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+                tileSegData[outOfs] = triIdx;
+            }
+
+            //------------------------------------------------------------------------
+            // Patch.
+            //------------------------------------------------------------------------
+
+            // Allocated segment per thread: Initialize next-pointer and count.
+
+            for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
+            {
+                int segIdx = firstAllocSeg + i;
+                tileSegNext[segIdx] = segIdx + 1;
+                tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
+            }
+
+            // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
+
+            __syncthreads();
+            for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int oldOfs = s_tileStreamCurrOfs[tileInBin];
+                int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
+                int allocLo = s_tileAllocPrefixSum[tileInBin];
+                int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
+
+                if (allocLo != allocHi)
+                {
+                    S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
+                    if (oldOfs < 0)
+                        nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
+                    *nextPtr = firstAllocSeg + allocLo;
+
+                    newOfs--;
+                    newOfs &= CR_TILE_SEG_SIZE - 1;
+                    newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
+                    newOfs++;
+                }
+                s_tileStreamCurrOfs[tileInBin] = newOfs;
+            }
+
+            // Advance queue read pointer.
+            // Queue became empty => bin done.
+
+            triQueueReadPos += CR_COARSE_WARPS * 32;
+        }
+        while (triQueueReadPos < triQueueWritePos);
+
+        // Tile per thread: Fix next-pointer and count of the last segment.
+        // 32 tiles per warp: Count active tiles.
+
+        __syncthreads();
+
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int tileX = tileInBin & (CR_BIN_SIZE - 1);
+                int tileY = tileInBin >> CR_BIN_LOG2;
+                bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
+
+                int ofs = s_tileStreamCurrOfs[tileInBin];
+                int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
+                int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
+
+                if (ofs >= 0)
+                    tileSegNext[segIdx] = -1;
+                else if (force)
+                {
+                    s_tileStreamCurrOfs[tileInBin] = 0;
+                    tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
+                }
+
+                if (segCount != 0)
+                    tileSegCount[segIdx] = segCount;
+
+                U32 res = __ballot_sync(actMask, ofs >= 0 | force);
+                if (threadIdx.x == 0)
+                    s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
+            }
+        }
+
+        // First warp: Scan-8.
+        // One thread: Allocate space for active tiles.
+
+        __syncthreads();
+
+        bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+        U32 scan8Mask = __ballot_sync(~0u, scan8);
+        if (scan8)
+        {
+            volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+            U32 sum = v[0];
+            #if (CR_BIN_SQR > 1 * 32)
+                sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 2 * 32)
+                sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 4 * 32)
+                sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+
+            if (thrInBlock == CR_BIN_SQR / 32 - 1)
+                s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
+        }
+
+        // Tile per thread: Output active tiles.
+
+        __syncthreads();
+
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int activeIdx = s_firstActiveIdx;
+                activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
+                activeIdx += __popc(actMask & getLaneMaskLt());
+                activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Constants.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Constants.hpp
new file mode 100644
index 00000000..916315cd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Constants.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_LOG2     11      // ViewportSize / PixelSize.
+#define CR_SUBPIXEL_LOG2        4       // PixelSize / SubpixelSize.
+
+#define CR_MAXBINS_LOG2         4       // ViewportSize / BinSize.
+#define CR_BIN_LOG2             4       // BinSize / TileSize.
+#define CR_TILE_LOG2            3       // TileSize / PixelSize.
+
+#define CR_COVER8X8_LUT_SIZE    768     // 64-bit entries.
+#define CR_FLIPBIT_FLIP_Y       2
+#define CR_FLIPBIT_FLIP_X       3
+#define CR_FLIPBIT_SWAP_XY      4
+#define CR_FLIPBIT_COMPL        5
+
+#define CR_BIN_STREAMS_LOG2     4
+#define CR_BIN_SEG_LOG2         9       // 32-bit entries.
+#define CR_TILE_SEG_LOG2        5       // 32-bit entries.
+
+#define CR_MAXSUBTRIS_LOG2      24      // Triangle structs. Dictated by CoarseRaster.
+#define CR_COARSE_QUEUE_LOG2    10      // Triangles.
+
+#define CR_SETUP_WARPS          2
+#define CR_SETUP_OPT_BLOCKS     8
+#define CR_BIN_WARPS            16
+#define CR_COARSE_WARPS         16      // Must be a power of two.
+#define CR_FINE_MAX_WARPS       20
+
+#define CR_EMBED_IMAGE_PARAMS   32      // Number of per-image parameter structs embedded in kernel launch parameter block.
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_SIZE     (1 << CR_MAXVIEWPORT_LOG2)
+#define CR_SUBPIXEL_SIZE        (1 << CR_SUBPIXEL_LOG2)
+#define CR_SUBPIXEL_SQR         (1 << (CR_SUBPIXEL_LOG2 * 2))
+
+#define CR_MAXBINS_SIZE         (1 << CR_MAXBINS_LOG2)
+#define CR_MAXBINS_SQR          (1 << (CR_MAXBINS_LOG2 * 2))
+#define CR_BIN_SIZE             (1 << CR_BIN_LOG2)
+#define CR_BIN_SQR              (1 << (CR_BIN_LOG2 * 2))
+
+#define CR_MAXTILES_LOG2        (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
+#define CR_MAXTILES_SIZE        (1 << CR_MAXTILES_LOG2)
+#define CR_MAXTILES_SQR         (1 << (CR_MAXTILES_LOG2 * 2))
+#define CR_TILE_SIZE            (1 << CR_TILE_LOG2)
+#define CR_TILE_SQR             (1 << (CR_TILE_LOG2 * 2))
+
+#define CR_BIN_STREAMS_SIZE     (1 << CR_BIN_STREAMS_LOG2)
+#define CR_BIN_SEG_SIZE         (1 << CR_BIN_SEG_LOG2)
+#define CR_TILE_SEG_SIZE        (1 << CR_TILE_SEG_LOG2)
+
+#define CR_MAXSUBTRIS_SIZE      (1 << CR_MAXSUBTRIS_LOG2)
+#define CR_COARSE_QUEUE_SIZE    (1 << CR_COARSE_QUEUE_LOG2)
+
+//------------------------------------------------------------------------
+// When evaluating interpolated Z pixel centers, we may introduce an error
+// of (+-CR_LERP_ERROR) ULPs.
+
+#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
+#define CR_DEPTH_MIN                CR_LERP_ERROR(3)
+#define CR_DEPTH_MAX                (CR_U32_MAX - CR_LERP_ERROR(3))
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
new file mode 100644
index 00000000..db8bf314
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "Defs.hpp"
+#include "../CudaRaster.hpp"
+#include "RasterImpl.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// Stub interface implementation.
+//------------------------------------------------------------------------
+
+CudaRaster::CudaRaster()
+{
+    m_impl = new RasterImpl();
+}
+
+CudaRaster::~CudaRaster()
+{
+    delete m_impl;
+}
+
+void CudaRaster::setBufferSize(int width, int height, int numImages)
+{
+    m_impl->setBufferSize(Vec3i(width, height, numImages));
+}
+
+void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
+{
+    m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
+}
+
+void CudaRaster::setRenderModeFlags(U32 flags)
+{
+    m_impl->setRenderModeFlags(flags);
+}
+
+void CudaRaster::deferredClear(U32 clearColor)
+{
+    m_impl->deferredClear(clearColor);
+}
+
+void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
+{
+    m_impl->setVertexBuffer(vertices, numVertices);
+}
+
+void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
+{
+    m_impl->setIndexBuffer(indices, numTriangles);
+}
+
+bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
+{
+    return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
+}
+
+void* CudaRaster::getColorBuffer(void)
+{
+    return m_impl->getColorBuffer();
+}
+
+void* CudaRaster::getDepthBuffer(void)
+{
+    return m_impl->getDepthBuffer();
+}
+
+void CudaRaster::swapDepthAndPeel(void)
+{
+    m_impl->swapDepthAndPeel();
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Defs.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Defs.hpp
new file mode 100644
index 00000000..7aa7774c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Defs.hpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+#ifndef NULL
+#   define NULL 0
+#endif
+
+#ifdef __CUDACC__
+#   define CR_CUDA 1
+#else
+#   define CR_CUDA 0
+#endif
+
+#if CR_CUDA
+#   define CR_CUDA_FUNC     __device__ __inline__
+#   define CR_CUDA_CONST    __constant__
+#else
+#   define CR_CUDA_FUNC     inline
+#   define CR_CUDA_CONST    static const
+#endif
+
+#define CR_UNREF(X)         ((void)(X))
+#define CR_ARRAY_SIZE(X)    ((int)(sizeof(X) / sizeof((X)[0])))
+
+//------------------------------------------------------------------------
+
+typedef uint8_t             U8;
+typedef uint16_t            U16;
+typedef uint32_t            U32;
+typedef uint64_t            U64;
+typedef int8_t              S8;
+typedef int16_t             S16;
+typedef int32_t             S32;
+typedef int64_t             S64;
+typedef float               F32;
+typedef double              F64;
+typedef void                (*FuncPtr)(void);
+
+//------------------------------------------------------------------------
+
+#define CR_U32_MAX          (0xFFFFFFFFu)
+#define CR_S32_MIN          (~0x7FFFFFFF)
+#define CR_S32_MAX          (0x7FFFFFFF)
+#define CR_U64_MAX          ((U64)(S64)-1)
+#define CR_S64_MIN          ((S64)-1 << 63)
+#define CR_S64_MAX          (~((S64)-1 << 63))
+#define CR_F32_MIN          (1.175494351e-38f)
+#define CR_F32_MAX          (3.402823466e+38f)
+#define CR_F64_MIN          (2.2250738585072014e-308)
+#define CR_F64_MAX          (1.7976931348623158e+308)
+
+//------------------------------------------------------------------------
+// Misc types.
+
+class Vec2i
+{
+public:
+    Vec2i(int x_, int y_) : x(x_), y(y_) {}
+    int x, y;
+};
+
+class Vec3i
+{
+public:
+    Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    int x, y, z;
+};
+
+//------------------------------------------------------------------------
+// CUDA utilities.
+
+#if CR_CUDA
+#   define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
+#endif
+
+//------------------------------------------------------------------------
+} // namespace CR
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/FineRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/FineRaster.inl
new file mode 100644
index 00000000..720e9997
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/FineRaster.inl
@@ -0,0 +1,385 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Utility funcs.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
+{
+    tileZMax = CR_DEPTH_MAX;
+    tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
+}
+
+__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
+{
+    // Entry is warp-coherent.
+    if (__any_sync(~0u, tileZUpd))
+    {
+        U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
+        temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        tileZMax = temp[47];
+        tileZUpd = false;
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
+{
+    const CRTriangleHeader* triHeaderPtr    = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
+    const S32*              tileSegData     = (const S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    const S32*              tileSegNext     = (const S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    const S32*              tileSegCount    = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+    if (threadIdx.x >= tileSegCount[segment])
+    {
+        triIdx = -1;
+        dataIdx = -1;
+    }
+    else
+    {
+        int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
+        triIdx = subtriIdx >> 3;
+        dataIdx = triIdx;
+        subtriIdx &= 7;
+        if (subtriIdx != 7)
+            dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
+        triHeader = *((uint4*)triHeaderPtr + dataIdx);
+    }
+
+    // advance to next segment
+    segment = tileSegNext[segment];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
+{
+    U32 zmin = triHeader.w & 0xFFFFF000;
+    return (zmin > tileZMax);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
+{
+    int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp  - 1) << (CR_SUBPIXEL_LOG2 - 1));
+    int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
+
+    // extract S16 vertex positions while subtracting tile coordinates
+    S32 v0x  = sub_s16lo_s16lo(triHeader.x, baseX);
+    S32 v0y  = sub_s16hi_s16lo(triHeader.x, baseY);
+    S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
+    S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
+    S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
+    S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
+
+    // extract flipbits
+    U32 f01 = (triHeader.w >> 6) & 0x3C;
+    U32 f12 = (triHeader.w >> 2) & 0x3C;
+    U32 f20 = (triHeader.w << 2) & 0x3C;
+
+    // compute per-edge coverage masks
+    U64 c01, c12, c20;
+    c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
+    c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
+    c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
+
+    // combine masks
+    return c01 & c12 & c20;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
+{
+    __syncwarp();
+    temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    return value;
+}
+
+__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
+{
+    return temp[47];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ S32 findBit(U64 mask, int idx)
+{
+    U32 x = getLo(mask);
+    int  pop = __popc(x);
+    bool p   = (pop <= idx);
+    if (p) x = getHi(mask);
+    if (p) idx -= pop;
+    int bit = p ? 32 : 0;
+
+    pop = __popc(x & 0x0000ffffu);
+    p   = (pop <= idx);
+    if (p) x >>= 16;
+    if (p) bit += 16;
+    if (p) idx -= pop;
+
+    U32 tmp = x & 0x000000ffu;
+    pop = __popc(tmp);
+    p   = (pop <= idx);
+    if (p) tmp = x & 0x0000ff00u;
+    if (p) idx -= pop;
+
+    return findLeadingOne(tmp) + bit - idx;
+}
+
+//------------------------------------------------------------------------
+// Single-sample implementation.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
+{
+    atomicMin((U32*)pDepth, depth);
+    __syncwarp(ropMask);
+    bool act = (depth == *pDepth);
+    __syncwarp(ropMask);
+    U32 actMask = __ballot_sync(ropMask, act);
+    if (act)
+    {
+        *pDepth = 0;
+        __syncwarp(actMask);
+        atomicMax((U32*)pDepth, threadIdx.x);
+        __syncwarp(actMask);
+        if (*pDepth == threadIdx.x)
+        {
+            *pDepth = depth;
+            *pColor = color;
+        }
+        __syncwarp(actMask);
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void fineRasterImpl(const CRParams p)
+{
+                                                                            // for 20 warps:
+    __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE];           // 6KB
+    __shared__ volatile U32 s_tileColor   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tileDepth   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tilePeel    [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_triDataIdx  [CR_FINE_MAX_WARPS][64];          // 5KB  CRTriangleData index
+    __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64];          // 10KB coverage mask
+    __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64];          // 5KB  fragment index
+    __shared__ volatile U32 s_temp        [CR_FINE_MAX_WARPS][80];          // 6.25KB
+                                                                            // = 47.25KB total
+
+    CRAtomics&            atomics   = p.atomics[blockIdx.z];
+    const CRTriangleData* triData   = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
+
+    const S32*      activeTiles     = (const S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    const S32*      tileFirstSeg    = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+
+    volatile U32*   tileColor       = s_tileColor[threadIdx.y];
+    volatile U32*   tileDepth       = s_tileDepth[threadIdx.y];
+    volatile U32*   tilePeel        = s_tilePeel[threadIdx.y];
+    volatile U32*   triDataIdx      = s_triDataIdx[threadIdx.y];
+    volatile U64*   triangleCov     = s_triangleCov[threadIdx.y];
+    volatile U32*   triangleFrag    = s_triangleFrag[threadIdx.y];
+    volatile U32*   temp            = s_temp[threadIdx.y];
+
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
+        return;
+
+    temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
+    cover8x8_setupLUT(s_cover8x8_lut);
+    __syncthreads();
+
+    // loop over tiles
+    for (;;)
+    {
+        // pick a tile
+        if (threadIdx.x == 0)
+            temp[16] = atomicAdd(&atomics.fineCounter, 1);
+        __syncwarp();
+        int activeIdx = temp[16];
+        if (activeIdx >= atomics.numActiveTiles)
+            break;
+
+        int tileIdx = activeTiles[activeIdx];
+        S32 segment = tileFirstSeg[tileIdx];
+        int tileY = tileIdx / p.widthTiles;
+        int tileX = tileIdx - tileY * p.widthTiles;
+        int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+        int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+
+        // initialize per-tile state
+        int triRead = 0, triWrite = 0;
+        int fragRead = 0, fragWrite = 0;
+        if (threadIdx.x == 0)
+            triangleFrag[63] = 0; // "previous triangle"
+
+        // deferred clear => clear tile
+        if (p.deferredClear)
+        {
+			tileColor[threadIdx.x] = p.clearColor;
+            tileDepth[threadIdx.x] = p.clearDepth;
+            tileColor[threadIdx.x + 32] = p.clearColor;
+            tileDepth[threadIdx.x + 32] = p.clearDepth;
+        }
+        else // otherwise => read tile from framebuffer
+        {
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+			tileColor[threadIdx.x] = pColor[px + p.strideX * py];
+            tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
+            tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
+            tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
+        }
+
+        // read peeling inputs if enabled
+        if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
+        {
+            U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
+            tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
+            tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
+        }
+
+        U32 tileZMax;
+        bool tileZUpd;
+        initTileZMax(tileZMax, tileZUpd, tileDepth);
+
+        // process fragments
+        for(;;)
+        {
+            // need to queue more fragments?
+            if (fragWrite - fragRead < 32 && segment >= 0)
+            {
+                // update tile z - coherent over warp
+                updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
+
+                // read triangles
+                do
+                {
+                    // read triangle index and data, advance to next segment
+                    S32 triIdx, dataIdx;
+                    uint4 triHeader;
+                    getTriangle(p, triIdx, dataIdx, triHeader, segment);
+
+                    // early z cull
+                    if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
+                        triIdx = -1;
+
+                    // determine coverage
+                    U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
+                    S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
+
+                    // fragment count scan
+                    U32 frag = scan32_value(pop, temp);
+                    frag += fragWrite; // frag now holds cumulative fragment count
+                    fragWrite += scan32_total(temp);
+
+                    // queue non-empty triangles
+                    U32 goodMask = __ballot_sync(~0u, pop != 0);
+                    if (pop != 0)
+                    {
+                        int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
+                        triDataIdx  [idx] = dataIdx;
+                        triangleFrag[idx] = frag;
+                        triangleCov [idx] = coverage;
+                    }
+                    triWrite += __popc(goodMask);
+                }
+                while (fragWrite - fragRead < 32 && segment >= 0);
+            }
+            __syncwarp();
+
+            // end of segment?
+            if (fragRead == fragWrite)
+                break;
+
+            // clear triangle boundaries
+            temp[threadIdx.x + 16] = 0;
+            __syncwarp();
+
+            // tag triangle boundaries
+            if (triRead + threadIdx.x < triWrite)
+            {
+                int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
+                if (idx <= 32)
+                    temp[idx + 16 - 1] = 1;
+            }
+            __syncwarp();
+
+            int ropLaneIdx = threadIdx.x;
+            U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
+
+            // distribute fragments
+            bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
+            U32 fragmentMask = __ballot_sync(~0u, hasFragment);
+            if (hasFragment)
+            {
+                int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
+                int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
+                U64 coverage = triangleCov[triBufIdx];
+                int pixelInTile = findBit(coverage, fragIdx);
+                int dataIdx = triDataIdx[triBufIdx];
+
+                // determine pixel position
+                U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
+                U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
+
+                // depth test
+                U32 depth = 0;
+                uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
+
+                depth = td.x * pixelX + td.y * pixelY + td.z;
+                bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
+                if (!zkill)
+                {
+                    U32 oldDepth = tileDepth[pixelInTile];
+                    if (depth > oldDepth)
+                        zkill = true;
+                    else if (oldDepth == tileZMax)
+                        tileZUpd = true; // we are replacing previous zmax => need to update
+                }
+
+                U32 ropMask = __ballot_sync(fragmentMask, !zkill);
+                if (!zkill)
+					executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
+            }
+            // no need to sync, as next up is updateTileZMax that does internal warp sync
+
+            // update counters
+            fragRead = ::min(fragRead + 32, fragWrite);
+            triRead += __popc(boundaryMask);
+        }
+
+        // Write tile back to the framebuffer.
+        if (true)
+        {
+            int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+            int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+            pColor[px + p.strideX * py] = tileColor[threadIdx.x];
+            pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
+            pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
+            pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
+        }
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
new file mode 100644
index 00000000..26133c97
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
@@ -0,0 +1,153 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+#include "Constants.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+// Projected triangle.
+//------------------------------------------------------------------------
+
+struct CRTriangleHeader
+{
+    S16 v0x;    // Subpixels relative to viewport center. Valid if triSubtris = 1.
+    S16 v0y;
+    S16 v1x;
+    S16 v1y;
+    S16 v2x;
+    S16 v2y;
+
+    U32 misc;   // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
+};
+
+//------------------------------------------------------------------------
+
+struct CRTriangleData
+{
+    U32 zx;     // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
+    U32 zy;
+    U32 zb;
+    U32 id;     // Triangle id.
+};
+
+//------------------------------------------------------------------------
+// Device-side structures.
+//------------------------------------------------------------------------
+
+struct CRAtomics
+{
+    // Setup.
+    S32         numSubtris;         // = numTris
+
+    // Bin.
+    S32         binCounter;         // = 0
+    S32         numBinSegs;         // = 0
+
+    // Coarse.
+    S32         coarseCounter;      // = 0
+    S32         numTileSegs;        // = 0
+    S32         numActiveTiles;     // = 0
+
+    // Fine.
+    S32         fineCounter;        // = 0
+};
+
+//------------------------------------------------------------------------
+
+struct CRImageParams
+{
+    S32         triOffset;          // First triangle index to draw.
+    S32         triCount;           // Number of triangles to draw.
+    S32         binBatchSize;       // Number of triangles per batch.
+};
+
+//------------------------------------------------------------------------
+
+struct CRParams
+{
+    // Common.
+
+    CRAtomics*  atomics;            // Work counters. Per-image.
+    S32         numImages;          // Batch size.
+    S32         totalCount;         // In range mode, total number of triangles to render.
+    S32         instanceMode;       // 0 = range mode, 1 = instance mode.
+
+    S32         numVertices;        // Number of vertices in input buffer, not counting multiples in instance mode.
+    S32         numTriangles;       // Number of triangles in input buffer.
+    void*       vertexBuffer;       // numVertices * float4(x, y, z, w)
+    void*       indexBuffer;        // numTriangles * int3(vi0, vi1, vi2)
+
+    S32         widthPixels;        // Render buffer size in pixels. Must be multiple of tile size (8x8).
+    S32         heightPixels;
+    S32         widthPixelsVp;      // Viewport size in pixels.
+    S32         heightPixelsVp;
+    S32         widthBins;          // widthPixels / CR_BIN_SIZE
+    S32         heightBins;         // heightPixels / CR_BIN_SIZE
+    S32         numBins;            // widthBins * heightBins
+
+    F32         xs;                 // Vertex position adjustments for tiled rendering.
+    F32         ys;
+    F32         xo;
+    F32         yo;
+
+    S32         widthTiles;         // widthPixels / CR_TILE_SIZE
+    S32         heightTiles;        // heightPixels / CR_TILE_SIZE
+    S32         numTiles;           // widthTiles * heightTiles
+
+    U32         renderModeFlags;
+    S32         deferredClear;      // 1 = Clear framebuffer before rendering triangles.
+    U32         clearColor;
+    U32         clearDepth;
+
+    // These are uniform across batch.
+
+    S32         maxSubtris;
+    S32         maxBinSegs;
+    S32         maxTileSegs;
+
+    // Setup output / bin input.
+
+    void*       triSubtris;         // maxSubtris * U8
+    void*       triHeader;          // maxSubtris * CRTriangleHeader
+    void*       triData;            // maxSubtris * CRTriangleData
+
+    // Bin output / coarse input.
+
+    void*       binSegData;         // maxBinSegs * CR_BIN_SEG_SIZE * S32
+    void*       binSegNext;         // maxBinSegs * S32
+    void*       binSegCount;        // maxBinSegs * S32
+    void*       binFirstSeg;        // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
+    void*       binTotal;           // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
+
+    // Coarse output / fine input.
+
+    void*       tileSegData;        // maxTileSegs * CR_TILE_SEG_SIZE * S32
+    void*       tileSegNext;        // maxTileSegs * S32
+    void*       tileSegCount;       // maxTileSegs * S32
+    void*       activeTiles;        // CR_MAXTILES_SQR * (S32 tileIdx)
+    void*       tileFirstSeg;       // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
+
+    // Surface buffers. Outer tile offset is baked into pointers.
+
+    void*       colorBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       depthBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       peelBuffer;         // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
+    S32         strideX;            // horizontal size in pixels
+    S32         strideY;            // vertical stride in pixels
+
+    // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
+
+    CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
+    const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
new file mode 100644
index 00000000..f7f05d57
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
@@ -0,0 +1,370 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "RasterImpl.hpp"
+#include <cuda_runtime.h>
+
+using namespace CR;
+using std::min;
+using std::max;
+
+//------------------------------------------------------------------------
+// Kernel prototypes and variables.
+
+void triangleSetupKernel (const CRParams p);
+void binRasterKernel     (const CRParams p);
+void coarseRasterKernel  (const CRParams p);
+void fineRasterKernel    (const CRParams p);
+
+//------------------------------------------------------------------------
+
+RasterImpl::RasterImpl(void)
+:   m_renderModeFlags       (0),
+    m_deferredClear         (false),
+    m_clearColor            (0),
+    m_vertexPtr             (NULL),
+    m_indexPtr              (NULL),
+    m_numVertices           (0),
+    m_numTriangles          (0),
+    m_bufferSizesReported   (0),
+
+    m_numImages             (0),
+    m_bufferSizePixels      (0, 0),
+    m_bufferSizeVp          (0, 0),
+    m_sizePixels            (0, 0),
+    m_sizeVp                (0, 0),
+    m_offsetPixels          (0, 0),
+    m_sizeBins              (0, 0),
+    m_numBins               (0),
+    m_sizeTiles             (0, 0),
+    m_numTiles              (0),
+
+    m_numSMs                (1),
+    m_numCoarseBlocksPerSM  (1),
+    m_numFineBlocksPerSM    (1),
+    m_numFineWarpsPerBlock  (1),
+
+    m_maxSubtris            (1),
+    m_maxBinSegs            (1),
+    m_maxTileSegs           (1)
+{
+    // Query relevant device attributes.
+
+    int currentDevice = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
+    cudaFuncAttributes attr;
+    NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
+    m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
+
+    // Setup functions.
+
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel,     cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel,  cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel,    cudaFuncCachePreferShared));
+}
+
+//------------------------------------------------------------------------
+
+RasterImpl::~RasterImpl(void)
+{
+    // Empty.
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setBufferSize(Vec3i size)
+{
+    // Internal buffer width and height must be divisible by tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    m_bufferSizePixels = Vec2i(w, h);
+    m_bufferSizeVp     = Vec2i(size.x, size.y);
+    m_numImages        = size.z;
+
+    m_colorBuffer.reset(w * h * size.z * sizeof(U32));
+    m_depthBuffer.reset(w * h * size.z * sizeof(U32));
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setViewport(Vec2i size, Vec2i offset)
+{
+    // Offset must be divisible by tile size.
+    NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
+
+    // Round internal viewport size to multiples of tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    m_sizePixels    = Vec2i(w, h);
+    m_offsetPixels  = offset;
+    m_sizeVp        = Vec2i(size.x, size.y);
+    m_sizeTiles.x   = m_sizePixels.x >> CR_TILE_LOG2;
+    m_sizeTiles.y   = m_sizePixels.y >> CR_TILE_LOG2;
+    m_numTiles      = m_sizeTiles.x * m_sizeTiles.y;
+    m_sizeBins.x    = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_sizeBins.y    = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_numBins       = m_sizeBins.x * m_sizeBins.y;
+}
+
+void RasterImpl::swapDepthAndPeel(void)
+{
+    m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
+
+    void* tmp = m_depthBuffer.getPtr();
+    m_depthBuffer.setPtr(m_peelBuffer.getPtr());
+    m_peelBuffer.setPtr(tmp);
+}
+
+//------------------------------------------------------------------------
+
+bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
+{
+    bool instanceMode = (!ranges);
+
+    int maxSubtrisSlack     = 4096;     // x 81B    = 324KB
+    int maxBinSegsSlack     = 256;      // x 2137B  = 534KB
+    int maxTileSegsSlack    = 4096;     // x 136B   = 544KB
+
+    // Resize atomics as needed.
+    m_crAtomics    .grow(m_numImages * sizeof(CRAtomics));
+    m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
+
+    // Size of these buffers doesn't depend on input.
+    m_binFirstSeg  .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_binTotal     .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_activeTiles  .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+    m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+
+    // Construct per-image parameters and determine worst-case buffer sizes.
+    m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+    for (int i=0; i < m_numImages; i++)
+    {
+        CRImageParams& ip = imageParams[i];
+
+        int roundSize  = CR_BIN_WARPS * 32;
+        int minBatches = CR_BIN_STREAMS_SIZE * 2;
+        int maxRounds  = 32;
+
+        ip.triOffset = instanceMode ? 0 : ranges[i].x;
+        ip.triCount  = instanceMode ? m_numTriangles : ranges[i].y;
+        ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
+
+        m_maxSubtris  = max(m_maxSubtris,  min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+        m_maxBinSegs  = max(m_maxBinSegs,  max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
+        m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
+    }
+
+    // Retry until successful.
+
+    for (;;)
+    {
+        // Allocate buffers.
+        m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
+        m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
+        m_triData   .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
+
+        m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
+        m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
+        m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
+
+        m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
+        m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
+        m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
+
+        // Report if buffers grow from last time.
+        size_t sizesTotal = getTotalBufferSizes();
+        if (sizesTotal > m_bufferSizesReported)
+        {
+            size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
+            sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
+            LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
+            m_bufferSizesReported = sizesMB << 20;
+        }
+
+        // Launch stages. Blocks until everything is done.
+        launchStages(instanceMode, peel, stream);
+
+        // Peeling iteration cannot fail, so no point checking things further.
+        if (peel)
+            break;
+
+        // Atomics after coarse stage are now available.
+        CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+
+        // Success?
+        bool failed = false;
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
+        }
+        if (!failed)
+            break; // Success!
+
+        // If we were already at maximum capacity, no can do.
+        if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
+            return false;
+
+        // Enlarge buffers and try again.
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            m_maxSubtris  = max(m_maxSubtris,  min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+            m_maxBinSegs  = max(m_maxBinSegs,  a.numBinSegs + maxBinSegsSlack);
+            m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
+        }
+    }
+
+    m_deferredClear = false;
+    return true; // Success.
+}
+
+//------------------------------------------------------------------------
+
+size_t RasterImpl::getTotalBufferSizes(void) const
+{
+    return
+        m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
+        m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
+        m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
+        m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
+{
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+
+    // Unless peeling, initialize atomics to mostly zero.
+    CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+    if (!peel)
+    {
+        memset(atomics, 0, m_numImages * sizeof(CRAtomics));
+        for (int i=0; i < m_numImages; i++)
+            atomics[i].numSubtris = imageParams[i].triCount;
+    }
+
+    // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
+    NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+
+    // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
+    if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
+    {
+        int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
+        m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set global parameters.
+    CRParams p;
+    {
+        p.atomics           = (CRAtomics*)m_crAtomics.getPtr();
+        p.numImages         = m_numImages;
+        p.totalCount        = 0; // Only relevant in range mode.
+        p.instanceMode      = instanceMode ? 1 : 0;
+
+        p.numVertices       = m_numVertices;
+        p.numTriangles      = m_numTriangles;
+        p.vertexBuffer      = m_vertexPtr;
+        p.indexBuffer       = m_indexPtr;
+
+        p.widthPixels       = m_sizePixels.x;
+        p.heightPixels      = m_sizePixels.y;
+        p.widthPixelsVp     = m_sizeVp.x;
+        p.heightPixelsVp    = m_sizeVp.y;
+        p.widthBins         = m_sizeBins.x;
+        p.heightBins        = m_sizeBins.y;
+        p.numBins           = m_numBins;
+
+        p.xs                = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
+        p.ys                = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
+        p.xo                = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
+        p.yo                = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
+
+        p.widthTiles        = m_sizeTiles.x;
+        p.heightTiles       = m_sizeTiles.y;
+        p.numTiles          = m_numTiles;
+
+        p.renderModeFlags   = m_renderModeFlags;
+        p.deferredClear     = m_deferredClear ? 1 : 0;
+        p.clearColor        = m_clearColor;
+        p.clearDepth        = CR_DEPTH_MAX;
+
+        p.maxSubtris        = m_maxSubtris;
+        p.maxBinSegs        = m_maxBinSegs;
+        p.maxTileSegs       = m_maxTileSegs;
+
+        p.triSubtris        = m_triSubtris.getPtr();
+        p.triHeader         = m_triHeader.getPtr();
+        p.triData           = m_triData.getPtr();
+        p.binSegData        = m_binSegData.getPtr();
+        p.binSegNext        = m_binSegNext.getPtr();
+        p.binSegCount       = m_binSegCount.getPtr();
+        p.binFirstSeg       = m_binFirstSeg.getPtr();
+        p.binTotal          = m_binTotal.getPtr();
+        p.tileSegData       = m_tileSegData.getPtr();
+        p.tileSegNext       = m_tileSegNext.getPtr();
+        p.tileSegCount      = m_tileSegCount.getPtr();
+        p.activeTiles       = m_activeTiles.getPtr();
+        p.tileFirstSeg      = m_tileFirstSeg.getPtr();
+
+        size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
+        p.colorBuffer       = m_colorBuffer.getPtr(byteOffset);
+        p.depthBuffer       = m_depthBuffer.getPtr(byteOffset);
+        p.peelBuffer        = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
+        p.strideX           = m_bufferSizePixels.x;
+        p.strideY           = m_bufferSizePixels.y;
+
+        memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
+        p.imageParamsExtra  = (CRImageParams*)m_crImageParamsExtra.getPtr();
+    }
+
+    // Setup block sizes.
+
+    dim3 brBlock(32, CR_BIN_WARPS);
+    dim3 crBlock(32, CR_COARSE_WARPS);
+    dim3 frBlock(32, m_numFineWarpsPerBlock);
+    void* args[] = {&p};
+
+    // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
+    if (!peel)
+    {
+        if (instanceMode)
+        {
+            int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        else
+        {
+            for (int i=0; i < m_numImages; i++)
+                p.totalCount += imageParams[i].triCount;
+            int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+    }
+
+    // Fine rasterizer is launched always.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
new file mode 100644
index 00000000..43b1edf0
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../CudaRaster.hpp"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "Util.inl"
+
+namespace CR
+{
+
+//------------------------------------------------------------------------
+// Stage implementations.
+//------------------------------------------------------------------------
+
+#include "TriangleSetup.inl"
+#include "BinRaster.inl"
+#include "CoarseRaster.inl"
+#include "FineRaster.inl"
+
+}
+
+//------------------------------------------------------------------------
+// Stage entry points.
+//------------------------------------------------------------------------
+
+__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS)  triangleSetupKernel (const CR::CRParams p)  { CR::triangleSetupImpl(p); }
+__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1)                      binRasterKernel     (const CR::CRParams p)  { CR::binRasterImpl(p); }
+__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1)                   coarseRasterKernel  (const CR::CRParams p)  { CR::coarseRasterImpl(p); }
+__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1)                 fineRasterKernel    (const CR::CRParams p)  { CR::fineRasterImpl(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
new file mode 100644
index 00000000..d594acdf
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "PrivateDefs.hpp"
+#include "Buffer.hpp"
+#include "../CudaRaster.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class RasterImpl
+{
+public:
+					        RasterImpl				(void);
+					        ~RasterImpl				(void);
+
+    void                    setBufferSize           (Vec3i size);
+    void                    setViewport             (Vec2i size, Vec2i offset);
+    void                    setRenderModeFlags      (U32 flags) { m_renderModeFlags = flags; }
+    void                    deferredClear           (U32 color) { m_deferredClear = true; m_clearColor = color; }
+    void                    setVertexBuffer         (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
+    void                    setIndexBuffer          (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
+    bool                    drawTriangles           (const Vec2i* ranges, bool peel, cudaStream_t stream);
+    void*                   getColorBuffer          (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
+    void*                   getDepthBuffer          (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
+    void                    swapDepthAndPeel        (void);
+    size_t                  getTotalBufferSizes     (void) const;
+
+private:
+    void                    launchStages            (bool instanceMode, bool peel, cudaStream_t stream);
+
+    // State.
+
+    unsigned int            m_renderModeFlags;
+    bool                    m_deferredClear;
+    unsigned int            m_clearColor;
+    void*                   m_vertexPtr;
+    void*                   m_indexPtr;
+    int                     m_numVertices;          // Input buffer size.
+    int                     m_numTriangles;         // Input buffer size.
+    size_t                  m_bufferSizesReported;  // Previously reported buffer sizes.
+
+    // Surfaces.
+
+    Buffer                  m_colorBuffer;
+    Buffer                  m_depthBuffer;
+    Buffer                  m_peelBuffer;
+    int                     m_numImages;
+    Vec2i                   m_bufferSizePixels;     // Internal buffer size.
+    Vec2i                   m_bufferSizeVp;         // Total viewport size.
+    Vec2i                   m_sizePixels;           // Internal size at which all computation is done, buffers reserved, etc.
+    Vec2i                   m_sizeVp;               // Size to which output will be cropped outside, determines viewport size.
+    Vec2i                   m_offsetPixels;         // Viewport offset for tiled rendering.
+    Vec2i                   m_sizeBins;
+    S32                     m_numBins;
+    Vec2i                   m_sizeTiles;
+    S32                     m_numTiles;
+
+    // Launch sizes etc.
+
+    S32                     m_numSMs;
+    S32                     m_numCoarseBlocksPerSM;
+    S32                     m_numFineBlocksPerSM;
+    S32                     m_numFineWarpsPerBlock;
+
+    // Global intermediate buffers. Individual images have offsets to these.
+
+    Buffer                  m_crAtomics;
+    HostBuffer              m_crAtomicsHost;
+    HostBuffer              m_crImageParamsHost;
+    Buffer                  m_crImageParamsExtra;
+    Buffer                  m_triSubtris;
+    Buffer                  m_triHeader;
+    Buffer                  m_triData;
+    Buffer                  m_binFirstSeg;
+    Buffer                  m_binTotal;
+    Buffer                  m_binSegData;
+    Buffer                  m_binSegNext;
+	Buffer                  m_binSegCount;
+    Buffer                  m_activeTiles;
+    Buffer                  m_tileFirstSeg;
+    Buffer                  m_tileSegData;
+    Buffer                  m_tileSegNext;
+    Buffer                  m_tileSegCount;
+
+    // Actual buffer sizes.
+
+    S32                     m_maxSubtris;
+    S32                     m_maxBinSegs;
+    S32                     m_maxTileSegs;
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
new file mode 100644
index 00000000..276f0a40
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
@@ -0,0 +1,402 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void snapTriangle(
+    const CRParams& p,
+    float4 v0, float4 v1, float4 v2,
+    int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
+{
+    F32 viewScaleX = (F32)(p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1));
+    F32 viewScaleY = (F32)(p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+    rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
+    p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
+    p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
+    p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
+    lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
+    hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
+{
+    U32 flips = 0;
+    if (dy > 0 || (dy == 0 && dx <= 0))
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
+    if (dx > 0)
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    if (::abs(dx) < ::abs(dy))
+        flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    return flips;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool prepareTriangle(
+    const CRParams& p,
+    int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
+    int2& d1, int2& d2, S32& area)
+{
+    // Backfacing or degenerate => cull.
+
+    d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
+    d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
+    area = d1.x * d2.y - d1.y * d2.x;
+
+    if (area == 0)
+        return false; // Degenerate.
+
+    if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
+        return false; // Backfacing.
+
+    // AABB falls between samples => cull.
+
+    int sampleSize = 1 << CR_SUBPIXEL_LOG2;
+    int biasX = (p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int biasY = (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
+    int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
+    int hix = (hi.x + biasX) & -sampleSize;
+    int hiy = (hi.y + biasY) & -sampleSize;
+
+    if (lox > hix || loy > hiy)
+        return false; // Between pixels.
+
+    // AABB covers 1 or 2 samples => cull if they are not covered.
+
+    int diff = add_sub(hix, hiy, lox) - loy;
+    if (diff <= sampleSize)
+    {
+        int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
+        int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
+        int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
+        S32 e0 = t0.x * t1.y - t0.y * t1.x;
+        S32 e1 = t1.x * t2.y - t1.y * t2.x;
+        S32 e2 = t2.x * t0.y - t2.y * t0.x;
+        if (area < 0)
+        {
+            e0 = -e0;
+            e1 = -e1;
+            e2 = -e2;
+        }
+
+        if (e0 < 0 || e1 < 0 || e2 < 0)
+        {
+            if (diff == 0)
+                return false; // Between pixels.
+
+            t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
+            t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
+            t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
+            e0 = t0.x * t1.y - t0.y * t1.x;
+            e1 = t1.x * t2.y - t1.y * t2.x;
+            e2 = t2.x * t0.y - t2.y * t0.x;
+            if (area < 0)
+            {
+                e0 = -e0;
+                e1 = -e1;
+                e2 = -e2;
+            }
+
+            if (e0 < 0 || e1 < 0 || e2 < 0)
+                return false; // Between pixels.
+        }
+    }
+
+    // Otherwise => proceed to output the triangle.
+
+    return true; // Visible.
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void setupTriangle(
+    const CRParams& p,
+    CRTriangleHeader* th, CRTriangleData* td, int triId,
+    float v0z, float v1z, float v2z,
+    int2 p0, int2 p1, int2 p2, float3 rcpW,
+    int2 d1, int2 d2, S32 area)
+{
+    // Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
+    // disabled (if it is enabled, we never come here with area < 0).
+
+    if (area < 0)
+    {
+        swap(d1, d2);
+        swap(p1, p2);
+        swap(v1z, v2z);
+        swap(rcpW.y, rcpW.z);
+        area = -area;
+    }
+
+    int2 wv0;
+    wv0.x = p0.x + (p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1));
+    wv0.y = p0.y + (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+
+    // Setup depth plane equation.
+
+    F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
+    F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
+    float3 zvert = make_float3(
+        (v0z * zcoef) * rcpW.x + zbias,
+        (v1z * zcoef) * rcpW.y + zbias,
+        (v2z * zcoef) * rcpW.z + zbias
+    );
+    int2 zv0 = make_int2(
+        wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
+        wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
+    );
+    uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
+
+    U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
+
+    // Write CRTriangleData.
+
+    *(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
+
+    // Determine flipbits.
+
+    U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
+    U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
+    U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
+
+    // Write CRTriangleHeader.
+
+    *(uint4*)th = make_uint4(
+        prmt(p0.x, p0.y, 0x5410),
+        prmt(p1.x, p1.y, 0x5410),
+        prmt(p2.x, p2.y, 0x5410),
+        (zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void triangleSetupImpl(const CRParams p)
+{
+    __shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
+    F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
+
+    // Compute task and image indices.
+
+    int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
+    int imageIdx = 0;
+    if (p.instanceMode)
+    {
+        imageIdx = blockIdx.z;
+        if (taskIdx >= p.numTriangles)
+            return;
+    }
+    else
+    {
+        while (imageIdx < p.numImages)
+        {
+            int count = getImageParams(p, imageIdx).triCount;
+            if (taskIdx < count)
+                break;
+            taskIdx -= count;
+            imageIdx += 1;
+        }
+        if (imageIdx == p.numImages)
+            return;
+    }
+
+    // Per-image data structures.
+
+    const CRImageParams& ip = getImageParams(p, imageIdx);
+    CRAtomics& atomics = p.atomics[imageIdx];
+
+    const int*          indexBuffer = (const int*)p.indexBuffer;
+    U8*                 triSubtris  = (U8*)p.triSubtris               + imageIdx * p.maxSubtris;
+    CRTriangleHeader*   triHeader   = (CRTriangleHeader*)p.triHeader  + imageIdx * p.maxSubtris;
+    CRTriangleData*     triData     = (CRTriangleData*)p.triData      + imageIdx * p.maxSubtris;
+
+    // Determine triangle index.
+
+    int triIdx = taskIdx;
+    if (!p.instanceMode)
+        triIdx += ip.triOffset;
+
+    // Read vertex indices.
+
+    if ((U32)triIdx >= (U32)p.numTriangles)
+    {
+        // Bad triangle index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    uint4 vidx;
+    vidx.x = indexBuffer[triIdx * 3 + 0];
+    vidx.y = indexBuffer[triIdx * 3 + 1];
+    vidx.z = indexBuffer[triIdx * 3 + 2];
+    vidx.w = triIdx + 1; // Triangle index.
+
+    if (vidx.x >= (U32)p.numVertices ||
+        vidx.y >= (U32)p.numVertices ||
+        vidx.z >= (U32)p.numVertices)
+    {
+        // Bad vertex index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    // Read vertex positions.
+
+    const float4* vertexBuffer = (const float4*)p.vertexBuffer;
+    if (p.instanceMode)
+        vertexBuffer += p.numVertices * imageIdx; // Instance offset.
+
+    float4 v0 = vertexBuffer[vidx.x];
+    float4 v1 = vertexBuffer[vidx.y];
+    float4 v2 = vertexBuffer[vidx.z];
+
+    // Adjust vertex positions according to current viewport size and offset.
+
+    v0.x = v0.x * p.xs + v0.w * p.xo;
+    v0.y = v0.y * p.ys + v0.w * p.yo;
+    v1.x = v1.x * p.xs + v1.w * p.xo;
+    v1.y = v1.y * p.ys + v1.w * p.yo;
+    v2.x = v2.x * p.xs + v2.w * p.xo;
+    v2.y = v2.y * p.ys + v2.w * p.yo;
+
+    // Outside view frustum => cull.
+
+    if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
+    {
+        if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
+            (v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
+            (v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
+            (v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
+            (v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
+            (v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
+        {
+            triSubtris[taskIdx] = 0;
+            return;
+        }
+    }
+
+    // Inside depth range => try to snap vertices.
+
+    if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
+    {
+        // Inside S16 range and small enough => fast path.
+        // Note: aabbLimit comes from the fact that cover8x8
+        // does not support guardband with maximal viewport.
+
+        int2 p0, p1, p2, lo, hi;
+        float3 rcpW;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        S32 loxy = ::min(lo.x, lo.y);
+        S32 hixy = ::max(hi.x, hi.y);
+        S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
+
+        if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
+        {
+            int2 d1, d2;
+            S32 area;
+            bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
+            triSubtris[taskIdx] = res ? 1 : 0;
+
+            if (res)
+                setupTriangle(
+                    p,
+                    &triHeader[taskIdx], &triData[taskIdx], vidx.w,
+                    v0.z, v1.z, v2.z,
+                    p0, p1, p2, rcpW,
+                    d1, d2, area);
+
+            return;
+        }
+    }
+
+    // Clip to view frustum.
+
+    float4 ov0 = v0;
+    float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
+    float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
+    int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
+
+    // Count non-culled subtriangles.
+
+    v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
+    v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
+    v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
+    v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
+    v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
+    v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
+    v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
+    v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
+    float4 tv1 = v1;
+
+    int numSubtris = 0;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+            numSubtris++;
+
+        v1 = v2;
+    }
+
+    triSubtris[taskIdx] = numSubtris;
+
+    // Multiple subtriangles => allocate.
+
+    int subtriBase = taskIdx;
+    if (numSubtris > 1)
+    {
+        subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
+        triHeader[taskIdx].misc = subtriBase;
+        if (subtriBase + numSubtris > p.maxSubtris)
+            numVerts = 0;
+    }
+
+    // Setup subtriangles.
+
+    v1 = tv1;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+        {
+            setupTriangle(
+                p,
+                &triHeader[subtriBase], &triData[subtriBase], vidx.w,
+                v0.z, v1.z, v2.z,
+                p0, p1, p2, rcpW,
+                d1, d2, area);
+
+            subtriBase++;
+        }
+
+        v1 = v2;
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Util.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Util.inl
new file mode 100644
index 00000000..f8faeba7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/cudaraster/impl/Util.inl
@@ -0,0 +1,452 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "PrivateDefs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+template<class T> __device__ __inline__ void swap(T& a, T& b)               { T t = a; a = b; b = t; }
+
+__device__ __inline__ U32   getLo                   (U64 a)                 { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ S32   getLo                   (S64 a)                 { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ U32   getHi                   (U64 a)                 { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ S32   getHi                   (S64 a)                 { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ U64   combineLoHi             (U32 lo, U32 hi)        { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ S64   combineLoHi             (S32 lo, S32 hi)        { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ U32   getLaneMaskLt           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_lt;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskLe           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_le;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskGt           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_gt;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskGe           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_ge;" : "=r"(r)); return r; }
+__device__ __inline__ int   findLeadingOne          (U32 v)                 { U32 r; asm("bfind.u32 %0, %1;" : "=r"(r) : "r"(v)); return r; }
+__device__ __inline__ bool  singleLane              (void)                  { return ((::__ballot_sync(~0u, true) & getLaneMaskLt()) == 0); }
+
+__device__ __inline__ void  add_add_carry           (U32& rlo, U32 alo, U32 blo, U32& rhi, U32 ahi, U32 bhi) { U64 r = combineLoHi(alo, ahi) + combineLoHi(blo, bhi); rlo = getLo(r); rhi = getHi(r); }
+__device__ __inline__ S32   f32_to_s32_sat          (F32 a)                 { S32 v; asm("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u32_sat          (F32 a)                 { U32 v; asm("cvt.rni.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u32_sat_rmi      (F32 a)                 { U32 v; asm("cvt.rmi.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u8_sat           (F32 a)                 { U32 v; asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ S64   f32_to_s64              (F32 a)                 { S64 v; asm("cvt.rni.s64.f32 %0, %1;" : "=l"(v) : "f"(a)); return v; }
+__device__ __inline__ S32   add_s16lo_s16lo			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16hi_s16lo			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16lo_s16hi			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16hi_s16hi			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16lo_s16lo			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16hi_s16lo			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16lo_s16hi			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16hi_s16hi			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16lo_u16lo			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16hi_u16lo			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16lo_u16hi			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16hi_u16hi			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b0					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b0, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b1					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b2					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b2, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b3					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b3, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   vmad_b0					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b1					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b2					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b3					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b0_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b1_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b2_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b3_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_mask8				(U32 a, U32 b)			{ U32 v; U32 z=0; asm("vadd.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ U32   sub_mask8				(U32 a, U32 b)			{ U32 v; U32 z=0; asm("vsub.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ S32   max_max					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   min_min					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   max_add					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmax.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   min_add					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmin.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_add					(U32 a, U32 b, U32 c)	{ U32 v; asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   sub_add					(U32 a, U32 b, U32 c)	{ U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_sub					(U32 a, U32 b, U32 c)	{ U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(c), "r"(b)); return v; }
+__device__ __inline__ S32   add_clamp_0_x			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   add_clamp_b0			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   add_clamp_b2			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat %0.b2, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   prmt					(U32 a, U32 b, U32 c)   { U32 v; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   u32lo_sext              (U32 a)                 { U32 v; asm("cvt.s16.u32 %0, %1;" : "=r"(v) : "r"(a)); return v; }
+__device__ __inline__ U32   slct                    (U32 a, U32 b, S32 c)   { U32 v; asm("slct.u32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   slct                    (S32 a, S32 b, S32 c)   { S32 v; asm("slct.s32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ F32   slct                    (F32 a, F32 b, S32 c)   { F32 v; asm("slct.f32.s32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "r"(c)); return v; }
+__device__ __inline__ U32   isetge                  (S32 a, S32 b)          { U32 v; asm("set.ge.u32.s32 %0, %1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ F64   rcp_approx              (F64 a)                 { F64 v; asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(v) : "d"(a)); return v; }
+__device__ __inline__ F32   fma_rm                  (F32 a, F32 b, F32 c)   { F32 v; asm("fma.rm.f32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "f"(c)); return v; }
+__device__ __inline__ U32   idiv_fast               (U32 a, U32 b);
+
+__device__ __inline__ uint3 setupPleq               (float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp);
+
+__device__ __inline__ void  cover8x8_setupLUT           (volatile U64* lut);
+__device__ __inline__ U64   cover8x8_exact_fast         (S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut); // Assumes viewport <= 2^11, subpixels <= 2^4, no guardband.
+__device__ __inline__ U64   cover8x8_lookupMask         (S64 yinit, U32 yinc, U32 flips, volatile const U64* lut);
+
+__device__ __inline__ U64   cover8x8_exact_noLUT        (S32 ox, S32 oy, S32 dx, S32 dy); // optimized reference implementation, does not require look-up table
+__device__ __inline__ U64   cover8x8_conservative_noLUT (S32 ox, S32 oy, S32 dx, S32 dy);
+__device__ __inline__ U64   cover8x8_generateMask_noLUT (S32 curr, S32 dx, S32 dy);
+
+template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems); // Assumes that numItems <= threadsInBlock. Must sync before & after the call.
+
+__device__ __inline__ const CRImageParams& getImageParams(const CRParams& p, int idx)
+{
+    return (idx < CR_EMBED_IMAGE_PARAMS) ? p.imageParamsFirst[idx] : p.imageParamsExtra[idx - CR_EMBED_IMAGE_PARAMS];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipPolygonWithPlane(F32* baryOut, const F32* baryIn, int numIn, F32 v0, F32 v1, F32 v2)
+{
+    int numOut = 0;
+    if (numIn >= 3)
+    {
+        int ai = (numIn - 1) * 2;
+        F32 av = v0 + v1 * baryIn[ai + 0] + v2 * baryIn[ai + 1];
+        for (int bi = 0; bi < numIn * 2; bi += 2)
+        {
+            F32 bv = v0 + v1 * baryIn[bi + 0] + v2 * baryIn[bi + 1];
+            if (av * bv < 0.0f)
+            {
+                F32 bc = av / (av - bv);
+                F32 ac = 1.0f - bc;
+                baryOut[numOut + 0] = baryIn[ai + 0] * ac + baryIn[bi + 0] * bc;
+                baryOut[numOut + 1] = baryIn[ai + 1] * ac + baryIn[bi + 1] * bc;
+                numOut += 2;
+            }
+            if (bv >= 0.0f)
+            {
+                baryOut[numOut + 0] = baryIn[bi + 0];
+                baryOut[numOut + 1] = baryIn[bi + 1];
+                numOut += 2;
+            }
+            ai = bi;
+            av = bv;
+        }
+    }
+    return (numOut >> 1);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipTriangleWithFrustum(F32* bary, const F32* v0, const F32* v1, const F32* v2, const F32* d1, const F32* d2)
+{
+    int num = 3;
+    bary[0] = 0.0f, bary[1] = 0.0f;
+    bary[2] = 1.0f, bary[3] = 0.0f;
+    bary[4] = 0.0f, bary[5] = 1.0f;
+
+    if ((v0[3] < fabsf(v0[0])) | (v1[3] < fabsf(v1[0])) | (v2[3] < fabsf(v2[0])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[0], d1[3] + d1[0], d2[3] + d2[0]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[0], d1[3] - d1[0], d2[3] - d2[0]);
+    }
+    if ((v0[3] < fabsf(v0[1])) | (v1[3] < fabsf(v1[1])) | (v2[3] < fabsf(v2[1])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[1], d1[3] + d1[1], d2[3] + d2[1]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[1], d1[3] - d1[1], d2[3] - d2[1]);
+    }
+    if ((v0[3] < fabsf(v0[2])) | (v1[3] < fabsf(v1[2])) | (v2[3] < fabsf(v2[2])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[2], d1[3] + d1[2], d2[3] + d2[2]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[2], d1[3] - d1[2], d2[3] - d2[2]);
+    }
+    return num;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 idiv_fast(U32 a, U32 b)
+{
+    return f32_to_u32_sat_rmi(((F32)a + 0.5f) / (F32)b);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 toABGR(float4 color)
+{
+	// 11 instructions: 4*FFMA, 4*F2I, 3*PRMT
+	U32 x = f32_to_u32_sat_rmi(fma_rm(color.x, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 y = f32_to_u32_sat_rmi(fma_rm(color.y, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 z = f32_to_u32_sat_rmi(fma_rm(color.z, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 w = f32_to_u32_sat_rmi(fma_rm(color.w, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	return prmt(prmt(x, y, 0x0073), prmt(z, w, 0x0073), 0x5410);
+}
+
+//------------------------------------------------------------------------
+// v0 = subpixels relative to the bottom-left sampling point
+
+__device__ __inline__ uint3 setupPleq(float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp)
+{
+    F32 mx = fmaxf(fmaxf(values.x, values.y), values.z);
+    int sh = ::min(::max((__float_as_int(mx) >> 23) - (127 + 22), 0), 8);
+    S32 t0 = (U32)values.x >> sh;
+    S32 t1 = ((U32)values.y >> sh) - t0;
+    S32 t2 = ((U32)values.z >> sh) - t0;
+
+    U32 rcpMant = (__float_as_int(areaRcp) & 0x007FFFFF) | 0x00800000;
+    int rcpShift = (23 + 127) - (__float_as_int(areaRcp) >> 23);
+
+    uint3 pleq;
+    S64 xc = ((S64)t1 * d2.y - (S64)t2 * d1.y) * rcpMant;
+    S64 yc = ((S64)t2 * d1.x - (S64)t1 * d2.x) * rcpMant;
+    pleq.x = (U32)(xc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+    pleq.y = (U32)(yc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+
+    S32 centerX = (v0.x * 2 + min_min(d1.x, d2.x, 0) + max_max(d1.x, d2.x, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+    S32 centerY = (v0.y * 2 + min_min(d1.y, d2.y, 0) + max_max(d1.y, d2.y, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+    S32 vcx = v0.x - (centerX << CR_SUBPIXEL_LOG2);
+    S32 vcy = v0.y - (centerY << CR_SUBPIXEL_LOG2);
+
+    pleq.z = t0 << sh;
+    pleq.z -= (U32)(((xc >> 13) * vcx + (yc >> 13) * vcy) >> (rcpShift - (sh + 13)));
+    pleq.z -= pleq.x * centerX + pleq.y * centerY;
+    return pleq;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void cover8x8_setupLUT(volatile U64* lut)
+{
+    for (S32 lutIdx = threadIdx.x + blockDim.x * threadIdx.y; lutIdx < CR_COVER8X8_LUT_SIZE; lutIdx += blockDim.x * blockDim.y)
+    {
+        int half       = (lutIdx < (12 << 5)) ? 0 : 1;
+        int yint       = (lutIdx >> 5) - half * 12 - 3;
+        U32 shape      = ((lutIdx >> 2) & 7) << (31 - 2);
+        S32 slctSwapXY = lutIdx << (31 - 1);
+        S32 slctNegX   = lutIdx << (31 - 0);
+        S32 slctCompl  = slctSwapXY ^ slctNegX;
+
+        U64 mask = 0;
+        int xlo = half * 4;
+        int xhi = xlo + 4;
+        for (int x = xlo; x < xhi; x++)
+        {
+            int ylo = slct(0, ::max(yint, 0), slctCompl);
+            int yhi = slct(::min(yint, 8), 8, slctCompl);
+            for (int y = ylo; y < yhi; y++)
+            {
+                int xx = slct(x, y, slctSwapXY);
+                int yy = slct(y, x, slctSwapXY);
+                xx = slct(xx, 7 - xx, slctNegX);
+                mask |= (U64)1 << (xx + yy * 8);
+            }
+            yint += shape >> 31;
+            shape <<= 1;
+        }
+        lut[lutIdx] = mask;
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_fast(S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut) // 52 instr
+{
+    F32  yinitBias  = (F32)(1 << (31 - CR_MAXVIEWPORT_LOG2 - CR_SUBPIXEL_LOG2 * 2));
+    F32  yinitScale = (F32)(1 << (32 - CR_SUBPIXEL_LOG2));
+    F32  yincScale  = 65536.0f * 65536.0f;
+
+    S32  slctFlipY  = flips << (31 - CR_FLIPBIT_FLIP_Y);
+    S32  slctFlipX  = flips << (31 - CR_FLIPBIT_FLIP_X);
+    S32  slctSwapXY = flips << (31 - CR_FLIPBIT_SWAP_XY);
+
+    // Evaluate cross product.
+
+    S32 t = ox * dy - oy * dx;
+    F32 det = (F32)slct(t, t - dy * (7 << CR_SUBPIXEL_LOG2), slctFlipX);
+    if (flips >= (1 << CR_FLIPBIT_COMPL))
+        det = -det;
+
+    // Represent Y as a function of X.
+
+    F32 xrcp  = 1.0f / (F32)::abs(slct(dx, dy, slctSwapXY));
+    F32 yzero = det * yinitScale * xrcp + yinitBias;
+    S64 yinit = f32_to_s64(slct(yzero, -yzero, slctFlipY));
+    U32 yinc  = f32_to_u32_sat((F32)::abs(slct(dy, dx, slctSwapXY)) * xrcp * yincScale);
+
+    // Lookup.
+
+    return cover8x8_lookupMask(yinit, yinc, flips, lut);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_lookupMask(S64 yinit, U32 yinc, U32 flips, volatile const U64* lut)
+{
+    // First half.
+
+    U32 yfrac = getLo(yinit);
+    U32 shape = add_clamp_0_x(getHi(yinit) + 4, 0, 11);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    int oct = flips & ((1 << CR_FLIPBIT_FLIP_X) | (1 << CR_FLIPBIT_SWAP_XY));
+    U64 mask = *(U64*)((U8*)lut + oct + (shape << 5));
+
+    // Second half.
+
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    shape = add_clamp_0_x(getHi(yinit) + 4, __popc(shape & 15), 11);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    mask |= *(U64*)((U8*)lut + oct + (shape << 5) + (12 << 8));
+    return (flips >= (1 << CR_FLIPBIT_COMPL)) ? ~mask : mask;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+    S32 curr = ox * dy - oy * dx;
+    if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+    return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_conservative_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+    S32 curr = ox * dy - oy * dx;
+    if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+    curr += (::abs(dx) + ::abs(dy)) << (CR_SUBPIXEL_LOG2 - 1);
+    return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_generateMask_noLUT(S32 curr, S32 dx, S32 dy)
+{
+    curr += (dx - dy) * (7 << CR_SUBPIXEL_LOG2);
+    S32 stepX = dy << (CR_SUBPIXEL_LOG2 + 1);
+    S32 stepYorig = -dx - dy * 7;
+    S32 stepY = stepYorig << (CR_SUBPIXEL_LOG2 + 1);
+
+    U32 hi = isetge(curr, 0);
+    U32 frac = curr + curr;
+    for (int i = 62; i >= 32; i--)
+        add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, hi, hi, hi);
+
+	U32 lo = 0;
+    for (int i = 31; i >= 0; i--)
+        add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, lo, lo, lo);
+
+	lo ^= lo >> 1,  hi ^= hi >> 1;
+	lo ^= lo >> 2,  hi ^= hi >> 2;
+	lo ^= lo >> 4,  hi ^= hi >> 4;
+	lo ^= lo >> 8,  hi ^= hi >> 8;
+	lo ^= lo >> 16, hi ^= hi >> 16;
+
+	if (dy < 0)
+    {
+        lo ^= 0x55AA55AA;
+        hi ^= 0x55AA55AA;
+    }
+	if (stepYorig < 0)
+    {
+        lo ^= 0xFF00FF00;
+        hi ^= 0x00FF00FF;
+    }
+	if ((hi & 1) != 0)
+		lo = ~lo;
+
+    return combineLoHi(lo, hi);
+}
+
+//------------------------------------------------------------------------
+
+template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems)
+{
+    int thrInBlock = threadIdx.x + threadIdx.y * blockDim.x;
+    int range = 16;
+
+    // Use transposition sort within each 16-wide subrange.
+
+    int base = thrInBlock * 2;
+    bool act = (base < numItems - 1);
+    U32 actMask = __ballot_sync(~0u, act);
+    if (act)
+    {
+        bool tryOdd = (base < numItems - 2 && (~base & (range - 2)) != 0);
+        T mid = ptr[base + 1];
+
+        for (int iter = 0; iter < range; iter += 2)
+        {
+            // Evens.
+
+            T tmp = ptr[base + 0];
+            if (tmp > mid)
+            {
+                ptr[base + 0] = mid;
+                mid = tmp;
+            }
+            __syncwarp(actMask);
+
+            // Odds.
+
+            if (tryOdd)
+            {
+                tmp = ptr[base + 2];
+                if (mid > tmp)
+                {
+                    ptr[base + 2] = mid;
+                    mid = tmp;
+                }
+            }
+            __syncwarp(actMask);
+        }
+        ptr[base + 1] = mid;
+    }
+
+    // Multiple subranges => Merge hierarchically.
+
+    for (; range < numItems; range <<= 1)
+    {
+        // Assuming that we would insert the current item into the other
+        // subrange, use binary search to find the appropriate slot.
+
+        __syncthreads();
+
+        T item;
+        int slot;
+        if (thrInBlock < numItems)
+        {
+            item = ptr[thrInBlock];
+            slot = (thrInBlock & -range) ^ range;
+            if (slot < numItems)
+            {
+                T tmp = ptr[slot];
+                bool inclusive = ((thrInBlock & range) != 0);
+                if (tmp < item || (inclusive && tmp == item))
+                {
+                    for (int step = (range >> 1); step != 0; step >>= 1)
+                    {
+                        int probe = slot + step;
+                        if (probe < numItems)
+                        {
+                            tmp = ptr[probe];
+                            if (tmp < item || (inclusive && tmp == item))
+                                slot = probe;
+                        }
+                    }
+                    slot++;
+                }
+            }
+        }
+
+        // Store the item at an appropriate place.
+
+        __syncthreads();
+
+        if (thrInBlock < numItems)
+            ptr[slot + (thrInBlock & (range * 2 - 1)) - range] = item;
+    }
+}
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/framework.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/framework.h
new file mode 100644
index 00000000..12d803ca
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/framework.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <pybind11/numpy.h>
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp
new file mode 100644
index 00000000..2af3e931
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+#include "framework.h"
+#include "glutil.h"
+#include <iostream>
+#include <iomanip>
+
+// Create the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+// Track initialization status.
+static volatile bool s_glExtInitialized = false;
+
+// Error strings.
+const char* getGLErrorString(GLenum err)
+{
+    switch(err)
+    {
+        case GL_NO_ERROR:                       return "GL_NO_ERROR";
+        case GL_INVALID_ENUM:                   return "GL_INVALID_ENUM";
+        case GL_INVALID_VALUE:                  return "GL_INVALID_VALUE";
+        case GL_INVALID_OPERATION:              return "GL_INVALID_OPERATION";
+        case GL_STACK_OVERFLOW:                 return "GL_STACK_OVERFLOW";
+        case GL_STACK_UNDERFLOW:                return "GL_STACK_UNDERFLOW";
+        case GL_OUT_OF_MEMORY:                  return "GL_OUT_OF_MEMORY";
+        case GL_INVALID_FRAMEBUFFER_OPERATION:  return "GL_INVALID_FRAMEBUFFER_OPERATION";
+        case GL_TABLE_TOO_LARGE:                return "GL_TABLE_TOO_LARGE";
+        case GL_CONTEXT_LOST:                   return "GL_CONTEXT_LOST";
+    }
+    return "Unknown error";
+}
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+static CRITICAL_SECTION getInitializedCriticalSection(void)
+{
+    CRITICAL_SECTION cs;
+    InitializeCriticalSection(&cs);
+    return cs;
+}
+
+static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
+
+static void safeGetProcAddress(const char* name, PROC* pfn)
+{
+    PROC result = wglGetProcAddress(name);
+    if (!result)
+    {
+        LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    // Use critical section for thread safety.
+    EnterCriticalSection(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    // Done.
+    LeaveCriticalSection(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    if (!wglMakeCurrent(NULL, NULL))
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+extern "C" int set_gpu(const char*); // In setgpu.lib
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
+
+    HINSTANCE hInstance = GetModuleHandle(NULL);
+    WNDCLASS wc = {};
+    wc.style         = CS_OWNDC;
+    wc.lpfnWndProc   = DefWindowProc;
+    wc.hInstance     = hInstance;
+    wc.lpszClassName = "__DummyGLClassCPP";
+    int res = RegisterClass(&wc);
+
+    HWND hwnd = CreateWindow(
+        "__DummyGLClassCPP",        // lpClassName
+        "__DummyGLWindowCPP",       // lpWindowName
+        WS_OVERLAPPEDWINDOW,        // dwStyle
+        CW_USEDEFAULT,              // x
+        CW_USEDEFAULT,              // y
+        0, 0,                       // nWidth, nHeight
+        NULL, NULL,                 // hWndParent, hMenu
+        hInstance,                  // hInstance
+        NULL                        // lpParam
+    );
+
+    PIXELFORMATDESCRIPTOR pfd = {};
+    pfd.dwFlags      = PFD_SUPPORT_OPENGL;
+    pfd.iPixelType   = PFD_TYPE_RGBA;
+    pfd.iLayerType   = PFD_MAIN_PLANE;
+    pfd.cColorBits   = 32;
+    pfd.cDepthBits   = 24;
+    pfd.cStencilBits = 8;
+
+    HDC hdc = GetDC(hwnd);
+    int pixelformat = ChoosePixelFormat(hdc, &pfd);
+    SetPixelFormat(hdc, pixelformat, &pfd);
+
+    HGLRC hglrc = wglCreateContext(hdc);
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+    GLContext glctx = {hdc, hglrc, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (wglGetCurrentContext() == glctx.hglrc)
+        releaseGLContext();
+
+    HWND hwnd = WindowFromDC(glctx.hdc);
+    if (!hwnd)
+        LOG(FATAL) << "WindowFromDC() failed";
+    if (!ReleaseDC(hwnd, glctx.hdc))
+        LOG(FATAL) << "ReleaseDC() failed";
+    if (!wglDeleteContext(glctx.hglrc))
+        LOG(FATAL) << "wglDeleteContext() failed";
+    if (!DestroyWindow(hwnd))
+        LOG(FATAL) << "DestroyWindow() failed";
+
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+
+static pthread_mutex_t s_getProcAddressMutex;
+
+typedef void (*PROCFN)();
+
+static void safeGetProcAddress(const char* name, PROCFN* pfn)
+{
+    PROCFN result = eglGetProcAddress(name);
+    if (!result)
+    {
+        pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    pthread_mutex_lock(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    pthread_mutex_unlock(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
+        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (display == EGL_NO_DISPLAY)
+        LOG(WARNING) << "releaseGLContext() called with no active display";
+    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
+{
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
+
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i];
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+
+    free(devices);
+    return display;
+}
+
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+
+    if (!display)
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+        if (display == EGL_NO_DISPLAY)
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }
+
+    EGLint major;
+    EGLint minor;
+    if (!eglInitialize(display, &major, &minor))
+        LOG(FATAL) << "eglInitialize() failed";
+
+    // Choose configuration.
+
+    const EGLint context_attribs[] = {
+        EGL_RED_SIZE,           8,
+        EGL_GREEN_SIZE,         8,
+        EGL_BLUE_SIZE,          8,
+        EGL_ALPHA_SIZE,         8,
+        EGL_DEPTH_SIZE,         24,
+        EGL_STENCIL_SIZE,       8,
+        EGL_RENDERABLE_TYPE,    EGL_OPENGL_BIT,
+        EGL_SURFACE_TYPE,       EGL_PBUFFER_BIT,
+        EGL_NONE
+    };
+
+    EGLConfig config;
+    EGLint num_config;
+    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+        LOG(FATAL) << "eglChooseConfig() failed";
+
+    // Create GL context.
+
+    if (!eglBindAPI(EGL_OPENGL_API))
+        LOG(FATAL) << "eglBindAPI() failed";
+
+    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+    if (context == EGL_NO_CONTEXT)
+        LOG(FATAL) << "eglCreateContext() failed";
+
+    // Done.
+
+    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+    GLContext glctx = {display, context, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (eglGetCurrentContext() == glctx.context)
+        releaseGLContext();
+
+    if (!eglDestroyContext(glctx.display, glctx.context))
+        LOG(ERROR) << "eglDestroyContext() failed";
+
+    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)glctx.display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+//------------------------------------------------------------------------
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.h
new file mode 100644
index 00000000..e9a3a7d9
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Windows-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h> // Required by gl.h in Windows.
+#define GLAPIENTRY APIENTRY
+
+struct GLContext
+{
+    HDC     hdc;
+    HGLRC   hglrc;
+    int     extInitialized;
+};
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#define GLAPIENTRY
+
+struct GLContext
+{
+    EGLDisplay  display;
+    EGLContext  context;
+    int         extInitialized;
+};
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// OpenGL, CUDA interop, GL extensions.
+//------------------------------------------------------------------------
+#define GL_GLEXT_LEGACY
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+// Constants.
+#ifndef GL_VERSION_1_2
+#define GL_CLAMP_TO_EDGE                 0x812F
+#define GL_TEXTURE_3D                    0x806F
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_ARRAY_BUFFER                  0x8892
+#define GL_DYNAMIC_DRAW                  0x88E8
+#define GL_ELEMENT_ARRAY_BUFFER          0x8893
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_FRAGMENT_SHADER               0x8B30
+#define GL_INFO_LOG_LENGTH               0x8B84
+#define GL_LINK_STATUS                   0x8B82
+#define GL_VERTEX_SHADER                 0x8B31
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_MAJOR_VERSION                 0x821B
+#define GL_MINOR_VERSION                 0x821C
+#define GL_RGBA32F                       0x8814
+#define GL_TEXTURE_2D_ARRAY              0x8C1A
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_GEOMETRY_SHADER               0x8DD9
+#endif
+#ifndef GL_ARB_framebuffer_object
+#define GL_COLOR_ATTACHMENT0             0x8CE0
+#define GL_COLOR_ATTACHMENT1             0x8CE1
+#define GL_DEPTH_STENCIL                 0x84F9
+#define GL_DEPTH_STENCIL_ATTACHMENT      0x821A
+#define GL_DEPTH24_STENCIL8              0x88F0
+#define GL_FRAMEBUFFER                   0x8D40
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_UNSIGNED_INT_24_8             0x84FA
+#endif
+#ifndef GL_ARB_imaging
+#define GL_TABLE_TOO_LARGE               0x8031
+#endif
+#ifndef GL_KHR_robustness
+#define GL_CONTEXT_LOST                  0x0507
+#endif
+
+// Declare function pointers to OpenGL extension functions.
+#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+//------------------------------------------------------------------------
+// Common functions.
+//------------------------------------------------------------------------
+
+void        setGLContext            (GLContext& glctx);
+void        releaseGLContext        (void);
+GLContext   createGLContext         (int cudaDeviceIdx);
+void        destroyGLContext        (GLContext& glctx);
+const char* getGLErrorString        (GLenum err);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h
new file mode 100644
index 00000000..afa08f39
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/glutil_extlist.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#ifndef GL_VERSION_1_2
+GLUTIL_EXT(void,   glTexImage3D,                GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+#endif
+#ifndef GL_VERSION_1_5
+GLUTIL_EXT(void,   glBindBuffer,                GLenum target, GLuint buffer);
+GLUTIL_EXT(void,   glBufferData,                GLenum target, ptrdiff_t size, const void* data, GLenum usage);
+GLUTIL_EXT(void,   glGenBuffers,                GLsizei n, GLuint* buffers);
+#endif
+#ifndef GL_VERSION_2_0
+GLUTIL_EXT(void,   glAttachShader,              GLuint program, GLuint shader);
+GLUTIL_EXT(void,   glCompileShader,             GLuint shader);
+GLUTIL_EXT(GLuint, glCreateProgram,             void);
+GLUTIL_EXT(GLuint, glCreateShader,              GLenum type);
+GLUTIL_EXT(void,   glDrawBuffers,               GLsizei n, const GLenum* bufs);
+GLUTIL_EXT(void,   glEnableVertexAttribArray,   GLuint index);
+GLUTIL_EXT(void,   glGetProgramInfoLog,         GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
+GLUTIL_EXT(void,   glGetProgramiv,              GLuint program, GLenum pname, GLint* param);
+GLUTIL_EXT(void,   glLinkProgram,               GLuint program);
+GLUTIL_EXT(void,   glShaderSource,              GLuint shader, GLsizei count, const char *const* string, const GLint* length);
+GLUTIL_EXT(void,   glUniform1f,                 GLint location, GLfloat v0);
+GLUTIL_EXT(void,   glUniform2f,                 GLint location, GLfloat v0, GLfloat v1);
+GLUTIL_EXT(void,   glUseProgram,                GLuint program);
+GLUTIL_EXT(void,   glVertexAttribPointer,       GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
+#endif
+#ifndef GL_VERSION_3_2
+GLUTIL_EXT(void,   glFramebufferTexture,        GLenum target, GLenum attachment, GLuint texture, GLint level);
+#endif
+#ifndef GL_ARB_framebuffer_object
+GLUTIL_EXT(void,   glBindFramebuffer,           GLenum target, GLuint framebuffer);
+GLUTIL_EXT(void,   glGenFramebuffers,           GLsizei n, GLuint* framebuffers);
+#endif
+#ifndef GL_ARB_vertex_array_object
+GLUTIL_EXT(void,   glBindVertexArray,           GLuint array);
+GLUTIL_EXT(void,   glGenVertexArrays,           GLsizei n, GLuint* arrays);
+#endif
+#ifndef GL_ARB_multi_draw_indirect
+GLUTIL_EXT(void,   glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
+#endif
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu
new file mode 100644
index 00000000..3bd2a7a7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Output ptrs.
+    float* out = p.out + pidx * p.numAttr;
+    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+    // Fetch rasterizer output.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = float_to_triidx(r.w) - 1;
+    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+    // If no geometry in entire warp, zero the output and exit.
+    // Otherwise force barys to zero and output with live threads.
+    if (__all_sync(0xffffffffu, !triValid))
+    {
+        for (int i=0; i < p.numAttr; i++)
+            out[i] = 0.f;
+        if (ENABLE_DA)
+            for (int i=0; i < p.numDiffAttr; i++)
+                outDA[i] = make_float2(0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Pointers to attributes.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+
+    // Barys. If no triangle, force all to zero -> output is zero.
+    float b0 = triValid ? r.x : 0.f;
+    float b1 = triValid ? r.y : 0.f;
+    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+    // Interpolate and write attributes.
+    for (int i=0; i < p.numAttr; i++)
+        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+    // No diff attrs? Exit.
+    if (!ENABLE_DA)
+        return;
+
+    // Read bary pixel differentials if we have a triangle.
+    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+    if (triValid)
+        db = ((float4*)p.rastDB)[pidx];
+
+    // Unpack a bit.
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    // Calculate the pixel differentials of chosen attributes.    
+    for (int i=0; i < p.numDiffAttr; i++)
+    {   
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Zero output if invalid index.
+        float dsdx = 0.f;
+        float dsdy = 0.f;
+        if (j >= 0 && j < p.numAttr)
+        {
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            dsdx = dudx*dsdu + dvdx*dsdv;
+            dsdy = dudy*dsdu + dvdy*dsdv;
+        }
+
+        // Write.
+        outDA[i] = make_float2(dsdx, dsdy);
+    }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = float_to_triidx(r.w) - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        if (ENABLE_DA)
+            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Pointers to inputs.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+    const float* pdy = p.dy + pidx * p.numAttr;
+
+    // Pointers to outputs.
+    float* ga0 = p.gradAttr + vi0 * p.numAttr;
+    float* ga1 = p.gradAttr + vi1 * p.numAttr;
+    float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+    // Barys and bary gradient accumulators.
+    float b0 = r.x;
+    float b1 = r.y;
+    float b2 = 1.f - r.x - r.y;
+    float gb0 = 0.f;
+    float gb1 = 0.f;
+
+    // Loop over attributes and accumulate attribute gradients.
+    for (int i=0; i < p.numAttr; i++)
+    {
+        float y = pdy[i];
+        float s0 = a0[i];
+        float s1 = a1[i];
+        float s2 = a2[i];
+        gb0 += y * (s0 - s2);
+        gb1 += y * (s1 - s2);
+        caAtomicAdd(ga0 + i, b0 * y);
+        caAtomicAdd(ga1 + i, b1 * y);
+        caAtomicAdd(ga2 + i, b2 * y);
+    }
+
+    // Write the bary gradients.
+    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+    // If pixel differentials disabled, we're done.
+    if (!ENABLE_DA)
+        return;
+
+    // Calculate gradients based on attribute pixel differentials.
+    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+    float gdudx = 0.f;
+    float gdudy = 0.f;
+    float gdvdx = 0.f;
+    float gdvdy = 0.f;
+
+    // Read bary pixel differentials.
+    float4 db = ((float4*)p.rastDB)[pidx];
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    for (int i=0; i < p.numDiffAttr; i++)
+    {
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Check that index is valid.
+        if (j >= 0 && j < p.numAttr)
+        {
+            float2 dsdxy = dda[i];
+            float dsdx = dsdxy.x;
+            float dsdy = dsdxy.y;
+
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+
+            // Gradients of db.
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            gdudx += dsdu * dsdx;
+            gdudy += dsdu * dsdy;
+            gdvdx += dsdv * dsdx;
+            gdvdy += dsdv * dsdy;
+
+            // Gradients of attributes.
+            float du = dsdx*dudx + dsdy*dudy;
+            float dv = dsdx*dvdx + dsdy*dvdy;
+            caAtomicAdd(ga0 + j, du);
+            caAtomicAdd(ga1 + j, dv);
+            caAtomicAdd(ga2 + j, -du - dv);
+        }
+    }
+
+    // Write.
+    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h
new file mode 100644
index 00000000..d35d8388
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/interpolate.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH   8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS               32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+    const int*      tri;                            // Incoming triangle buffer.
+    const float*    attr;                           // Incoming attribute buffer.
+    const float*    rast;                           // Incoming rasterizer output buffer.
+    const float*    rastDB;                         // Incoming rasterizer output buffer for bary derivatives.
+    const float*    dy;                             // Incoming attribute gradients.
+    const float*    dda;                            // Incoming attr diff gradients.
+    float*          out;                            // Outgoing interpolated attributes.
+    float*          outDA;                          // Outgoing texcoord major axis lengths.
+    float*          gradAttr;                       // Outgoing attribute gradients.
+    float*          gradRaster;                     // Outgoing rasterizer gradients.
+    float*          gradRasterDB;                   // Outgoing rasterizer bary diff gradients.
+    int             numTriangles;                   // Number of triangles.
+    int             numVertices;                    // Number of vertices.
+    int             numAttr;                        // Number of total vertex attributes.
+    int             numDiffAttr;                    // Number of attributes to differentiate.
+    int             width;                          // Image width.
+    int             height;                         // Image height.
+    int             depth;                          // Minibatch size.
+    int             attrBC;                         // 0=normal, 1=attr is broadcast.
+    int             instance_mode;                  // 0=normal, 1=instance mode.
+    int             diff_attrs_all;                 // 0=normal, 1=produce pixel differentials for all attributes.
+    int             diffAttrs[IP_MAX_DIFF_ATTRS];   // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu
new file mode 100644
index 00000000..455aca3e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Cuda forward rasterizer pixel shader kernel.
+
+__global__ void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width_out || py >= p.height_out || pz >= p.depth)
+        return;
+
+    // Pixel indices.
+    int pidx_in  = px + p.width_in  * (py + p.height_in  * pz);
+    int pidx_out = px + p.width_out * (py + p.height_out * pz);
+
+    // Fetch triangle idx.
+    int triIdx = p.in_idx[pidx_in] - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        // No or corrupt triangle.
+        ((float4*)p.out)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out.
+        ((float4*)p.out_db)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out_db.
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Perspective correct, normalized barycentrics.
+    float iw = 1.f / (a0 + a1 + a2);
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Compute z/w for depth buffer.
+    float z = p0.z * a0 + p1.z * a1 + p2.z * a2;
+    float w = p0.w * a0 + p1.w * a1 + p2.w * a2;
+    float zw = z / w;
+
+    // Clamps to avoid NaNs.
+    b0 = __saturatef(b0); // Clamp to [+0.0, 1.0].
+    b1 = __saturatef(b1); // Clamp to [+0.0, 1.0].
+    zw = fmaxf(fminf(zw, 1.f), -1.f);
+
+    // Emit output.
+    ((float4*)p.out)[pidx_out] = make_float4(b0, b1, zw, triidx_to_float(triIdx + 1));
+
+    // Calculate bary pixel differentials.
+    float dfxdx = p.xs * iw;
+    float dfydy = p.ys * iw;
+    float da0dx = p2.y*p1.w - p1.y*p2.w;
+    float da0dy = p1.x*p2.w - p2.x*p1.w;
+    float da1dx = p0.y*p2.w - p2.y*p0.w;
+    float da1dy = p2.x*p0.w - p0.x*p2.w;
+    float da2dx = p1.y*p0.w - p0.y*p1.w;
+    float da2dy = p0.x*p1.w - p1.x*p0.w;
+    float datdx = da0dx + da1dx + da2dx;
+    float datdy = da0dy + da1dy + da2dy;
+    float dudx = dfxdx * (b0 * datdx - da0dx);
+    float dudy = dfydy * (b0 * datdy - da0dy);
+    float dvdx = dfxdx * (b1 * datdx - da1dx);
+    float dvdy = dfydy * (b1 * datdy - da1dy);
+
+    // Emit bary pixel differentials.
+    ((float4*)p.out_db)[pidx_out] = make_float4(dudx, dudy, dvdx, dvdy);
+}
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = float_to_triidx(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX;
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h
new file mode 100644
index 00000000..cb3104fa
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH  8
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT 8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// CUDA forward rasterizer shader kernel params.
+
+struct RasterizeCudaFwdShaderParams
+{
+    const float*    pos;            // Vertex positions.
+    const int*      tri;            // Triangle indices.
+    const int*      in_idx;         // Triangle idx buffer from rasterizer.
+    float*          out;            // Main output buffer.
+    float*          out_db;         // Bary pixel gradient output buffer.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width_in;       // Input image width.
+    int             height_in;      // Input image height.
+    int             width_out;      // Output image width.
+    int             height_out;     // Output image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.cpp
new file mode 100644
index 00000000..ac71ccd8
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.cpp
@@ -0,0 +1,644 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize_gl.h"
+#include "glutil.h"
+#include <vector>
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+    // Round x up so that it has at most y bits of mantissa.
+    if (x < (1u << y))
+        return x;
+    uint32_t m = 0;
+    while (x & ~m)
+        m = (m << 1) | 1u;
+    m >>= y;
+    if (!(x & m))
+        return x;
+    return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf)
+{
+    std::string src(src_buf);
+
+    // Set preprocessor directives.
+    int n = src.find('\n') + 1; // After first line containing #version directive.
+    if (s.enableZModify)
+        src.insert(n, "#define IF_ZMODIFY(x) x\n");
+    else
+        src.insert(n, "#define IF_ZMODIFY(x)\n");
+
+    const char *cstr = src.c_str();
+    *pShader = 0;
+    NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+    NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0));
+    NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+    *pProgram = 0;
+
+    GLuint glProgram = 0;
+    NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+    NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+    GLint linkStatus = 0;
+    NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+    if (!linkStatus)
+    {
+        GLint infoLen = 0;
+        NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+        if (infoLen)
+        {
+            const char* hdr = "glLinkProgram() failed:\n";
+            std::vector<char> info(strlen(hdr) + infoLen);
+            strcpy(&info[0], hdr);
+            NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+            NVDR_CHECK(0, &info[0]);
+        }
+        NVDR_CHECK(0, "glLinkProgram() failed");
+    }
+
+    *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
+{
+    // Create GL context and set it current.
+    s.glctx = createGLContext(cudaDeviceIdx);
+    setGLContext(s.glctx);
+
+    // Version check.
+    GLint vMajor = 0;
+    GLint vMinor = 0;
+    glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+    glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+
+    // Enable depth modification workaround on A100 and later.
+    int capMajor = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx));
+    s.enableZModify = (capMajor >= 8);
+
+    // Number of output buffers.
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Set up vertex shader.
+    compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER,
+        "#version 330\n"
+        "#extension GL_ARB_shader_draw_parameters : enable\n"
+        STRINGIFY_SHADER_SOURCE(
+            layout(location = 0) in vec4 in_pos;
+            out int v_layer;
+            out int v_offset;
+            void main()
+            {
+                int layer = gl_DrawIDARB;
+                gl_Position = in_pos;
+                v_layer = layer;
+                v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+            }
+        )
+    );
+
+    // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+    if (s.enableDB)
+    {
+        // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+        //           u = (u/w) / (1/w)
+        //   --> du/dX = d((u/w) / (1/w))/dX
+        //   --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+        // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                layout(location = 0) uniform vec2 vp_scale;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                out vec4 var_db;
+                void main()
+                {
+                    // Plane equations for bary differentials.
+                    float w0 = gl_in[0].gl_Position.w;
+                    float w1 = gl_in[1].gl_Position.w;
+                    float w2 = gl_in[2].gl_Position.w;
+                    vec2 p0 = gl_in[0].gl_Position.xy;
+                    vec2 p1 = gl_in[1].gl_Position.xy;
+                    vec2 p2 = gl_in[2].gl_Position.xy;
+                    vec2 e0 = p0*w2 - p2*w0;
+                    vec2 e1 = p1*w2 - p2*w1;
+                    float a = e0.x*e1.y - e0.y*e1.x;
+
+                    // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+                    float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+                    float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+                    float ia = 1.f / ca; // Inverse area.
+
+                    vec2 ascl = ia * vp_scale;
+                    float dudx =  e1.y * ascl.x;
+                    float dudy = -e1.x * ascl.y;
+                    float dvdx = -e0.y * ascl.x;
+                    float dvdy =  e0.x * ascl.y;
+
+                    float duwdx = w2 * dudx;
+                    float dvwdx = w2 * dvdx;
+                    float duvdx = w0 * dudx + w1 * dvdx;
+                    float duwdy = w2 * dudy;
+                    float dvwdy = w2 * dvdy;
+                    float duvdy = w0 * dudy + w1 * dvdy;
+
+                    vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+                    vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+                    vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+                }
+            )
+        );
+
+        // Set up fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    out_db = var_db * var_uvzw.w;
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+
+        // Set up fragment shader for depth peeling.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float);
+                    out_db = var_db * var_uvzw.w;
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+    }
+    else
+    {
+        // Geometry shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                void main()
+                {
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+                }
+            )
+        );
+
+        // Fragment shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                layout(location = 0) out vec4 out_raster;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+
+        // Depth peeling variant of fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+    }
+
+    // Finalize programs.
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);
+
+    // Construct main fbo and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+    NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+    // Enable two color attachments.
+    GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+    NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+    // Construct vertex array object.
+    NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+    NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+    // Construct position buffer, bind permanently, enable, set ptr.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+    NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+    // Construct index buffer and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+    // Set up depth test.
+    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+    NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
+
+    // Create and bind output buffers. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+    }
+
+    // Create and bind depth/stencil buffer. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+
+    // Create texture name for previous output buffer (depth peeling).
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
+{
+    changes = false;
+
+    // Resize vertex buffer?
+    if (posCount > s.posCount)
+    {
+        if (s.cudaPosBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+        LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+        changes = true;
+    }
+
+    // Resize triangle buffer?
+    if (triCount > s.triCount)
+    {
+        if (s.cudaTriBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+        LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+        changes = true;
+    }
+
+    // Resize framebuffer?
+    if (width > s.width || height > s.height || depth > s.depth)
+    {
+        int num_outputs = s.enableDB ? 2 : 1;
+        if (s.cudaColorBuffer[0])
+            for (int i=0; i < num_outputs; i++)
+                NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+        if (s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+            s.cudaPrevOutBuffer = 0;
+        }
+
+        // New framebuffer size.
+        s.width  = (width > s.width) ? width : s.width;
+        s.height = (height > s.height) ? height : s.height;
+        s.depth  = (depth > s.depth) ? depth : s.depth;
+        s.width  = ROUND_UP(s.width, 32);
+        s.height = ROUND_UP(s.height, 32);
+        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+
+        // Allocate color buffers.
+        for (int i=0; i < num_outputs; i++)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+        }
+
+        // Allocate depth/stencil buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+        NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+        // (Re-)register all GL buffers into Cuda.
+        for (int i=0; i < num_outputs; i++)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+
+        changes = true;
+    }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
+{
+    // Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
+    if (peeling_idx < 1)
+    {
+        if (triPtr)
+        {
+            // Copy both position and triangle buffers.
+            void* glPosPtr = NULL;
+            void* glTriPtr = NULL;
+            size_t posBytes = 0;
+            size_t triBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+        }
+        else
+        {
+            // Copy position buffer only. Triangles are already copied and known to be constant.
+            void* glPosPtr = NULL;
+            size_t posBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+        }
+    }
+
+    // Select program based on whether we have a depth peeling input or not.
+    if (peeling_idx < 1)
+    {
+        // Normal case: No peeling, or peeling disabled.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+    }
+    else
+    {
+        // If we don't have a third buffer yet, create one.
+        if (!s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+        }
+
+        // Swap the GL buffers.
+        GLuint glTempBuffer = s.glPrevOutBuffer;
+        s.glPrevOutBuffer = s.glColorBuffer[0];
+        s.glColorBuffer[0] = glTempBuffer;
+
+        // Swap the Cuda buffers.
+        cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
+        s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
+        s.cudaColorBuffer[0] = cudaTempBuffer;
+
+        // Bind the new output buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));
+
+        // Bind old buffer as the input texture.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+
+        // Activate the correct program.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
+    }
+
+    // Set viewport, clear color buffer(s) and depth/stencil buffer.
+    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+    NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
+
+    // If outputting bary differentials, set resolution uniform
+    if (s.enableDB)
+        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+
+    // Set the dummy uniform if depth modification workaround is active.
+    if (s.enableZModify)
+        NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f));
+
+    // Render the meshes.
+    if (depth == 1 && !rangesPtr)
+    {
+        // Trivial case.
+        NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+    }
+    else
+    {
+        // Populate a buffer for draw commands and execute it.
+        std::vector<GLDrawCmd> drawCmdBuffer(depth);
+
+        if (!rangesPtr)
+        {
+            // Fill in range array to instantiate the same triangles for each output layer.
+            // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = drawCmdBuffer[i];
+                cmd.firstIndex    = 0;
+                cmd.count         = triCount;
+                cmd.baseVertex    = vtxPerInstance * i;
+                cmd.baseInstance  = 0;
+                cmd.instanceCount = 1;
+            }
+        }
+        else
+        {
+            // Fill in the range array according to user-given ranges. Triangle IDs point
+            // to the input triangle array, NOT index within range, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0, j=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = drawCmdBuffer[i];
+                int first = rangesPtr[j++];
+                int count = rangesPtr[j++];
+                NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+                NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+                cmd.firstIndex    = first * 3;
+                cmd.count         = count * 3;
+                cmd.baseVertex    = 0;
+                cmd.baseInstance  = first;
+                cmd.instanceCount = 1;
+            }
+        }
+
+        // Draw!
+        NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+    }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+    // Copy color buffers to output tensors.
+    cudaArray_t array = 0;
+    cudaChannelFormatDesc arrayDesc = {};   // For error checking.
+    cudaExtent arrayExt = {};               // For error checking.
+    int num_outputs = s.enableDB ? 2 : 1;
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+        NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+        NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+        NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+        NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+        cudaMemcpy3DParms p = {0};
+        p.srcArray = array;
+        p.dstPtr.ptr = outputPtr[i];
+        p.dstPtr.pitch = width * 4 * sizeof(float);
+        p.dstPtr.xsize = width;
+        p.dstPtr.ysize = height;
+        p.extent.width = width;
+        p.extent.height = height;
+        p.extent.depth = depth;
+        p.kind = cudaMemcpyDeviceToDevice;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s)
+{
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    if (s.cudaPosBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.cudaPosBuffer = 0;
+    }
+
+    if (s.cudaTriBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.cudaTriBuffer = 0;
+    }
+
+    for (int i=0; i < num_outputs; i++)
+    {
+        if (s.cudaColorBuffer[i])
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+            s.cudaColorBuffer[i] = 0;
+        }
+    }
+
+    if (s.cudaPrevOutBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+        s.cudaPrevOutBuffer = 0;
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.h
new file mode 100644
index 00000000..27537c56
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/rasterize_gl.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState // Must be initializable by memset to zero.
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glPrevOutBuffer;
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glProgramDP;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    GLuint                  glFragmentShaderDP;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPrevOutBuffer;
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    int                     enableDB;
+    int                     enableZModify;      // Modify depth in shader, workaround for a rasterization issue on A100.
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
+
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp
new file mode 100644
index 00000000..51633e10
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    mipOffsets[0] = 0;
+    while ((w|h) > 1)
+    {
+        // Current level.
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        mipOffsets[level] = mipTotal; // Store the mip offset (#floats).
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cu
new file mode 100644
index 00000000..490b8d68
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.cu
@@ -0,0 +1,1156 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float  b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float&  a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); }
+static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); }
+template<class T> static __device__ __forceinline__ T lerp  (const T& a, const T& b, float c) { return a + c * (b - a); }
+template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+    0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+    0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+    0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+    0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+    0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+    0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+    0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+    0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+    0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+    // Calculate case number.
+    int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+    int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+    int c = cx + cy;
+    if (c >= 5)
+        c--;
+    c = (face << 3) + c;
+
+    // Compute coordinates and faces.
+    unsigned int m = c_cubeWrapMask1[c];
+    int x0 = (m >>  0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+    int x1 = (m >>  2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+    int x2 = (m >>  4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+    int x3 = (m >>  6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+    int y0 = (m >>  8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+    int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+    int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+    int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+    int f0 = ((m >> 16) & 15) - 1;
+    int f1 = ((m >> 20) & 15) - 1;
+    int f2 = ((m >> 24) & 15) - 1;
+    int f3 = ((m >> 28)     ) - 1;
+
+    // Flips.
+    unsigned int f = c_cubeWrapMask2[c];
+    int w1 = w - 1;
+    if (f & 0x01) x0 = w1 - x0;
+    if (f & 0x02) x1 = w1 - x1;
+    if (f & 0x04) x2 = w1 - x2;
+    if (f & 0x08) x3 = w1 - x3;
+    if (f & 0x10) y0 = w1 - y0;
+    if (f & 0x20) y1 = w1 - y1;
+    if (f & 0x40) y2 = w1 - y2;
+    if (f & 0x80) y3 = w1 - y3;
+
+    // Done.
+    int4 tcOut;
+    tcOut.x = x0 + (y0 + f0 * w) * w;
+    tcOut.y = x1 + (y1 + f1 * w) * w;
+    tcOut.z = x2 + (y2 + f2 * w) * w;
+    tcOut.w = x3 + (y3 + f3 * w) * w;
+    return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+    float ax = fabsf(x);
+    float ay = fabsf(y);
+    float az = fabsf(z);
+    int idx;
+    float c;
+    if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+    else if (ay > ax)       { idx = 2; c = y; y = z; }
+    else                    { idx = 0; c = x; x = z; }
+    if (c < 0.f) idx += 1;
+    float m = __frcp_rz(fabsf(c)) * .5;
+    float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+    float m1 = (idx != 2) ? -m : m;
+    x = x * m0 + .5;
+    y = y * m1 + .5;
+    if (!isfinite(x) || !isfinite(y))
+        return -1; // Invalid uv.
+    x = fminf(fmaxf(x, 0.f), 1.f);
+    y = fminf(fmaxf(y, 0.f), 1.f);
+    return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c;
+    float c0 = gu;
+    float c1 = gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl = (c0 + c1) * m;
+    float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+    float gy = (idx & 0x0c) ? gl : -gv;
+    float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+    gz = (idx & 0x09) ? -gz : gz;
+    float3 res = make_float3(gx, gy, gz) * (m * .5f);
+    if (!isfinite_vec3(res))
+        return make_float3(0.f, 0.f, 0.f); // Invalid uv.
+    return res;
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, c0, c1;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl0 = (dw.x * c0 + dw.z * c1) * m;
+    float gl1 = (dw.y * c0 + dw.w * c1) * m;
+    float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+    float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+    float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+    float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+    float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+    float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+    if (idx & 0x09)
+    {
+        gz0 = -gz0;
+        gz1 = -gz1;
+    }
+    g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+    g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+    if (!isfinite_vec3(g0) || !isfinite_vec3(g1))
+    {
+        g0 = make_float3(0.f, 0.f, 0.f); // Invalid uv.
+        g1 = make_float3(0.f, 0.f, 0.f);
+    }
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+    float m = __frcp_rz(fabsf(c));
+    float dm = m * .5f;
+    float mm = m * dm;
+    gu *= (idx & 0x34) ? -mm : mm;
+    gv *= (idx & 0x2e) ? -mm : mm;
+
+    float4 res;
+    if (idx & 0x03)
+    {
+        res = make_float4(gu * dvdX.x + dm * dvdX.z,
+                          gu * dvdY.x + dm * dvdY.z,
+                          gv * dvdX.x - dm * dvdX.y,
+                          gv * dvdY.x - dm * dvdY.y);
+    }
+    else if (idx & 0x0c)
+    {
+        res = make_float4(gu * dvdX.y + dm * dvdX.x,
+                          gu * dvdY.y + dm * dvdY.x,
+                          gv * dvdX.y + dm * dvdX.z,
+                          gv * dvdY.y + dm * dvdY.z);
+    }
+    else // (idx & 0x30)
+    {
+        res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+                          gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+                          gv * dvdX.z - dm * dvdX.y,
+                          gv * dvdY.z - dm * dvdY.y);
+    }
+
+    if (!isfinite_vec4(res))
+        return make_float4(0.f, 0.f, 0.f, 0.f);
+
+    return res;
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+//   |  d(ds/dX)/dx  d(ds/dY)/dx  d(dt/dX)/dx  d(dt/dY)/dx  |
+//   |  d(ds/dX)/dy  d(ds/dY)/dy  d(dt/dX)/dy  d(dt/dY)/dy  |
+//   |  d(ds/dX)/dz  d(ds/dY)/dz  d(dt/dX)/dz  d(dt/dY)/dz  |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+
+    float m = __frcp_rz(c);
+    float dm = -m * fabsf(m) * .5;
+    float mm = m * m * .5;
+    float mu = (idx & 0x34) ? -mm : mm;
+    float mv = (idx & 0x2e) ? -mm : mm;
+    gu *= -2.0 * m * mu;
+    gv *= -2.0 * m * mv;
+
+    if (idx & 0x03)
+    {
+        dx.x = gu * dvdX.x + dm * dvdX.z;
+        dx.y = gu * dvdY.x + dm * dvdY.z;
+        dx.z = gv * dvdX.x - dm * dvdX.y;
+        dx.w = gv * dvdY.x - dm * dvdY.y;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.x;
+        dy.w = mv * dvdY.x;
+        dz.x = mu * dvdX.x;
+        dz.y = mu * dvdY.x;
+        dz.z = 0.f;
+        dz.w = 0.f;
+    }
+    else if (idx & 0x0c)
+    {
+        dx.x = mu * dvdX.y;
+        dx.y = mu * dvdY.y;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = gu * dvdX.y + dm * dvdX.x;
+        dy.y = gu * dvdY.y + dm * dvdY.x;
+        dy.z = gv * dvdX.y + dm * dvdX.z;
+        dy.w = gv * dvdY.y + dm * dvdY.z;
+        dz.x = 0.f;
+        dz.y = 0.f;
+        dz.z = mv * dvdX.y;
+        dz.w = mv * dvdY.y;
+    }
+    else // (idx & 0x30)
+    {
+        dx.x = mu * dvdX.z;
+        dx.y = mu * dvdY.z;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.z;
+        dy.w = mv * dvdY.z;
+        dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+        dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+        dz.z = gv * dvdX.z - dm * dvdX.y;
+        dz.w = gv * dvdY.z - dm * dvdY.y;
+    }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+    int w = p.texWidth;
+    int h = p.texHeight;
+    float u = uv.x;
+    float v = uv.y;
+
+    // Cube map indexing.
+    if (CUBE_MODE)
+    {
+        // No wrap. Fold face index into tz right away.
+        int idx = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        if (idx < 0)
+            return -1; // Invalid uv.
+        tz = 6 * tz + idx;
+    }
+    else
+    {
+        // Handle boundary.
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+    }
+
+    u = u * (float)w;
+    v = v * (float)h;
+
+    int iu = __float2int_rd(u);
+    int iv = __float2int_rd(v);
+
+    // In zero boundary mode, return texture address -1.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+            return -1;
+    }
+
+    // Otherwise clamp and calculate the coordinate properly.
+    iu = min(max(iu, 0), w-1);
+    iv = min(max(iv, 0), h-1);
+    return iu + w * (iv + tz * h);
+}
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+    // Mip level size.
+    int2 sz = mipLevelSize(p, level);
+    int w = sz.x;
+    int h = sz.y;
+
+    // Compute texture-space u, v.
+    float u = uv.x;
+    float v = uv.y;
+    bool clampU = false;
+    bool clampV = false;
+
+    // Cube map indexing.
+    int face = 0;
+    if (CUBE_MODE)
+    {
+        // Neither clamp or wrap.
+        face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        if (face < 0)
+        {
+            tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; // Invalid uv.
+            return make_float2(0.f, 0.f);
+        }
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+    }
+    else
+    {
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            // Wrap.
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+
+        // Move to texel space.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+        {
+            // Clamp to center of edge texels.
+            u = fminf(fmaxf(u, 0.f), w - 1.f);
+            v = fminf(fmaxf(v, 0.f), h - 1.f);
+            clampU = (u == 0.f || u == w - 1.f);
+            clampV = (v == 0.f || v == h - 1.f);
+        }
+    }
+
+    // Compute texel coordinates and weights.
+    int iu0 = __float2int_rd(u);
+    int iv0 = __float2int_rd(v);
+    int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+    int iv1 = iv0 + (clampV ? 0 : 1);
+    u -= (float)iu0;
+    v -= (float)iv0;
+
+    // Cube map wrapping.
+    bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+    if (cubeWrap)
+    {
+        tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+        tcOut += 6 * tz * w * h;  // Bring in tz.
+        return make_float2(u, v); // Done.
+    }
+
+    // Fold cube map face into tz.
+    if (CUBE_MODE)
+        tz = 6 * tz + face;
+
+    // Wrap overflowing texel indices.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+    {
+        if (iu0 < 0) iu0 += w;
+        if (iv0 < 0) iv0 += h;
+        if (iu1 >= w) iu1 -= w;
+        if (iv1 >= h) iv1 -= h;
+    }
+
+    // Coordinates with tz folded in.
+    int iu0z = iu0 + tz * w * h;
+    int iu1z = iu1 + tz * w * h;
+    tcOut.x = iu0z + w * iv0;
+    tcOut.y = iu1z + w * iv0;
+    tcOut.z = iu0z + w * iv1;
+    tcOut.w = iu1z + w * iv1;
+
+    // Invalidate texture addresses outside unit square if we are in zero mode.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        bool iu0_out = (iu0 < 0 || iu0 >= w);
+        bool iu1_out = (iu1 < 0 || iu1 >= w);
+        bool iv0_out = (iv0 < 0 || iv0 >= h);
+        bool iv1_out = (iv1 < 0 || iv1 >= h);
+        if (iu0_out || iv0_out) tcOut.x = -1;
+        if (iu1_out || iv0_out) tcOut.y = -1;
+        if (iu0_out || iv1_out) tcOut.z = -1;
+        if (iu1_out || iv1_out) tcOut.w = -1;
+    }
+
+    // All done.
+    return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+    // Do nothing if mips not in use.
+    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+        return;
+
+    // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+    if (!BIAS_ONLY)
+    {
+        // Get pixel derivatives of texture coordinates.
+        float4 uvDA;
+        float3 dvdX, dvdY; // Gradients use these later.
+        if (CUBE_MODE)
+        {
+            // Fetch.
+            float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+            float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+            float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
+
+            // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+            dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+            dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+            uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+        }
+        else
+        {
+            // Fetch.
+            uvDA = ((const float4*)p.uvDA)[pidx];
+        }
+
+        // Scaling factors.
+        float uscl = p.texWidth;
+        float vscl = p.texHeight;
+
+        // d[s,t]/d[X,Y].
+        float dsdx = uvDA.x * uscl;
+        float dsdy = uvDA.y * uscl;
+        float dtdx = uvDA.z * vscl;
+        float dtdy = uvDA.w * vscl;
+
+        // Calculate footprint axis lengths.
+        float A = dsdx*dsdx + dtdx*dtdx;
+        float B = dsdy*dsdy + dtdy*dtdy;
+        float C = dsdx*dsdy + dtdx*dtdy;
+        float l2b = 0.5 * (A + B);
+        float l2n = 0.25 * (A-B)*(A-B) + C*C;
+        float l2a = sqrt(l2n);
+        float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+        float lenMajorSqr = l2b + l2a;
+
+        // Footprint vs. mip level gradient.
+        if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            float dw   = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+            float AB   = dw * .5f * (A - B);
+            float Cw   = dw * C;
+            float l2aw = dw * l2a;
+            float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+            float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+            float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+            float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+            float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+            if (!CUBE_MODE)
+                *pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+
+            // In cube maps, there is also a texture coordinate vs. mip level gradient.
+            // Only output nonzero vectors if both are free of inf/Nan garbage.
+            if (CUBE_MODE)
+            {
+                float4 dx, dy, dz;
+                indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+                float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+                float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+                float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+                float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+                float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+                d_f_dv += d_dsdX_dv * d_f_ddsdX;
+                d_f_dv += d_dsdY_dv * d_f_ddsdY;
+                d_f_dv += d_dtdX_dv * d_f_ddtdX;
+                d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+                bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv);
+                *pdw   = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+                *pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f);
+            }
+        }
+
+        // Finally, calculate mip level.
+        flevel = .5f * __log2f(lenMajorSqr); // May be inf/NaN, but clamp fixes it.
+    }
+
+    // Bias the mip level and clamp.
+    if (p.mipLevelBias)
+        flevel += p.mipLevelBias[pidx];
+    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+    // Calculate levels depending on filter mode.
+    level0 = __float2int_rd(flevel);
+
+    // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+    {
+        level1 = min(level0 + 1, p.mipLevelMax);
+        flevel -= level0; // Fractional part. Zero if clamped on last level.
+    }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template<class T>
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+    // For invalid cube map uv, tc will be all negative, and all texel values will be zero.
+    if (corner)
+    {
+        T avg = zero_value<T>();
+        if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+        if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+        if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+        if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+        avg *= 0.33333333f;
+        if (tc.x < 0) a00 = avg;
+        if (tc.y < 0) a10 = avg;
+        if (tc.z < 0) a01 = avg;
+        if (tc.w < 0) a11 = avg;
+    }
+    else
+    {
+        a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
+        a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
+        a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
+        a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
+    }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+    // For invalid cube map uv, tc will be all negative, and no accumulation will take place.
+    if (corner)
+    {
+        float cb;
+        if (tc.x < 0) cb = c.x;
+        if (tc.y < 0) cb = c.y;
+        if (tc.z < 0) cb = c.z;
+        if (tc.w < 0) cb = c.w;
+        cb *= 0.33333333f;
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+    }
+    else
+    {
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+    }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+    // Sizes.
+    int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+    int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= sz_out.x || py >= sz_out.y)
+        return;
+
+    // Pixel indices.
+    int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+    int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+    int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+    // Input and output pointers.
+    const float* pin = p.tex[p.mipLevelOut - 1];
+    float* pout = (float*)p.tex[p.mipLevelOut];
+
+    // Special case: Input texture height or width is 1.
+    if (sz_in.x == 1 || sz_in.y == 1)
+    {
+        if (sz_in.y == 1)
+            pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+        for (int i=0; i < p.channels; i += C)
+        {
+            T v0 = *((const T*)&pin[pidx_in0 + i]);
+            T v1 = *((const T*)&pin[pidx_in1 + i]);
+            T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+            avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+            *((T*)&pout[pidx_out + i]) = avg;
+        }
+
+        return;
+    }
+
+    for (int i=0; i < p.channels; i += C)
+    {
+        T v0 = *((const T*)&pin[pidx_in0 + i]);
+        T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+        T v2 = *((const T*)&pin[pidx_in1 + i]);
+        T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+        T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+        avg = (avg - .5f) * 2.f + .5f;
+#endif
+        *((T*)&pout[pidx_out + i]) = avg;
+    }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float,  1>(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Output ptr.
+    float* pOut = p.out + pidx * p.channels;
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        tc *= p.channels;
+        const float* pIn = p.tex[0];
+
+        // Copy if valid tc, otherwise output zero.
+        for (int i=0; i < p.channels; i += C)
+            *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+    // Get texel indices and pointer for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Bilinear fetch.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Interpolate.
+        for (int i=0; i < p.channels; i += C, tc0 += C)
+        {
+            T a00, a10, a01, a11;
+            fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+        }
+        return; // Exit.
+    }
+
+    // Get texel indices and pointer for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Trilinear fetch.
+    for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+    {
+        // First level.
+        T a00, a10, a01, a11;
+        fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        T a = bilerp(a00, a10, a01, a11, uv0);
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            T b00, b10, b01, b11;
+            fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            T b = bilerp(b00, b10, b01, b11, uv1);
+            a = lerp(a, b, flevel); // Interpolate between levels.
+        }
+
+        // Write.
+        *((T*)&pOut[i]) = a;
+    }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelLinear1                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1        (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2        (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4        (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1         (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2         (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4         (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeNearest1                (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2                (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4                (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinear1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1       (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2       (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4       (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1   (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2   (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4   (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.texWidth || py >= p.texHeight)
+        return;
+
+    // Number of wide elements.
+    int c = p.channels;
+    if (C == 2) c >>= 1;
+    if (C == 4) c >>= 2;
+
+    // Dynamically allocated shared memory for holding a texel.
+    extern __shared__ float s_texelAccum[];
+    int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+    int sharedStride = blockDim.x * blockDim.y;
+#   define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+    // Clear the texel.
+    for (int i=0; i < p.channels; i++)
+        *TEXEL_ACCUM(i) = 0.f;
+
+    // Track texel position and accumulation weight over the mip stack.
+    int x = px;
+    int y = py;
+    float w = 1.f;
+
+    // Pull gradients from all levels.
+    int2 sz = mipLevelSize(p, 0); // Previous level size.
+    for (int level=1; level <= p.mipLevelMax; level++)
+    {
+        // Weight decay depends on previous level size.
+        if (sz.x > 1) w *= .5f;
+        if (sz.y > 1) w *= .5f;
+
+        // Current level size and coordinates.
+        sz = mipLevelSize(p, level);
+        x >>= 1;
+        y >>= 1;
+
+        T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+        for (int i=0; i < c; i++)
+            accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+    }
+
+    // Add to main texture gradients.
+    T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+    for (int i=0; i < c; i++)
+        accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float,  1>(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Early exit if output gradients are zero.
+    const float* pDy = p.dy + pidx * p.channels;
+    unsigned int dmax = 0u;
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i < p.channels; i += 4)
+        {
+            uint4 dy = *((const uint4*)&pDy[i]);
+            dmax |= (dy.x | dy.y | dy.z | dy.w);
+        }
+    }
+    else
+    {
+        for (int i=0; i < p.channels; i++)
+            dmax |= __float_as_uint(pDy[i]);
+    }
+
+    // Store zeros and exit.
+    if (__uint_as_float(dmax) == 0.f)
+    {
+        if (CUBE_MODE)
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                {
+                    ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+                }
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        else
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                    ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        return;
+    }
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode - texture gradients only.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        if (tc < 0)
+            return; // Outside texture.
+
+        tc *= p.channels;
+        float* pOut = p.gradTex[0];
+
+        // Accumulate texture gradients.
+        for (int i=0; i < p.channels; i++)
+            caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+    float3 dfdv = make_float3(0.f, 0.f, 0.f);
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+    // UV gradient accumulators.
+    float gu = 0.f;
+    float gv = 0.f;
+
+    // Get texel indices and pointers for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    float* pOut0 = p.gradTex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Texel weights.
+    float uv011 = uv0.x * uv0.y;
+    float uv010 = uv0.x - uv011;
+    float uv001 = uv0.y - uv011;
+    float uv000 = 1.f - uv0.x - uv001;
+    float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+    // Attribute weights.
+    int2 sz0 = mipLevelSize(p, level0);
+    float sclu0 = (float)sz0.x;
+    float sclv0 = (float)sz0.y;
+
+    // Bilinear mode - texture and uv gradients.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        for (int i=0; i < p.channels; i++, tc0 += 1)
+        {
+            float dy = pDy[i];
+            accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+
+            float a00, a10, a01, a11;
+            fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            float ad = (a11 + a00 - a10 - a01);
+            gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+            gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+        }
+
+        // Store UV gradients and exit.
+        if (CUBE_MODE)
+            ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+        else
+            ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+        return;
+    }
+
+    // Accumulate fractional mip level gradient.
+    float df = 0; // dL/df.
+
+    // Get texel indices and pointers for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    float* pOut1 = p.gradTex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Texel weights.
+    float uv111 = uv1.x * uv1.y;
+    float uv110 = uv1.x - uv111;
+    float uv101 = uv1.y - uv111;
+    float uv100 = 1.f - uv1.x - uv101;
+    float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+    // Attribute weights.
+    int2 sz1 = mipLevelSize(p, level1);
+    float sclu1 = (float)sz1.x;
+    float sclv1 = (float)sz1.y;
+
+    // Trilinear mode.
+    for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+    {
+        float dy = pDy[i];
+        float dy0 = (1.f - flevel) * dy;
+        accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+        // UV gradients for first level.
+        float a00, a10, a01, a11;
+        fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        float ad = (a11 + a00 - a10 - a01);
+        gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+        gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            // Texture gradients for second level.
+            float dy1 = flevel * dy;
+            accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+
+            // UV gradients for second level.
+            float b00, b10, b01, b11;
+            fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            float bd = (b11 + b00 - b10 - b01);
+            gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+            gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+            // Mip level gradient.
+            float a = bilerp(a00, a10, a01, a11, uv0);
+            float b = bilerp(b00, b10, b01, b11, uv1);
+            df += (b-a) * dy;
+        }
+    }
+
+    // Store UV gradients.
+    if (CUBE_MODE)
+        ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+    else
+        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+    // Store mip level bias gradient.
+    if (p.gradMipLevelBias)
+        p.gradMipLevelBias[pidx] = df;
+
+    // Store UV pixel differential gradients.
+    if (!BIAS_ONLY)
+    {
+        // Final gradients.
+        dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+        // Store them.
+        if (CUBE_MODE)
+        {
+            // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+            float3 g0, g1;
+            indexCubeMapGrad4(uv, dw, g0, g1);
+            ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+            ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+            ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+        }
+        else
+            ((float4*)p.gradUVDA)[pidx] = dw;
+    }
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest                    (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelLinear                     (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest        (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear         (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeNearest                (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinear                 (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest    (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear     (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO      (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO       (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO   (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.h
new file mode 100644
index 00000000..f79b600f
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/common/texture.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       16  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex[TEX_MAX_MIP_LEVEL];         // Incoming texture buffer with mip levels.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
+    const float*    dy;                             // Incoming output gradient.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex[TEX_MAX_MIP_LEVEL];     // Outgoing texture gradients with mip levels.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py
new file mode 100644
index 00000000..cf62df87
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py
new file mode 100644
index 00000000..be51deef
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/ops.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+    libs = {
+        'posix': ['GL', 'EGL'],
+        'nt':    ['gdi32', 'opengl32', 'user32', 'setgpu'],
+    }
+    return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+    fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+    return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+    try:
+        return np.asarray(x, dtype=dtype)
+    except (TypeError, ValueError):
+        return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+    if isinstance(x, np.ndarray):
+        return np.can_cast(x.dtype, dtype, 'unsafe')
+    else:
+        return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+    assert tri_const is True or tri_const is False
+    assert output_db is True or output_db is False
+
+    # Known constant resolution?
+    resolution_c = _get_constant(resolution, np.int32)
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert all inputs to tensors / base types.
+    tri_const = 1 if tri_const else 0
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+    if ranges is None:
+        ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+    else:
+        ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+    # Infer as much about the output shape as possible.
+    out_shape = [None, None, None, 4]
+    if pos.shape.rank == 3: # Instanced mode.
+        out_shape[0] = pos.shape[0].value
+    elif pos.shape.rank == 2: # Range mode.
+        if ranges.shape.rank not in [None, 0]:
+            out_shape[0] = ranges.shape[0].value
+    if resolution_c is not None:
+        assert resolution_c.shape == (2,)
+        out_shape[1], out_shape[2] = resolution_c
+
+    # Output pixel differentials.
+    @tf.custom_gradient
+    def func_db(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape)
+        def grad(dy, ddb):
+            if grad_db:
+                return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+            else:
+                return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Do not output pixel differentials.
+    @tf.custom_gradient
+    def func(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+        def grad(dy, _):
+            return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Choose stub.
+    if output_db:
+        return func_db(pos)
+    else:
+        return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = _get_constant(diff_attrs, np.int32)
+        assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    # Convert all inputs to tensors.
+    attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    if diff_attrs:
+        rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if rast.shape.rank is not None:
+        out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+    if attr.shape.rank in [2, 3]:
+        out_shape[3] = attr.shape[-1].value
+
+    # Output pixel differentials for at least some attributes.
+    @tf.custom_gradient
+    def func_da(attr, rast, rast_db):
+        diff_attrs_all = int(diff_attrs == 'all')
+        diff_attrs_list = [] if diff_attrs_all else diff_attrs
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+        # Infer number of channels in out_da.
+        if not diff_attrs_all:
+            da_channels = 2 * len(diff_attrs)
+        if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+            da_channels = 2 * attr.shape[-1].value
+        else:
+            da_channels = None
+
+        # Set output shapes.
+        out.set_shape(out_shape)
+        out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+        def grad(dy, dda):
+            return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return (out, out_da), grad
+
+    # No pixel differentials for any attribute.
+    @tf.custom_gradient
+    def func(attr, rast):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        out.set_shape(out_shape)
+        out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+        def grad(dy, _):
+            return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return (out, out_da), grad
+
+    # Choose stub.
+    if diff_attrs:
+        return func_da(attr, rast, rast_db)
+    else:
+        return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+    assert tex_const is True or tex_const is False
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Known constant texture?
+    tex_const = tex_const or _is_constant(tex, np.float32)
+
+    # Sanitize inputs.
+    tex_const = 1 if tex_const else 0
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Convert inputs to tensors.
+    tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+    uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+    if 'mipmap' in filter_mode:
+        uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if uv.shape.rank is not None:
+        assert uv.shape.rank == 4
+        out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+    if tex.shape.rank is not None:
+        assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+        out_shape[-1] = tex.shape[-1].value
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+    @tf.custom_gradient
+    def func_linear_mipmap_linear(tex, uv, uv_da):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear_mipmap_nearest(tex, uv):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear(tex, uv):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+    @tf.custom_gradient
+    def func_nearest(tex):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear':
+        return func_linear_mipmap_linear(tex, uv, uv_da)
+    elif filter_mode == 'linear-mipmap-nearest':
+        return func_linear_mipmap_nearest(tex, uv)
+    elif filter_mode == 'linear':
+        return func_linear(tex, uv)
+    elif filter_mode == 'nearest':
+        return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+    assert tri_const is True or tri_const is False
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert inputs to tensors.
+    color = tf.convert_to_tensor(color, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+    # Sanitize inputs.
+    tri_const = 1 if tri_const else 0
+
+    @tf.custom_gradient
+    def func(color, pos):
+        color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+        color_out.set_shape(color.shape)
+        def grad(dy):
+            grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+            if pos_gradient_boost != 1.0:
+                grad_pos = grad_pos * pos_gradient_boost
+            return grad_color, grad_pos
+        return color_out, grad
+
+    return func(color, pos)
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py
new file mode 100644
index 00000000..3918aecd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/plugin_loader.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    '''Set CUDA kernel compilation temp dir.
+
+    If `set_cache_dir` is not called, the cache directory will default to
+    one of the below:
+
+    - Value of NVDIFFRAST_CACHE_DIR env var, if set
+    - $HOME/.cache/nvdiffrast if HOME env var is set
+    - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+    Args:
+      path: Where to save CUDA kernel build temporaries
+    '''
+    global _nvdiffrast_cache_dir
+    _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _nvdiffrast_cache_dir is not None:
+        return os.path.join(_nvdiffrast_cache_dir, *paths)
+    if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+    if os.path.isdir(vc_bin_dir):
+        return vc_bin_dir
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+            compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find libraries during compilation.
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        for opt in extra_nvcc_options:
+            compile_opts += ' ' + opt
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        cuda_cache_path = make_cache_dir_path()
+        bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu
new file mode 100644
index 00000000..8eefcfbd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_all.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+#include "../common/glutil.cpp"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize_gl.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu
new file mode 100644
index 00000000..9b14962a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_antialias.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color     = ctx->input(0);
+        const Tensor& rasterOut = ctx->input(1);
+        const Tensor& pos       = ctx->input(2);
+        const Tensor& tri       = ctx->input(3);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate output tensor.
+        Tensor* outputTensor = NULL;
+        TensorShape outputShape;
+        outputShape.AddDim(p.n);
+        outputShape.AddDim(p.height);
+        outputShape.AddDim(p.width);
+        outputShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+        p.output = outputTensor->flat<float>().data();
+
+        // Allocate work buffer. One extra int4 for storing counters.
+        Tensor* workTensor = NULL;
+        TensorShape workShape;
+        workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+        p.workBuffer = (int4*)(workTensor->flat<int>().data());
+
+        // Clear the work counters.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut  &  7), errors::Internal("raster_out input tensor not aligned to float2"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Kernel parameters.
+        void* args[] = {&p};
+
+        // (Re-)calculate opposite vertex hash.
+        if (!p.evHash || !p.tri_const)
+        {            
+            if (p.allocTriangles < p.numTriangles)
+            {
+                p.allocTriangles = max(p.allocTriangles, 64);
+                while (p.allocTriangles < p.numTriangles)
+                    p.allocTriangles <<= 1; // Must be power of two.
+               
+                // (Re-)allocate memory for the hash.
+                OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+                OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4)));
+                LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+            }
+
+            // Clear the hash and launch the mesh kernel to populate it.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4), stream));
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+        }
+
+        // Copy input to output as a baseline.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        // Choose launch parameters for the discontinuity finder kernel and launch.
+        dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+        dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+        // Determine optimum block size for the persistent analysis kernel.
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+        // Launch analysis kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasFwd")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Output     ("output: float")
+    .Output     ("work_buffer: int32")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color      = ctx->input(0);
+        const Tensor& rasterOut  = ctx->input(1);
+        const Tensor& pos        = ctx->input(2);
+        const Tensor& tri        = ctx->input(3);
+        const Tensor& dy         = ctx->input(4);
+        const Tensor& workBuffer = ctx->input(5);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.dy = dy.flat<float>().data();
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+        p.workBuffer = (int4*)(workBuffer.flat<int>().data());
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate color gradient output tensor.
+        Tensor* gradColor = NULL;
+        TensorShape gradColorShape;
+        gradColorShape.AddDim(p.n);
+        gradColorShape.AddDim(p.height);
+        gradColorShape.AddDim(p.width);
+        gradColorShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
+        p.gradColor = gradColor->flat<float>().data();
+
+        // Allocate position gradient output tensor.
+        Tensor* gradPos = NULL;
+        TensorShape gradPosShape;
+        if (p.instance_mode)
+            gradPosShape.AddDim(p.n);
+        gradPosShape.AddDim(p.numVertices);
+        gradPosShape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
+        p.gradPos = gradPos->flat<float>().data();
+
+        // Initialize all the stuff.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Launch the gradient kernel.
+        void* args[] = {&p};
+
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasGrad")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("work_buffer: int32")
+    .Output     ("grad_color: float")
+    .Output     ("grad_pos: float");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu
new file mode 100644
index 00000000..612ce1af
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_interpolate.cu
@@ -0,0 +1,301 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
+{
+    if (enableDA)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
+        if (!p.diff_attrs_all)
+        {
+            std::vector<int> diff_attrs_vec;
+            OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
+            p.numDiffAttr = diff_attrs_vec.size();
+            memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateFwdOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
+            OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
+            OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr = attr.flat<float>().data();
+        p.rast = rast.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+
+        // Allocate main output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.depth);
+        out_shape.AddDim(p.height);
+        out_shape.AddDim(p.width);
+        out_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Allocate pixel differential output tensor.
+        Tensor* out_da_tensor = NULL;
+        out_shape.set_dim(3, p.numDiffAttr * 2);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
+        p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+        if (ENABLE_DA)
+            OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateFwd")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Output     ("out: float")
+    .Output     ("out_da: float");
+
+REGISTER_OP("InterpolateFwdDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("rast_db: float")
+    .Output     ("out: float")
+    .Output     ("out_da: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwd")  .Device(DEVICE_GPU), InterpolateFwdOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateGradOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);      
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& dy      = ctx->input(3);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
+        const Tensor& dda     = ctx->input(ENABLE_DA ? 5 : 3);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
+        OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
+        OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
+            OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr   = attr.flat<float>().data();
+        p.rast   = rast.flat<float>().data();
+        p.tri    = tri.flat<int>().data();
+        p.dy     = dy.flat<float>().data();
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+        p.dda    = ENABLE_DA ? dda.flat<float>().data() : 0;
+        p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+        // Allocate attribute gradient output tensor.
+        Tensor* grad_attr_tensor = NULL;
+        TensorShape grad_attr_shape;
+        if (p.instance_mode)
+            grad_attr_shape.AddDim(attr_depth);
+        grad_attr_shape.AddDim(p.numVertices);
+        grad_attr_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
+        p.gradAttr = grad_attr_tensor->flat<float>().data();
+
+        // Allocate bary gradient output tensor.
+        Tensor* grad_rast_tensor = NULL;
+        TensorShape grad_rast_shape;
+        grad_rast_shape.AddDim(p.depth);
+        grad_rast_shape.AddDim(p.height);
+        grad_rast_shape.AddDim(p.width);
+        grad_rast_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
+        p.gradRaster = grad_rast_tensor->flat<float>().data();
+
+        // Allocate bary pixel diff gradient output tensor.
+        if (ENABLE_DA)
+        {
+            Tensor* grad_rast_db_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
+            p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
+        }
+        
+        // Clear attribute gradients.
+        cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
+        }
+    
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateGrad")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    ;
+
+REGISTER_OP("InterpolateGradDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("rast_db: float")
+    .Input      ("dda: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    .Output     ("grad_rast_db: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+    ;
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateGrad")  .Device(DEVICE_GPU), InterpolateGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu
new file mode 100644
index 00000000..4d0a2616
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -0,0 +1,242 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct RasterizeFwdOp : public OpKernel
+{
+    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.
+
+    RasterizeFwdOp(OpKernelConstruction* ctx):
+        OpKernel(ctx)
+    {
+        memset(&m_glState, 0, sizeof(RasterizeGLState));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Check that input shapes are correct.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& resolution = ctx->input(2);
+        const Tensor& ranges = ctx->input(3);
+
+        // Determine number of outputs
+        int num_outputs = m_glState.enableDB ? 2 : 1;
+
+        // Determine instance mode and check input dimensions.
+        bool instance_mode = pos.dims() > 2;
+        if (instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
+        }
+
+        // Get output shape.
+        const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
+        int height = res_in[0];
+        int width  = res_in[1];
+        int depth  = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
+        OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
+
+        // Get position and triangle buffer sizes in int32/float32.
+        int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
+        int triCount = 3 * tri.dim_size(0);
+
+        // Init context and GL?
+        bool initCtx = !m_glState.glFBO;
+        if (initCtx)
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
+        else
+            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
+
+        // Resize all buffers.
+        bool changes = false;
+        rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
+        if (changes)
+        {
+#ifdef _WIN32
+            // Workaround for occasional blank first frame on Windows.
+            releaseGLContext();
+            setGLContext(m_glState.glctx);
+#endif
+        }
+
+        // Copy input data to GL and render.
+        const float* posPtr = pos.flat<float>().data();
+        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
+        const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
+        int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
+        rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);
+
+        // Allocate output tensors.
+        TensorShape output_shape;
+        output_shape.AddDim(depth);
+        output_shape.AddDim(height);
+        output_shape.AddDim(width);
+        output_shape.AddDim(4);
+        float* outputPtr[2];
+        for (int i=0; i < 2; i++)
+        {
+            if (i >= num_outputs)
+                output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
+            Tensor* output_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
+            if (i < num_outputs)
+                outputPtr[i] = output_tensor->flat<float>().data();
+        }
+
+        // Copy rasterized results into CUDA buffers.
+        rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
+
+        // Done. Release GL context.
+        releaseGLContext();
+    }
+};
+
+REGISTER_OP("RasterizeFwd")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("resolution: int32")
+    .Input      ("ranges: int32")
+    .Output     ("out: float")
+    .Output     ("out_db: float")
+    .Attr       ("enable_db: int")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DB>
+struct RasterizeGradOp : public OpKernel
+{
+    RasterizeGradParams m_attribs;
+
+    RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        RasterizeGradParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Input tensors.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& out = ctx->input(2);
+        const Tensor& dy  = ctx->input(3);
+        const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
+
+        // Determine instance mode.
+        p.instance_mode = (pos.dims() > 2) ? 1 : 0;
+
+        // Shape is taken from the rasterizer output tensor.
+        OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
+        p.depth  = out.dim_size(0);
+        p.height = out.dim_size(1);
+        p.width  = out.dim_size(2);
+        OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
+
+        // Check other shapes.
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
+        else
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
+        OP_REQUIRES(ctx,  dy.dims() == 4 &&  dy.dim_size(0) == p.depth &&  dy.dim_size(1) == p.height &&  dy.dim_size(2) == p.width &&  dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
+
+        // Populate parameters.
+        p.numTriangles = tri.dim_size(0);
+        p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
+        p.pos = pos.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.out = out.flat<float>().data();
+        p.dy  = dy.flat<float>().data();
+        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
+
+        // Set up pixel position to clip space x, y transform.
+        p.xs = 2.f / (float)p.width;
+        p.xo = 1.f / (float)p.width - 1.f;
+        p.ys = 2.f / (float)p.height;
+        p.yo = 1.f / (float)p.height - 1.f;
+
+        // Allocate output tensor for position gradients.
+        Tensor* grad_tensor = NULL;
+        TensorShape grad_shape;
+        if (p.instance_mode)
+            grad_shape.AddDim(p.depth);
+        grad_shape.AddDim(p.numVertices);
+        grad_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
+        p.grad = grad_tensor->flat<float>().data();
+
+        // Clear the output buffers.
+        size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
+        cudaMemsetAsync(p.grad, 0, gradBytes, stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.dy  &  7), errors::Internal("dy input tensor not aligned to float2"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("RasterizeGrad")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Output     ("grad: float");
+
+REGISTER_OP("RasterizeGradDb")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Input      ("ddb: float")
+    .Output     ("grad: float");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeGrad")  .Device(DEVICE_GPU), RasterizeGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu
new file mode 100644
index 00000000..c5382fed
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/tensorflow/tf_texture.cu
@@ -0,0 +1,525 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
+{
+    // Mip and filter modes.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
+    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
+        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
+        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
+    }
+
+    // Boundary mode.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
+    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct TextureFwdOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
+    bool                m_persistentMipTensorInitialized;
+
+    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        m_persistentMipTensorInitialized = false;
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+
+        // Allocate output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.n);
+        out_shape.AddDim(p.imgHeight);
+        out_shape.AddDim(p.imgWidth);
+        out_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Choose kernel variants based on channel count.
+        void* args[] = {&p};
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        float* pmip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Mip output tensor.
+            Tensor* mip_tensor = NULL;
+            TensorShape mip_shape;
+            mip_shape.AddDim(mipTotal);
+
+            // If texture is constant, calculate mip stack only once.
+            bool computeMip = true;
+            if (p.texConst)
+            {
+                // First execution?
+                if (!m_persistentMipTensorInitialized)
+                {
+                    // Allocate a persistent mip tensor.
+                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
+                    m_persistentMipTensorInitialized = true;
+                }
+                else
+                {
+                    // Reuse the persistent tensor, do not recompute mip levels.
+                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
+                    computeMip = false;
+                }
+
+                // Set as output tensor as well.
+                ctx->set_output(1, *mip_tensor);
+            }
+            else
+            {
+                // Allocate an output tensor as usual.
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
+            }
+
+            pmip = mip_tensor->flat<float>().data(); // Pointer to data.
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+            // Build mip levels if needed.
+            if (computeMip)
+            {
+                for (int i=1; i <= p.mipLevelMax; i++)
+                {
+                    int2 ms = mipLevelSize(p, i);
+                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+                    p.mipLevelOut = i;
+
+                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+                }
+            }
+        }
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 15), errors::Internal("out output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 15), errors::Internal("mip output tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 7), errors::Internal("out output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 7), errors::Internal("mip output tensor not aligned to float2"));
+        }
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+        else
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+
+        // Choose launch parameters for texture lookup kernel.
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        // Choose kernel based on filter mode, cube mode, and datatype.
+        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+            (void*)TextureFwdKernelNearest1,
+            (void*)TextureFwdKernelNearest2,
+            (void*)TextureFwdKernelNearest4,
+            (void*)TextureFwdKernelLinear1,
+            (void*)TextureFwdKernelLinear2,
+            (void*)TextureFwdKernelLinear4,
+            (void*)TextureFwdKernelLinearMipmapNearest1,
+            (void*)TextureFwdKernelLinearMipmapNearest2,
+            (void*)TextureFwdKernelLinearMipmapNearest4,
+            (void*)TextureFwdKernelLinearMipmapLinear1,
+            (void*)TextureFwdKernelLinearMipmapLinear2,
+            (void*)TextureFwdKernelLinearMipmapLinear4,
+            (void*)TextureFwdKernelCubeNearest1,
+            (void*)TextureFwdKernelCubeNearest2,
+            (void*)TextureFwdKernelCubeNearest4,
+            (void*)TextureFwdKernelCubeLinear1,
+            (void*)TextureFwdKernelCubeLinear2,
+            (void*)TextureFwdKernelCubeLinear4,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+        func_idx = func_idx * 3 + channel_div_idx;
+
+        // Launch kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("TextureFwd")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Output     ("out: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureFwdMip")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("uv_da: float")
+    .Output     ("out: float")
+    .Output     ("mip: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("tex_const: int")
+    .Attr       ("max_mip_level: int");
+
+REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
+REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct TextureGradOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+
+    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& dy    = ctx->input(2);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
+        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.dy = dy.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+        float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
+
+        // Allocate output tensor for tex gradient.
+        Tensor* grad_tex_tensor = NULL;
+        TensorShape grad_tex_shape;
+        grad_tex_shape.AddDim(p.texDepth);
+        if (cube_mode)
+            grad_tex_shape.AddDim(6);
+        grad_tex_shape.AddDim(p.texHeight);
+        grad_tex_shape.AddDim(p.texWidth);
+        grad_tex_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
+        p.gradTex[0] = grad_tex_tensor->flat<float>().data();
+
+        // Allocate output tensor for uv gradient.
+        if (p.filterMode != TEX_MODE_NEAREST)
+        {
+            TensorShape grad_uv_shape;
+            Tensor* grad_uv_tensor = NULL;
+            grad_uv_shape.AddDim(p.n);
+            grad_uv_shape.AddDim(p.imgHeight);
+            grad_uv_shape.AddDim(p.imgWidth);
+            grad_uv_shape.AddDim(uv.dim_size(3));
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
+            p.gradUV = grad_uv_tensor->flat<float>().data();
+
+            // Allocate output tensor for uv_da gradient.
+            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                Tensor* grad_uv_da_tensor = NULL;
+                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
+                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
+            }
+        }
+
+        // Choose kernel variants based on channel count.
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        Tensor grad_mip_tensor;
+        float* pgradMip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Get space for temporary mip gradients.
+            TensorShape grad_mip_shape;
+            grad_mip_shape.AddDim(mipTotal);
+            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
+            pgradMip = grad_mip_tensor.flat<float>().data();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+
+            // Clear mip gradients.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream));
+        }
+
+        // Initialize texture gradients to zero.
+        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
+        if (cube_mode)
+            texBytes *= 6;
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
+        }
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 15), errors::Internal("dy input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 15), errors::Internal("mip input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 15), errors::Internal("internal mip gradient tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 7), errors::Internal("dy output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 7), errors::Internal("mip input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 7), errors::Internal("internal mip gradient tensor not aligned to float2"));
+        }
+
+        // Choose launch parameters for main gradient kernel.
+        void* args[] = {&p};
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        void* func_tbl[TEX_MODE_COUNT * 2] = {
+            (void*)TextureGradKernelNearest,
+            (void*)TextureGradKernelLinear,
+            (void*)TextureGradKernelLinearMipmapNearest,
+            (void*)TextureGradKernelLinearMipmapLinear,
+            (void*)TextureGradKernelCubeNearest,
+            (void*)TextureGradKernelCubeLinear,
+            (void*)TextureGradKernelCubeLinearMipmapNearest,
+            (void*)TextureGradKernelCubeLinearMipmapLinear,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+
+        // Launch main gradient kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+        // Launch kernel to pull gradients from mip levels.
+        if (p.enableMip)
+        {
+            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+        }
+    }
+};
+
+REGISTER_OP("TextureGradNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinearMipmapNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_OP("TextureGradLinearMipmapLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Output     ("grad_uv_da: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
+        
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py
new file mode 100644
index 00000000..d28f95e7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import RasterizeCudaContext, RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeCudaContext", "RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/ops.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/ops.py
new file mode 100644
index 00000000..f366c022
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/ops.py
@@ -0,0 +1,729 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import importlib
+import logging
+import numpy as np
+import os
+import torch
+import torch.utils.cpp_extension
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = {}
+def _get_plugin(gl=False):
+    assert isinstance(gl, bool)
+
+    # Return cached plugin if already loaded.
+    if _cached_plugin.get(gl, None) is not None:
+        return _cached_plugin[gl]
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        lib_dir = os.path.dirname(__file__) + r"\..\lib"
+        def find_cl_path():
+            import glob
+            def get_sort_key(x):
+                # Primary criterion is VS version, secondary is edition, third is internal MSVC version.
+                x = x.split('\\')[3:]
+                x[1] = {'BuildTools': '~0', 'Community': '~1', 'Pro': '~2', 'Professional': '~3', 'Enterprise': '~4'}.get(x[1], x[1])
+                return x
+            vs_relative_path = r"\Microsoft Visual Studio\*\*\VC\Tools\MSVC\*\bin\Hostx64\x64"
+            paths = glob.glob(r"C:\Program Files" + vs_relative_path)
+            paths += glob.glob(r"C:\Program Files (x86)" + vs_relative_path)
+            if paths:
+                return sorted(paths, key=get_sort_key)[-1]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    common_opts = ['-DNVDR_TORCH']
+    cc_opts = []
+    if os.name == 'nt':
+        cc_opts += ['/wd4067', '/wd4624'] # Disable warnings in torch headers.
+
+    # Linker options for the GL-interfacing plugin.
+    ldflags = []
+    if gl:
+        if os.name == 'posix':
+            ldflags = ['-lGL', '-lEGL']
+        elif os.name == 'nt':
+            libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+            ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+
+    # List of source files.
+    if gl:
+        source_files = [
+            '../common/common.cpp',
+            '../common/glutil.cpp',
+            '../common/rasterize_gl.cpp',
+            'torch_bindings_gl.cpp',
+            'torch_rasterize_gl.cpp',
+        ]
+    else:
+        source_files = [
+            '../common/cudaraster/impl/Buffer.cpp',
+            '../common/cudaraster/impl/CudaRaster.cpp',
+            '../common/cudaraster/impl/RasterImpl.cu',
+            '../common/cudaraster/impl/RasterImpl.cpp',
+            '../common/common.cpp',
+            '../common/rasterize.cu',
+            '../common/interpolate.cu',
+            '../common/texture.cu',
+            '../common/texture.cpp',
+            '../common/antialias.cu',
+            'torch_bindings.cpp',
+            'torch_rasterize.cpp',
+            'torch_interpolate.cpp',
+            'torch_texture.cpp',
+            'torch_antialias.cpp',
+        ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # On Linux, show a warning if GLEW is being forcibly loaded when compiling the GL plugin.
+    if gl and (os.name == 'posix') and ('libGLEW' in os.environ.get('LD_PRELOAD', '')):
+        logging.getLogger('nvdiffrast').warning("Warning: libGLEW is being loaded via LD_PRELOAD, and will probably conflict with the OpenGL plugin")
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    plugin_name = 'nvdiffrast_plugin' + ('_gl' if gl else '')
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
+        if os.path.exists(lock_fn):
+            logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Speed up compilation on Windows.
+    if os.name == 'nt':
+        # Skip telemetry sending step in vcvarsall.bat
+        os.environ['VSCMD_SKIP_SENDTELEMETRY'] = '1'
+
+        # Opportunistically patch distutils to cache MSVC environments.
+        try:
+            import distutils._msvccompiler
+            import functools
+            if not hasattr(distutils._msvccompiler._get_vc_env, '__wrapped__'):
+                distutils._msvccompiler._get_vc_env = functools.lru_cache()(distutils._msvccompiler._get_vc_env)
+        except:
+            pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=common_opts+cc_opts, extra_cuda_cflags=common_opts+['-lineinfo'], extra_ldflags=ldflags, with_cuda=True, verbose=False)
+
+    # Import, cache, and return the compiled module.
+    _cached_plugin[gl] = importlib.import_module(plugin_name)
+    return _cached_plugin[gl]
+
+#----------------------------------------------------------------------------
+# Log level.
+#----------------------------------------------------------------------------
+
+def get_log_level():
+    '''Get current log level.
+
+    Returns:
+      Current log level in nvdiffrast. See `set_log_level()` for possible values.
+    '''
+    return _get_plugin().get_log_level()
+
+def set_log_level(level):
+    '''Set log level.
+
+    Log levels follow the convention on the C++ side of Torch:
+      0 = Info,
+      1 = Warning,
+      2 = Error,
+      3 = Fatal.
+    The default log level is 1.
+
+    Args:
+      level: New log level as integer. Internal nvdiffrast messages of this 
+             severity or higher will be printed, while messages of lower
+             severity will be silent.
+    '''
+    _get_plugin().set_log_level(level)
+
+#----------------------------------------------------------------------------
+# CudaRaster state wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeCudaContext:
+    def __init__(self, device=None):
+        '''Create a new Cuda rasterizer context.
+
+        The context is deleted and internal storage is released when the object is
+        destroyed.
+
+        Args:
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created Cuda rasterizer context.
+        '''
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeCRStateWrapper(cuda_device_idx)
+        self.output_db = True
+        self.active_depth_peeler = None
+
+#----------------------------------------------------------------------------
+# GL state wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeGLContext:
+    def __init__(self, output_db=True, mode='automatic', device=None):
+        '''Create a new OpenGL rasterizer context.
+
+        Creating an OpenGL context is a slow operation so you should usually reuse the same
+        context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
+        is deleted when the object is destroyed.
+
+        Side note: When using the OpenGL context in a rasterization operation, the
+        context's internal framebuffer object is automatically enlarged to accommodate the
+        rasterization operation's output shape, but it is never shrunk in size until the
+        context is destroyed. Thus, if you need to rasterize, say, deep low-resolution
+        tensors and also shallow high-resolution tensors, you can conserve GPU memory by
+        creating two separate OpenGL contexts for these tasks. In this scenario, using the
+        same OpenGL context for both tasks would end up reserving GPU memory for a deep,
+        high-resolution output tensor.
+
+        Args:
+          output_db (bool): Compute and output image-space derivates of barycentrics.
+          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created OpenGL rasterizer context.
+        '''
+        assert output_db is True or output_db is False
+        assert mode in ['automatic', 'manual']
+        self.output_db = output_db
+        self.mode = mode
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin(gl=True).RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
+        self.active_depth_peeler = None # For error checking only.
+
+    def set_context(self):
+        '''Set (activate) OpenGL context in the current CPU thread.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.set_context()
+
+    def release_context(self):
+        '''Release (deactivate) currently active OpenGL context.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.release_context()
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+class _rasterize_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, raster_ctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
+        if isinstance(raster_ctx, RasterizeGLContext):
+            out, out_db = _get_plugin(gl=True).rasterize_fwd_gl(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        else:
+            out, out_db = _get_plugin().rasterize_fwd_cuda(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        ctx.save_for_backward(pos, tri, out)
+        ctx.saved_grad_db = grad_db
+        return out, out_db
+
+    @staticmethod
+    def backward(ctx, dy, ddb):
+        pos, tri, out = ctx.saved_tensors
+        if ctx.saved_grad_db:
+            g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+        else:
+            g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return None, g_pos, None, None, None, None, None
+
+# Op wrapper.
+def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
+    '''Rasterize triangles.
+
+    All input tensors must be contiguous and reside in GPU memory except for
+    the `ranges` tensor that, if specified, has to reside in CPU memory. The
+    output tensors will be contiguous and reside in GPU memory.
+
+    Args:
+        glctx: Rasterizer context of type `RasterizeGLContext` or `RasterizeCudaContext`.
+        pos: Vertex position tensor with dtype `torch.float32`. To enable range
+             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        resolution: Output resolution as integer tuple (height, width).
+        ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype
+                `torch.int32`, specifying start indices and counts into `tri`.
+                Ignored in instanced mode.
+        grad_db: Propagate gradients of image-space derivatives of barycentrics
+                 into `pos` in backward pass. Ignored if using an OpenGL context that
+                 was not configured to output image-space derivatives.
+
+    Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size,
+        height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+        triangle_id). If the OpenGL context was configured to output image-space
+        derivatives of barycentrics, the second output tensor will also have shape
+        [minibatch_size, height, width, 4] and contain said derivatives in order
+        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    '''
+    assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
+    assert grad_db is True or grad_db is False
+    grad_db = grad_db and glctx.output_db
+
+    # Sanitize inputs.
+    assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+    resolution = tuple(resolution)
+    if ranges is None:
+        ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+    else:
+        assert isinstance(ranges, torch.Tensor)
+
+    # Check that context is not currently reserved for depth peeling.
+    if glctx.active_depth_peeler is not None:
+        return RuntimeError("Cannot call rasterize() during depth peeling operation, use rasterize_next_layer() instead")
+
+    # Instantiate the function.
+    return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db, -1)
+
+#----------------------------------------------------------------------------
+# Depth peeler context manager for rasterizing multiple depth layers.
+#----------------------------------------------------------------------------
+
+class DepthPeeler:
+    def __init__(self, glctx, pos, tri, resolution, ranges=None, grad_db=True):
+        '''Create a depth peeler object for rasterizing multiple depth layers.
+
+        Arguments are the same as in `rasterize()`.
+
+        Returns:
+          The newly created depth peeler.
+        '''
+        assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
+        assert grad_db is True or grad_db is False
+        grad_db = grad_db and glctx.output_db
+
+        # Sanitize inputs as usual.
+        assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+        resolution = tuple(resolution)
+        if ranges is None:
+            ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+        else:
+            assert isinstance(ranges, torch.Tensor)
+
+        # Store all the parameters.
+        self.raster_ctx = glctx
+        self.pos = pos
+        self.tri = tri
+        self.resolution = resolution
+        self.ranges = ranges
+        self.grad_db = grad_db
+        self.peeling_idx = None
+
+    def __enter__(self):
+        if self.raster_ctx is None:
+            raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
+        if self.raster_ctx.active_depth_peeler is not None:
+            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a rasterization context")
+        self.raster_ctx.active_depth_peeler = self
+        self.peeling_idx = 0
+        return self
+
+    def __exit__(self, *args):
+        assert self.raster_ctx.active_depth_peeler is self
+        self.raster_ctx.active_depth_peeler = None
+        self.raster_ctx = None # Remove all references to input tensor so they're not left dangling.
+        self.pos = None
+        self.tri = None
+        self.resolution = None
+        self.ranges = None
+        self.grad_db = None
+        self.peeling_idx = None
+        return None
+
+    def rasterize_next_layer(self):
+        '''Rasterize next depth layer.
+
+        Operation is equivalent to `rasterize()` except that previously reported
+        surface points are culled away.
+
+        Returns:
+          A tuple of two tensors as in `rasterize()`.
+        '''
+        assert self.raster_ctx.active_depth_peeler is self
+        assert self.peeling_idx >= 0
+        result = _rasterize_func.apply(self.raster_ctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
+        self.peeling_idx += 1
+        return result
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+# Output pixel differentials for at least some attributes.
+class _interpolate_func_da(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+        ctx.save_for_backward(attr, rast, tri, rast_db)
+        ctx.saved_misc = diff_attrs_all, diff_attrs_list
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, dda):
+        attr, rast, tri, rast_db = ctx.saved_tensors
+        diff_attrs_all, diff_attrs_list = ctx.saved_misc
+        g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return g_attr, g_rast, None, g_rast_db, None, None
+
+# No pixel differential for any attribute.
+class _interpolate_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        ctx.save_for_backward(attr, rast, tri)
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, _):
+        attr, rast, tri = ctx.saved_tensors
+        g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return g_attr, g_rast, None
+
+# Op wrapper.
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    """Interpolate vertex attributes.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensors
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        attr: Attribute tensor with dtype `torch.float32`. 
+              Shape is [num_vertices, num_attributes] in range mode, or 
+              [minibatch_size, num_vertices, num_attributes] in instanced mode.
+              Broadcasting is supported along the minibatch axis.
+        rast: Main output tensor from `rasterize()`.
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        rast_db: (Optional) Tensor containing image-space derivatives of barycentrics, 
+                 i.e., the second output tensor from `rasterize()`. Enables computing
+                 image-space derivatives of attributes.
+        diff_attrs: (Optional) List of attribute indices for which image-space
+                    derivatives are to be computed. Special value 'all' is equivalent
+                    to list [0, 1, ..., num_attributes - 1].
+
+    Returns:
+        A tuple of two tensors. The first output tensor contains interpolated
+        attributes and has shape [minibatch_size, height, width, num_attributes].
+        If `rast_db` and `diff_attrs` were specified, the second output tensor contains
+        the image-space derivatives of the selected attributes and has shape
+        [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+        first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+        Otherwise, the second output tensor will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    """
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = np.asarray(diff_attrs, np.int32)
+        assert len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    diff_attrs_all = int(diff_attrs == 'all')
+    diff_attrs_list = [] if diff_attrs_all else diff_attrs
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
+    if diff_attrs:
+        assert isinstance(rast_db, torch.Tensor)
+
+    # Choose stub.
+    if diff_attrs:
+        return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+    else:
+        return _interpolate_func.apply(attr, rast, tri)
+
+#----------------------------------------------------------------------------
+# Texture
+#----------------------------------------------------------------------------
+
+# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
+class _texture_func_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack):
+        empty = torch.tensor([])
+        if uv_da is None:
+            uv_da = empty
+        if mip_level_bias is None:
+            mip_level_bias = empty
+        if mip_wrapper is None:
+            mip_wrapper = _get_plugin().TextureMipWrapper()
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias, *mip_stack)
+        ctx.saved_misc = filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv, uv_da, mip_level_bias, *mip_stack = ctx.saved_tensors
+        filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear-mipmap-linear':
+            g_tex, g_uv, g_uv_da, g_mip_level_bias, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None) + tuple(g_mip_stack)
+        else: # linear-mipmap-nearest
+            g_tex, g_uv, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, None, None, None, None, None) + tuple(g_mip_stack)
+
+# Linear and nearest: Mipmaps disabled.
+class _texture_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv)
+        ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv = ctx.saved_tensors
+        filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear':
+            g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None
+        else: # nearest
+            g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, None, None, None
+
+# Op wrapper.
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+    """Perform texture sampling.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
+             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set
+             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+            texture, must have shape [minibatch_size, height, width, 3].
+        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
+               Must have same shape as `uv` except for the last dimension that is to be twice
+               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
+                        of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+                        the tensors in the list must follow the same format as `tex` except for width and
+                        height that must follow the usual rules for mipmap sizes. The base level texture
+                        is still supplied in `tex` and must not be included in the list. Gradients of a
+                        custom mipmap stack are not automatically propagated to base texture but the mipmap
+                        tensors will receive gradients of their own. If a mipmap stack is not specified
+                        but the chosen filter mode requires it, the mipmap stack is constructed internally
+                        and discarded afterwards.
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
+                     'linear-mipmap-linear' when at least one of them is specified, these being
+                     the highest-quality modes possible depending on the availability of the
+                     image-space derivatives of the texture coordinates or direct mip level information.
+        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
+                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+                       centers of the boundary texels. Mode 'zero' virtually extends the texture with
+                       all-zero values in all directions.
+        max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
+                       filter modes.
+
+    Returns:
+        A tensor containing the results of the texture sampling with shape
+        [minibatch_size, height, width, tex_channels]. Cube map fetches with invalid uv coordinates
+        (e.g., zero vectors) output all zeros and do not propagate gradients.
+    """
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
+
+    # Sanitize inputs.
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Check inputs.
+    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
+    if 'mipmap' in filter_mode:
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Construct a mipmap if necessary.
+    if 'mipmap' in filter_mode:
+        mip_wrapper, mip_stack = None, []
+        if mip is not None:
+            assert isinstance(mip, (_get_plugin().TextureMipWrapper, list))
+            if isinstance(mip, list):
+                assert all(isinstance(x, torch.Tensor) for x in mip)
+                mip_stack = mip
+            else:
+                mip_wrapper = mip
+        else:
+            mip_wrapper = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack)
+    else:
+        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
+
+# Mipmap precalculation for cases where the texture stays constant.
+def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
+    """Construct a mipmap stack for a texture.
+
+    This function can be used for constructing a mipmap stack for a texture that is known to remain
+    constant. This avoids reconstructing it every time `texture()` is called.
+
+    Args:
+        tex: Texture tensor with the same constraints as in `texture()`.
+        max_mip_level: If specified, limits the number of mipmaps constructed.
+        cube_mode: Must be set to True if `tex` specifies a cube map texture.
+
+    Returns:
+        An opaque object containing the mipmap stack. This can be supplied in a call to `texture()` 
+        in the `mip` argument.
+    """
+
+    assert isinstance(tex, torch.Tensor)
+    assert cube_mode is True or cube_mode is False
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+    return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+class _antialias_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
+        out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
+        ctx.save_for_backward(color, rast, pos, tri)
+        ctx.saved_misc = pos_gradient_boost, work_buffer
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        color, rast, pos, tri = ctx.saved_tensors
+        pos_gradient_boost, work_buffer = ctx.saved_misc
+        g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+        if pos_gradient_boost != 1.0:
+            g_pos = g_pos * pos_gradient_boost
+        return g_color, None, g_pos, None, None, None
+
+# Op wrapper.
+def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+    """Perform antialiasing.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Note that silhouette edge determination is based on vertex indices in the triangle
+    tensor. For it to work properly, a vertex belonging to multiple triangles must be
+    referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
+    classify the adjacent edges as silhouette edges, which leads to bad performance and
+    potentially incorrect gradients. If you are unsure whether your data is good, check
+    which pixels are modified by the antialias operation and compare to the example in the
+    documentation.
+
+    Args:
+        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast: Main output tensor from `rasterize()`.
+        pos: Vertex position tensor used in the rasterization operation.
+        tri: Triangle tensor used in the rasterization operation.
+        topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
+                       specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
+
+    Returns:
+        A tensor containing the antialiased image with the same shape as `color` input tensor.
+    """
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
+
+    # Construct topology hash unless provided by user.
+    if topology_hash is not None:
+        assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
+    else:
+        topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
+
+    # Instantiate the function.
+    return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
+
+# Topology hash precalculation for cases where the triangle array stays constant.
+def antialias_construct_topology_hash(tri):
+    """Construct a topology hash for a triangle tensor.
+
+    This function can be used for constructing a topology hash for a triangle tensor that is 
+    known to remain constant. This avoids reconstructing it every time `antialias()` is called.
+
+    Args:
+        tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+             GPU memory.
+
+    Returns:
+        An opaque object containing the topology hash. This can be supplied in a call to 
+        `antialias()` in the `topology_hash` argument.
+    """
+    assert isinstance(tri, torch.Tensor)
+    return _get_plugin().antialias_construct_topology_hash(tri)
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp
new file mode 100644
index 00000000..730a200e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_antialias.cpp
@@ -0,0 +1,243 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/antialias.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void AntialiasFwdMeshKernel         (const AntialiasKernelParams p);
+void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
+void AntialiasFwdAnalysisKernel     (const AntialiasKernelParams p);
+void AntialiasGradKernel            (const AntialiasKernelParams p);
+
+//------------------------------------------------------------------------
+// Topology hash construction.
+
+TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tri);
+    NVDR_CHECK_CONTIGUOUS(tri);
+    NVDR_CHECK_I32(tri);
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Fill in kernel parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
+    p.tri = tri.data_ptr<int>();
+
+    // Kernel parameters.
+    p.allocTriangles = 64;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Construct the hash tensor and get pointer.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    torch::Tensor ev_hash = torch::zeros({(uint64_t)p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * 4}, opts);
+    p.evHash = (uint4*)(ev_hash.data_ptr<int>());
+
+    // Check alignment.
+    NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
+
+    // Populate the hash.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return.
+    TopologyHashWrapper hash_wrap;
+    hash_wrap.ev_hash = ev_hash;
+    return hash_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_F32(color, rast, pos);
+    NVDR_CHECK_I32(tri, topology_hash);
+
+    // Sanity checks.
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.evHash = (uint4*)(topology_hash.data_ptr<int>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Determine hash allocation size.
+    p.allocTriangles = 64;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Allocate output tensors.
+    torch::Tensor out = color.detach().clone(); // Use color as base.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
+    p.output = out.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Clear the work counters.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rasterOut  &  7), "raster_out input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+    NVDR_CHECK(!((uintptr_t)p.evHash     & 15), "topology_hash internal tensor not aligned to int4");
+
+    // Choose launch parameters for the discontinuity finder kernel and launch.
+    void* args[] = {&p};
+    dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+    // Determine optimum block size for the persistent analysis kernel and launch.
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
+    NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
+    NVDR_CHECK_I32(tri);
+
+    // Sanity checks.
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
+    NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Ensure dy is contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Allocate output tensors.
+    torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
+    torch::Tensor grad_pos = torch::zeros_like(pos);
+    p.gradColor = grad_color.data_ptr<float>();
+    p.gradPos = grad_pos.data_ptr<float>();
+
+    // Clear gradient kernel work counter.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+
+    // Determine optimum block size for the gradient kernel and launch.
+    void* args[] = {&p};
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp
new file mode 100644
index 00000000..898e17e3
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T     torch::Tensor
+#define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTV   std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+
+OP_RETURN_TT        rasterize_fwd_cuda                  (RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TTV       texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_TTTTV     texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeCRStateWrapper>(m, "RasterizeCRStateWrapper").def(pybind11::init<int>());
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd_cuda",                 &rasterize_fwd_cuda,                    "rasterize forward op (cuda)");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings_gl.cpp
new file mode 100644
index 00000000..5363e802
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_bindings_gl.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes.
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+
+    // Ops.
+    m.def("rasterize_fwd_gl", &rasterize_fwd_gl, "rasterize forward op (opengl)");
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl
new file mode 100644
index 00000000..74dea415
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_common.inl
@@ -0,0 +1,29 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "../common/framework.h"
+
+//------------------------------------------------------------------------
+// Input check helpers.
+//------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
+#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
+#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
+#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
+#define NVDR_CHECK_I32(...) do { nvdr_check_i32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be int32 tensors"); } while(0)
+inline void nvdr_check_cpu(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.device().type() == c10::DeviceType::CPU, func, err_msg); }
+inline void nvdr_check_contiguous(at::ArrayRef<at::Tensor> ts, const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.is_contiguous(), func, err_msg); }
+inline void nvdr_check_f32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kFloat32, func, err_msg); }
+inline void nvdr_check_i32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kInt32, func, err_msg); }
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp
new file mode 100644
index 00000000..b2c99fcc
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_interpolate.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "../common/common.h"
+#include "../common/interpolate.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void InterpolateFwdKernel   (const InterpolateKernelParams p);
+void InterpolateFwdKernelDa (const InterpolateKernelParams p);
+void InterpolateGradKernel  (const InterpolateKernelParams p);
+void InterpolateGradKernelDa(const InterpolateKernelParams p);
+
+//------------------------------------------------------------------------
+// Helper
+
+static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    if (diff_attrs_all)
+    {
+        p.numDiffAttr = p.numAttr;
+        p.diff_attrs_all = 1;
+    }
+    else
+    {
+        NVDR_CHECK(diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, "too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)");
+        p.numDiffAttr = diff_attrs_vec.size();
+        memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, rast_db);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, rast_db);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK( tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    if (p.instance_mode)
+        NVDR_CHECK(attr.size(0) == rast.size(0) || attr.size(0) == 1, "minibatch size mismatch between inputs rast, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
+    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
+
+    p.out = out.data_ptr<float>();
+    p.outDA = enable_da ? out_da.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast   & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.outDA  &  7), "out_da output tensor not aligned to float2");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_da);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    return interpolate_fwd_da(attr, rast, tri, empty_tensor, false, empty_vec);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy, rast_db, dda);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, dy, rast_db, dda);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Depth of attributes.
+    int attr_depth = p.instance_mode ? (attr.sizes().size() > 1 ? attr.size(0) : 0) : 1;
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) == rast.size(1) && dy.size(2) == rast.size(2) && dy.size(3) > 0, "dy must have shape [>0, height, width, >0]");
+    NVDR_CHECK(dy.size(3) == attr.size(attr.sizes().size() - 1), "argument count mismatch between inputs dy, attr");
+    NVDR_CHECK((attr_depth == rast.size(0) || attr_depth == 1) && dy.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, dy, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(dda.sizes().size() == 4 && dda.size(0) > 0 && dda.size(1) == rast.size(1) && dda.size(2) == rast.size(2), "dda must have shape [>0, height, width, ?]");
+        NVDR_CHECK(dda.size(0) == rast.size(0), "minibatch size mismatch between rast, dda");
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor dda_;
+    if (enable_da)
+        dda_ = dda.contiguous();
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.dy = dy_.data_ptr<float>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.dda = enable_da ? dda_.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor gradAttr = torch::zeros_like(attr);
+    torch::Tensor gradRaster = torch::empty_like(rast);
+    torch::Tensor gradRasterDB;
+    if (enable_da)
+        gradRasterDB = torch::empty_like(rast_db);
+
+    p.gradAttr = gradAttr.data_ptr<float>();
+    p.gradRaster = gradRaster.data_ptr<float>();
+    p.gradRasterDB = enable_da ? gradRasterDB.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast         & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB       & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dda          &  7), "dda input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.gradRaster   & 15), "grad_rast output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.gradRasterDB & 15), "grad_rast_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(gradAttr, gradRaster, gradRasterDB);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_grad(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = interpolate_grad_da(attr, rast, tri, dy, empty_tensor, empty_tensor, false, empty_vec);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp
new file mode 100644
index 00000000..589e227a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize.cpp
@@ -0,0 +1,265 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize.h"
+#include "../common/cudaraster/CudaRaster.hpp"
+#include "../common/cudaraster/impl/Constants.hpp"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p);
+void RasterizeGradKernel(const RasterizeGradParams p);
+void RasterizeGradKernelDb(const RasterizeGradParams p);
+
+//------------------------------------------------------------------------
+// Python CudaRaster state wrapper methods.
+
+RasterizeCRStateWrapper::RasterizeCRStateWrapper(int cudaDeviceIdx_)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx_);
+    cudaDeviceIdx = cudaDeviceIdx_;
+    cr = new CR::CudaRaster();
+}
+
+RasterizeCRStateWrapper::~RasterizeCRStateWrapper(void)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx);
+    delete cr;
+}
+
+//------------------------------------------------------------------------
+// Forward op (Cuda).
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    CR::CudaRaster* cr = stateWrapper.cr;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that CudaRaster context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "CudaRaster context must must reside on the same device as input tensors");
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height_out = std::get<0>(resolution);
+    int width_out  = std::get<1>(resolution);
+    int depth      = instance_mode ? pos.size(0) : ranges.size(0); // Depth of tensor, not related to depth buffering.
+    NVDR_CHECK(height_out > 0 && width_out > 0, "resolution must be [>0, >0]");
+
+    // Round internal resolution up to tile size.
+    int height = (height_out + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int width  = (width_out  + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    // Get position and triangle buffer sizes in vertices / triangles.
+    int posCount = instance_mode ? pos.size(1) : pos.size(0);
+    int triCount = tri.size(0);
+
+    // Set up CudaRaster buffers.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    cr->setVertexBuffer((void*)posPtr, posCount);
+    cr->setIndexBuffer((void*)triPtr, triCount);
+    cr->setBufferSize(width_out, height_out, depth);
+
+    // Enable depth peeling?
+    bool enablePeel = (peeling_idx > 0);
+    cr->setRenderModeFlags(enablePeel ? CR::CudaRaster::RenderModeFlag_EnableDepthPeeling : 0); // No backface culling.
+    if (enablePeel)
+        cr->swapDepthAndPeel(); // Use previous depth buffer as peeling depth input.
+
+    // Determine viewport tiling.
+    int tileCountX = (width  + CR_MAXVIEWPORT_SIZE - 1) / CR_MAXVIEWPORT_SIZE;
+    int tileCountY = (height + CR_MAXVIEWPORT_SIZE - 1) / CR_MAXVIEWPORT_SIZE;
+    int tileSizeX = ((width  + tileCountX - 1) / tileCountX + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int tileSizeY = ((height + tileCountY - 1) / tileCountY + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    TORCH_CHECK(tileCountX > 0 && tileCountY > 0 && tileSizeX > 0 && tileSizeY > 0,             "internal error in tile size calculation: count or size is zero");
+    TORCH_CHECK(tileSizeX <= CR_MAXVIEWPORT_SIZE && tileSizeY <= CR_MAXVIEWPORT_SIZE,           "internal error in tile size calculation: tile larger than allowed");
+    TORCH_CHECK((tileSizeX & (CR_TILE_SIZE - 1)) == 0 && (tileSizeY & (CR_TILE_SIZE - 1)) == 0, "internal error in tile size calculation: tile not divisible by ", CR_TILE_SIZE);
+    TORCH_CHECK(tileCountX * tileSizeX >= width && tileCountY * tileSizeY >= height,            "internal error in tile size calculation: tiles do not cover viewport");
+
+    // Rasterize in tiles.
+    for (int tileY = 0; tileY < tileCountY; tileY++)
+    for (int tileX = 0; tileX < tileCountX; tileX++)
+    {
+        // Set CudaRaster viewport according to tile.
+        int offsetX = tileX * tileSizeX;
+        int offsetY = tileY * tileSizeY;
+        int sizeX = (width_out  - offsetX) < tileSizeX ? (width_out  - offsetX) : tileSizeX;
+        int sizeY = (height_out - offsetY) < tileSizeY ? (height_out - offsetY) : tileSizeY;
+        cr->setViewport(sizeX, sizeY, offsetX, offsetY);
+
+        // Run all triangles in one batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
+        // Only enable peeling-specific optimizations to skip first stages when image fits in one tile. Those are not valid otherwise.
+        cr->deferredClear(0u);
+        bool success = cr->drawTriangles(rangesPtr, enablePeel && (tileCountX == 1 && tileCountY == 1), stream);
+        NVDR_CHECK(success, "subtriangle count overflow");
+    }
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height_out, width_out, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height_out, width_out, 4}, opts);
+
+    // Populate pixel shader kernel parameters.
+    RasterizeCudaFwdShaderParams p;
+    p.pos = posPtr;
+    p.tri = triPtr;
+    p.in_idx = (const int*)cr->getColorBuffer();
+    p.out = out.data_ptr<float>();
+    p.out_db = out_db.data_ptr<float>();
+    p.numTriangles = triCount;
+    p.numVertices = posCount;
+    p.width_in = width;
+    p.height_in = height;
+    p.width_out = width_out;
+    p.height_out = height_out;
+    p.depth  = depth;
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    p.xs = 2.f / (float)width_out;
+    p.xo = 1.f / (float)width_out - 1.f;
+    p.ys = 2.f / (float)height_out;
+    p.yo = 1.f / (float)height_out - 1.f;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15),    "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.out & 15),    "out output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.out_db & 15), "out_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH, RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT, p.width_out, p.height_out);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width_out, p.height_out, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)RasterizeCudaFwdShaderKernel, gridSize, blockSize, args, 0, stream));
+
+    // Return.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGradParams p;
+    bool enable_db = ddb.defined();
+
+    // Check inputs.
+    if (enable_db)
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy, ddb);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy, ddb);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Determine instance mode.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Shape is taken from the rasterizer output tensor.
+    NVDR_CHECK(out.sizes().size() == 4, "tensor out must be rank-4");
+    p.depth  = out.size(0);
+    p.height = out.size(1);
+    p.width  = out.size(2);
+    NVDR_CHECK(p.depth > 0 && p.height > 0 && p.width > 0, "resolution must be [>0, >0, >0]");
+
+    // Check other shapes.
+    if (p.instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) == p.depth && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [depth, >0, 4]");
+    else
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(out.sizes().size() == 4 && out.size(0) == p.depth && out.size(1) == p.height && out.size(2) == p.width && out.size(3) == 4, "out must have shape [depth, height, width, 4]");
+    NVDR_CHECK( dy.sizes().size() == 4 &&  dy.size(0) == p.depth &&  dy.size(1) == p.height &&  dy.size(2) == p.width &&  dy.size(3) == 4, "dy must have shape [depth, height, width, 4]");
+    if (enable_db)
+        NVDR_CHECK(ddb.sizes().size() == 4 && ddb.size(0) == p.depth && ddb.size(1) == p.height && ddb.size(2) == p.width && ddb.size(3) == 4, "ddb must have shape [depth, height, width, 4]");
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor ddb_;
+    if (enable_db)
+        ddb_ = ddb.contiguous();
+
+    // Populate parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = p.instance_mode ? pos.size(1) : pos.size(0);
+    p.pos = pos.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.out = out.data_ptr<float>();
+    p.dy  = dy_.data_ptr<float>();
+    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
+
+    // Set up pixel position to clip space x, y transform.
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
+
+    // Allocate output tensor for position gradients.
+    torch::Tensor grad = torch::zeros_like(pos);
+    p.grad = grad.data_ptr<float>();
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dy  &  7), "dy input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.ddb & 15), "ddb input tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_db ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return the gradients.
+    return grad;
+}
+
+// Version without derivatives.
+torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
+{
+    torch::Tensor empty_tensor;
+    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize_gl.cpp
new file mode 100644
index 00000000..3776134a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_rasterize_gl.cpp
@@ -0,0 +1,132 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize_gl.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
+    releaseGLContext();
+}
+
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    setGLContext(pState->glctx);
+    rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
+    releaseGLContext();
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+
+//------------------------------------------------------------------------
+// Forward op (OpenGL).
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+
+    // Resize all buffers.
+    bool changes = false;
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, changes, posCount, triCount, width, height, depth);
+    if (changes)
+    {
+#ifdef _WIN32
+        // Workaround for occasional blank first frame on Windows.
+        releaseGLContext();
+        setGLContext(s.glctx);
+#endif
+    }
+
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp
new file mode 100644
index 00000000..2257f566
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_texture.cpp
@@ -0,0 +1,718 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/texture.h"
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void MipBuildKernel1                            (const TextureKernelParams p);
+void MipBuildKernel2                            (const TextureKernelParams p);
+void MipBuildKernel4                            (const TextureKernelParams p);
+void TextureFwdKernelNearest1                   (const TextureKernelParams p);
+void TextureFwdKernelNearest2                   (const TextureKernelParams p);
+void TextureFwdKernelNearest4                   (const TextureKernelParams p);
+void TextureFwdKernelLinear1                    (const TextureKernelParams p);
+void TextureFwdKernelLinear2                    (const TextureKernelParams p);
+void TextureFwdKernelLinear4                    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest1       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest2       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest4       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear1        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear2        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear4        (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest1               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest2               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest4               (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear1                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear2                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear4                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest1   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest2   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO1     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO2     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO4     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO1      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO2      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO4      (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO1  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO2  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO4  (const TextureKernelParams p);
+void MipGradKernel1                             (const TextureKernelParams p);
+void MipGradKernel2                             (const TextureKernelParams p);
+void MipGradKernel4                             (const TextureKernelParams p);
+void TextureGradKernelNearest                   (const TextureKernelParams p);
+void TextureGradKernelLinear                    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearest       (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinear        (const TextureKernelParams p);
+void TextureGradKernelCubeNearest               (const TextureKernelParams p);
+void TextureGradKernelCubeLinear                (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearestBO     (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinearBO      (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinearBO  (const TextureKernelParams p);
+
+//------------------------------------------------------------------------
+// Modeselektor.
+
+static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode, int max_mip_level)
+{
+    // Mip and filter modes.
+    p.filterMode = filter_mode;
+    NVDR_CHECK(p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, "filter_mode unsupported");
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        p.mipLevelLimit = max_mip_level;
+        NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+    }
+
+    // Boundary mode.
+    p.boundaryMode = boundary_mode;
+    NVDR_CHECK(p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, "boundary_mode unsupported");
+}
+
+//------------------------------------------------------------------------
+// Mipmap construction.
+
+TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    p.mipLevelLimit = max_mip_level;
+    p.boundaryMode = cube_mode ? TEX_BOUNDARY_MODE_CUBE : TEX_BOUNDARY_MODE_WRAP;
+    NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex);
+    NVDR_CHECK_CONTIGUOUS(tex);
+    NVDR_CHECK_F32(tex);
+
+    // Populate parameters and sanity check tex shape.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+    }
+    p.texDepth  = tex.size(0);
+    p.texHeight = tex.size(cube_mode ? 2 : 1);
+    p.texWidth  = tex.size(cube_mode ? 3 : 2);
+    p.channels  = tex.size(cube_mode ? 4 : 3);
+
+    // Set texture pointer.
+    p.tex[0] = tex.data_ptr<float>();
+
+    // Generate mip offsets and calculate total size.
+    int mipOffsets[TEX_MAX_MIP_LEVEL];
+    int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+
+    // Allocate and set mip tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor mip = torch::empty({mipTotal}, opts);
+    float* pmip = mip.data_ptr<float>();
+    for (int i=1; i <= p.mipLevelMax; i++)
+        p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Build mip levels.
+    for (int i=1; i <= p.mipLevelMax; i++)
+    {
+        int2 ms = mipLevelSize(p, i);
+        int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+        dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+        p.mipLevelOut = i;
+
+        void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+    }
+
+    // Return the mip tensor in a wrapper.
+    TextureMipWrapper mip_wrapper;
+    mip_wrapper.mip = mip;
+    mip_wrapper.max_mip_level = max_mip_level;
+    mip_wrapper.texture_size = tex.sizes().vec();
+    mip_wrapper.cube_mode = cube_mode;
+    return mip_wrapper;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.n, p.imgHeight, p.imgWidth, p.channels}, opts);
+    p.out = out.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    float* pmip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match and assign.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in custom mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+                p.tex[i] = t.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets, check mipmap size, and set mip data pointer.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "wrapped mip tensor size mismatch");
+            pmip = mip_w.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.out    & 15), "out output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip     & 15), "mip input tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.out    & 7), "out output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip     & 7), "mip input tensor not aligned to float2");
+    }
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 15), "uv_da input tensor not aligned to float4");
+    else
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 7), "uv_da input tensor not aligned to float2");
+
+    // Choose launch parameters for texture lookup kernel.
+    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
+        (void*)TextureFwdKernelNearest1,
+        (void*)TextureFwdKernelNearest2,
+        (void*)TextureFwdKernelNearest4,
+        (void*)TextureFwdKernelLinear1,
+        (void*)TextureFwdKernelLinear2,
+        (void*)TextureFwdKernelLinear4,
+        (void*)TextureFwdKernelLinearMipmapNearest1,
+        (void*)TextureFwdKernelLinearMipmapNearest2,
+        (void*)TextureFwdKernelLinearMipmapNearest4,
+        (void*)TextureFwdKernelLinearMipmapLinear1,
+        (void*)TextureFwdKernelLinearMipmapLinear2,
+        (void*)TextureFwdKernelLinearMipmapLinear4,
+        (void*)TextureFwdKernelCubeNearest1,
+        (void*)TextureFwdKernelCubeNearest2,
+        (void*)TextureFwdKernelCubeNearest4,
+        (void*)TextureFwdKernelCubeLinear1,
+        (void*)TextureFwdKernelCubeLinear2,
+        (void*)TextureFwdKernelCubeLinear4,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelLinearMipmapLinearBO4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+    func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
+
+    // Launch kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Return output tensor.
+    return out;
+}
+
+// Version without mipmaps.
+torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
+
+    // Get contiguous version of dy.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor for tex gradient.
+    torch::Tensor grad_tex = torch::zeros_like(tex);
+    p.gradTex[0] = grad_tex.data_ptr<float>();
+
+    // Allocate output tensor for uv gradient.
+    torch::Tensor grad_uv;
+    torch::Tensor grad_uv_da;
+    torch::Tensor grad_mip_level_bias;
+    if (p.filterMode != TEX_MODE_NEAREST)
+    {
+        grad_uv = torch::empty_like(uv);
+        p.gradUV = grad_uv.data_ptr<float>();
+
+        // Gradients for things affecting mip level.
+        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            // Allocate output tensor for uv_da gradient.
+            if (has_uv_da)
+            {
+                grad_uv_da = torch::empty_like(uv_da);
+                p.gradUVDA = grad_uv_da.data_ptr<float>();
+            }
+
+            // Allocate output tensor for mip_level_bias gradient.
+            if (has_mip_level_bias)
+            {
+                grad_mip_level_bias = torch::empty_like(mip_level_bias);
+                p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
+            }
+        }
+    }
+
+    // Choose kernel variants based on channel count.
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    torch::Tensor grad_mip;
+    std::vector<torch::Tensor> grad_mip_stack;
+    float* pmip = 0;
+    float* pgradMip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match, assign, construct gradient tensors.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+
+                torch::Tensor g = torch::zeros_like(t);
+                grad_mip_stack.push_back(g);
+
+                p.tex[i] = t.data_ptr<float>();
+                p.gradTex[i] = g.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets and get space for temporary mip gradients.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "mip tensor size mismatch");
+            grad_mip = torch::zeros_like(mip_w);
+            pmip = (float*)mip_w.data_ptr<float>();
+            pgradMip = grad_mip.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(!((uintptr_t)p.uv       & 7), "uv input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUV   & 7), "grad_uv output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 15), "uv_da input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 15), "grad_uv_da output tensor not aligned to float4");
+    }
+    else
+    {
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 7), "uv_da input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 7), "grad_uv_da output tensor not aligned to float2");
+    }
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 15), "tex or mip input tensor not aligned to float4");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 15), "grad_tex output tensor not aligned to float4");
+        }
+        NVDR_CHECK(!((uintptr_t)p.dy         & 15), "dy input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip         & 15), "mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 15), "internal mip gradient tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 7), "tex or mip input tensor not aligned to float2");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 7), "grad_tex output tensor not aligned to float2");
+        }
+         NVDR_CHECK(!((uintptr_t)p.dy         & 7), "dy output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip         & 7), "mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 7), "internal mip gradient tensor not aligned to float2");
+    }
+
+    // Choose launch parameters for main gradient kernel.
+    void* args[] = {&p};
+    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
+        (void*)TextureGradKernelNearest,
+        (void*)TextureGradKernelLinear,
+        (void*)TextureGradKernelLinearMipmapNearest,
+        (void*)TextureGradKernelLinearMipmapLinear,
+        (void*)TextureGradKernelCubeNearest,
+        (void*)TextureGradKernelCubeLinear,
+        (void*)TextureGradKernelCubeLinearMipmapNearest,
+        (void*)TextureGradKernelCubeLinearMipmapLinear,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelLinearMipmapNearestBO,
+        (void*)TextureGradKernelLinearMipmapLinearBO,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelCubeLinearMipmapNearestBO,
+        (void*)TextureGradKernelCubeLinearMipmapLinearBO,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+
+    // Launch main gradient kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Launch kernel to pull gradients from mip levels. Don't do this if mip stack was supplied - individual level gradients are already there.
+    if (p.enableMip && !has_mip_stack)
+    {
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+        int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+        void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+    }
+
+    // Return output tensors.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias, grad_mip_stack);
+}
+
+// Version for nearest filter mode.
+torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::get<0>(result);
+}
+
+// Version for linear filter mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+// Version for linear-mipmap-nearest mode.
+std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(std::get<0>(result), std::get<1>(result), std::get<4>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h
new file mode 100644
index 00000000..8e389582
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/build/lib/nvdiffrast/torch/torch_types.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+
+//------------------------------------------------------------------------
+// Python GL state wrapper.
+
+class RasterizeGLState;
+class RasterizeGLStateWrapper
+{
+public:
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
+    ~RasterizeGLStateWrapper    (void);
+
+    void setContext             (void);
+    void releaseContext         (void);
+
+    RasterizeGLState*           pState;
+    bool                        automatic;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Python CudaRaster state wrapper.
+
+namespace CR { class CudaRaster; }
+class RasterizeCRStateWrapper
+{
+public:
+    RasterizeCRStateWrapper     (int cudaDeviceIdx);
+    ~RasterizeCRStateWrapper    (void);
+
+    CR::CudaRaster*             cr;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Mipmap wrapper to prevent intrusion from Python side.
+
+class TextureMipWrapper
+{
+public:
+    torch::Tensor               mip;
+    int                         max_mip_level;
+    std::vector<int64_t>        texture_size;   // For error checking.
+    bool                        cube_mode;      // For error checking.
+};
+
+
+//------------------------------------------------------------------------
+// Antialias topology hash wrapper to prevent intrusion from Python side.
+
+class TopologyHashWrapper
+{
+public:
+    torch::Tensor               ev_hash;
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/10_nvidia.json b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/10_nvidia.json
new file mode 100644
index 00000000..2bfcca05
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/10_nvidia.json
@@ -0,0 +1,6 @@
+{
+    "file_format_version" : "1.0.0",
+    "ICD" : {
+        "library_path" : "libEGL_nvidia.so.0"
+    }
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/Dockerfile b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/Dockerfile
new file mode 100644
index 00000000..f32d27ea
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docker/Dockerfile
@@ -0,0 +1,51 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# Note: Should also work with NVIDIA's Docker image builds such as
+#
+# nvcr.io/nvidia/pytorch:20.09-py3
+#
+# This file defaults to pytorch/pytorch as it works on slightly older
+# driver versions.
+FROM nvcr.io/nvidia/pytorch:23.03-py3
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    pkg-config \
+    libglvnd0 \
+    libgl1 \
+    libglx0 \
+    libegl1 \
+    libgles2 \
+    libglvnd-dev \
+    libgl1-mesa-dev \
+    libegl1-mesa-dev \
+    libgles2-mesa-dev \
+    cmake \
+    curl
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# for GLEW
+ENV LD_LIBRARY_PATH /usr/lib64:$LD_LIBRARY_PATH
+
+# nvidia-container-runtime
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics
+
+# Default pyopengl to EGL for good headless rendering support
+ENV PYOPENGL_PLATFORM egl
+
+COPY docker/10_nvidia.json /usr/share/glvnd/egl_vendor.d/10_nvidia.json
+
+RUN pip install --upgrade pip
+RUN pip install ninja imageio imageio-ffmpeg
+
+COPY nvdiffrast /tmp/pip/nvdiffrast/
+COPY README.md setup.py /tmp/pip/
+RUN cd /tmp/pip && pip install .
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/cube.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/cube.png
new file mode 100644
index 00000000..92b63e61
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/cube.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/earth.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/earth.png
new file mode 100644
index 00000000..d30989a6
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/earth.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/envphong.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/envphong.png
new file mode 100644
index 00000000..2c6f3902
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/envphong.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/logo.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/logo.png
new file mode 100644
index 00000000..827d907f
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/logo.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_cube.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_cube.png
new file mode 100644
index 00000000..6410c720
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_cube.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_earth.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_earth.png
new file mode 100644
index 00000000..c46ab68e
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_earth.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_envphong.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_envphong.png
new file mode 100644
index 00000000..524c5c4e
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pipe_envphong.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pose.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pose.png
new file mode 100644
index 00000000..908c0978
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/pose.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_aa.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_aa.png
new file mode 100644
index 00000000..c957e3ba
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_aa.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop1.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop1.png
new file mode 100644
index 00000000..c43c6998
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop1.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop2.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop2.png
new file mode 100644
index 00000000..e2c5a046
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_crop2.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff1.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff1.png
new file mode 100644
index 00000000..ebc65a27
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff1.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff2.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff2.png
new file mode 100644
index 00000000..14a7b6dd
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_diff2.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel1.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel1.png
new file mode 100644
index 00000000..80970c5b
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel1.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel2.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel2.png
new file mode 100644
index 00000000..269fa4b0
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_peel2.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_st.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_st.png
new file mode 100644
index 00000000..669470ff
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_st.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tex.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tex.png
new file mode 100644
index 00000000..83088987
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tex.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texture.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texture.png
new file mode 100644
index 00000000..63094487
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texture.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texw.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texw.png
new file mode 100644
index 00000000..6191c79b
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_texw.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tri.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tri.png
new file mode 100644
index 00000000..81422791
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_tri.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_uv.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_uv.png
new file mode 100644
index 00000000..da2f7447
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/spot_uv.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser.png
new file mode 100644
index 00000000..cca878e3
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser1.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser1.png
new file mode 100644
index 00000000..defdaf88
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser1.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser2.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser2.png
new file mode 100644
index 00000000..a950a663
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser2.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser3.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser3.png
new file mode 100644
index 00000000..13450160
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser3.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser4.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser4.png
new file mode 100644
index 00000000..a0dceb8f
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser4.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser5.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser5.png
new file mode 100644
index 00000000..439de8a4
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/teaser5.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/thumb.jpg b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/thumb.jpg
new file mode 100644
index 00000000..aab9d25a
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/thumb.jpg differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/tri.png b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/tri.png
new file mode 100644
index 00000000..45b17356
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/img/tri.png differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/index.html b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/index.html
new file mode 100644
index 00000000..7c04f4fa
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/docs/index.html
@@ -0,0 +1,1060 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+  <meta charset="utf-8" />
+  <title>nvdiffrast</title>
+  <meta property="og:title" content="nvdiffrast">
+  <meta property="og:description" content="Modular Primitives for High-Performance Differentiable Rendering">
+  <meta property="og:image" content="https://nvlabs.github.io/nvdiffrast/img/thumb.jpg">
+  <meta property="og:url" content="https://nvlabs.github.io/nvdiffrast/">
+
+<style type='text/css'>
+
+:root {
+    --func-vert-padding: 0.5em;
+}
+
+span.smallcaps{font-variant: small-caps;}
+span.underline{text-decoration: underline;}
+div.column{display: inline-block; vertical-align: top; width: 50%;}
+
+body {
+    font-family: 'Segoe UI', sans-serif;
+    color: #000;
+    line-height: 1.5;
+}
+.tocstyle nav {
+    display: table;
+    padding: .4em 2em .5em 0;
+    margin-top: 1em;
+    background-color: #f6f8fa;
+    border: 1px solid DarkSlateGray;
+}
+h1 {
+    font-family: 'Montserrat', 'Segoe UI', sans-serif;
+    line-height: 1.2;
+    font-size: 3em;
+    margin-top: 0.5em;
+    margin-bottom: 0.2em;
+}
+h2, h3, h4, h5, h6 {
+    font-family: 'Segoe UI', sans-serif;
+    font-weight: 600;
+    margin-bottom: 0.1em;
+    color: DarkSlateGray;
+}
+h2     { margin-top: 2em; }
+h2, h3 { border-bottom: 1px solid #ccc; }
+p {
+  margin-left: 0px;
+  margin-right: 0px;
+  margin-top: 0.75em;
+  margin-bottom: 0.75em;
+}
+
+.max-width {
+    margin: 1em;
+}
+
+@media screen and (min-width: 680px) {
+    .max-width {
+       margin-left: auto;
+       margin-right: auto;
+       margin-top: 60px;
+       margin-bottom: 60px;
+       max-width: 800px;
+    }
+}
+
+.pixelated {
+    image-rendering: pixelated;
+}
+
+strong {
+    font-weight: 600;
+}
+
+.title {
+    text-align: center;
+}
+.subtitle {
+ 	font-size: 1.25em;
+ 	margin-top: 0px;
+ 	padding-top: 0px;
+ 	padding-bottom: 1em;
+ 	margin-bottom: 2em;
+    border-bottom: 1px solid #ccc;
+ 	color: #444;
+}
+
+.centered {
+    text-align: center;
+}
+
+.spaced {
+    margin: 2em 0;
+}
+.no-bottom-margin {
+    margin-bottom: 0;
+}
+.top-lined {
+    padding-top: 2em;
+    border-top: 1px solid #000;
+}
+.bottom-lined {
+    padding-bottom: 2em;
+    border-bottom: 1px solid #888;
+}
+.intro {
+    display: flex;
+    flex-direction: column;
+}
+
+.permalinked {
+    color: #222;
+    text-decoration: none;
+}
+.permalinked:hover,
+.permalinked:focus {
+    text-decoration: underline;
+}
+.flattr-note {
+    vertical-align: top;
+}
+
+pre {
+  font-family: 'Consolas', monospace, sans-serif;
+  font-size: 11pt;
+  font-weight: normal;
+  background-color: #f6f8fa;
+  border-radius: 3px;
+  padding: 12px;
+  line-height: 1.3;
+  overflow-x:auto;
+  white-space: pre-wrap;
+}
+
+code {
+  font-family: 'Consolas', monospace, sans-serif;
+  font-size: 11pt;
+  font-weight: normal;
+  background-color: #f6f8fa;
+  line-height: 1.3;
+  white-space: pre;
+}
+
+img.nob {
+  height: 250px;
+}
+
+img.pipe {
+  height: 250px;
+  padding-left: 50px;
+  padding-right: 50px;
+}
+
+img.brd {
+  height: 250px;
+  border: 1px solid #aaa;
+  box-shadow: 2px 2px 4px 0 #ddd;
+}
+
+img.teaser {
+  width: 160px;
+  height: 160px;
+  border: 1px solid #aaa;
+  box-shadow: 2px 2px 4px 0 #ddd;
+  margin: 20px 5px 0 5px;
+}
+
+td.mip {
+  text-align: center;
+  vertical-align: middle;
+  padding: 0 5px 0 5px;
+  line-height: 1.0;
+}
+
+td.cmd {
+  text-align: left;
+  vertical-align: top;
+  padding: 0 1em 0 0;
+  margin: 0;
+  line-height: 1.1;
+}
+
+div.image-parent {
+    display: flex;
+    flex-direction: row;
+    justify-content: center;
+}
+
+/* CSS for an image row with a caption */
+.image-row {
+    display: flex;
+    flex-direction: row;
+    align-items: top;
+    width: min-content;
+}
+
+.image-row > div { margin:10px; }
+
+.image-caption {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+}
+
+.image-caption .caption {
+    margin-top: 2px;
+}
+
+/* Styles for API reference */
+.apifunc {
+    margin-bottom: 1.5em;
+}
+.apifunc h4 {
+    margin-top: var(--func-vert-padding);
+    margin-bottom: var(--func-vert-padding);
+    overflow-x: hidden;
+}
+.apifunc h4 .defarg {
+    color:MediumBlue;
+}
+.apifunc h4 .sym_class,.sym_function,.sym_method {
+    border-radius: 4px;
+    padding: 0px 5px 0px 5px;
+    border: 0;
+    margin: 0;
+    font-size: 11pt;
+    font-weight: 600;
+    color: #fff;
+}
+.apifunc h4 .sym_class {
+    background-color: #d66;
+}
+.apifunc h4 .sym_function {
+    background-color: #66f;
+}
+.apifunc h4 .sym_method {
+    background-color: #6a9;
+}
+.apifunc p {
+    margin-top: var(--func-vert-padding);
+    margin-bottom: var(--func-vert-padding);
+}
+.apifunc code {
+    color: #000;
+    background-color: #f6f8fa;
+    font-family: 'Consolas', monospace, sans-serif;
+    font-weight: normal;
+    line-height: 1.3;
+    white-space: pre-wrap;
+}
+.apifunc h4 code {
+    font-size: 12pt;
+}
+.apifunc .returns, .arguments {
+    margin-top: .5em;
+    margin-bottom: 0em;
+}
+.apifunc {
+    padding-bottom: 1em;
+    border-bottom: 1px solid #cdcdcd;
+}
+.apifunc:last-child {
+    border-bottom: none;
+}
+
+.apifunc .args,.return_description {
+    line-height: 1.4;
+    margin-bottom: 0.5em;
+	margin-left: 2em;
+}
+.apifunc .args .arg .argname  {
+    font-family: 'Consolas', monospace, sans-serif;
+    font-weight: normal;
+    font-size: 12pt;
+    padding-right: .5em;
+    padding-left: 0em;
+}
+.apifunc .args .arg {
+    vertical-align: baseline;
+}
+.apifunc .args .arg .arg_short {
+    padding-left: .5em;
+}
+
+</style>
+<link href="https://fonts.googleapis.com/css?family=Montserrat|Segoe+UI" rel="stylesheet">
+</head>
+
+<body class='max-width'>
+    <header id='title-block-header'>
+        <div style='display: flex; flex-direction: row; align-items: center; margin-top: 20px'>
+            <img class="pixelated" style='margin-top: 1.0em' width='34px' height='34px' src='img/logo.png'></img>
+            <h1 style='padding-bottom: 0.0em; margin-left: 3px;' class="title">nvdiffrast</h1>
+        </div>
+        <div class="subtitle">Modular Primitives for High-Performance Differentiable Rendering</div>
+
+    </header>
+
+<h2 style='border-bottom: 0; padding-bottom: 0;'>Table of contents</h2>
+<div class="tocstyle">
+<nav id="TOC">
+<ul>
+<li><a href="#overview" id="toc-overview">Overview</a></li>
+<li><a href="#installation" id="toc-installation">Installation</a>
+<ul>
+<li><a href="#linux" id="toc-linux">Linux</a></li>
+<li><a href="#windows" id="toc-windows">Windows</a></li>
+</ul></li>
+<li><a href="#primitive-operations" id="toc-primitive-operations">Primitive operations</a>
+<ul>
+<li><a href="#rasterization" id="toc-rasterization">Rasterization</a></li>
+<li><a href="#interpolation" id="toc-interpolation">Interpolation</a></li>
+<li><a href="#texturing" id="toc-texturing">Texturing</a></li>
+<li><a href="#antialiasing" id="toc-antialiasing">Antialiasing</a></li>
+</ul></li>
+<li><a href="#beyond-the-basics" id="toc-beyond-the-basics">Beyond the basics</a>
+<ul>
+<li><a href="#coordinate-systems" id="toc-coordinate-systems">Coordinate systems</a></li>
+<li><a href="#geometry-and-minibatches-range-mode-vs-instanced-mode" id="toc-geometry-and-minibatches-range-mode-vs-instanced-mode">Geometry and minibatches: Range mode vs Instanced mode</a></li>
+<li><a href="#image-space-derivatives" id="toc-image-space-derivatives">Image-space derivatives</a></li>
+<li><a href="#mipmaps-and-texture-dimensions" id="toc-mipmaps-and-texture-dimensions">Mipmaps and texture dimensions</a></li>
+<li><a href="#rasterizing-with-cuda-vs-opengl" id="toc-rasterizing-with-cuda-vs-opengl">Rasterizing with CUDA vs OpenGL</a></li>
+<li><a href="#running-on-multiple-gpus" id="toc-running-on-multiple-gpus">Running on multiple GPUs</a>
+<ul>
+<li><a href="#note-on-torch.nn.dataparallel" id="toc-note-on-torch.nn.dataparallel">Note on torch.nn.DataParallel</a></li>
+</ul></li>
+<li><a href="#rendering-multiple-depth-layers" id="toc-rendering-multiple-depth-layers">Rendering multiple depth layers</a></li>
+<li><a href="#differences-between-pytorch-and-tensorflow" id="toc-differences-between-pytorch-and-tensorflow">Differences between PyTorch and TensorFlow</a>
+<ul>
+<li><a href="#manual-opengl-contexts-in-pytorch" id="toc-manual-opengl-contexts-in-pytorch">Manual OpenGL contexts in PyTorch</a></li>
+</ul></li>
+</ul></li>
+<li><a href="#samples" id="toc-samples">Samples</a>
+<ul>
+<li><a href="#triangle.py" id="toc-triangle.py">triangle.py</a></li>
+<li><a href="#cube.py" id="toc-cube.py">cube.py</a></li>
+<li><a href="#earth.py" id="toc-earth.py">earth.py</a></li>
+<li><a href="#envphong.py" id="toc-envphong.py">envphong.py</a></li>
+<li><a href="#pose.py" id="toc-pose.py">pose.py</a></li>
+</ul></li>
+<li><a href="#pytorch-api-reference" id="toc-pytorch-api-reference">PyTorch API reference</a></li>
+<li><a href="#licenses" id="toc-licenses">Licenses</a></li>
+<li><a href="#citation" id="toc-citation">Citation</a></li>
+<li><a href="#acknowledgements" id="toc-acknowledgements">Acknowledgements</a></li>
+</ul>
+</nav></div>
+
+<h2 id="overview">Overview</h2>
+<p>Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering. It is a lower-level library compared to previous ones such as <a href="https://github.com/BachiLi/redner">redner</a>, <a href="https://github.com/ShichenLiu/SoftRas">SoftRas</a>, or <a href="https://github.com/facebookresearch/pytorch3d">PyTorch3D</a> — nvdiffrast has no built-in camera models, lighting/material models, etc. Instead, the provided operations encapsulate only the most graphics-centric steps in the modern hardware graphics pipeline: rasterization, interpolation, texturing, and antialiasing. All of these operations (and their gradients) are GPU-accelerated, either via CUDA or via the hardware graphics pipeline.</p>
+This documentation is intended to serve as a user's guide to nvdiffrast. For detailed discussion on the design principles, implementation details, and benchmarks, please see our paper:
+<blockquote>
+<strong>Modular Primitives for High-Performance Differentiable Rendering</strong><br> Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br> ACM Transactions on Graphics 39(6) (proc. SIGGRAPH Asia 2020)
+</blockquote>
+<p>Paper: <a href="http://arxiv.org/abs/2011.03277">http://arxiv.org/abs/2011.03277</a><br> GitHub: <a href="https://github.com/NVlabs/nvdiffrast">https://github.com/NVlabs/nvdiffrast</a></p>
+<div class="image-parent">
+<div class="image-caption">
+<div class="image-row">
+<img class="teaser" src="img/teaser4.png"/> <img class="teaser" src="img/teaser1.png"/> <img class="teaser" src="img/teaser2.png"/> <img class="teaser" src="img/teaser3.png"/> <img class="teaser" src="img/teaser5.png"/>
+</div>
+<div class="caption">
+Examples of things we've done with nvdiffrast
+</div>
+</div>
+</div>
+<h2 id="installation">Installation</h2>
+<p>Minimum requirements:</p>
+<ul>
+<li>Linux or Windows operating system.</li>
+<li>64-bit Python 3.6.</li>
+<li>PyTorch (recommended) 1.6 or TensorFlow 1.14. TensorFlow 2.x is currently not supported.</li>
+<li>A high-end NVIDIA GPU, NVIDIA drivers, CUDA 10.2 toolkit.</li>
+</ul>
+<p>To download nvdiffrast, either download the repository at <a href="https://github.com/NVlabs/nvdiffrast">https://github.com/NVlabs/nvdiffrast</a> as a .zip file, or clone the repository using git:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> clone https://github.com/NVlabs/nvdiffrast</span></code></pre></div>
+<h3 id="linux">Linux</h3>
+<p>We recommend running nvdiffrast on <a href="https://www.docker.com/">Docker</a>. To build a Docker image with nvdiffrast and PyTorch 1.6 installed, run:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> <span class="at">--build-container</span></span></code></pre></div>
+<p>We recommend using Ubuntu, as some Linux distributions might not have all the required packages available. Installation on CentOS is reportedly problematic, but success has been claimed <a href="https://github.com/NVlabs/nvdiffrast/issues/48#issuecomment-1449261808">here</a>.</p>
+<p>To try out some of the provided code examples, run:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">./run_sample.sh</span> ./samples/torch/cube.py <span class="at">--resolution</span> 32</span></code></pre></div>
+<p>Alternatively, if you have all the dependencies taken care of (consult the included Dockerfile for reference), you can install nvdiffrast in your local Python site-packages by running</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install .</span></code></pre></div>
+<p>at the root of the repository. You can also just add the repository root directory to your <code>PYTHONPATH</code>.</p>
+<h3 id="windows">Windows</h3>
+<p>On Windows, nvdiffrast requires an external compiler for compiling the CUDA kernels. The development was done using Microsoft Visual Studio 2017 Professional Edition, and this version works with both PyTorch and TensorFlow versions of nvdiffrast. VS 2019 Professional Edition has also been confirmed to work with the PyTorch version of nvdiffrast. Other VS editions besides Professional Edition, including the Community Edition, should work but have not been tested.</p>
+<p>If the compiler binary (<code>cl.exe</code>) cannot be found in <code>PATH</code>, nvdiffrast will search for it heuristically. If this fails you may need to add it manually via</p>
+<pre><code>&quot;C:\Program Files (x86)\Microsoft Visual Studio\...\...\VC\Auxiliary\Build\vcvars64.bat&quot;</code></pre>
+<p>where the exact path depends on the version and edition of VS you have installed.</p>
+<p>To install nvdiffrast in your local site-packages, run:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Ninja is required run-time to build PyTorch extensions</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install ninja</span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Run at the root of the repository to install nvdiffrast</span></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="ex">pip</span> install .</span></code></pre></div>
+<p>Instead of <code>pip install .</code> you can also just add the repository root directory to your <code>PYTHONPATH</code>.</p>
+<h2 id="primitive-operations">Primitive operations</h2>
+<p>Nvdiffrast offers four differentiable rendering primitives: <strong>rasterization</strong>, <strong>interpolation</strong>, <strong>texturing</strong>, and <strong>antialiasing</strong>. The operation of the primitives is described here in a platform-agnostic way. Platform-specific documentation can be found in the API reference section.</p>
+<p>In this section we ignore the minibatch axis for clarity and assume a minibatch size of one. However, all operations support minibatches as detailed later.</p>
+<h3 id="rasterization">Rasterization</h3>
+<p>The rasterization operation takes as inputs a tensor of vertex positions and a tensor of vertex index triplets that specify the triangles. Vertex positions are specified in clip space, i.e., after modelview and projection transformations. Performing these transformations is left as the user's responsibility. In clip space, the view frustum is a cube in homogeneous coordinates where <span class="math inline"><em>x</em>/<em>w</em></span>, <span class="math inline"><em>y</em>/<em>w</em></span>, <span class="math inline"><em>z</em>/<em>w</em></span> are all between -1 and +1.</p>
+<p>The output of the rasterization operation is a 4-channel float32 image with tuple (<span class="math inline"><em>u</em></span>, <span class="math inline"><em>v</em></span>, <span class="math inline"><em>z</em>/<em>w</em></span>, <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span>) in each pixel. Values <span class="math inline"><em>u</em></span> and <span class="math inline"><em>v</em></span> are the barycentric coordinates within a triangle: the first vertex in the vertex index triplet obtains <span class="math inline">(<em>u</em>, <em>v</em>) = (1, 0)</span>, the second vertex <span class="math inline">(<em>u</em>, <em>v</em>) = (0, 1)</span> and the third vertex <span class="math inline">(<em>u</em>, <em>v</em>) = (0, 0)</span>. Normalized depth value <span class="math inline"><em>z</em>/<em>w</em></span> is used later by the antialiasing operation to infer occlusion relations between triangles, and it does not propagate gradients to the vertex position input. Field <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span> is the triangle index, offset by one. Pixels where no triangle was rasterized will receive a zero in all channels.</p>
+<p>Rasterization is point-sampled, i.e., the geometry is not smoothed, blurred, or made partially transparent in any way, in contrast to some previous differentiable rasterizers. The contents of a pixel always represent a single surface point that is on the closest surface visible along the ray through the pixel center.</p>
+<p>Point-sampled coverage does not produce vertex position gradients related to occlusion and visibility effects. This is because the motion of vertices does not change the coverage in a continuous way — a triangle is either rasterized into a pixel or not. In nvdiffrast, the occlusion/visibility related gradients are generated in the antialiasing operation that typically occurs towards the end of the rendering pipeline.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_uv.png"/>
+<div class="caption">
+<code>[..., 0:2]</code> = barycentrics <span class="math inline">(<em>u</em>, <em>v</em>)</span>
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_tri.png"/>
+<div class="caption">
+<code>[..., 3]</code> = <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span>
+</div>
+</div>
+</div>
+</div>
+<p>The images above illustrate the output of the rasterizer. The left image shows the contents of channels 0 and 1, i.e., the barycentric coordinates, rendered as red and green, respectively. The right image shows channel 3, i.e., the triangle ID, using a random color per triangle. <a href="http://www.cs.cmu.edu/~kmcrane/Projects/ModelRepository/index.html#spot">Spot</a> model was created and released into public domain by <a href="http://www.cs.cmu.edu/~kmcrane/index.html">Keenan Crane</a>.</p>
+<h3 id="interpolation">Interpolation</h3>
+<p>Depending on the shading and lighting models, a mesh typically specifies a number of attributes at its vertices. These can include, e.g., texture coordinates, vertex normals, reflection vectors, and material parameters. The purpose of the interpolation operation is to transfer these attributes specified at vertices to image space. In the hardware graphics pipeline, this happens automatically between vertex and pixel shaders. The interpolation operation in nvdiffrast supports an arbitrary number of attributes.</p>
+<p>Concretely, the interpolation operation takes as inputs the buffer produced by the rasterizer and a buffer specifying the vertex attributes. The output is an image-size buffer with as many channels as there are attributes. Pixels where no triangle was rendered will contain all zeros in the output.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_st.png"/>
+<div class="caption">
+Texture coordinates <span class="math inline">(<em>s</em>, <em>t</em>)</span>
+</div>
+</div>
+</div>
+</div>
+<p>Above is an example of interpolated texture coordinates visualized in red and green channels. This image was created using the output of the rasterizer from the previous step, and an attribute buffer containing the texture coordinates.</p>
+<h3 id="texturing">Texturing</h3>
+<p>Texture sampling is a fundamental operation in hardware graphics pipelines, and the same is true in nvdiffrast. The basic principle is simple: given a per-pixel texture coordinate vector, fetch a value from a texture and place it in the output. In nvdiffrast, the textures may have an arbitrary number of channels, which is useful in case you want to learn, say, an abstract field that acts as an input to a neural network further down the pipeline.</p>
+<p>When sampling a texture, it is typically desirable to use some form of filtering. Most previous differentiable rasterizers support at most bilinear filtering, where sampling at a texture coordinate between texel centers will interpolate the value linearly from the four nearest texels. While this works fine when viewing the texture up close, it yields badly aliased results when the texture is viewed from a distance. To avoid this, the texture needs to be <em>prefiltered</em> prior to sampling it, removing the frequencies that are too high compared to how densely it is being sampled.</p>
+<p>Nvdiffrast supports prefiltered texture sampling based on <a href="https://en.wikipedia.org/wiki/Mipmap">mipmapping</a>. The required mipmap levels can be generated internally in the texturing operation, so that the user only needs to specify the highest-resolution (base level) texture. Currently the highest-quality filtering mode is isotropic trilinear filtering. The lack of anisotropic filtering means that a texture viewed at a steep angle will not alias in any direction, but it may appear blurry across the <q>non-squished</q> direction.</p>
+<p>In addition to standard 2D textures, the texture sampling operation also supports cube maps. Cube maps are addressed using 3D texture coordinates, and the transitions between cube map faces are properly filtered so there will be no visible seams. Cube maps support trilinear filtering similar to 2D textures. There is no explicit support for 1D textures but they can be simulated efficiently with 1<span class="math inline">×</span><span class="math inline"><em>n</em></span> textures. All the filtering, mipmapping etc. work with such textures just as they would with true 1D textures. For now there is no support for 3D volume textures.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_texture.png"/>
+<div class="caption">
+Texture of Spot
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_tex.png"/>
+<div class="caption">
+Output of the texture sampling operation
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_texw.png"/>
+<div class="caption">
+Background replaced with white
+</div>
+</div>
+</div>
+</div>
+<p>The middle image above shows the result of texture sampling using the interpolated texture coordinates from the previous step. Why is the background pink? The texture coordinates <span class="math inline">(<em>s</em>, <em>t</em>)</span> read as zero at those pixels, but that is a perfectly valid point to sample the texture. It happens that Spot's texture (left) has pink color at its <span class="math inline">(0, 0)</span> corner, and therefore all pixels in the background obtain that color as a result of the texture sampling operation. On the right, we have replaced the color of the <q>empty</q> pixels with a white color. Here's one way to do this in PyTorch:</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>img_right <span class="op">=</span> torch.where(rast_out[..., <span class="dv">3</span>:] <span class="op">&gt;</span> <span class="dv">0</span>, img_left, torch.tensor(<span class="fl">1.0</span>).cuda())</span></code></pre></div>
+<p>where <code>rast_out</code> is the output of the rasterization operation. We simply test if the <span class="math inline"><em>t</em><em>r</em><em>i</em><em>a</em><em>n</em><em>g</em><em>l</em><em>e</em>_<em>i</em><em>d</em></span> field, i.e., channel 3 of the rasterizer output, is greater than zero, indicating that a triangle was rendered in that pixel. If so, we take the color from the textured image, and otherwise we take constant 1.0.</p>
+<h3 id="antialiasing">Antialiasing</h3>
+<p>The last of the four primitive operations in nvdiffrast is antialiasing. Based on the geometry input (vertex positions and triangles), it will smooth out discontinuties at silhouette edges in a given image. The smoothing is based on a local approximation of coverage — an approximate integral over a pixel is calculated based on the exact location of relevant edges and the point-sampled colors at pixel centers.</p>
+<p>In this context, a silhouette is any edge that connects to just one triangle, or connects two triangles so that one folds behind the other. Specifically, this includes both silhouettes against the background and silhouettes against another surface, unlike some previous methods (<a href="https://github.com/nv-tlabs/DIB-R">DIB-R</a>) that only support the former kind.</p>
+<p>It is worth discussing why we might want to go through this trouble to improve the image a tiny bit. If we're attempting to, say, match a real-world photograph, a slightly smoother edge probably won't match the captured image much better than a jagged one. However, that is not the point of the antialiasing operation — the real goal is to obtain gradients w.r.t. vertex positions related to occlusion, visibility, and coverage.</p>
+<p>Remember that everything up to this point in the rendering pipeline is point-sampled. In particular, the coverage, i.e., which triangle is rasterized to which pixel, changes discontinuously in the rasterization operation.</p>
+<p>This is the reason why previous differentiable rasterizers apply nonstandard image synthesis model with blur and transparency: Something has to make coverage continuous w.r.t. vertex positions if we wish to optimize vertex positions, camera position, etc., based on an image-space loss. In nvdiffrast, we do everything point-sampled so that we know that every pixel corresponds to a single, well-defined surface point. This lets us perform arbitrary shading computations without worrying about things like accidentally blurring texture coordinates across silhouettes, or having attributes mysteriously tend towards background color when getting close to the edge of the object. Only towards the end of the pipeline, the antialiasing operation ensures that the motion of vertex positions results in continuous change on silhouettes.</p>
+<p>The antialiasing operation supports any number of channels in the image to be antialiased. Thus, if your rendering pipeline produces an abstract representation that is fed to a neural network for further processing, that is not a problem.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_aa.png"/>
+<div class="caption">
+Antialiased image
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_crop1.png"/>
+<div class="caption">
+Closeup, before AA
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_crop2.png"/>
+<div class="caption">
+Closeup, after AA
+</div>
+</div>
+</div>
+</div>
+<p>The left image above shows the result image from the last step, after performing antialiasing. The effect is quite small — some boundary pixels become less jagged, as shown in the closeups.</p>
+<p>Notably, not all boundary pixels are antialiased as revealed by the left-side image below. This is because the accuracy of the antialiasing operation in nvdiffrast depends on the rendered size of triangles: Because we store knowledge of just one surface point per pixel, antialiasing is possible only when the triangle that contains the actual geometric silhouette edge is visible in the image. The example image is rendered in very low resolution and the triangles are tiny compared to pixels. Thus, triangles get easily lost between the pixels.</p>
+<p>This results in incomplete-looking antialiasing, and the gradients provided by antialiasing become noisier when edge triangles are missed. Therefore it is advisable to render images in resolutions where the triangles are large enough to show up in the image at least most of the time.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_diff1.png"/>
+<div class="caption">
+Pixels touched by antialiasing, original resolution
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_diff2.png"/>
+<div class="caption">
+Rendered in 4×4 higher resolution and downsampled
+</div>
+</div>
+</div>
+</div>
+<p>The left image above shows which pixels were modified by the antialiasing operation in this example. On the right, we performed the rendering in 4×4 higher resolution and downsampled the final images back to the original size. This yields more accurate position gradients related to the silhouettes, so if you suspect your position gradients are too noisy, you may want to try simply increasing the resolution in which rasterization and antialiasing is done.</p>
+<p>For purposes of shape optimization, the sparse-looking situation on the left would probably be perfectly fine. The gradients are still going to point in the right direction even if they are somewhat sparse, and you will need to use some sort of shape regularization anyway, which will greatly increase tolerance to noisy shape gradients.</p>
+<h2 id="beyond-the-basics">Beyond the basics</h2>
+<p>Rendering images is easy with nvdiffrast, but there are a few practical things that you will need to take into account. The topics in this section explain the operation and usage of nvdiffrast in more detail, and hopefully help you avoid any potential misunderstandings and pitfalls.</p>
+<h3 id="coordinate-systems">Coordinate systems</h3>
+<p>Nvdiffrast follows OpenGL's coordinate systems and other conventions. This is partially because we support OpenGL to accelerate the rasterization operation, but mostly so that there is a <a href="https://xkcd.com/927/">single standard to follow</a>.</p>
+<ul>
+<li>
+In OpenGL convention, the perspective projection matrix (as implemented in, e.g., <a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/util.py#L16-L20"><code>utils.projection()</code></a> in our samples and <a href="https://www.khronos.org/registry/OpenGL-Refpages/gl2.1/xhtml/glFrustum.xml"><code>glFrustum()</code></a> in OpenGL) treats the view-space <span class="math inline"><em>z</em></span> as increasing towards the viewer. However, <em>after</em> multiplication by perspective projection matrix, the homogeneous <a href="https://en.wikipedia.org/wiki/Clip_coordinates">clip-space</a> coordinate <span class="math inline"><em>z</em></span>/<span class="math inline"><em>w</em></span> increases away from the viewer. Hence, a larger depth value in the rasterizer output tensor also corresponds to a surface further away from the viewer.
+</li>
+<li>
+<strong>The memory order of image data in OpenGL, and consequently in nvdiffrast, is bottom-up.</strong> This means that row 0 of a tensor containing an image is the bottom row of the texture/image, which is the opposite of the more common scanline order. If you want to keep your image data in the conventional top-down order in your code, but have it logically the right way up inside nvdiffrast, you will need to flip the images vertically when crossing the boundary.
+</li>
+<li>
+For 2D textures, the coordinate origin <span class="math inline">(<em>s</em>, <em>t</em>) = (0, 0)</span> is at the bottom left corner with <span class="math inline"><em>s</em></span> increasing to the right and <span class="math inline"><em>t</em></span> increasing to the top. When specifying the faces of a cube map texture, the orientation varies between the faces, but nvdiffrast follows the <a href="https://www.khronos.org/opengl/wiki/Cubemap_Texture">OpenGL convention</a> here as well.
+</li>
+</ul>
+<p>As a word of advice, it is best to stay on top of coordinate systems and orientations used in your program. When something appears to be the wrong way around, it is much better to identify and fix the root cause than to randomly flip coordinates, images, buffers, and matrices until the immediate problem goes away.</p>
+<h3 id="geometry-and-minibatches-range-mode-vs-instanced-mode">Geometry and minibatches: Range mode vs Instanced mode</h3>
+<p>As mentioned earlier, all operations in nvdiffrast support the minibatch axis efficiently. Related to this, we support two ways for representing the geometry: <strong>range mode</strong> and <strong>instanced mode</strong>. If you want to render a different mesh in each minibatch index, you need to use the range mode. However, if you are rendering the same mesh, but with potentially different viewpoints, vertex positions, attributes, textures, etc., in each minibatch index, the instanced mode will be much more convenient.</p>
+<p>In <strong>range mode</strong>, you specify triangle index triplets as a 2D tensor of shape [<em>num_triangles</em>, 3], and vertex positions as a 2D tensor of shape [<em>num_vertices</em>, 4]. In addition to these, the rasterization operation requires an additional 2D <em>range tensor</em> of shape [<em>minibatch_size</em>, 2] where each row specifies a start index and count into the triangle tensor. As a result, the rasterizer will render the triangles in the specified ranges into each minibatch index of the output tensor. If you have multiple meshes, you should place all of them into the vertex and triangle tensors, and then choose which mesh to rasterize into each minibatch index via the contents of the range tensor. The attribute tensor in interpolation operation is handled in the same way as positions, and it has to be of shape [<em>num_vertices</em>, <em>num_attributes</em>] in range mode.</p>
+<p>In <strong>instanced mode</strong>, the topology of the mesh will be shared for each minibatch index. The triangle tensor is still a 2D tensor with shape [<em>num_triangles</em>, 3], but the vertex positions are specified using a 3D tensor of shape [<em>minibatch_size</em>, <em>num_vertices</em>, 4]. With a 3D vertex position tensor, the rasterizer will not require the range tensor input, but will take the minibatch size from the first dimension of the vertex position tensor. The same triangles are rendered to each minibatch index, but with vertex positions taken from the corresponding slice of the vertex position tensor. In this mode, the attribute tensor in interpolation has to be a 3D tensor similar to position tensor, i.e., of shape [<em>minibatch_size</em>, <em>num_vertices</em>, <em>num_attributes</em>]. However, you can provide an attribute tensor with minibatch size of 1, and it will be broadcast across the minibatch.</p>
+<h3 id="image-space-derivatives">Image-space derivatives</h3>
+<p>We skirted around a pretty fundamental question in the description of the texturing operation above. In order to determine the proper amount of prefiltering for sampling a texture, we need to know how densely it is being sampled. But how can we know the sampling density when each pixel knows of a just a single surface point?</p>
+<p>The solution is to track the image-space derivatives of all things leading up to the texture sampling operation. <em>These are not the same thing as the gradients used in the backward pass</em>, even though they both involve differentiation! Consider the barycentrics <span class="math inline">(<em>u</em>, <em>v</em>)</span> produced by the rasterization operation. They change by some amount when moving horizontally or vertically in the image plane. If we denote the image-space coordinates as <span class="math inline">(<em>X</em>, <em>Y</em>)</span>, the image-space derivatives of the barycentrics would be <span class="math inline">∂<em>u</em>/∂<em>X</em></span>, <span class="math inline">∂<em>u</em>/∂<em>Y</em></span>, <span class="math inline">∂<em>v</em>/∂<em>X</em></span>, and <span class="math inline">∂<em>v</em>/∂<em>Y</em></span>. We can organize these into a 2×2 Jacobian matrix that describes the local relationship between <span class="math inline">(<em>u</em>, <em>v</em>)</span> and <span class="math inline">(<em>X</em>, <em>Y</em>)</span>. This matrix is generally different at every pixel. For the purpose of image-space derivatives, the units of <span class="math inline"><em>X</em></span> and <span class="math inline"><em>Y</em></span> are pixels. Hence, <span class="math inline">∂<em>u</em>/∂<em>X</em></span> is the local approximation of how much <span class="math inline"><em>u</em></span> changes when moving a distance of one pixel in the horizontal direction, and so on.</p>
+<p>Once we know how the barycentrics change w.r.t. pixel position, the interpolation operation can use this to determine how the attributes change w.r.t. pixel position. When attributes are used as texture coordinates, we can therefore tell how the texture sampling position (in texture space) changes when moving around within the pixel (up to a local, linear approximation, that is). This <em>texture footprint</em> tells us the scale on which the texture should be prefiltered. In more practical terms, it tells us which mipmap level(s) to use when sampling the texture.</p>
+<p>In nvdiffrast, the rasterization operation outputs the image-space derivatives of the barycentrics in an auxiliary 4-channel output tensor, ordered (<span class="math inline">∂<em>u</em>/∂<em>X</em></span>, <span class="math inline">∂<em>u</em>/∂<em>Y</em></span>, <span class="math inline">∂<em>v</em>/∂<em>X</em></span>, <span class="math inline">∂<em>v</em>/∂<em>Y</em></span>) from channel 0 to 3. The interpolation operation can take this auxiliary tensor as input and compute image-space derivatives of any set of attributes being interpolated. Finally, the texture sampling operation can use the image-space derivatives of the texture coordinates to determine the amount of prefiltering.</p>
+<p>There is nothing magic about these image-space derivatives. They are tensors like the, e.g., the texture coordinates themselves, they propagate gradients backwards, and so on. For example, if you want to artificially blur or sharpen the texture when sampling it, you can simply multiply the tensor carrying the image-space derivatives of the texture coordinates <span class="math inline">∂{<em>s</em>, <em>t</em>}/∂{<em>X</em>, <em>Y</em>}</span> by a scalar value before feeding it into the texture sampling operation. This scales the texture footprints and thus adjusts the amount of prefiltering. If your loss function prefers a different level of sharpness, this multiplier will receive a nonzero gradient. <em>Update:</em> Since version 0.2.1, the texture sampling operation also supports a separate mip level bias input that would be better suited for this particular task, but the gist is the same nonetheless.</p>
+<p>One might wonder if it would have been easier to determine the texture footprints simply from the texture coordinates in adjacent pixels, and skip all this derivative rubbish? In easy cases the answer is yes, but silhouettes, occlusions, and discontinuous texture parameterizations would make this approach rather unreliable in practice. Computing the image-space derivatives analytically keeps everything point-like, local, and well-behaved.</p>
+<p>It should be noted that computing gradients related to image-space derivatives is somewhat involved and requires additional computation. At the same time, they are often not crucial for the convergence of the training/optimization. Because of this, the primitive operations in nvdiffrast offer options to disable the calculation of these gradients. We're talking about things like <span class="math inline">∂<em>L</em><em>o</em><em>s</em><em>s</em>/∂(∂{<em>u</em>, <em>v</em>}/∂{<em>X</em>, <em>Y</em>})</span> that may look second-order-ish, but they're not.</p>
+<h3 id="mipmaps-and-texture-dimensions">Mipmaps and texture dimensions</h3>
+<p>Prefiltered texture sampling modes require <a href="https://en.wikipedia.org/wiki/Mipmap">mipmaps</a>, i.e., downsampled versions, of the texture. The texture sampling operation can construct these internally, or you can provide your own mipmap stack, but there are limits to texture dimensions that need to be considered.</p>
+<p>When mipmaps are constructed internally, each mipmap level is constructed by averaging 2×2 pixel patches of the preceding level (or of the texture itself for the first mipmap level). The size of the buffer to be averaged therefore has to be divisible by 2 in both directions. There is one exception: side length of 1 is valid, and it will remain as 1 in the downsampling operation.</p>
+<p>For example, a 32×32 texture will produce the following mipmap stack:</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+32×32
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+16×16
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+8×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+4×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+2×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+1×1
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>And a 32×8 texture, with both sides powers of two but not equal, will result in:</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+32×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+16×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+8×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+4×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+2×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+1×1
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>For texture sizes like this, everything will work automatically and mipmaps are constructed down to 1×1 pixel size. Therefore, if you wish to use prefiltered texture sampling, you should <strong>scale your textures to power-of-two dimensions</strong> that do not, however, need to be equal.</p>
+<p>How about texture atlases? You may have an object whose texture is composed of multiple individual patches, or a collection of textured meshes with a unique texture for each. Say we have a texture atlas composed of five 32×32 sub-images, i.e., a total size of 160×32 pixels. Now we cannot compute mipmap levels all the way down to 1×1 size, because there is a 5×1 mipmap in the way that cannot be downsampled (because 5 is not even):</p>
+<div class="image-parent">
+<table>
+<tr>
+<td class="mip">
+160×32
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+80×16
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+40×8
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+20×4
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+10×2
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip">
+<span style="color: #c00"><b>5</b></span>×1
+</td>
+<td class="mip" rowspan="2">
+→
+</td>
+<td class="mip" rowspan="2">
+Error!
+</td>
+</tr>
+<tr>
+<td class="mip">
+Base texture
+</td>
+<td class="mip">
+Mip level 1
+</td>
+<td class="mip">
+Mip level 2
+</td>
+<td class="mip">
+Mip level 3
+</td>
+<td class="mip">
+Mip level 4
+</td>
+<td class="mip">
+Mip level 5
+</td>
+</tr>
+</table>
+</div>
+<p>Scaling the atlas to, say, 256×32 pixels would feel silly because the dimensions of the sub-images are perfectly fine, and downsampling the different sub-images together — which would happen after the 5×1 resolution — would not make sense anyway. For this reason, the texture sampling operation allows the user to specify the maximum number of mipmap levels to be constructed and used. In this case, setting <code>max_mip_level=5</code> would stop at the 5×1 mipmap and prevent the error.</p>
+<p>It is a deliberate design choice that nvdiffrast doesn't just stop automatically at a mipmap size it cannot downsample, but requires the user to specify a limit when the texture dimensions are not powers of two. The goal is to avoid bugs where prefiltered texture sampling mysteriously doesn't work due to an oddly sized texture. It would be confusing if a 256×256 texture gave beautifully prefiltered texture samples, a 255×255 texture suddenly had no prefiltering at all, and a 254×254 texture did just a bit of prefiltering (one level) but not more.</p>
+<p>If you compute your own mipmaps, their sizes must follow the scheme described above. There is no need to specify mipmaps all the way to 1×1 resolution, but the stack can end at any point and it will work equivalently to an internally constructed mipmap stack with a <code>max_mip_level</code> limit. Importantly, the gradients of user-provided mipmaps are not propagated automatically to the base texture — naturally so, because nvdiffrast knows nothing about the relation between them. Instead, the tensors that specify the mip levels in a user-provided mipmap stack will receive gradients of their own.</p>
+<h3 id="rasterizing-with-cuda-vs-opengl">Rasterizing with CUDA vs OpenGL</h3>
+<p>Since version 0.3.0, nvdiffrast on PyTorch supports executing the rasterization operation using either CUDA or OpenGL. Earlier versions and the Tensorflow bindings support OpenGL only.</p>
+<p>When rasterization is executed on OpenGL, we use the GPU's graphics pipeline to determine which triangles land on which pixels. GPUs have amazingly efficient hardware for doing this — it is their original <i>raison d'être</i> — and thus it makes sense to exploit it. Unfortunately, some computing environments haven't been designed with this in mind, and it can be difficult to get OpenGL to work correctly and interoperate with CUDA cleanly. On Windows, compatibility is generally good because the GPU drivers required to run CUDA also include OpenGL support. Linux is more complicated, as various drivers can be installed separately and there isn't a standardized way to acquire access to the hardware graphics pipeline.</p>
+<p>Rasterizing in CUDA pretty much reverses these considerations. Compatibility is obviously not an issue on any CUDA-enabled platform. On the other hand, implementing the rasterization process correctly and efficiently on a massively data-parallel programming model is non-trivial. The CUDA rasterizer in nvdiffrast follows the approach described in research paper <em>High-Performance Software Rasterization on GPUs</em> by Laine and Karras, HPG 2011. Our code is based on the paper's publicly released CUDA kernels, with considerable modifications to support current hardware architectures and to match nvdiffrast's needs.</p>
+<p>The subpixel precision of the CUDA rasterizer is limited to 4 bits, and depth peeling is less accurate than with OpenGL. Memory consumption depends on many factors. <em>Note:</em> Restrictions related to output resolution have been removed in version 0.3.3. Although the internal resolution of the CUDA rasterizer remains capped at 2048×2048, nvdiffrast now invokes it automatically multiple times to support higher resolutions.</p>
+<p>It is difficult to predict which rasterizer offers better performance. For complex meshes and high resolutions OpenGL will most likely outperform the CUDA rasterizer, although it has certain overheads that the CUDA rasterizer does not have. For simple meshes and low resolutions the CUDA rasterizer may be faster, but it has its own overheads, too. Measuring the performance on actual data, on the target platform, and in the context of the entire program is the only way to know for sure.</p>
+<p>To run rasterization in CUDA, create a <code>RasterizeCudaContext</code> and supply it to the <code>rasterize()</code> operation. For OpenGL, use a <code>RasterizeGLContext</code> instead. Easy!</p>
+<h3 id="running-on-multiple-gpus">Running on multiple GPUs</h3>
+<p>Nvdiffrast supports computation on multiple GPUs in both PyTorch and TensorFlow. As is the convention in PyTorch, the operations are always executed on the device on which the input tensors reside. All GPU input tensors must reside on the same device, and the output tensors will unsurprisingly end up on that same device. In addition, the rasterization operation requires that its context was created for the correct device. In TensorFlow, the rasterizer context is automatically created on the device of the rasterization operation when it is executed for the first time.</p>
+<p><i>The remainder of this section applies only to OpenGL rasterizer contexts. CUDA rasterizer contexts require no special considerations besides making sure they're on the correct device.</i></p>
+<p>On Windows, nvdiffrast implements OpenGL device selection in a way that can be done only once per process — after one context is created, all future ones will end up on the same GPU. Hence you cannot expect to run the rasterization operation on multiple GPUs within the same process using an OpenGL context. Trying to do so will either cause a crash or incur a significant performance penalty. However, with PyTorch it is common to distribute computation across GPUs by launching a separate process for each GPU, so this is not a huge concern. Note that any OpenGL context created within the same process, even for something like a GUI window, will prevent changing the device later. Therefore, if you want to run the rasterization operation on other than the default GPU, be sure to create its OpenGL context before initializing any other OpenGL-powered libraries.</p>
+<p>On Linux everything just works, and you can create OpenGL rasterizer contexts on multiple devices within the same process.</p>
+<h4 id="note-on-torch.nn.dataparallel">Note on torch.nn.DataParallel</h4>
+<p>PyTorch offers <code>torch.nn.DataParallel</code> wrapper class for splitting the execution of a minibatch across multiple threads. Unfortunately, this class is fundamentally incompatible with OpenGL-dependent operations, as it spawns a new set of threads at each call (as of PyTorch 1.9.0, at least). Using previously created OpenGL contexts in these new threads, even if taking care to not use the same context in multiple threads, causes them to be migrated around and this has resulted in ever-growing GPU memory usage and abysmal GPU utilization. Therefore, we advise against using <code>torch.nn.DataParallel</code> for rasterization operations that depend on the OpenGL contexts.</p>
+<p>Notably, <code>torch.nn.DistributedDataParallel</code> spawns subprocesses that are much more persistent. The subprocesses must create their own OpenGL contexts as part of initialization, and as such they do not suffer from this problem.</p>
+<p>GitHub issue <a href="https://github.com/NVlabs/nvdiffrast/issues/23">#23</a>, especially <a href="https://github.com/NVlabs/nvdiffrast/issues/23#issuecomment-851577382">this comment</a>, contains further analysis and suggestions for workarounds.</p>
+<h3 id="rendering-multiple-depth-layers">Rendering multiple depth layers</h3>
+<p>Sometimes there is a need to render scenes with partially transparent surfaces. In this case, it is not sufficient to find only the surfaces that are closest to the camera, as you may also need to know what lies behind them. For this purpose, nvdiffrast supports <em>depth peeling</em> that lets you extract multiple closest surfaces for each pixel.</p>
+<p>With depth peeling, we start by rasterizing the closest surfaces as usual. We then perform a second rasterization pass with the same geometry, but this time we cull all previously rendered surface points at each pixel, effectively extracting the second-closest depth layer. This can be repeated as many times as desired, so that we can extract as many depth layers as we like. See the images below for example results of depth peeling with each depth layer shaded and antialiased.</p>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/spot_aa.png"/>
+<div class="caption">
+First depth layer
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_peel1.png"/>
+<div class="caption">
+Second depth layer
+</div>
+</div>
+<div class="image-caption">
+<img class="brd" src="img/spot_peel2.png"/>
+<div class="caption">
+Third depth layer
+</div>
+</div>
+</div>
+</div>
+<p>The API for depth peeling is based on <code>DepthPeeler</code> object that acts as a <a href="https://docs.python.org/3/reference/datamodel.html#context-managers">context manager</a>, and its <code>rasterize_next_layer</code> method. The first call to <code>rasterize_next_layer</code> is equivalent to calling the traditional <code>rasterize</code> function, and subsequent calls report further depth layers. The arguments for rasterization are specified when instantiating the <code>DepthPeeler</code> object. Concretely, your code might look something like this:</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode python"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> nvdiffrast.torch.DepthPeeler(glctx, pos, tri, resolution) <span class="im">as</span> peeler:</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>  <span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(num_layers):</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    rast, rast_db <span class="op">=</span> peeler.rasterize_next_layer()</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>    (process <span class="kw">or</span> store the results)</span></code></pre></div>
+<p>There is no performance penalty compared to the basic rasterization op if you end up extracting only the first depth layer. In other words, the code above with <code>num_layers=1</code> runs exactly as fast as calling <code>rasterize</code> once.</p>
+<p>Depth peeling is only supported in the PyTorch version of nvdiffrast. For implementation reasons, depth peeling reserves the rasterizer context so that other rasterization operations cannot be performed while the peeling is ongoing, i.e., inside the <code>with</code> block. Hence you cannot start a nested depth peeling operation or call <code>rasterize</code> inside the <code>with</code> block unless you use a different context.</p>
+<p>For the sake of completeness, let us note the following small caveat: Depth peeling relies on depth values to distinguish surface points from each other. Therefore, culling "previously rendered surface points" actually means culling all surface points at the same or closer depth as those rendered into the pixel in previous passes. This matters only if you have multiple layers of geometry at matching depths — if your geometry consists of, say, nothing but two exactly overlapping triangles, you will see one of them in the first pass but never see the other one in subsequent passes, as it's at the exact depth that is already considered done.</p>
+<h3 id="differences-between-pytorch-and-tensorflow">Differences between PyTorch and TensorFlow</h3>
+<p>Nvdiffrast can be used from PyTorch and from TensorFlow 1.x; the latter may change to TensorFlow 2.x if there is demand. These frameworks operate somewhat differently and that is reflected in the respective APIs. Simplifying a bit, in TensorFlow 1.x you construct a persistent graph out of persistent nodes, and run many batches of data through it. In PyTorch, there is no persistent graph or nodes, but a new, ephemeral graph is constructed for each batch of data and destroyed immediately afterwards. Therefore, there is also no persistent state for the operations. There is the <code>torch.nn.Module</code> abstraction for festooning operations with persistent state, but we do not use it.</p>
+<p>As a consequence, things that would be part of persistent state of an nvdiffrast operation in TensorFlow must be stored by the user in PyTorch, and supplied to the operations as needed. In practice, this is a very small difference and amounts to just a couple of lines of code in most cases.</p>
+<p>As an example, consider the OpenGL context used by the rasterization operation. In order to use hardware-accelerated rendering, an OpenGL context must be created and switched into before issuing OpenGL commands internally. Creating the context is an expensive operation, so we don't want to create and destroy one at every call of the rasterization operation. In TensorFlow, the rasterization operation creates a context when it is executed for the first time, and stashes it away in its persistent state to be reused later. In PyTorch, the user has to create the context using a separate function call, and supply it as a parameter to the rasterization operation.</p>
+<p>Similarly, if you have a constant texture and want to use prefiltered texture sampling modes, the mipmap stack only needs to be computed once. In TensorFlow, you can specify that the texture is constant, in which case the texture sampling operation only computes the mipmap stack on the first execution and stores it internally. In PyTorch, you can compute the mipmap stack once using a separate function call, and supply it to the texture sampling operation every time. If you don't do that, the operation will compute the mipmap stack internally and discard it afterwards. This is exactly what you want if your texture changes at every iteration, and it's not wrong even if the texture is constant, just a bit inefficient.</p>
+<p>Finally, the same holds for a thing called the <em>topology hash</em> that the antialiasing operation uses for identifying potential silhouette edges. Its contents depend only on the triangle tensor, not the vertex positions, so if the topology is constant, this auxiliary structure needs to be constructed only once. As before, in TensorFlow this is handled internally, whereas in PyTorch a separate function is provided for <q>off-line</q> construction.</p>
+<h4 id="manual-opengl-contexts-in-pytorch">Manual OpenGL contexts in PyTorch</h4>
+<p>First, please note that handling OpenGL contexts manually is a very small optimization. It almost certainly won't be relevant unless you've already profiled and optimized your code <em>with gusto</em>, and you're on a mission to extract every last bit of performance possible.</p>
+<p>In TensorFlow, the only option is to let nvdiffrast handle the OpenGL context management internally. This is because TensorFlow utilizes multiple CPU threads under the hood, and the active OpenGL context is a thread-local resource.</p>
+<p>PyTorch isn't as unpredictable, and stays in the same CPU thread by default (although things like <code>torch.utils.data.DataLoader</code> do invoke additional CPU threads). As such, nvdiffrast lets the user choose between handling OpenGL context switching in <strong>automatic</strong> or <strong>manual</strong> mode. The default is automatic mode where the rasterization operation always sets/releases the context at the beginning/end of each execution, like we do in TensorFlow. This ensures that the rasterizer will always use the context that you supply, and the context won't remain active so nobody else can mess with it.</p>
+<p>In manual mode, the user assumes the responsibility of setting and releasing the OpenGL context. Most of the time, if you don't have any other libraries that would be using OpenGL, you can just set the context once after having created it and keep it set until the program exits. However, keep in mind that the active OpenGL context is a thread-local resource, so it needs to be set in the same CPU thread as it will be used, and it cannot be set simultaneously in multiple CPU threads.</p>
+<h2 id="samples">Samples</h2>
+<p>Nvdiffrast comes with a set of samples that were crafted to support the research paper. Each sample is available in both PyTorch and TensorFlow versions. Details such as command-line parameters, logging format, etc., may not be identical between the versions, and generally the PyTorch versions should be considered definitive. The command-line examples below are for the PyTorch versions.</p>
+<p>All PyTorch samples support selecting between CUDA and OpenGL rasterizer contexts. The default is to do rasterization in CUDA, and switching to OpenGL is done by specifying command-line option <code>--opengl</code>.</p>
+<p>Enabling interactive display using the <code>--display-interval</code> parameter is likely to fail on Linux when using OpenGL rasterization. This is because the interactive display window is shown using OpenGL, and on Linux this conflicts with the internal OpenGL rasterization in nvdiffrast. Using a CUDA context should work, assuming that OpenGL is correctly installed in the system (for displaying the window). Our Dockerfile is set up to support headless rendering only, and thus cannot show an interactive result window.</p>
+<h3 id="triangle.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/triangle.py">triangle.py</a></h3>
+<p>This is a minimal sample that renders a triangle and saves the resulting image into a file (<code>tri.png</code>) in the current directory. Running this should be the first step to verify that you have everything set up correctly. Rendering is done using the rasterization and interpolation operations, so getting the correct output image means that both OpenGL (if specified on command line) and CUDA are working as intended under the hood.</p>
+<p>This is the only sample where you must specify either <code>--cuda</code> or <code>--opengl</code> on command line. Other samples default to CUDA rasterization and provide only the <code>--opengl</code> option.</p>
+<p>Example command lines:</p>
+<pre><code>python triangle.py --cuda
+python triangle.py --opengl</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/tri.png"/>
+<div class="caption">
+The expected output image
+</div>
+</div>
+</div>
+</div>
+<h3 id="cube.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/cube.py">cube.py</a></h3>
+<p>In this sample, we optimize the vertex positions and colors of a cube mesh, starting from a semi-randomly initialized state. The optimization is based on image-space loss in extremely low resolutions such as 4×4, 8×8, or 16×16 pixels. The goal of this sample is to examine the rate of geometrical convergence when the triangles are only a few pixels in size. It serves to illustrate that the antialiasing operation, despite being approximative, yields good enough position gradients even in 4×4 resolution to guide the optimization to the goal.</p>
+<p>Example command line:</p>
+<pre><code>python cube.py --resolution 16 --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/cube.png"/>
+<div class="caption">
+Interactive view of cube.py
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_cube.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>The image above shows a live view of the sample. Top row shows the low-resolution rendered image and reference image that the image-space loss is calculated from. Bottom row shows the current mesh (and colors) and reference mesh in high resolution so that convergence can be seen more easily visually.</p>
+<p>In the pipeline diagram, green boxes indicate nvdiffrast operations, whereas blue boxes are other computation. Red boxes are the learned tensors and gray are non-learned tensors or other data.</p>
+<h3 id="earth.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/earth.py">earth.py</a></h3>
+<p>The goal of this sample is to compare texture convergence with and without prefiltered texture sampling. The texture is learned based on image-space loss against high-quality reference renderings in random orientations and at random distances. When prefiltering is disabled, the texture is not learned properly because of spotty gradient updates caused by aliasing. This shows as a much worse PSNR for the texture, compared to learning with prefiltering enabled. See the paper for further discussion.</p>
+<p>Example command lines:</p>
+<table>
+<tr>
+<td class="cmd">
+<code>python earth.py --display-interval 10</code>
+</td>
+<td class="cmd">
+No prefiltering, bilinear interpolation.
+</td>
+</tr>
+<tr>
+<td class="cmd">
+<code>python earth.py --display-interval 10 --mip</code>
+</td>
+<td class="cmd">
+Prefiltering enabled, trilinear interpolation.
+</td>
+</tr>
+</table>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/earth.png"/>
+<div class="caption">
+Interactive view of earth.py, prefiltering disabled
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_earth.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>The interactive view shows the current texture mapped onto the mesh, with or without prefiltered texture sampling as specified via the command-line parameter. In this sample, no antialiasing is performed because we are not learning vertex positions and hence need no gradients related to them.</p>
+<h3 id="envphong.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/envphong.py">envphong.py</a></h3>
+<p>In this sample, a more complex shading model is used compared to the vertex colors or plain texture in the previous ones. Here, we learn a reflected environment map and parameters of a Phong BRDF model given a known mesh. The optimization is based on image-space loss against reference renderings in random orientations. The shading model of mirror reflection plus a Phong BRDF is not physically sensible, but it works as a reasonably simple strawman that would not be possible to implement with previous differentiable rasterizers that bundle rasterization, shading, lighting, and texturing together. The sample also illustrates the use of cube mapping for representing a learned texture in a spherical domain.</p>
+<p>Example command line:</p>
+<pre><code>python envphong.py --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/envphong.png"/>
+<div class="caption">
+Interactive view of envphong.py
+</div>
+</div>
+<div class="image-caption">
+<img class="pipe" src="img/pipe_envphong.png"/>
+<div class="caption">
+Rendering pipeline
+</div>
+</div>
+</div>
+</div>
+<p>In the interactive view, we see the rendering with the current environment map and Phong BRDF parameters, both gradually improving during the optimization.</p>
+<h3 id="pose.py"><a href="https://github.com/NVlabs/nvdiffrast/blob/main/samples/torch/pose.py">pose.py</a></h3>
+<p>Pose fitting based on an image-space loss is a classical task in differentiable rendering. In this sample, we solve a pose optimization problem with a simple cube with differently colored sides. We detail the optimization method in the paper, but in brief, it combines gradient-free greedy optimization in an initialization phase and gradient-based optimization in a fine-tuning phase.</p>
+<p>Example command line:</p>
+<pre><code>python pose.py --display-interval 10</code></pre>
+<div class="image-parent">
+<div class="image-row">
+<div class="image-caption">
+<img class="brd" src="img/pose.png"/>
+<div class="caption">
+Interactive view of pose.py
+</div>
+</div>
+</div>
+</div>
+<p>The interactive view shows, from left to right: target pose, best found pose, and current pose. When viewed live, the two stages of optimization are clearly visible. In the first phase, the best pose updates intermittently when a better initialization is found. In the second phase, the solution converges smoothly to the target via gradient-based optimization.</p>
+<h2 id="pytorch-api-reference">PyTorch API reference</h2>
+<div style="padding-top: 1em;">
+<div class="apifunc"><h4><code>nvdiffrast.torch.RasterizeCudaContext(<em>device</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_class">Class</span></h4>
+<p class="shortdesc">Create a new Cuda rasterizer context.</p><p class="longdesc">The context is deleted and internal storage is released when the object is
+destroyed.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">device</td><td class="arg_short">Cuda device on which the context is created. Type can be
+<code>torch.device</code>, string (e.g., <code>'cuda:1'</code>), or int. If not
+specified, context will be created on currently active Cuda
+device.</td></tr></table><div class="returns">Returns:<div class="return_description">The newly created Cuda rasterizer context.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.RasterizeGLContext(<em>output_db</em>=<span class="defarg">True</span>, <em>mode</em>=<span class="defarg">'automatic'</span>, <em>device</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_class">Class</span></h4>
+<p class="shortdesc">Create a new OpenGL rasterizer context.</p><p class="longdesc">Creating an OpenGL context is a slow operation so you should usually reuse the same
+context in all calls to <code>rasterize()</code> on the same CPU thread. The OpenGL context
+is deleted when the object is destroyed.</p><p class="longdesc">Side note: When using the OpenGL context in a rasterization operation, the
+context's internal framebuffer object is automatically enlarged to accommodate the
+rasterization operation's output shape, but it is never shrunk in size until the
+context is destroyed. Thus, if you need to rasterize, say, deep low-resolution
+tensors and also shallow high-resolution tensors, you can conserve GPU memory by
+creating two separate OpenGL contexts for these tasks. In this scenario, using the
+same OpenGL context for both tasks would end up reserving GPU memory for a deep,
+high-resolution output tensor.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">output_db</td><td class="arg_short">Compute and output image-space derivates of barycentrics.</td></tr><tr class="arg"><td class="argname">mode</td><td class="arg_short">OpenGL context handling mode. Valid values are 'manual' and 'automatic'.</td></tr><tr class="arg"><td class="argname">device</td><td class="arg_short">Cuda device on which the context is created. Type can be
+<code>torch.device</code>, string (e.g., <code>'cuda:1'</code>), or int. If not
+specified, context will be created on currently active Cuda
+device.</td></tr></table><div class="methods">Methods, only available if context was created in manual mode:</div><table class="args"><tr class="arg"><td class="argname">set_context()</td><td class="arg_short">Set (activate) OpenGL context in the current CPU thread.</td></tr><tr class="arg"><td class="argname">release_context()</td><td class="arg_short">Release (deactivate) currently active OpenGL context.</td></tr></table><div class="returns">Returns:<div class="return_description">The newly created OpenGL rasterizer context.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.rasterize(<em>glctx</em>, <em>pos</em>, <em>tri</em>, <em>resolution</em>, <em>ranges</em>=<span class="defarg">None</span>, <em>grad_db</em>=<span class="defarg">True</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Rasterize triangles.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory except for
+the <code>ranges</code> tensor that, if specified, has to reside in CPU memory. The
+output tensors will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">glctx</td><td class="arg_short">Rasterizer context of type <code>RasterizeGLContext</code> or <code>RasterizeCudaContext</code>.</td></tr><tr class="arg"><td class="argname">pos</td><td class="arg_short">Vertex position tensor with dtype <code>torch.float32</code>. To enable range
+mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3] and dtype <code>torch.int32</code>.</td></tr><tr class="arg"><td class="argname">resolution</td><td class="arg_short">Output resolution as integer tuple (height, width).</td></tr><tr class="arg"><td class="argname">ranges</td><td class="arg_short">In range mode, tensor with shape [minibatch_size, 2] and dtype
+<code>torch.int32</code>, specifying start indices and counts into <code>tri</code>.
+Ignored in instanced mode.</td></tr><tr class="arg"><td class="argname">grad_db</td><td class="arg_short">Propagate gradients of image-space derivatives of barycentrics
+into <code>pos</code> in backward pass. Ignored if using an OpenGL context that
+was not configured to output image-space derivatives.</td></tr></table><div class="returns">Returns:<div class="return_description">A tuple of two tensors. The first output tensor has shape [minibatch_size,
+height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+triangle_id). If the OpenGL context was configured to output image-space
+derivatives of barycentrics, the second output tensor will also have shape
+[minibatch_size, height, width, 4] and contain said derivatives in order
+(du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+[minibatch_size, height, width, 0].</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.DepthPeeler(<em>...</em>)</code>&nbsp;<span class="sym_class">Class</span></h4>
+<p class="shortdesc">Create a depth peeler object for rasterizing multiple depth layers.</p><p class="longdesc">Arguments are the same as in <code>rasterize()</code>.</p><div class="returns">Returns:<div class="return_description">The newly created depth peeler.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.DepthPeeler.rasterize_next_layer()</code>&nbsp;<span class="sym_method">Method</span></h4>
+<p class="shortdesc">Rasterize next depth layer.</p><p class="longdesc">Operation is equivalent to <code>rasterize()</code> except that previously reported
+surface points are culled away.</p><div class="returns">Returns:<div class="return_description">A tuple of two tensors as in <code>rasterize()</code>.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.interpolate(<em>attr</em>, <em>rast</em>, <em>tri</em>, <em>rast_db</em>=<span class="defarg">None</span>, <em>diff_attrs</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Interpolate vertex attributes.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensors
+will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">attr</td><td class="arg_short">Attribute tensor with dtype <code>torch.float32</code>. 
+Shape is [num_vertices, num_attributes] in range mode, or 
+[minibatch_size, num_vertices, num_attributes] in instanced mode.
+Broadcasting is supported along the minibatch axis.</td></tr><tr class="arg"><td class="argname">rast</td><td class="arg_short">Main output tensor from <code>rasterize()</code>.</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3] and dtype <code>torch.int32</code>.</td></tr><tr class="arg"><td class="argname">rast_db</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of barycentrics, 
+i.e., the second output tensor from <code>rasterize()</code>. Enables computing
+image-space derivatives of attributes.</td></tr><tr class="arg"><td class="argname">diff_attrs</td><td class="arg_short">(Optional) List of attribute indices for which image-space
+derivatives are to be computed. Special value 'all' is equivalent
+to list [0, 1, ..., num_attributes - 1].</td></tr></table><div class="returns">Returns:<div class="return_description">A tuple of two tensors. The first output tensor contains interpolated
+attributes and has shape [minibatch_size, height, width, num_attributes].
+If <code>rast_db</code> and <code>diff_attrs</code> were specified, the second output tensor contains
+the image-space derivatives of the selected attributes and has shape
+[minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+Otherwise, the second output tensor will be an empty tensor with shape
+[minibatch_size, height, width, 0].</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.texture(<em>tex</em>, <em>uv</em>, <em>uv_da</em>=<span class="defarg">None</span>, <em>mip_level_bias</em>=<span class="defarg">None</span>, <em>mip</em>=<span class="defarg">None</span>, <em>filter_mode</em>=<span class="defarg">'auto'</span>, <em>boundary_mode</em>=<span class="defarg">'wrap'</span>, <em>max_mip_level</em>=<span class="defarg">None</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Perform texture sampling.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tex</td><td class="arg_short">Texture tensor with dtype <code>torch.float32</code>. For 2D textures, must have shape
+[minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+tex_width and tex_height are equal. Note that <code>boundary_mode</code> must also be set
+to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.</td></tr><tr class="arg"><td class="argname">uv</td><td class="arg_short">Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+texture, must have shape [minibatch_size, height, width, 3].</td></tr><tr class="arg"><td class="argname">uv_da</td><td class="arg_short">(Optional) Tensor containing image-space derivatives of texture coordinates.
+Must have same shape as <code>uv</code> except for the last dimension that is to be twice
+as long.</td></tr><tr class="arg"><td class="argname">mip_level_bias</td><td class="arg_short">(Optional) Per-pixel bias for mip level selection. If <code>uv_da</code> is omitted,
+determines mip level directly. Must have shape [minibatch_size, height, width].</td></tr><tr class="arg"><td class="argname">mip</td><td class="arg_short">(Optional) Preconstructed mipmap stack from a <code>texture_construct_mip()</code> call, or a list
+of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+the tensors in the list must follow the same format as <code>tex</code> except for width and
+height that must follow the usual rules for mipmap sizes. The base level texture
+is still supplied in <code>tex</code> and must not be included in the list. Gradients of a
+custom mipmap stack are not automatically propagated to base texture but the mipmap
+tensors will receive gradients of their own. If a mipmap stack is not specified
+but the chosen filter mode requires it, the mipmap stack is constructed internally
+and discarded afterwards.</td></tr><tr class="arg"><td class="argname">filter_mode</td><td class="arg_short">Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+selects 'linear' if neither <code>uv_da</code> or <code>mip_level_bias</code> is specified, and
+'linear-mipmap-linear' when at least one of them is specified, these being
+the highest-quality modes possible depending on the availability of the
+image-space derivatives of the texture coordinates or direct mip level information.</td></tr><tr class="arg"><td class="argname">boundary_mode</td><td class="arg_short">Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If <code>tex</code> defines a
+cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+centers of the boundary texels. Mode 'zero' virtually extends the texture with
+all-zero values in all directions.</td></tr><tr class="arg"><td class="argname">max_mip_level</td><td class="arg_short">If specified, limits the number of mipmaps constructed and used in mipmap-based
+filter modes.</td></tr></table><div class="returns">Returns:<div class="return_description">A tensor containing the results of the texture sampling with shape
+[minibatch_size, height, width, tex_channels]. Cube map fetches with invalid uv coordinates
+(e.g., zero vectors) output all zeros and do not propagate gradients.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.texture_construct_mip(<em>tex</em>, <em>max_mip_level</em>=<span class="defarg">None</span>, <em>cube_mode</em>=<span class="defarg">False</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Construct a mipmap stack for a texture.</p><p class="longdesc">This function can be used for constructing a mipmap stack for a texture that is known to remain
+constant. This avoids reconstructing it every time <code>texture()</code> is called.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tex</td><td class="arg_short">Texture tensor with the same constraints as in <code>texture()</code>.</td></tr><tr class="arg"><td class="argname">max_mip_level</td><td class="arg_short">If specified, limits the number of mipmaps constructed.</td></tr><tr class="arg"><td class="argname">cube_mode</td><td class="arg_short">Must be set to True if <code>tex</code> specifies a cube map texture.</td></tr></table><div class="returns">Returns:<div class="return_description">An opaque object containing the mipmap stack. This can be supplied in a call to <code>texture()</code> 
+in the <code>mip</code> argument.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.antialias(<em>color</em>, <em>rast</em>, <em>pos</em>, <em>tri</em>, <em>topology_hash</em>=<span class="defarg">None</span>, <em>pos_gradient_boost</em>=<span class="defarg">1.0</span>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Perform antialiasing.</p><p class="longdesc">All input tensors must be contiguous and reside in GPU memory. The output tensor
+will be contiguous and reside in GPU memory.</p><p class="longdesc">Note that silhouette edge determination is based on vertex indices in the triangle
+tensor. For it to work properly, a vertex belonging to multiple triangles must be
+referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
+classify the adjacent edges as silhouette edges, which leads to bad performance and
+potentially incorrect gradients. If you are unsure whether your data is good, check
+which pixels are modified by the antialias operation and compare to the example in the
+documentation.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">color</td><td class="arg_short">Input image to antialias with shape [minibatch_size, height, width, num_channels].</td></tr><tr class="arg"><td class="argname">rast</td><td class="arg_short">Main output tensor from <code>rasterize()</code>.</td></tr><tr class="arg"><td class="argname">pos</td><td class="arg_short">Vertex position tensor used in the rasterization operation.</td></tr><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor used in the rasterization operation.</td></tr><tr class="arg"><td class="argname">topology_hash</td><td class="arg_short">(Optional) Preconstructed topology hash for the triangle tensor. If not
+specified, the topology hash is constructed internally and discarded afterwards.</td></tr><tr class="arg"><td class="argname">pos_gradient_boost</td><td class="arg_short">(Optional) Multiplier for gradients propagated to <code>pos</code>.</td></tr></table><div class="returns">Returns:<div class="return_description">A tensor containing the antialiased image with the same shape as <code>color</code> input tensor.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.antialias_construct_topology_hash(<em>tri</em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Construct a topology hash for a triangle tensor.</p><p class="longdesc">This function can be used for constructing a topology hash for a triangle tensor that is 
+known to remain constant. This avoids reconstructing it every time <code>antialias()</code> is called.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">tri</td><td class="arg_short">Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+GPU memory.</td></tr></table><div class="returns">Returns:<div class="return_description">An opaque object containing the topology hash. This can be supplied in a call to 
+<code>antialias()</code> in the <code>topology_hash</code> argument.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.get_log_level(<em></em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Get current log level.</p><div class="returns">Returns:<div class="return_description">Current log level in nvdiffrast. See <code>set_log_level()</code> for possible values.</div></div></div>
+<div class="apifunc"><h4><code>nvdiffrast.torch.set_log_level(<em>level</em>)</code>&nbsp;<span class="sym_function">Function</span></h4>
+<p class="shortdesc">Set log level.</p><p class="longdesc">Log levels follow the convention on the C++ side of Torch:
+  0 = Info,
+  1 = Warning,
+  2 = Error,
+  3 = Fatal.
+The default log level is 1.</p><div class="arguments">Arguments:</div><table class="args"><tr class="arg"><td class="argname">level</td><td class="arg_short">New log level as integer. Internal nvdiffrast messages of this 
+severity or higher will be printed, while messages of lower
+severity will be silent.</td></tr></table></div>
+
+</div>
+<h2 id="licenses">Licenses</h2>
+<p>Copyright © 2020–2024, NVIDIA Corporation. All rights reserved.</p>
+<p>This work is made available under the <a href="https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt">Nvidia Source Code License</a>.</p>
+<p>For business inquiries, please visit our website and submit the form: <a href="https://www.nvidia.com/en-us/research/inquiries/">NVIDIA Research Licensing</a></p>
+<p>We do not currently accept outside contributions in the form of pull requests.</p>
+<p>Environment map stored as part of <code>samples/data/envphong.npz</code> is derived from a Wave Engine <a href="https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap">sample material</a> originally shared under <a href="https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md">MIT License</a>. Mesh and texture stored as part of <code>samples/data/earth.npz</code> are derived from <a href="https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125">3D Earth Photorealistic 2K</a> model originally made available under <a href="https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license">TurboSquid 3D Model License</a>.</p>
+<h2 id="citation">Citation</h2>
+<pre><code>@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}</code></pre>
+<h2 id="acknowledgements">Acknowledgements</h2>
+<p>We thank David Luebke, Simon Yuen, Jaewoo Seo, Tero Kuosmanen, Sanja Fidler, Wenzheng Chen, Jacob Munkberg, Jon Hasselgren, and Onni Kosomaa for discussions, test data, support with compute infrastructure, testing, reviewing, and suggestions for features and improvements.</p>
+<div style="height: 100px">
+ 
+</div>
+</body>
+
+</html>
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/PKG-INFO b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/PKG-INFO
new file mode 100644
index 00000000..59be9910
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/PKG-INFO
@@ -0,0 +1,56 @@
+Metadata-Version: 2.1
+Name: nvdiffrast
+Version: 0.3.3
+Summary: nvdiffrast - modular primitives for high-performance differentiable rendering
+Home-page: https://github.com/NVlabs/nvdiffrast
+Author: Samuli Laine
+Author-email: slaine@nvidia.com
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.6
+Description-Content-Type: text/markdown
+License-File: LICENSE.txt
+Requires-Dist: numpy
+
+## Nvdiffrast &ndash; Modular Primitives for High-Performance Differentiable Rendering
+
+![Teaser image](./docs/img/teaser.png)
+
+**Modular Primitives for High-Performance Differentiable Rendering**<br>
+Samuli Laine, Janne Hellsten, Tero Karras, Yeongho Seol, Jaakko Lehtinen, Timo Aila<br>
+[http://arxiv.org/abs/2011.03277](http://arxiv.org/abs/2011.03277)
+
+Nvdiffrast is a PyTorch/TensorFlow library that provides high-performance primitive operations for rasterization-based differentiable rendering.
+Please refer to &#x261E;&#x261E; [nvdiffrast documentation](https://nvlabs.github.io/nvdiffrast) &#x261C;&#x261C; for more information.
+
+## Licenses
+
+Copyright &copy; 2020&ndash;2024, NVIDIA Corporation. All rights reserved.
+
+This work is made available under the [Nvidia Source Code License](https://github.com/NVlabs/nvdiffrast/blob/main/LICENSE.txt).
+
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/)
+
+We do not currently accept outside code contributions in the form of pull requests.
+
+Environment map stored as part of `samples/data/envphong.npz` is derived from a Wave Engine
+[sample material](https://github.com/WaveEngine/Samples-2.5/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap)
+originally shared under 
+[MIT License](https://github.com/WaveEngine/Samples-2.5/blob/master/LICENSE.md).
+Mesh and texture stored as part of `samples/data/earth.npz` are derived from
+[3D Earth Photorealistic 2K](https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125)
+model originally made available under
+[TurboSquid 3D Model License](https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license).
+
+## Citation
+
+```
+@article{Laine2020diffrast,
+  title   = {Modular Primitives for High-Performance Differentiable Rendering},
+  author  = {Samuli Laine and Janne Hellsten and Tero Karras and Yeongho Seol and Jaakko Lehtinen and Timo Aila},
+  journal = {ACM Transactions on Graphics},
+  year    = {2020},
+  volume  = {39},
+  number  = {6}
+}
+```
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt
new file mode 100644
index 00000000..60e72532
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/SOURCES.txt
@@ -0,0 +1,60 @@
+LICENSE.txt
+README.md
+setup.py
+nvdiffrast/__init__.py
+nvdiffrast.egg-info/PKG-INFO
+nvdiffrast.egg-info/SOURCES.txt
+nvdiffrast.egg-info/dependency_links.txt
+nvdiffrast.egg-info/requires.txt
+nvdiffrast.egg-info/top_level.txt
+nvdiffrast/common/antialias.cu
+nvdiffrast/common/antialias.h
+nvdiffrast/common/common.cpp
+nvdiffrast/common/common.h
+nvdiffrast/common/framework.h
+nvdiffrast/common/glutil.cpp
+nvdiffrast/common/glutil.h
+nvdiffrast/common/glutil_extlist.h
+nvdiffrast/common/interpolate.cu
+nvdiffrast/common/interpolate.h
+nvdiffrast/common/rasterize.cu
+nvdiffrast/common/rasterize.h
+nvdiffrast/common/rasterize_gl.cpp
+nvdiffrast/common/rasterize_gl.h
+nvdiffrast/common/texture.cpp
+nvdiffrast/common/texture.cu
+nvdiffrast/common/texture.h
+nvdiffrast/common/cudaraster/CudaRaster.hpp
+nvdiffrast/common/cudaraster/impl/BinRaster.inl
+nvdiffrast/common/cudaraster/impl/Buffer.cpp
+nvdiffrast/common/cudaraster/impl/Buffer.hpp
+nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
+nvdiffrast/common/cudaraster/impl/Constants.hpp
+nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
+nvdiffrast/common/cudaraster/impl/Defs.hpp
+nvdiffrast/common/cudaraster/impl/FineRaster.inl
+nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
+nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
+nvdiffrast/common/cudaraster/impl/RasterImpl.cu
+nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
+nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
+nvdiffrast/common/cudaraster/impl/Util.inl
+nvdiffrast/tensorflow/__init__.py
+nvdiffrast/tensorflow/ops.py
+nvdiffrast/tensorflow/plugin_loader.py
+nvdiffrast/tensorflow/tf_all.cu
+nvdiffrast/tensorflow/tf_antialias.cu
+nvdiffrast/tensorflow/tf_interpolate.cu
+nvdiffrast/tensorflow/tf_rasterize.cu
+nvdiffrast/tensorflow/tf_texture.cu
+nvdiffrast/torch/__init__.py
+nvdiffrast/torch/ops.py
+nvdiffrast/torch/torch_antialias.cpp
+nvdiffrast/torch/torch_bindings.cpp
+nvdiffrast/torch/torch_bindings_gl.cpp
+nvdiffrast/torch/torch_common.inl
+nvdiffrast/torch/torch_interpolate.cpp
+nvdiffrast/torch/torch_rasterize.cpp
+nvdiffrast/torch/torch_rasterize_gl.cpp
+nvdiffrast/torch/torch_texture.cpp
+nvdiffrast/torch/torch_types.h
\ No newline at end of file
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/requires.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/requires.txt
new file mode 100644
index 00000000..24ce15ab
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/requires.txt
@@ -0,0 +1 @@
+numpy
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/top_level.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/top_level.txt
new file mode 100644
index 00000000..1f7ac63e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast.egg-info/top_level.txt
@@ -0,0 +1 @@
+nvdiffrast
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/__init__.py
new file mode 100644
index 00000000..fd28a087
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+__version__ = '0.3.3'
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.cu
new file mode 100644
index 00000000..95cc3bab
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.cu
@@ -0,0 +1,558 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "antialias.h"
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define F32_MAX (3.402823466e+38f)
+static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
+static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0*d1 > n1*d0) == same_sign(d0, d1); }
+static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
+{
+    bool g10 = rational_gt(n1, n0, d1, d0);
+    bool g20 = rational_gt(n2, n0, d2, d0);
+    bool g21 = rational_gt(n2, n1, d2, d1);
+    if (g20 && g21) return 2;
+    if (g10) return 1;
+    return 0;
+}
+
+//------------------------------------------------------------------------
+// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.
+
+struct AAWorkItem
+{
+    enum
+    {
+        EDGE_MASK       = 3,    // Edge index in lowest bits.
+        FLAG_DOWN_BIT   = 2,    // Down instead of right.
+        FLAG_TRI1_BIT   = 3,    // Edge is from other pixel's triangle.
+    };
+
+    int             px, py;         // Pixel x, y.
+    unsigned int    pz_flags;       // High 16 bits = pixel z, low 16 bits = edge index and flags.
+    float           alpha;          // Antialiasing alpha value. Zero if no AA.
+};
+
+//------------------------------------------------------------------------
+// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html
+
+#define JENKINS_MAGIC (0x9e3779b9u)
+static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
+{
+    a -= b; a -= c; a ^= (c>>13);
+    b -= c; b -= a; b ^= (a<<8);
+    c -= a; c -= b; c ^= (b>>13);
+    a -= b; a -= c; a ^= (c>>12);
+    b -= c; b -= a; b ^= (a<<16);
+    c -= a; c -= b; c ^= (b>>5);
+    a -= b; a -= c; a ^= (c>>3);
+    b -= c; b -= a; b ^= (a<<10);
+    c -= a; c -= b; c ^= (b>>15);
+}
+
+// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
+class HashIndex
+{
+public:
+    __device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
+    {
+        m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
+        m_idx  = (uint32_t)(key & 0xffffffffu);
+        m_skip = (uint32_t)(key >> 32);
+        uint32_t dummy = JENKINS_MAGIC;
+        jenkins_mix(m_idx, m_skip, dummy);
+        m_idx &= m_mask;
+        m_skip &= m_mask;
+        m_skip |= 1;
+    }
+    __device__ __forceinline__ int get(void) const { return m_idx; }
+    __device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
+private:
+    uint32_t m_idx, m_skip, m_mask;
+};
+
+static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
+        if (prev == 0 || prev == key)
+            break;
+        idx.next();
+    }
+    int* q = (int*)&p.evHash[idx.get()];
+    int a = atomicCAS(q+2, 0, v);
+    if (a != 0 && a != v)
+        atomicCAS(q+3, 0, v);
+}
+
+static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
+{
+    HashIndex idx(p, key);
+    while(1)
+    {
+        uint4 entry = p.evHash[idx.get()];
+        uint64_t k = ((uint64_t)entry.x) | (((uint64_t)entry.y) << 32);
+        if (k == key || k == 0)
+            return make_int2((int)entry.z, (int)entry.w);
+        idx.next();
+    }
+}
+
+static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
+{
+    if (va == vb)
+        return;
+    
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    hash_insert(p, vk, vn + 1);
+}
+
+static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
+{
+    if (va == vb)
+        return -1;
+
+    uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
+    uint64_t v1 = (uint32_t)max(va, vb) + 1;
+    uint64_t vk = v0 | (v1 << 32); // hash key
+    int2 vn = hash_find(p, vk) - 1;
+    if (vn.x == vr) return vn.y;
+    if (vn.y == vr) return vn.x;
+    return -1;
+}
+
+//------------------------------------------------------------------------
+// Mesh analysis kernel.
+
+__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= p.numTriangles)
+        return;
+
+    int v0 = p.tri[idx * 3 + 0];
+    int v1 = p.tri[idx * 3 + 1];
+    int v2 = p.tri[idx * 3 + 2];
+
+    if (v0 < 0 || v0 >= p.numVertices ||
+        v1 < 0 || v1 >= p.numVertices ||
+        v2 < 0 || v2 >= p.numVertices)
+        return;
+
+    if (v0 == v1 || v1 == v2 || v2 == v0)
+        return;
+
+    evhash_insert_vertex(p, v1, v2, v0);
+    evhash_insert_vertex(p, v2, v0, v1);
+    evhash_insert_vertex(p, v0, v1, v2);
+}
+
+//------------------------------------------------------------------------
+// Discontinuity finder kernel.
+
+__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
+    int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.n)
+        return;
+
+    // Pointer to our TriIdx and fetch.
+    int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
+    float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.
+
+    // Look right, clamp at edge.
+    int pidx1 = pidx0;
+    if (px < p.width - 1)
+        pidx1 += 4;
+    float tri1 = p.rasterOut[pidx1];
+
+    // Look down, clamp at edge.
+    int pidx2 = pidx0;
+    if (py < p.height - 1)
+        pidx2 += p.width << 2;
+    float tri2 = p.rasterOut[pidx2];
+
+    // Determine amount of work.
+    int count = 0;
+    if (tri1 != tri0) count  = 1;
+    if (tri2 != tri0) count += 1;
+    if (!count)
+        return; // Exit warp.
+
+    // Coalesce work counter update to once per CTA.
+    __shared__ int s_temp;
+    s_temp = 0;
+    __syncthreads();
+    int idx = atomicAdd(&s_temp, count);
+    __syncthreads();
+    if (idx == 0)
+    {
+        int base = atomicAdd(&p.workBuffer[0].x, s_temp);
+        s_temp = base + 1; // don't clobber the counters in first slot.
+    }
+    __syncthreads();
+    idx += s_temp;
+
+    // Write to memory.
+    if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
+    if (tri2 != tri0) p.workBuffer[idx]   = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
+}
+
+//------------------------------------------------------------------------
+// Forward analysis kernel.
+
+__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
+{
+    __shared__ int s_base;
+    int workCount = p.workBuffer[0].x;
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        int4* pItem = p.workBuffer + thread_idx + 1;
+        int4 item = *pItem;
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d  = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
+        float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
+        int tri0 = float_to_triidx(zt0.y) - 1;
+        int tri1 = float_to_triidx(zt1.y) - 1;
+
+        // Select triangle based on background / depth.
+        int tri = (tri0 >= 0) ? tri0 : tri1;
+        if (tri0 >= 0 && tri1 >= 0)
+            tri = (zt0.x < zt1.x) ? tri0 : tri1;
+        if (tri == tri1)
+        {
+            // Calculate with respect to neighbor pixel if chose that triangle.
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        if (tri < 0 || tri >= p.numTriangles)
+            continue;
+
+        // Fetch vertex indices.
+        int vi0 = p.tri[tri * 3 + 0];
+        int vi1 = p.tri[tri * 3 + 1];
+        int vi2 = p.tri[tri * 3 + 2];
+
+        // Bail out if vertex indices are corrupt.
+        if (vi0 < 0 || vi0 >= p.numVertices ||
+            vi1 < 0 || vi1 >= p.numVertices ||
+            vi2 < 0 || vi2 >= p.numVertices)
+            continue;
+
+        // Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
+        int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
+        int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
+        int op2 = evhash_find_vertex(p, vi1, vi0, vi2);
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            int vbase = pz * p.numVertices;
+            vi0 += vbase;
+            vi1 += vbase;
+            vi2 += vbase;
+            if (op0 >= 0) op0 += vbase;
+            if (op1 >= 0) op1 += vbase;
+            if (op2 >= 0) op2 += vbase;
+        }
+
+        // Fetch vertex positions.
+        float4 p0 = ((float4*)p.pos)[vi0];
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+        float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
+        float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
+        float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];
+
+        // Project vertices to pixel space.
+        float w0  = 1.f / p0.w;
+        float w1  = 1.f / p1.w;
+        float w2  = 1.f / p2.w;
+        float ow0 = 1.f / o0.w;
+        float ow1 = 1.f / o1.w;
+        float ow2 = 1.f / o2.w;
+        float fx  = (float)px + .5f - p.xh;
+        float fy  = (float)py + .5f - p.yh;
+        float x0  = p0.x * w0 * p.xh - fx;
+        float y0  = p0.y * w0 * p.yh - fy;
+        float x1  = p1.x * w1 * p.xh - fx;
+        float y1  = p1.y * w1 * p.yh - fy;
+        float x2  = p2.x * w2 * p.xh - fx;
+        float y2  = p2.y * w2 * p.yh - fy;
+        float ox0 = o0.x * ow0 * p.xh - fx;
+        float oy0 = o0.y * ow0 * p.yh - fy;
+        float ox1 = o1.x * ow1 * p.xh - fx;
+        float oy1 = o1.y * ow1 * p.yh - fy;
+        float ox2 = o2.x * ow2 * p.xh - fx;
+        float oy2 = o2.y * ow2 * p.yh - fy;
+
+        // Signs to kill non-silhouette edges.
+        float bb = (x1-x0)*(y2-y0) - (x2-x0)*(y1-y0); // Triangle itself.
+        float a0 = (x1-ox0)*(y2-oy0) - (x2-ox0)*(y1-oy0); // Wings.
+        float a1 = (x2-ox1)*(y0-oy1) - (x0-ox1)*(y2-oy1);
+        float a2 = (x0-ox2)*(y1-oy2) - (x1-ox2)*(y0-oy2);
+
+        // If no matching signs anywhere, skip the rest.
+        if (same_sign(a0, bb) || same_sign(a1, bb) || same_sign(a2, bb))
+        {
+            // XY flip for horizontal edges.
+            if (d)
+            {
+                swap(x0, y0);
+                swap(x1, y1);
+                swap(x2, y2);
+            }
+
+            float dx0 = x2 - x1;
+            float dx1 = x0 - x2;
+            float dx2 = x1 - x0;
+            float dy0 = y2 - y1;
+            float dy1 = y0 - y2;
+            float dy2 = y1 - y0;
+
+            // Check if an edge crosses between us and the neighbor pixel.
+            float dc = -F32_MAX;
+            float ds = (tri == tri0) ? 1.f : -1.f;
+            float d0 = ds * (x1*dy0 - y1*dx0);
+            float d1 = ds * (x2*dy1 - y2*dx1);
+            float d2 = ds * (x0*dy2 - y0*dx2);
+
+            if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
+            if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
+            if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;
+
+            int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
+            if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
+            if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
+            if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
+            float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.
+
+            // Adjust output image if a suitable edge was found.
+            if (dc > -eps && dc < 1.f + eps)
+            {
+                dc = fminf(fmaxf(dc, 0.f), 1.f);
+                float alpha = ds * (.5f - dc);
+                const float* pColor0 = p.color + pixel0 * p.channels;
+                const float* pColor1 = p.color + pixel1 * p.channels;
+                float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+                for (int i=0; i < p.channels; i++)
+                    atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));
+
+                // Rewrite the work item's flags and alpha. Keep original px, py.
+                unsigned int flags = pz << 16;
+                flags |= di;
+                flags |= d << AAWorkItem::FLAG_DOWN_BIT;
+                flags |= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
+                ((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+    __shared__ int s_base; // Work counter communication across entire CTA.
+
+    int workCount = p.workBuffer[0].x;
+
+    for(;;)
+    {
+        // Persistent threads work fetcher.
+        __syncthreads();
+        if (threadIdx.x == 0)
+            s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
+        __syncthreads();
+        int thread_idx = s_base + threadIdx.x;
+        if (thread_idx >= workCount)
+            return;
+
+        // Read work item filled out by forward kernel.
+        int4 item = p.workBuffer[thread_idx + 1];
+        unsigned int amask = __ballot_sync(0xffffffffu, item.w);
+        if (item.w == 0)
+            continue; // No effect.
+
+        // Unpack work item and replicate setup from forward analysis kernel.
+        int px = item.x;
+        int py = item.y;
+        int pz = (int)(((unsigned int)item.z) >> 16);
+        int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
+        float alpha = __int_as_float(item.w);
+        int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
+        int di = item.z & AAWorkItem::EDGE_MASK;
+        float ds = __int_as_float(__float_as_int(1.0) | (tri1 << 31));
+        int pixel0 = px + p.width * (py + p.height * pz);
+        int pixel1 = pixel0 + (d ? p.width : 1);
+        int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
+        if (tri1)
+        {
+            px += 1 - d;
+            py += d;
+        }
+
+        // Bail out if triangle index is corrupt.
+        bool triFail = (tri < 0 || tri >= p.numTriangles);
+        amask = __ballot_sync(amask, !triFail);
+        if (triFail)
+            continue;
+
+        // Outgoing color gradients.
+        float* pGrad0 = p.gradColor + pixel0 * p.channels;
+        float* pGrad1 = p.gradColor + pixel1 * p.channels;
+
+        // Incoming color gradients.
+        const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
+
+        // Position gradient weight based on colors and incoming gradients.
+        float dd = 0.f;
+        const float* pColor0 = p.color + pixel0 * p.channels;
+        const float* pColor1 = p.color + pixel1 * p.channels;
+
+        // Loop over channels and accumulate.
+        for (int i=0; i < p.channels; i++)
+        {
+            float dy = pDy[i];
+            if (dy != 0.f)
+            {
+                // Update position gradient weight.
+                dd += dy * (pColor1[i] - pColor0[i]);
+
+                // Update color gradients. No coalescing because all have different targets.
+                float v = alpha * dy;
+                atomicAdd(&pGrad0[i], -v);
+                atomicAdd(&pGrad1[i], v);
+            }
+        }
+
+        // If position weight is zero, skip the rest.
+        bool noGrad = (dd == 0.f);
+        amask = __ballot_sync(amask, !noGrad);
+        if (noGrad)
+            continue;
+
+        // Fetch vertex indices of the active edge and their positions.
+        int i1 = (di < 2) ? (di + 1) : 0;
+        int i2 = (i1 < 2) ? (i1 + 1) : 0;
+        int vi1 = p.tri[3 * tri + i1];
+        int vi2 = p.tri[3 * tri + i2];
+
+        // Bail out if vertex indices are corrupt.
+        bool vtxFail = (vi1 < 0 || vi1 >= p.numVertices || vi2 < 0 || vi2 >= p.numVertices);
+        amask = __ballot_sync(amask, !vtxFail);
+        if (vtxFail)
+            continue;
+
+        // Instance mode: Adjust vertex indices based on minibatch index.
+        if (p.instance_mode)
+        {
+            vi1 += pz * p.numVertices;
+            vi2 += pz * p.numVertices;
+        }
+
+        // Fetch vertex positions.
+        float4 p1 = ((float4*)p.pos)[vi1];
+        float4 p2 = ((float4*)p.pos)[vi2];
+
+        // Project vertices to pixel space.
+        float pxh = p.xh;
+        float pyh = p.yh;
+        float fx = (float)px + .5f - pxh;
+        float fy = (float)py + .5f - pyh;
+
+        // XY flip for horizontal edges.
+        if (d)
+        {
+            swap(p1.x, p1.y);
+            swap(p2.x, p2.y);
+            swap(pxh, pyh);
+            swap(fx, fy);
+        }
+
+        // Gradient calculation setup.
+        float w1 = 1.f / p1.w;
+        float w2 = 1.f / p2.w;
+        float x1 = p1.x * w1 * pxh - fx;
+        float y1 = p1.y * w1 * pyh - fy;
+        float x2 = p2.x * w2 * pxh - fx;
+        float y2 = p2.y * w2 * pyh - fy;
+        float dx = x2 - x1;
+        float dy = y2 - y1;
+        float db = x1*dy - y1*dx;
+
+        // Compute inverse delta-y with epsilon.
+        float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
+        float iy = 1.f / (dy + ep);
+
+        // Compute position gradients.
+        float dby = db * iy;
+        float iw1 = -w1 * iy * dd;
+        float iw2 =  w2 * iy * dd;
+        float gp1x = iw1 * pxh * y2;
+        float gp2x = iw2 * pxh * y1;
+        float gp1y = iw1 * pyh * (dby - x2);
+        float gp2y = iw2 * pyh * (dby - x1);
+        float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
+        float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;
+
+        // XY flip the gradients.
+        if (d)
+        {
+            swap(gp1x, gp1y);
+            swap(gp2x, gp2y);
+        }
+
+        // Kill position gradients if alpha was saturated.
+        if (fabsf(alpha) >= 0.5f)
+        {
+            gp1x = gp1y = gp1w = 0.f;
+            gp2x = gp2y = gp2w = 0.f;
+        }
+
+        // Initialize coalesced atomics. Match both triangle ID and edge index.
+        // Also note that some threads may be inactive.
+        CA_SET_GROUP_MASK(tri ^ (di << 30), amask);
+
+        // Accumulate gradients.
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
+        caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.h
new file mode 100644
index 00000000..a324f2f2
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/antialias.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "common.h"
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH         32
+#define AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT        8
+#define AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK        256
+#define AA_MESH_KERNEL_THREADS_PER_BLOCK            256
+#define AA_HASH_ELEMENTS_PER_TRIANGLE(alloc)        ((alloc) >= (2 << 25) ? 4 : 8) // With more than 16777216 triangles (alloc >= 33554432) use smallest possible value of 4 to conserve memory, otherwise use 8 for fewer collisions.
+#define AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(alloc)    ((alloc) >= (2 << 25) ? 2 : 3)
+#define AA_GRAD_KERNEL_THREADS_PER_BLOCK            256
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct AntialiasKernelParams
+{
+    const float*    color;          // Incoming color buffer.
+    const float*    rasterOut;      // Incoming rasterizer output buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    pos;            // Incoming position buffer.
+    float*          output;         // Output buffer of forward kernel.
+    const float*    dy;             // Incoming gradients.
+    float*          gradColor;      // Output buffer, color gradient.
+    float*          gradPos;        // Output buffer, position gradient.
+    int4*           workBuffer;     // Buffer for storing intermediate work items. First item reserved for counters.
+    uint4*          evHash;         // Edge-vertex hash.
+    int             allocTriangles; // Number of triangles accommodated by evHash. Always power of two.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Input width.
+    int             height;         // Input height.
+    int             n;              // Minibatch size.
+    int             channels;       // Channel count in color input.
+    float           xh, yh;         // Transfer to pixel space.
+    int             instance_mode;  // 0=normal, 1=instance mode.
+    int             tri_const;      // 1 if triangle array is known to be constant.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.cpp
new file mode 100644
index 00000000..e566c035
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (width * height) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+
+    // Optimizations for weirdly sized buffers.
+    if (width < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= width)
+            bw >>= 1;
+
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > height)
+            bh = height;
+    }
+    else if (height < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > height)
+        {
+            bh >>= 1;
+            if (bw < width)
+                bw <<= 1;
+        }
+    }
+
+    // Done.
+    return dim3(bw, bh, 1);
+}
+
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth)
+{
+    dim3 gridSize;
+    gridSize.x = (width  - 1) / blockSize.x + 1;
+    gridSize.y = (height - 1) / blockSize.y + 1;
+    gridSize.z = (depth  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.h
new file mode 100644
index 00000000..01ecf9fc
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/common.h
@@ -0,0 +1,263 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, int width, int height);
+dim3 getLaunchGridSize(dim3 blockSize, int width, int height, int depth);
+
+//------------------------------------------------------------------------
+// The rest is CUDA device code specific stuff.
+
+#ifdef __CUDACC__
+
+//------------------------------------------------------------------------
+// Helpers for CUDA vector types.
+
+static __device__ __forceinline__ float2&   operator*=  (float2& a, const float2& b)       { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, const float2& b)       { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, const float2& b)       { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ float2&   operator*=  (float2& a, float b)               { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ float2&   operator+=  (float2& a, float b)               { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ float2&   operator-=  (float2& a, float b)               { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ float2    operator*   (const float2& a, const float2& b) { return make_float2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, const float2& b) { return make_float2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, const float2& b) { return make_float2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ float2    operator*   (const float2& a, float b)         { return make_float2(a.x * b, a.y * b); }
+static __device__ __forceinline__ float2    operator+   (const float2& a, float b)         { return make_float2(a.x + b, a.y + b); }
+static __device__ __forceinline__ float2    operator-   (const float2& a, float b)         { return make_float2(a.x - b, a.y - b); }
+static __device__ __forceinline__ float2    operator*   (float a, const float2& b)         { return make_float2(a * b.x, a * b.y); }
+static __device__ __forceinline__ float2    operator+   (float a, const float2& b)         { return make_float2(a + b.x, a + b.y); }
+static __device__ __forceinline__ float2    operator-   (float a, const float2& b)         { return make_float2(a - b.x, a - b.y); }
+static __device__ __forceinline__ float2    operator-   (const float2& a)                  { return make_float2(-a.x, -a.y); }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, const float3& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, const float3& b)       { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, const float3& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ float3&   operator*=  (float3& a, float b)               { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ float3&   operator+=  (float3& a, float b)               { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ float3&   operator-=  (float3& a, float b)               { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ float3    operator*   (const float3& a, const float3& b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, const float3& b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, const float3& b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ float3    operator*   (const float3& a, float b)         { return make_float3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ float3    operator+   (const float3& a, float b)         { return make_float3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ float3    operator-   (const float3& a, float b)         { return make_float3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ float3    operator*   (float a, const float3& b)         { return make_float3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ float3    operator+   (float a, const float3& b)         { return make_float3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ float3    operator-   (float a, const float3& b)         { return make_float3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ float3    operator-   (const float3& a)                  { return make_float3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, const float4& b)       { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, const float4& b)       { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, const float4& b)       { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ float4&   operator*=  (float4& a, float b)               { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ float4&   operator+=  (float4& a, float b)               { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ float4&   operator-=  (float4& a, float b)               { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ float4    operator*   (const float4& a, const float4& b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, const float4& b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, const float4& b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ float4    operator*   (const float4& a, float b)         { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ float4    operator+   (const float4& a, float b)         { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ float4    operator-   (const float4& a, float b)         { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ float4    operator*   (float a, const float4& b)         { return make_float4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ float4    operator+   (float a, const float4& b)         { return make_float4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ float4    operator-   (float a, const float4& b)         { return make_float4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ float4    operator-   (const float4& a)                  { return make_float4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, const int2& b)           { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, const int2& b)           { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, const int2& b)           { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ int2&     operator*=  (int2& a, int b)                   { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ int2&     operator+=  (int2& a, int b)                   { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ int2&     operator-=  (int2& a, int b)                   { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ int2      operator*   (const int2& a, const int2& b)     { return make_int2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, const int2& b)     { return make_int2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, const int2& b)     { return make_int2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ int2      operator*   (const int2& a, int b)             { return make_int2(a.x * b, a.y * b); }
+static __device__ __forceinline__ int2      operator+   (const int2& a, int b)             { return make_int2(a.x + b, a.y + b); }
+static __device__ __forceinline__ int2      operator-   (const int2& a, int b)             { return make_int2(a.x - b, a.y - b); }
+static __device__ __forceinline__ int2      operator*   (int a, const int2& b)             { return make_int2(a * b.x, a * b.y); }
+static __device__ __forceinline__ int2      operator+   (int a, const int2& b)             { return make_int2(a + b.x, a + b.y); }
+static __device__ __forceinline__ int2      operator-   (int a, const int2& b)             { return make_int2(a - b.x, a - b.y); }
+static __device__ __forceinline__ int2      operator-   (const int2& a)                    { return make_int2(-a.x, -a.y); }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, const int3& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, const int3& b)           { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, const int3& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ int3&     operator*=  (int3& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ int3&     operator+=  (int3& a, int b)                   { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ int3&     operator-=  (int3& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ int3      operator*   (const int3& a, const int3& b)     { return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, const int3& b)     { return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, const int3& b)     { return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ int3      operator*   (const int3& a, int b)             { return make_int3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ int3      operator+   (const int3& a, int b)             { return make_int3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ int3      operator-   (const int3& a, int b)             { return make_int3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ int3      operator*   (int a, const int3& b)             { return make_int3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ int3      operator+   (int a, const int3& b)             { return make_int3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ int3      operator-   (int a, const int3& b)             { return make_int3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ int3      operator-   (const int3& a)                    { return make_int3(-a.x, -a.y, -a.z); }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, const int4& b)           { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, const int4& b)           { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, const int4& b)           { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ int4&     operator*=  (int4& a, int b)                   { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ int4&     operator+=  (int4& a, int b)                   { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ int4&     operator-=  (int4& a, int b)                   { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ int4      operator*   (const int4& a, const int4& b)     { return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, const int4& b)     { return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, const int4& b)     { return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ int4      operator*   (const int4& a, int b)             { return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ int4      operator+   (const int4& a, int b)             { return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ int4      operator-   (const int4& a, int b)             { return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ int4      operator*   (int a, const int4& b)             { return make_int4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ int4      operator+   (int a, const int4& b)             { return make_int4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ int4      operator-   (int a, const int4& b)             { return make_int4(a - b.x, a - b.y, a - b.z, a - b.w); }
+static __device__ __forceinline__ int4      operator-   (const int4& a)                    { return make_int4(-a.x, -a.y, -a.z, -a.w); }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, const uint2& b)         { a.x *= b.x; a.y *= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, const uint2& b)         { a.x += b.x; a.y += b.y; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, const uint2& b)         { a.x -= b.x; a.y -= b.y; return a; }
+static __device__ __forceinline__ uint2&    operator*=  (uint2& a, unsigned int b)         { a.x *= b; a.y *= b; return a; }
+static __device__ __forceinline__ uint2&    operator+=  (uint2& a, unsigned int b)         { a.x += b; a.y += b; return a; }
+static __device__ __forceinline__ uint2&    operator-=  (uint2& a, unsigned int b)         { a.x -= b; a.y -= b; return a; }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, const uint2& b)   { return make_uint2(a.x * b.x, a.y * b.y); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, const uint2& b)   { return make_uint2(a.x + b.x, a.y + b.y); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, const uint2& b)   { return make_uint2(a.x - b.x, a.y - b.y); }
+static __device__ __forceinline__ uint2     operator*   (const uint2& a, unsigned int b)   { return make_uint2(a.x * b, a.y * b); }
+static __device__ __forceinline__ uint2     operator+   (const uint2& a, unsigned int b)   { return make_uint2(a.x + b, a.y + b); }
+static __device__ __forceinline__ uint2     operator-   (const uint2& a, unsigned int b)   { return make_uint2(a.x - b, a.y - b); }
+static __device__ __forceinline__ uint2     operator*   (unsigned int a, const uint2& b)   { return make_uint2(a * b.x, a * b.y); }
+static __device__ __forceinline__ uint2     operator+   (unsigned int a, const uint2& b)   { return make_uint2(a + b.x, a + b.y); }
+static __device__ __forceinline__ uint2     operator-   (unsigned int a, const uint2& b)   { return make_uint2(a - b.x, a - b.y); }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, const uint3& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, const uint3& b)         { a.x += b.x; a.y += b.y; a.z += b.z; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, const uint3& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; return a; }
+static __device__ __forceinline__ uint3&    operator*=  (uint3& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; return a; }
+static __device__ __forceinline__ uint3&    operator+=  (uint3& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; return a; }
+static __device__ __forceinline__ uint3&    operator-=  (uint3& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; return a; }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, const uint3& b)   { return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, const uint3& b)   { return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, const uint3& b)   { return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); }
+static __device__ __forceinline__ uint3     operator*   (const uint3& a, unsigned int b)   { return make_uint3(a.x * b, a.y * b, a.z * b); }
+static __device__ __forceinline__ uint3     operator+   (const uint3& a, unsigned int b)   { return make_uint3(a.x + b, a.y + b, a.z + b); }
+static __device__ __forceinline__ uint3     operator-   (const uint3& a, unsigned int b)   { return make_uint3(a.x - b, a.y - b, a.z - b); }
+static __device__ __forceinline__ uint3     operator*   (unsigned int a, const uint3& b)   { return make_uint3(a * b.x, a * b.y, a * b.z); }
+static __device__ __forceinline__ uint3     operator+   (unsigned int a, const uint3& b)   { return make_uint3(a + b.x, a + b.y, a + b.z); }
+static __device__ __forceinline__ uint3     operator-   (unsigned int a, const uint3& b)   { return make_uint3(a - b.x, a - b.y, a - b.z); }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, const uint4& b)         { a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, const uint4& b)         { a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, const uint4& b)         { a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w; return a; }
+static __device__ __forceinline__ uint4&    operator*=  (uint4& a, unsigned int b)         { a.x *= b; a.y *= b; a.z *= b; a.w *= b; return a; }
+static __device__ __forceinline__ uint4&    operator+=  (uint4& a, unsigned int b)         { a.x += b; a.y += b; a.z += b; a.w += b; return a; }
+static __device__ __forceinline__ uint4&    operator-=  (uint4& a, unsigned int b)         { a.x -= b; a.y -= b; a.z -= b; a.w -= b; return a; }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, const uint4& b)   { return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, const uint4& b)   { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, const uint4& b)   { return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+static __device__ __forceinline__ uint4     operator*   (const uint4& a, unsigned int b)   { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); }
+static __device__ __forceinline__ uint4     operator+   (const uint4& a, unsigned int b)   { return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); }
+static __device__ __forceinline__ uint4     operator-   (const uint4& a, unsigned int b)   { return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); }
+static __device__ __forceinline__ uint4     operator*   (unsigned int a, const uint4& b)   { return make_uint4(a * b.x, a * b.y, a * b.z, a * b.w); }
+static __device__ __forceinline__ uint4     operator+   (unsigned int a, const uint4& b)   { return make_uint4(a + b.x, a + b.y, a + b.z, a + b.w); }
+static __device__ __forceinline__ uint4     operator-   (unsigned int a, const uint4& b)   { return make_uint4(a - b.x, a - b.y, a - b.z, a - b.w); }
+
+template<class T> static __device__ __forceinline__ T zero_value(void);
+template<> __device__ __forceinline__ float  zero_value<float> (void)                      { return 0.f; }
+template<> __device__ __forceinline__ float2 zero_value<float2>(void)                      { return make_float2(0.f, 0.f); }
+template<> __device__ __forceinline__ float4 zero_value<float4>(void)                      { return make_float4(0.f, 0.f, 0.f, 0.f); }
+static __device__ __forceinline__ float3 make_float3(const float2& a, float b)             { return make_float3(a.x, a.y, b); }
+static __device__ __forceinline__ float4 make_float4(const float3& a, float b)             { return make_float4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ float4 make_float4(const float2& a, const float2& b)     { return make_float4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ int3 make_int3(const int2& a, int b)                     { return make_int3(a.x, a.y, b); }
+static __device__ __forceinline__ int4 make_int4(const int3& a, int b)                     { return make_int4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ int4 make_int4(const int2& a, const int2& b)             { return make_int4(a.x, a.y, b.x, b.y); }
+static __device__ __forceinline__ uint3 make_uint3(const uint2& a, unsigned int b)         { return make_uint3(a.x, a.y, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint3& a, unsigned int b)         { return make_uint4(a.x, a.y, a.z, b); }
+static __device__ __forceinline__ uint4 make_uint4(const uint2& a, const uint2& b)         { return make_uint4(a.x, a.y, b.x, b.y); }
+
+template<class T> static __device__ __forceinline__ void swap(T& a, T& b)                  { T temp = a; a = b; b = temp; }
+
+//------------------------------------------------------------------------
+// Triangle ID <-> float32 conversion functions to support very large triangle IDs.
+//
+// Values up to and including 16777216 (also, negative values) are converted trivially and retain
+// compatibility with previous versions. Larger values are mapped to unique float32 that are not equal to
+// the ID. The largest value that converts to float32 and back without generating inf or nan is 889192447.
+
+static __device__ __forceinline__ int   float_to_triidx(float x) { if (x <= 16777216.f) return (int)x;   return __float_as_int(x) - 0x4a800000; }
+static __device__ __forceinline__ float triidx_to_float(int x)   { if (x <= 0x01000000) return (float)x; return __int_as_float(0x4a800000 + x); }
+
+//------------------------------------------------------------------------
+// Coalesced atomics. These are all done via macros.
+
+#if __CUDA_ARCH__ >= 700 // Warp match instruction __match_any_sync() is only available on compute capability 7.x and higher
+
+#define CA_TEMP       _ca_temp
+#define CA_TEMP_PARAM float* CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) \
+    __shared__ float CA_TEMP[(threads_per_block)]
+
+#define CA_SET_GROUP_MASK(group, thread_mask)                   \
+    bool   _ca_leader;                                          \
+    float* _ca_ptr;                                             \
+    do {                                                        \
+        int tidx   = threadIdx.x + blockDim.x * threadIdx.y;    \
+        int lane   = tidx & 31;                                 \
+        int warp   = tidx >> 5;                                 \
+        int tmask  = __match_any_sync((thread_mask), (group));  \
+        int leader = __ffs(tmask) - 1;                          \
+        _ca_leader = (leader == lane);                          \
+        _ca_ptr    = &_ca_temp[((warp << 5) + leader)];         \
+    } while(0)
+
+#define CA_SET_GROUP(group) \
+    CA_SET_GROUP_MASK((group), 0xffffffffu)
+
+#define caAtomicAdd(ptr, value)         \
+    do {                                \
+        if (_ca_leader)                 \
+            *_ca_ptr = 0.f;             \
+        atomicAdd(_ca_ptr, (value));    \
+        if (_ca_leader)                 \
+            atomicAdd((ptr), *_ca_ptr); \
+    } while(0)
+
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        caAtomicAdd((ptr), (x));        \
+        caAtomicAdd((ptr)+1, (y));      \
+        caAtomicAdd((ptr)+3, (w));      \
+    } while(0)
+
+#define caAtomicAddTexture(ptr, level, idx, value)  \
+    do {                                            \
+        CA_SET_GROUP((idx) ^ ((level) << 27));      \
+        caAtomicAdd((ptr)+(idx), (value));          \
+    } while(0)
+
+//------------------------------------------------------------------------
+// Disable atomic coalescing for compute capability lower than 7.x
+
+#else // __CUDA_ARCH__ >= 700
+#define CA_TEMP _ca_temp
+#define CA_TEMP_PARAM float CA_TEMP
+#define CA_DECLARE_TEMP(threads_per_block) CA_TEMP_PARAM
+#define CA_SET_GROUP_MASK(group, thread_mask)
+#define CA_SET_GROUP(group)
+#define caAtomicAdd(ptr, value) atomicAdd((ptr), (value))
+#define caAtomicAdd3_xyw(ptr, x, y, w)  \
+    do {                                \
+        atomicAdd((ptr), (x));          \
+        atomicAdd((ptr)+1, (y));        \
+        atomicAdd((ptr)+3, (w));        \
+    } while(0)
+#define caAtomicAddTexture(ptr, level, idx, value) atomicAdd((ptr)+(idx), (value))
+#endif // __CUDA_ARCH__ >= 700
+
+//------------------------------------------------------------------------
+#endif // __CUDACC__
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp
new file mode 100644
index 00000000..3c1c3a7f
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/CudaRaster.hpp
@@ -0,0 +1,63 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// This is a slimmed-down and modernized version of the original
+// CudaRaster codebase that accompanied the HPG 2011 paper
+// "High-Performance Software Rasterization on GPUs" by Laine and Karras.
+// Modifications have been made to accommodate post-Volta execution model
+// with warp divergence. Support for shading, blending, quad rendering,
+// and supersampling have been removed as unnecessary for nvdiffrast.
+//------------------------------------------------------------------------
+
+namespace CR
+{
+
+class RasterImpl;
+
+//------------------------------------------------------------------------
+// Interface class to isolate user from implementation details.
+//------------------------------------------------------------------------
+
+class CudaRaster
+{
+public:
+    enum
+    {
+        RenderModeFlag_EnableBackfaceCulling = 1 << 0,   // Enable backface culling.
+        RenderModeFlag_EnableDepthPeeling    = 1 << 1,   // Enable depth peeling. Must have a peel buffer set.
+    };
+
+public:
+					        CudaRaster				(void);
+					        ~CudaRaster				(void);
+
+    void                    setBufferSize           (int width, int height, int numImages);              // Width and height are internally rounded up to multiples of tile size (8x8) for buffer sizes.
+    void                    setViewport             (int width, int height, int offsetX, int offsetY);   // Tiled rendering viewport setup.
+    void                    setRenderModeFlags      (unsigned int renderModeFlags);                      // Affects all subsequent calls to drawTriangles(). Defaults to zero.
+    void                    deferredClear           (unsigned int clearColor);                           // Clears color and depth buffers during next call to drawTriangles().
+    void                    setVertexBuffer         (void* vertices, int numVertices);                   // GPU pointer managed by caller. Vertex positions in clip space as float4 (x, y, z, w).
+    void                    setIndexBuffer          (void* indices, int numTriangles);                   // GPU pointer managed by caller. Triangle index+color quadruplets as uint4 (idx0, idx1, idx2, color).
+    bool                    drawTriangles           (const int* ranges, bool peel, cudaStream_t stream); // Ranges (offsets and counts) as #triangles entries, not as bytes. If NULL, draw all triangles. Returns false in case of internal overflow.
+    void*                   getColorBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void*                   getDepthBuffer          (void);                                              // GPU pointer managed by CudaRaster.
+    void                    swapDepthAndPeel        (void);                                              // Swap depth and peeling buffers.
+
+private:
+					        CudaRaster           	(const CudaRaster&); // forbidden
+	CudaRaster&             operator=           	(const CudaRaster&); // forbidden
+
+private:
+    RasterImpl*             m_impl;                 // Opaque pointer to implementation.
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl
new file mode 100644
index 00000000..deae9d2c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/BinRaster.inl
@@ -0,0 +1,423 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void binRasterImpl(const CRParams p)
+{
+    __shared__ volatile U32 s_broadcast [CR_BIN_WARPS + 16];
+    __shared__ volatile S32 s_outOfs    [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outTotal  [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_overIndex [CR_MAXBINS_SQR];
+    __shared__ volatile S32 s_outMask   [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_outCount  [CR_BIN_WARPS][CR_MAXBINS_SQR + 1]; // +1 to avoid bank collisions
+    __shared__ volatile S32 s_triBuf    [CR_BIN_WARPS*32*4];                // triangle ring buffer
+    __shared__ volatile U32 s_batchPos;
+    __shared__ volatile U32 s_bufCount;
+    __shared__ volatile U32 s_overTotal;
+    __shared__ volatile U32 s_allocBase;
+
+    const CRImageParams&    ip              = getImageParams(p, blockIdx.z);
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const U8*               triSubtris      = (const U8*)p.triSubtris + p.maxSubtris * blockIdx.z;
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+
+    S32*                    binFirstSeg     = (S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binTotal        = (S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    S32*                    binSegData      = (S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    S32*                    binSegNext      = (S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    S32*                    binSegCount     = (S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+
+    if (atomics.numSubtris > p.maxSubtris)
+        return;
+
+    // per-thread state
+    int thrInBlock = threadIdx.x + threadIdx.y * 32;
+    int batchPos = 0;
+
+    // first 16 elements of s_broadcast are always zero
+    if (thrInBlock < 16)
+        s_broadcast[thrInBlock] = 0;
+
+    // initialize output linked lists and offsets
+    if (thrInBlock < p.numBins)
+    {
+        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = -1;
+        s_outOfs[thrInBlock] = -CR_BIN_SEG_SIZE;
+        s_outTotal[thrInBlock] = 0;
+    }
+
+    // repeat until done
+    for(;;)
+    {
+        // get batch
+        if (thrInBlock == 0)
+            s_batchPos = atomicAdd(&atomics.binCounter, ip.binBatchSize);
+        __syncthreads();
+        batchPos = s_batchPos;
+
+        // all batches done?
+        if (batchPos >= ip.triCount)
+            break;
+
+        // per-thread state
+        int bufIndex = 0;
+        int bufCount = 0;
+        int batchEnd = min(batchPos + ip.binBatchSize, ip.triCount);
+
+        // loop over batch as long as we have triangles in it
+        do
+        {
+            // read more triangles
+            while (bufCount < CR_BIN_WARPS*32 && batchPos < batchEnd)
+            {
+                // get subtriangle count
+
+                int triIdx = batchPos + thrInBlock;
+                int num = 0;
+                if (triIdx < batchEnd)
+                    num = triSubtris[triIdx];
+
+                // cumulative sum of subtriangles within each warp
+                U32 myIdx = __popc(__ballot_sync(~0u, num & 1) & getLaneMaskLt());
+                if (__any_sync(~0u, num > 1))
+                {
+                    myIdx += __popc(__ballot_sync(~0u, num & 2) & getLaneMaskLt()) * 2;
+                    myIdx += __popc(__ballot_sync(~0u, num & 4) & getLaneMaskLt()) * 4;
+                }
+                if (threadIdx.x == 31) // Do not assume that last thread in warp wins the write.
+                    s_broadcast[threadIdx.y + 16] = myIdx + num;
+                __syncthreads();
+
+                // cumulative sum of per-warp subtriangle counts
+                // Note: cannot have more than 32 warps or this needs to sync between each step.
+                bool act = (thrInBlock < CR_BIN_WARPS);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (threadIdx.y == 0 && act)
+                {
+                    volatile U32* ptr = &s_broadcast[thrInBlock + 16];
+                    U32 val = *ptr;
+                    #if (CR_BIN_WARPS > 1)
+                        val += ptr[-1]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 2)
+                        val += ptr[-2]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 4)
+                        val += ptr[-4]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 8)
+                        val += ptr[-8]; __syncwarp(actMask);
+                        *ptr = val;     __syncwarp(actMask);
+                    #endif
+                    #if (CR_BIN_WARPS > 16)
+                        val += ptr[-16]; __syncwarp(actMask);
+                        *ptr = val;      __syncwarp(actMask);
+                    #endif
+
+                    // initially assume that we consume everything
+                    // only last active thread does the writes
+                    if (threadIdx.x == CR_BIN_WARPS - 1)
+                    {
+                        s_batchPos = batchPos + CR_BIN_WARPS * 32;
+                        s_bufCount = bufCount + val;
+                    }
+                }
+                __syncthreads();
+
+                // skip if no subtriangles
+                if (num)
+                {
+                    // calculate write position for first subtriangle
+                    U32 pos = bufCount + myIdx + s_broadcast[threadIdx.y + 16 - 1];
+
+                    // only write if entire triangle fits
+                    if (pos + num <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        pos += bufIndex; // adjust for current start position
+                        pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                        if (num == 1)
+                            s_triBuf[pos] = triIdx * 8 + 7; // single triangle
+                        else
+                        {
+                            for (int i=0; i < num; i++)
+                            {
+                                s_triBuf[pos] = triIdx * 8 + i;
+                                pos++;
+                                pos &= CR_ARRAY_SIZE(s_triBuf)-1;
+                            }
+                        }
+                    } else if (pos <= CR_ARRAY_SIZE(s_triBuf))
+                    {
+                        // this triangle is the first that failed, overwrite total count and triangle count
+                        s_batchPos = batchPos + thrInBlock;
+                        s_bufCount = pos;
+                    }
+                }
+
+                // update triangle counts
+                __syncthreads();
+                batchPos = s_batchPos;
+                bufCount = s_bufCount;
+            }
+
+            // make every warp clear its output buffers
+            for (int i=threadIdx.x; i < p.numBins; i += 32)
+                s_outMask[threadIdx.y][i] = 0;
+            __syncwarp();
+
+            // choose our triangle
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (thrInBlock < bufCount)
+            {
+                U32 triPos = bufIndex + thrInBlock;
+                triPos &= CR_ARRAY_SIZE(s_triBuf)-1;
+
+                // find triangle
+                int triIdx = s_triBuf[triPos];
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+
+                // read triangle
+
+                triData = *(((const uint4*)triHeader) + dataIdx);
+            }
+
+            // setup bounding box and edge functions, and rasterize
+            S32 lox, loy, hix, hiy;
+            bool hasTri = (thrInBlock < bufCount);
+            U32 hasTriMask = __ballot_sync(~0u, hasTri);
+            if (hasTri)
+            {
+                S32 v0x = add_s16lo_s16lo(triData.x, p.widthPixelsVp  * (CR_SUBPIXEL_SIZE >> 1));
+                S32 v0y = add_s16hi_s16lo(triData.x, p.heightPixelsVp * (CR_SUBPIXEL_SIZE >> 1));
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+                int binLog = CR_BIN_LOG2 + CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+                lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+                hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> binLog, 0, p.widthBins  - 1);
+                hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> binLog, 0, p.heightBins - 1);
+
+                U32 bit = 1 << threadIdx.x;
+#if __CUDA_ARCH__ >= 700
+                bool multi = (hix != lox || hiy != loy);
+                if (!__any_sync(hasTriMask, multi))
+                {
+                    int binIdx = lox + p.widthBins * loy;
+                    U32 mask = __match_any_sync(hasTriMask, binIdx);
+                    s_outMask[threadIdx.y][binIdx] = mask;
+                    __syncwarp(hasTriMask);
+                } else
+#endif
+                {
+                    bool complex = (hix > lox+1 || hiy > loy+1);
+                    if (!__any_sync(hasTriMask, complex))
+                    {
+                        int binIdx = lox + p.widthBins * loy;
+                        atomicOr((U32*)&s_outMask[threadIdx.y][binIdx], bit);
+                        if (hix > lox) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + 1], bit);
+                        if (hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins], bit);
+                        if (hix > lox && hiy > loy) atomicOr((U32*)&s_outMask[threadIdx.y][binIdx + p.widthBins + 1], bit);
+                    } else
+                    {
+                        S32 d12x = d02x - d01x, d12y = d02y - d01y;
+                        v0x -= lox << binLog, v0y -= loy << binLog;
+
+                        S32 t01 = v0x * d01y - v0y * d01x;
+                        S32 t02 = v0y * d02x - v0x * d02y;
+                        S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                        S32 b01 = add_sub(t01 >> binLog, max(d01x, 0), min(d01y, 0));
+                        S32 b02 = add_sub(t02 >> binLog, max(d02y, 0), min(d02x, 0));
+                        S32 b12 = add_sub(t12 >> binLog, max(d12x, 0), min(d12y, 0));
+
+                        int width = hix - lox + 1;
+                        d01x += width * d01y;
+                        d02x += width * d02y;
+                        d12x += width * d12y;
+
+                        U8* currPtr = (U8*)&s_outMask[threadIdx.y][lox + loy * p.widthBins];
+                        U8* skipPtr = (U8*)&s_outMask[threadIdx.y][(hix + 1) + loy * p.widthBins];
+                        U8* endPtr  = (U8*)&s_outMask[threadIdx.y][lox + (hiy + 1) * p.widthBins];
+                        int stride  = p.widthBins * 4;
+                        int ptrYInc = stride - width * 4;
+
+                        do
+                        {
+                            if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                atomicOr((U32*)currPtr, bit);
+                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                            if (currPtr == skipPtr)
+                                currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += stride;
+                        }
+                        while (currPtr != endPtr);
+                    }
+                }
+            }
+
+            // count per-bin contributions
+            if (thrInBlock == 0)
+                s_overTotal = 0; // overflow counter
+
+            // ensure that out masks are done
+            __syncthreads();
+
+            int overIndex = -1;
+            bool act = (thrInBlock < p.numBins);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                U8* srcPtr = (U8*)&s_outMask[0][thrInBlock];
+                U8* dstPtr = (U8*)&s_outCount[0][thrInBlock];
+                int total = 0;
+                for (int i = 0; i < CR_BIN_WARPS; i++)
+                {
+                    total += __popc(*(U32*)srcPtr);
+                    *(U32*)dstPtr = total;
+                    srcPtr += (CR_MAXBINS_SQR + 1) * 4;
+                    dstPtr += (CR_MAXBINS_SQR + 1) * 4;
+                }
+
+                // overflow => request a new segment
+                int ofs = s_outOfs[thrInBlock];
+                bool ovr = (((ofs - 1) >> CR_BIN_SEG_LOG2) != (((ofs - 1) + total) >> CR_BIN_SEG_LOG2));
+                U32 ovrMask = __ballot_sync(actMask, ovr);
+                if (ovr)
+                {
+                    overIndex = __popc(ovrMask & getLaneMaskLt());
+                    if (overIndex == 0)
+                        s_broadcast[threadIdx.y + 16] = atomicAdd((U32*)&s_overTotal, __popc(ovrMask));
+                    __syncwarp(ovrMask);
+                    overIndex += s_broadcast[threadIdx.y + 16];
+                    s_overIndex[thrInBlock] = overIndex;
+                }
+            }
+
+            // sync after overTotal is ready
+            __syncthreads();
+
+            // at least one segment overflowed => allocate segments
+            U32 overTotal = s_overTotal;
+            U32 allocBase = 0;
+            if (overTotal > 0)
+            {
+                // allocate memory
+                if (thrInBlock == 0)
+                {
+                    U32 allocBase = atomicAdd(&atomics.numBinSegs, overTotal);
+                    s_allocBase = (allocBase + overTotal <= p.maxBinSegs) ? allocBase : 0;
+                }
+                __syncthreads();
+                allocBase = s_allocBase;
+
+                // did my bin overflow?
+                if (overIndex != -1)
+                {
+                    // calculate new segment index
+                    int segIdx = allocBase + overIndex;
+
+                    // add to linked list
+                    if (s_outOfs[thrInBlock] < 0)
+                        binFirstSeg[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = segIdx;
+                    else
+                        binSegNext[(s_outOfs[thrInBlock] - 1) >> CR_BIN_SEG_LOG2] = segIdx;
+
+                    // defaults
+                    binSegNext [segIdx] = -1;
+                    binSegCount[segIdx] = CR_BIN_SEG_SIZE;
+                }
+            }
+
+            // concurrent emission -- each warp handles its own triangle
+            if (thrInBlock < bufCount)
+            {
+                int triPos  = (bufIndex + thrInBlock) & (CR_ARRAY_SIZE(s_triBuf) - 1);
+                int currBin = lox + loy * p.widthBins;
+                int skipBin = (hix + 1) + loy * p.widthBins;
+                int endBin  = lox + (hiy + 1) * p.widthBins;
+                int binYInc = p.widthBins - (hix - lox + 1);
+
+                // loop over triangle's bins
+                do
+                {
+                    U32 outMask = s_outMask[threadIdx.y][currBin];
+                    if (outMask & (1<<threadIdx.x))
+                    {
+                        int idx = __popc(outMask & getLaneMaskLt());
+                        if (threadIdx.y > 0)
+                            idx += s_outCount[threadIdx.y-1][currBin];
+
+                        int base = s_outOfs[currBin];
+                        int free = (-base) & (CR_BIN_SEG_SIZE - 1);
+                        if (idx >= free)
+                            idx += ((allocBase + s_overIndex[currBin]) << CR_BIN_SEG_LOG2) - free;
+                        else
+                            idx += base;
+
+                        binSegData[idx] = s_triBuf[triPos];
+                    }
+
+                    currBin++;
+                    if (currBin == skipBin)
+                        currBin += binYInc, skipBin += p.widthBins;
+                }
+                while (currBin != endBin);
+            }
+
+            // wait all triangles to finish, then replace overflown segment offsets
+            __syncthreads();
+            if (thrInBlock < p.numBins)
+            {
+                U32 total  = s_outCount[CR_BIN_WARPS - 1][thrInBlock];
+                U32 oldOfs = s_outOfs[thrInBlock];
+                if (overIndex == -1)
+                    s_outOfs[thrInBlock] = oldOfs + total;
+                else
+                {
+                    int addr = oldOfs + total;
+                    addr = ((addr - 1) & (CR_BIN_SEG_SIZE - 1)) + 1;
+                    addr += (allocBase + overIndex) << CR_BIN_SEG_LOG2;
+                    s_outOfs[thrInBlock] = addr;
+                }
+                s_outTotal[thrInBlock] += total;
+            }
+
+            // these triangles are now done
+            int count = ::min(bufCount, CR_BIN_WARPS * 32);
+            bufCount -= count;
+            bufIndex += count;
+            bufIndex &= CR_ARRAY_SIZE(s_triBuf)-1;
+        }
+        while (bufCount > 0 || batchPos < batchEnd);
+
+        // flush all bins
+        if (thrInBlock < p.numBins)
+        {
+            int ofs = s_outOfs[thrInBlock];
+            if (ofs & (CR_BIN_SEG_SIZE-1))
+            {
+                int seg = ofs >> CR_BIN_SEG_LOG2;
+                binSegCount[seg] = ofs & (CR_BIN_SEG_SIZE-1);
+                s_outOfs[thrInBlock] = (ofs + CR_BIN_SEG_SIZE - 1) & -CR_BIN_SEG_SIZE;
+            }
+        }
+    }
+
+    // output totals
+    if (thrInBlock < p.numBins)
+        binTotal[(thrInBlock << CR_BIN_STREAMS_LOG2) + blockIdx.x] = s_outTotal[thrInBlock];
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp
new file mode 100644
index 00000000..b2cd7b92
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "Buffer.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// GPU buffer.
+//------------------------------------------------------------------------
+
+Buffer::Buffer(void)
+:   m_gpuPtr(NULL),
+    m_bytes (0)
+{
+    // empty
+}
+
+Buffer::~Buffer(void)
+{
+    if (m_gpuPtr)
+        cudaFree(m_gpuPtr); // Don't throw an exception.
+}
+
+void Buffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_gpuPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFree(m_gpuPtr));
+        m_gpuPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMalloc(&m_gpuPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+void Buffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
+// Host buffer with page-locked memory.
+//------------------------------------------------------------------------
+
+HostBuffer::HostBuffer(void)
+:   m_hostPtr(NULL),
+    m_bytes  (0)
+{
+    // empty
+}
+
+HostBuffer::~HostBuffer(void)
+{
+    if (m_hostPtr)
+        cudaFreeHost(m_hostPtr); // Don't throw an exception.
+}
+
+void HostBuffer::reset(size_t bytes)
+{
+    if (bytes == m_bytes)
+        return;
+
+    if (m_hostPtr)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaFreeHost(m_hostPtr));
+        m_hostPtr = NULL;
+    }
+
+    if (bytes > 0)
+        NVDR_CHECK_CUDA_ERROR(cudaMallocHost(&m_hostPtr, bytes));
+
+    m_bytes = bytes;
+}
+
+void HostBuffer::grow(size_t bytes)
+{
+    if (bytes > m_bytes)
+        reset(bytes);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp
new file mode 100644
index 00000000..8a4b38fd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Buffer.hpp
@@ -0,0 +1,55 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class Buffer
+{
+public:
+                    Buffer      (void);
+                    ~Buffer     (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (size_t offset = 0) { return (void*)(((uintptr_t)m_gpuPtr) + offset); }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_gpuPtr = ptr; }
+
+private:
+    void*           m_gpuPtr;
+    size_t          m_bytes;
+};
+
+//------------------------------------------------------------------------
+
+class HostBuffer
+{
+public:
+                    HostBuffer  (void);
+                    ~HostBuffer (void);
+
+    void            reset       (size_t bytes);
+    void            grow        (size_t bytes);
+    void*           getPtr      (void) { return m_hostPtr; }
+    size_t          getSize     (void) const { return m_bytes; }
+
+    void            setPtr      (void* ptr) { m_hostPtr = ptr; }
+
+private:
+    void*           m_hostPtr;
+    size_t          m_bytes;
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
new file mode 100644
index 00000000..a7081c7e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CoarseRaster.inl
@@ -0,0 +1,730 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int globalTileIdx(int tileInBin, int widthTiles)
+{
+    int tileX = tileInBin & (CR_BIN_SIZE - 1);
+    int tileY = tileInBin >> CR_BIN_LOG2;
+    return tileX + tileY * widthTiles;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void coarseRasterImpl(const CRParams p)
+{
+    // Common.
+
+    __shared__ volatile U32 s_workCounter;
+    __shared__ volatile U32 s_scanTemp          [CR_COARSE_WARPS][48];              // 3KB
+
+    // Input.
+
+    __shared__ volatile U32 s_binOrder          [CR_MAXBINS_SQR];                   // 1KB
+    __shared__ volatile S32 s_binStreamCurrSeg  [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_binStreamFirstTri [CR_BIN_STREAMS_SIZE];              // 0KB
+    __shared__ volatile S32 s_triQueue          [CR_COARSE_QUEUE_SIZE];             // 4KB
+    __shared__ volatile S32 s_triQueueWritePos;
+    __shared__ volatile U32 s_binStreamSelectedOfs;
+    __shared__ volatile U32 s_binStreamSelectedSize;
+
+    // Output.
+
+    __shared__ volatile U32 s_warpEmitMask      [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_warpEmitPrefixSum [CR_COARSE_WARPS][CR_BIN_SQR + 1];  // 16KB, +1 to avoid bank collisions
+    __shared__ volatile U32 s_tileEmitPrefixSum [CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile U32 s_tileAllocPrefixSum[CR_BIN_SQR + 1];                   // 1KB, zero at the beginning
+    __shared__ volatile S32 s_tileStreamCurrOfs [CR_BIN_SQR];                       // 1KB
+    __shared__ volatile U32 s_firstAllocSeg;
+    __shared__ volatile U32 s_firstActiveIdx;
+
+    // Pointers and constants.
+
+    CRAtomics&              atomics         = p.atomics[blockIdx.z];
+    const CRTriangleHeader* triHeader       = (const CRTriangleHeader*)p.triHeader + p.maxSubtris * blockIdx.z;
+    const S32*              binFirstSeg     = (const S32*)p.binFirstSeg + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binTotal        = (const S32*)p.binTotal    + CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * blockIdx.z;
+    const S32*              binSegData      = (const S32*)p.binSegData  + p.maxBinSegs * CR_BIN_SEG_SIZE * blockIdx.z;
+    const S32*              binSegNext      = (const S32*)p.binSegNext  + p.maxBinSegs * blockIdx.z;
+    const S32*              binSegCount     = (const S32*)p.binSegCount + p.maxBinSegs * blockIdx.z;
+    S32*                    activeTiles     = (S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileFirstSeg    = (S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+    S32*                    tileSegData     = (S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    S32*                    tileSegNext     = (S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    S32*                    tileSegCount    = (S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+    int tileLog     = CR_TILE_LOG2 + CR_SUBPIXEL_LOG2;
+    int thrInBlock  = threadIdx.x + threadIdx.y * 32;
+    int emitShift   = CR_BIN_LOG2 * 2 + 5; // We scan ((numEmits << emitShift) | numAllocs) over tiles.
+
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs)
+        return;
+
+    // Initialize sharedmem arrays.
+
+    if (thrInBlock == 0)
+    {
+        s_tileEmitPrefixSum[0] = 0;
+        s_tileAllocPrefixSum[0] = 0;
+    }
+    s_scanTemp[threadIdx.y][threadIdx.x] = 0;
+
+    // Sort bins in descending order of triangle count.
+
+    for (int binIdx = thrInBlock; binIdx < p.numBins; binIdx += CR_COARSE_WARPS * 32)
+    {
+        int count = 0;
+        for (int i = 0; i < CR_BIN_STREAMS_SIZE; i++)
+            count += binTotal[(binIdx << CR_BIN_STREAMS_LOG2) + i];
+        s_binOrder[binIdx] = (~count << (CR_MAXBINS_LOG2 * 2)) | binIdx;
+    }
+
+    __syncthreads();
+    sortShared(s_binOrder, p.numBins);
+
+    // Process each bin by one block.
+
+    for (;;)
+    {
+        // Pick a bin for the block.
+
+        if (thrInBlock == 0)
+            s_workCounter = atomicAdd(&atomics.coarseCounter, 1);
+        __syncthreads();
+
+        int workCounter = s_workCounter;
+        if (workCounter >= p.numBins)
+            break;
+
+        U32 binOrder = s_binOrder[workCounter];
+        bool binEmpty = ((~binOrder >> (CR_MAXBINS_LOG2 * 2)) == 0);
+        if (binEmpty && !p.deferredClear)
+            break;
+
+        int binIdx = binOrder & (CR_MAXBINS_SQR - 1);
+
+        // Initialize input/output streams.
+
+        int triQueueWritePos = 0;
+        int triQueueReadPos = 0;
+
+        if (thrInBlock < CR_BIN_STREAMS_SIZE)
+        {
+            int segIdx = binFirstSeg[(binIdx << CR_BIN_STREAMS_LOG2) + thrInBlock];
+            s_binStreamCurrSeg[thrInBlock] = segIdx;
+            s_binStreamFirstTri[thrInBlock] = (segIdx == -1) ? ~0u : binSegData[segIdx << CR_BIN_SEG_LOG2];
+        }
+
+        for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            s_tileStreamCurrOfs[tileInBin] = -CR_TILE_SEG_SIZE;
+
+        // Initialize per-bin state.
+
+        int binY = idiv_fast(binIdx, p.widthBins);
+        int binX = binIdx - binY * p.widthBins;
+        int originX = (binX << (CR_BIN_LOG2 + tileLog)) - (p.widthPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int originY = (binY << (CR_BIN_LOG2 + tileLog)) - (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+        int maxTileXInBin = ::min(p.widthTiles - (binX << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int maxTileYInBin = ::min(p.heightTiles - (binY << CR_BIN_LOG2), CR_BIN_SIZE) - 1;
+        int binTileIdx = (binX + binY * p.widthTiles) << CR_BIN_LOG2;
+
+        // Entire block: Merge input streams and process triangles.
+
+        if (!binEmpty)
+        do
+        {
+            //------------------------------------------------------------------------
+            // Merge.
+            //------------------------------------------------------------------------
+
+            // Entire block: Not enough triangles => merge and queue segments.
+            // NOTE: The bin exit criterion assumes that we queue more triangles than we actually need.
+
+            while (triQueueWritePos - triQueueReadPos <= CR_COARSE_WARPS * 32)
+            {
+                // First warp: Choose the segment with the lowest initial triangle index.
+
+                bool hasStream = (thrInBlock < CR_BIN_STREAMS_SIZE);
+                U32 hasStreamMask = __ballot_sync(~0u, hasStream);
+                if (hasStream)
+                {
+                    // Find the stream with the lowest triangle index.
+
+                    U32 firstTri = s_binStreamFirstTri[thrInBlock];
+                    U32 t = firstTri;
+                    volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+
+                    #if (CR_BIN_STREAMS_SIZE > 1)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-1]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 2)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-2]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 4)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-4]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 8)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-8]); __syncwarp(hasStreamMask);
+                    #endif
+                    #if (CR_BIN_STREAMS_SIZE > 16)
+                        v[0] = t; __syncwarp(hasStreamMask); t = ::min(t, v[-16]); __syncwarp(hasStreamMask);
+                    #endif
+                    v[0] = t; __syncwarp(hasStreamMask);
+
+                    // Consume and broadcast.
+
+                    bool first = (s_scanTemp[0][CR_BIN_STREAMS_SIZE - 1 + 16] == firstTri);
+                    U32 firstMask = __ballot_sync(hasStreamMask, first);
+                    if (first && (firstMask >> threadIdx.x) == 1u)
+                    {
+                        int segIdx = s_binStreamCurrSeg[thrInBlock];
+                        s_binStreamSelectedOfs = segIdx << CR_BIN_SEG_LOG2;
+                        if (segIdx != -1)
+                        {
+                            int segSize = binSegCount[segIdx];
+                            int segNext = binSegNext[segIdx];
+                            s_binStreamSelectedSize = segSize;
+                            s_triQueueWritePos = triQueueWritePos + segSize;
+                            s_binStreamCurrSeg[thrInBlock] = segNext;
+                            s_binStreamFirstTri[thrInBlock] = (segNext == -1) ? ~0u : binSegData[segNext << CR_BIN_SEG_LOG2];
+                        }
+                    }
+                }
+
+                // No more segments => break.
+
+                __syncthreads();
+                triQueueWritePos = s_triQueueWritePos;
+                int segOfs = s_binStreamSelectedOfs;
+                if (segOfs < 0)
+                    break;
+
+                int segSize = s_binStreamSelectedSize;
+                __syncthreads();
+
+                // Fetch triangles into the queue.
+
+                for (int idxInSeg = CR_COARSE_WARPS * 32 - 1 - thrInBlock; idxInSeg < segSize; idxInSeg += CR_COARSE_WARPS * 32)
+                {
+                    S32 triIdx = binSegData[segOfs + idxInSeg];
+                    s_triQueue[(triQueueWritePos - segSize + idxInSeg) & (CR_COARSE_QUEUE_SIZE - 1)] = triIdx;
+                }
+            }
+
+            // All threads: Clear emit masks.
+
+            for (int maskIdx = thrInBlock; maskIdx < CR_COARSE_WARPS * CR_BIN_SQR; maskIdx += CR_COARSE_WARPS * 32)
+                s_warpEmitMask[maskIdx >> (CR_BIN_LOG2 * 2)][maskIdx & (CR_BIN_SQR - 1)] = 0;
+
+            __syncthreads();
+
+            //------------------------------------------------------------------------
+            // Raster.
+            //------------------------------------------------------------------------
+
+            // Triangle per thread: Read from the queue.
+
+            int triIdx = -1;
+            if (triQueueReadPos + thrInBlock < triQueueWritePos)
+                triIdx = s_triQueue[(triQueueReadPos + thrInBlock) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+            uint4 triData = make_uint4(0, 0, 0, 0);
+            if (triIdx != -1)
+            {
+                int dataIdx = triIdx >> 3;
+                int subtriIdx = triIdx & 7;
+                if (subtriIdx != 7)
+                    dataIdx = triHeader[dataIdx].misc + subtriIdx;
+                triData = *((uint4*)triHeader + dataIdx);
+            }
+
+            // 32 triangles per warp: Record emits (= tile intersections).
+
+            if (__any_sync(~0u, triIdx != -1))
+            {
+                S32 v0x = sub_s16lo_s16lo(triData.x, originX);
+                S32 v0y = sub_s16hi_s16lo(triData.x, originY);
+                S32 d01x = sub_s16lo_s16lo(triData.y, triData.x);
+                S32 d01y = sub_s16hi_s16hi(triData.y, triData.x);
+                S32 d02x = sub_s16lo_s16lo(triData.z, triData.x);
+                S32 d02y = sub_s16hi_s16hi(triData.z, triData.x);
+
+                // Compute tile-based AABB.
+
+                int lox = add_clamp_0_x((v0x + min_min(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int loy = add_clamp_0_x((v0y + min_min(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int hix = add_clamp_0_x((v0x + max_max(d01x, 0, d02x)) >> tileLog, 0, maxTileXInBin);
+                int hiy = add_clamp_0_x((v0y + max_max(d01y, 0, d02y)) >> tileLog, 0, maxTileYInBin);
+                int sizex = add_sub(hix, 1, lox);
+                int sizey = add_sub(hiy, 1, loy);
+                int area = sizex * sizey;
+
+                // Miscellaneous init.
+
+                U8* currPtr = (U8*)&s_warpEmitMask[threadIdx.y][lox + (loy << CR_BIN_LOG2)];
+                int ptrYInc = CR_BIN_SIZE * 4 - (sizex << 2);
+                U32 maskBit = 1 << threadIdx.x;
+
+                // Case A: All AABBs are small => record the full AABB using atomics.
+
+                if (__all_sync(~0u, sizex <= 2 && sizey <= 2))
+                {
+                    if (triIdx != -1)
+                    {
+                        atomicOr((U32*)currPtr, maskBit);
+                        if (sizex == 2) atomicOr((U32*)(currPtr + 4), maskBit);
+                        if (sizey == 2) atomicOr((U32*)(currPtr + CR_BIN_SIZE * 4), maskBit);
+                        if (sizex == 2 && sizey == 2) atomicOr((U32*)(currPtr + 4 + CR_BIN_SIZE * 4), maskBit);
+                    }
+                }
+                else
+                {
+                    // Compute warp-AABB (scan-32).
+
+                    U32 aabbMask = add_sub(2 << hix, 0x20000 << hiy, 1 << lox) - (0x10000 << loy);
+                    if (triIdx == -1)
+                        aabbMask = 0;
+
+                    volatile U32* v = &s_scanTemp[threadIdx.y][threadIdx.x + 16];
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-1]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-2]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-4]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-8]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask |= v[-16]; __syncwarp();
+                    v[0] = aabbMask; __syncwarp(); aabbMask = s_scanTemp[threadIdx.y][47];
+
+                    U32 maskX = aabbMask & 0xFFFF;
+                    U32 maskY = aabbMask >> 16;
+                    int wlox = findLeadingOne(maskX ^ (maskX - 1));
+                    int wloy = findLeadingOne(maskY ^ (maskY - 1));
+                    int whix = findLeadingOne(maskX);
+                    int whiy = findLeadingOne(maskY);
+                    int warea = (add_sub(whix, 1, wlox)) * (add_sub(whiy, 1, wloy));
+
+                    // Initialize edge functions.
+
+                    S32 d12x = d02x - d01x;
+                    S32 d12y = d02y - d01y;
+                    v0x -= lox << tileLog;
+                    v0y -= loy << tileLog;
+
+                    S32 t01 = v0x * d01y - v0y * d01x;
+                    S32 t02 = v0y * d02x - v0x * d02y;
+                    S32 t12 = d01x * d12y - d01y * d12x - t01 - t02;
+                    S32 b01 = add_sub(t01 >> tileLog, ::max(d01x, 0), ::min(d01y, 0));
+                    S32 b02 = add_sub(t02 >> tileLog, ::max(d02y, 0), ::min(d02x, 0));
+                    S32 b12 = add_sub(t12 >> tileLog, ::max(d12x, 0), ::min(d12y, 0));
+
+                    d01x += sizex * d01y;
+                    d02x += sizex * d02y;
+                    d12x += sizex * d12y;
+
+                    // Case B: Warp-AABB is not much larger than largest AABB => Check tiles in warp-AABB, record using ballots.
+                    if (__any_sync(~0u, warea * 4 <= area * 8))
+                    {
+                        // Not sure if this is any faster than Case C after all the post-Volta ballot mask tracking.
+                        bool act = (triIdx != -1);
+                        U32 actMask = __ballot_sync(~0u, act);
+                        if (act)
+                        {
+                            for (int y = wloy; y <= whiy; y++)
+                            {
+                                bool yIn = (y >= loy && y <= hiy);
+                                U32 yMask = __ballot_sync(actMask, yIn);
+                                if (yIn)
+                                {
+                                    for (int x = wlox; x <= whix; x++)
+                                    {
+                                        bool xyIn = (x >= lox && x <= hix);
+                                        U32 xyMask = __ballot_sync(yMask, xyIn);
+                                        if (xyIn)
+                                        {
+                                            U32 res = __ballot_sync(xyMask, b01 >= 0 && b02 >= 0 && b12 >= 0);
+                                            if (threadIdx.x == 31 - __clz(xyMask))
+                                                *(U32*)currPtr = res;
+                                            currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                        }
+                                    }
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x;
+                                }
+                            }
+                        }
+                    }
+
+                    // Case C: General case => Check tiles in AABB, record using atomics.
+
+                    else
+                    {
+                        if (triIdx != -1)
+                        {
+                            U8* skipPtr = currPtr + (sizex << 2);
+                            U8* endPtr  = currPtr + (sizey << (CR_BIN_LOG2 + 2));
+                            do
+                            {
+                                if (b01 >= 0 && b02 >= 0 && b12 >= 0)
+                                    atomicOr((U32*)currPtr, maskBit);
+                                currPtr += 4, b01 -= d01y, b02 += d02y, b12 -= d12y;
+                                if (currPtr == skipPtr)
+                                    currPtr += ptrYInc, b01 += d01x, b02 -= d02x, b12 += d12x, skipPtr += CR_BIN_SIZE * 4;
+                            }
+                            while (currPtr != endPtr);
+                        }
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            //------------------------------------------------------------------------
+            // Count.
+            //------------------------------------------------------------------------
+
+            // Tile per thread: Initialize prefix sums.
+
+            for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+            {
+                int tileInBin = tileInBin_base + thrInBlock;
+                bool act = (tileInBin < CR_BIN_SQR);
+                U32 actMask = __ballot_sync(~0u, act);
+                if (act)
+                {
+                    // Compute prefix sum of emits over warps.
+
+                    U8* srcPtr = (U8*)&s_warpEmitMask[0][tileInBin];
+                    U8* dstPtr = (U8*)&s_warpEmitPrefixSum[0][tileInBin];
+                    int tileEmits = 0;
+                    for (int i = 0; i < CR_COARSE_WARPS; i++)
+                    {
+                        tileEmits += __popc(*(U32*)srcPtr);
+                        *(U32*)dstPtr = tileEmits;
+                        srcPtr += (CR_BIN_SQR + 1) * 4;
+                        dstPtr += (CR_BIN_SQR + 1) * 4;
+                    }
+
+                    // Determine the number of segments to allocate.
+
+                    int spaceLeft = -s_tileStreamCurrOfs[tileInBin] & (CR_TILE_SEG_SIZE - 1);
+                    int tileAllocs = (tileEmits - spaceLeft + CR_TILE_SEG_SIZE - 1) >> CR_TILE_SEG_LOG2;
+                    volatile U32* v = &s_tileEmitPrefixSum[tileInBin + 1];
+
+                    // All counters within the warp are small => compute prefix sum using ballot.
+
+                    if (!__any_sync(actMask, tileEmits >= 2))
+                    {
+                        U32 m = getLaneMaskLe();
+                        *v = (__popc(__ballot_sync(actMask, tileEmits & 1) & m) << emitShift) | __popc(__ballot_sync(actMask, tileAllocs & 1) & m);
+                    }
+
+                    // Otherwise => scan-32 within the warp.
+
+                    else
+                    {
+                        U32 sum = (tileEmits << emitShift) | tileAllocs;
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 1)  sum += v[-1]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 2)  sum += v[-2]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 4)  sum += v[-4]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 8)  sum += v[-8]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask); if (threadIdx.x >= 16) sum += v[-16]; __syncwarp(actMask);
+                        *v = sum; __syncwarp(actMask);
+                    }
+                }
+            }
+
+            // First warp: Scan-8.
+
+            __syncthreads();
+
+            bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+            U32 scan8Mask = __ballot_sync(~0u, scan8);
+            if (scan8)
+            {
+                int sum = s_tileEmitPrefixSum[(thrInBlock << 5) + 32];
+                volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+                v[0] = sum; __syncwarp(scan8Mask);
+                #if (CR_BIN_SQR > 1 * 32)
+                    sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 2 * 32)
+                    sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+                #if (CR_BIN_SQR > 4 * 32)
+                    sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+                #endif
+            }
+
+            __syncthreads();
+
+            // Tile per thread: Finalize prefix sums.
+            // Single thread: Allocate segments.
+
+            for (int tileInBin = thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int sum = s_tileEmitPrefixSum[tileInBin + 1] + s_scanTemp[0][(tileInBin >> 5) + 15];
+                int numEmits = sum >> emitShift;
+                int numAllocs = sum & ((1 << emitShift) - 1);
+                s_tileEmitPrefixSum[tileInBin + 1] = numEmits;
+                s_tileAllocPrefixSum[tileInBin + 1] = numAllocs;
+
+                if (tileInBin == CR_BIN_SQR - 1 && numAllocs != 0)
+                {
+                    int t = atomicAdd(&atomics.numTileSegs, numAllocs);
+                    s_firstAllocSeg = (t + numAllocs <= p.maxTileSegs) ? t : 0;
+                }
+            }
+
+            __syncthreads();
+            int firstAllocSeg   = s_firstAllocSeg;
+            int totalEmits      = s_tileEmitPrefixSum[CR_BIN_SQR];
+            int totalAllocs     = s_tileAllocPrefixSum[CR_BIN_SQR];
+
+            //------------------------------------------------------------------------
+            // Emit.
+            //------------------------------------------------------------------------
+
+            // Emit per thread: Write triangle index to globalmem.
+
+            for (int emitInBin = thrInBlock; emitInBin < totalEmits; emitInBin += CR_COARSE_WARPS * 32)
+            {
+                // Find tile in bin.
+
+                U8* tileBase = (U8*)&s_tileEmitPrefixSum[0];
+                U8* tilePtr = tileBase;
+                U8* ptr;
+
+                #if (CR_BIN_SQR > 128)
+                    ptr = tilePtr + 0x80 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 64)
+                    ptr = tilePtr + 0x40 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 32)
+                    ptr = tilePtr + 0x20 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 16)
+                    ptr = tilePtr + 0x10 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 8)
+                    ptr = tilePtr + 0x08 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 4)
+                    ptr = tilePtr + 0x04 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 2)
+                    ptr = tilePtr + 0x02 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+                #if (CR_BIN_SQR > 1)
+                    ptr = tilePtr + 0x01 * 4; if (emitInBin >= *(U32*)ptr) tilePtr = ptr;
+                #endif
+
+                int tileInBin = (tilePtr - tileBase) >> 2;
+                int emitInTile = emitInBin - *(U32*)tilePtr;
+
+                // Find warp in tile.
+
+                int warpStep = (CR_BIN_SQR + 1) * 4;
+                U8* warpBase = (U8*)&s_warpEmitPrefixSum[0][tileInBin] - warpStep;
+                U8* warpPtr = warpBase;
+
+                #if (CR_COARSE_WARPS > 8)
+                    ptr = warpPtr + 0x08 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 4)
+                    ptr = warpPtr + 0x04 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 2)
+                    ptr = warpPtr + 0x02 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+                #if (CR_COARSE_WARPS > 1)
+                    ptr = warpPtr + 0x01 * warpStep; if (emitInTile >= *(U32*)ptr) warpPtr = ptr;
+                #endif
+
+                int warpInTile = (warpPtr - warpBase) >> (CR_BIN_LOG2 * 2 + 2);
+                U32 emitMask = *(U32*)(warpPtr + warpStep + ((U8*)s_warpEmitMask - (U8*)s_warpEmitPrefixSum));
+                int emitInWarp = emitInTile - *(U32*)(warpPtr + warpStep) + __popc(emitMask);
+
+                // Find thread in warp.
+
+                int threadInWarp = 0;
+                int pop = __popc(emitMask & 0xFFFF);
+                bool pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x10;
+                if (pred) threadInWarp += 0x10;
+
+                pop = __popc(emitMask & 0xFF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x08;
+                if (pred) threadInWarp += 0x08;
+
+                pop = __popc(emitMask & 0xF);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x04;
+                if (pred) threadInWarp += 0x04;
+
+                pop = __popc(emitMask & 0x3);
+                pred = (emitInWarp >= pop);
+                if (pred) emitInWarp -= pop;
+                if (pred) emitMask >>= 0x02;
+                if (pred) threadInWarp += 0x02;
+
+                if (emitInWarp >= (emitMask & 1))
+                    threadInWarp++;
+
+                // Figure out where to write.
+
+                int currOfs = s_tileStreamCurrOfs[tileInBin];
+                int spaceLeft = -currOfs & (CR_TILE_SEG_SIZE - 1);
+                int outOfs = emitInTile;
+
+                if (outOfs < spaceLeft)
+                    outOfs += currOfs;
+                else
+                {
+                    int allocLo = firstAllocSeg + s_tileAllocPrefixSum[tileInBin];
+                    outOfs += (allocLo << CR_TILE_SEG_LOG2) - spaceLeft;
+                }
+
+                // Write.
+
+                int queueIdx = warpInTile * 32 + threadInWarp;
+                int triIdx = s_triQueue[(triQueueReadPos + queueIdx) & (CR_COARSE_QUEUE_SIZE - 1)];
+
+                tileSegData[outOfs] = triIdx;
+            }
+
+            //------------------------------------------------------------------------
+            // Patch.
+            //------------------------------------------------------------------------
+
+            // Allocated segment per thread: Initialize next-pointer and count.
+
+            for (int i = CR_COARSE_WARPS * 32 - 1 - thrInBlock; i < totalAllocs; i += CR_COARSE_WARPS * 32)
+            {
+                int segIdx = firstAllocSeg + i;
+                tileSegNext[segIdx] = segIdx + 1;
+                tileSegCount[segIdx] = CR_TILE_SEG_SIZE;
+            }
+
+            // Tile per thread: Fix previous segment's next-pointer and update s_tileStreamCurrOfs.
+
+            __syncthreads();
+            for (int tileInBin = CR_COARSE_WARPS * 32 - 1 - thrInBlock; tileInBin < CR_BIN_SQR; tileInBin += CR_COARSE_WARPS * 32)
+            {
+                int oldOfs = s_tileStreamCurrOfs[tileInBin];
+                int newOfs = oldOfs + s_warpEmitPrefixSum[CR_COARSE_WARPS - 1][tileInBin];
+                int allocLo = s_tileAllocPrefixSum[tileInBin];
+                int allocHi = s_tileAllocPrefixSum[tileInBin + 1];
+
+                if (allocLo != allocHi)
+                {
+                    S32* nextPtr = &tileSegNext[(oldOfs - 1) >> CR_TILE_SEG_LOG2];
+                    if (oldOfs < 0)
+                        nextPtr = &tileFirstSeg[binTileIdx + globalTileIdx(tileInBin, p.widthTiles)];
+                    *nextPtr = firstAllocSeg + allocLo;
+
+                    newOfs--;
+                    newOfs &= CR_TILE_SEG_SIZE - 1;
+                    newOfs |= (firstAllocSeg + allocHi - 1) << CR_TILE_SEG_LOG2;
+                    newOfs++;
+                }
+                s_tileStreamCurrOfs[tileInBin] = newOfs;
+            }
+
+            // Advance queue read pointer.
+            // Queue became empty => bin done.
+
+            triQueueReadPos += CR_COARSE_WARPS * 32;
+        }
+        while (triQueueReadPos < triQueueWritePos);
+
+        // Tile per thread: Fix next-pointer and count of the last segment.
+        // 32 tiles per warp: Count active tiles.
+
+        __syncthreads();
+
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int tileX = tileInBin & (CR_BIN_SIZE - 1);
+                int tileY = tileInBin >> CR_BIN_LOG2;
+                bool force = (p.deferredClear & tileX <= maxTileXInBin & tileY <= maxTileYInBin);
+
+                int ofs = s_tileStreamCurrOfs[tileInBin];
+                int segIdx = (ofs - 1) >> CR_TILE_SEG_LOG2;
+                int segCount = ofs & (CR_TILE_SEG_SIZE - 1);
+
+                if (ofs >= 0)
+                    tileSegNext[segIdx] = -1;
+                else if (force)
+                {
+                    s_tileStreamCurrOfs[tileInBin] = 0;
+                    tileFirstSeg[binTileIdx + tileX + tileY * p.widthTiles] = -1;
+                }
+
+                if (segCount != 0)
+                    tileSegCount[segIdx] = segCount;
+
+                U32 res = __ballot_sync(actMask, ofs >= 0 | force);
+                if (threadIdx.x == 0)
+                    s_scanTemp[0][(tileInBin >> 5) + 16] = __popc(res);
+            }
+        }
+
+        // First warp: Scan-8.
+        // One thread: Allocate space for active tiles.
+
+        __syncthreads();
+
+        bool scan8 = (thrInBlock < CR_BIN_SQR / 32);
+        U32 scan8Mask = __ballot_sync(~0u, scan8);
+        if (scan8)
+        {
+            volatile U32* v = &s_scanTemp[0][thrInBlock + 16];
+            U32 sum = v[0];
+            #if (CR_BIN_SQR > 1 * 32)
+                sum += v[-1]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 2 * 32)
+                sum += v[-2]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+            #if (CR_BIN_SQR > 4 * 32)
+                sum += v[-4]; __syncwarp(scan8Mask); v[0] = sum; __syncwarp(scan8Mask);
+            #endif
+
+            if (thrInBlock == CR_BIN_SQR / 32 - 1)
+                s_firstActiveIdx = atomicAdd(&atomics.numActiveTiles, sum);
+        }
+
+        // Tile per thread: Output active tiles.
+
+        __syncthreads();
+
+        for (int tileInBin_base = 0; tileInBin_base < CR_BIN_SQR; tileInBin_base += CR_COARSE_WARPS * 32)
+        {
+            int tileInBin = tileInBin_base + thrInBlock;
+            bool act = (tileInBin < CR_BIN_SQR) && (s_tileStreamCurrOfs[tileInBin] >= 0);
+            U32 actMask = __ballot_sync(~0u, act);
+            if (act)
+            {
+                int activeIdx = s_firstActiveIdx;
+                activeIdx += s_scanTemp[0][(tileInBin >> 5) + 15];
+                activeIdx += __popc(actMask & getLaneMaskLt());
+                activeTiles[activeIdx] = binTileIdx + globalTileIdx(tileInBin, p.widthTiles);
+            }
+        }
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp
new file mode 100644
index 00000000..916315cd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Constants.hpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_LOG2     11      // ViewportSize / PixelSize.
+#define CR_SUBPIXEL_LOG2        4       // PixelSize / SubpixelSize.
+
+#define CR_MAXBINS_LOG2         4       // ViewportSize / BinSize.
+#define CR_BIN_LOG2             4       // BinSize / TileSize.
+#define CR_TILE_LOG2            3       // TileSize / PixelSize.
+
+#define CR_COVER8X8_LUT_SIZE    768     // 64-bit entries.
+#define CR_FLIPBIT_FLIP_Y       2
+#define CR_FLIPBIT_FLIP_X       3
+#define CR_FLIPBIT_SWAP_XY      4
+#define CR_FLIPBIT_COMPL        5
+
+#define CR_BIN_STREAMS_LOG2     4
+#define CR_BIN_SEG_LOG2         9       // 32-bit entries.
+#define CR_TILE_SEG_LOG2        5       // 32-bit entries.
+
+#define CR_MAXSUBTRIS_LOG2      24      // Triangle structs. Dictated by CoarseRaster.
+#define CR_COARSE_QUEUE_LOG2    10      // Triangles.
+
+#define CR_SETUP_WARPS          2
+#define CR_SETUP_OPT_BLOCKS     8
+#define CR_BIN_WARPS            16
+#define CR_COARSE_WARPS         16      // Must be a power of two.
+#define CR_FINE_MAX_WARPS       20
+
+#define CR_EMBED_IMAGE_PARAMS   32      // Number of per-image parameter structs embedded in kernel launch parameter block.
+
+//------------------------------------------------------------------------
+
+#define CR_MAXVIEWPORT_SIZE     (1 << CR_MAXVIEWPORT_LOG2)
+#define CR_SUBPIXEL_SIZE        (1 << CR_SUBPIXEL_LOG2)
+#define CR_SUBPIXEL_SQR         (1 << (CR_SUBPIXEL_LOG2 * 2))
+
+#define CR_MAXBINS_SIZE         (1 << CR_MAXBINS_LOG2)
+#define CR_MAXBINS_SQR          (1 << (CR_MAXBINS_LOG2 * 2))
+#define CR_BIN_SIZE             (1 << CR_BIN_LOG2)
+#define CR_BIN_SQR              (1 << (CR_BIN_LOG2 * 2))
+
+#define CR_MAXTILES_LOG2        (CR_MAXBINS_LOG2 + CR_BIN_LOG2)
+#define CR_MAXTILES_SIZE        (1 << CR_MAXTILES_LOG2)
+#define CR_MAXTILES_SQR         (1 << (CR_MAXTILES_LOG2 * 2))
+#define CR_TILE_SIZE            (1 << CR_TILE_LOG2)
+#define CR_TILE_SQR             (1 << (CR_TILE_LOG2 * 2))
+
+#define CR_BIN_STREAMS_SIZE     (1 << CR_BIN_STREAMS_LOG2)
+#define CR_BIN_SEG_SIZE         (1 << CR_BIN_SEG_LOG2)
+#define CR_TILE_SEG_SIZE        (1 << CR_TILE_SEG_LOG2)
+
+#define CR_MAXSUBTRIS_SIZE      (1 << CR_MAXSUBTRIS_LOG2)
+#define CR_COARSE_QUEUE_SIZE    (1 << CR_COARSE_QUEUE_LOG2)
+
+//------------------------------------------------------------------------
+// When evaluating interpolated Z pixel centers, we may introduce an error
+// of (+-CR_LERP_ERROR) ULPs.
+
+#define CR_LERP_ERROR(SAMPLES_LOG2) (2200u << (SAMPLES_LOG2))
+#define CR_DEPTH_MIN                CR_LERP_ERROR(3)
+#define CR_DEPTH_MAX                (CR_U32_MAX - CR_LERP_ERROR(3))
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
new file mode 100644
index 00000000..db8bf314
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/CudaRaster.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "Defs.hpp"
+#include "../CudaRaster.hpp"
+#include "RasterImpl.hpp"
+
+using namespace CR;
+
+//------------------------------------------------------------------------
+// Stub interface implementation.
+//------------------------------------------------------------------------
+
+CudaRaster::CudaRaster()
+{
+    m_impl = new RasterImpl();
+}
+
+CudaRaster::~CudaRaster()
+{
+    delete m_impl;
+}
+
+void CudaRaster::setBufferSize(int width, int height, int numImages)
+{
+    m_impl->setBufferSize(Vec3i(width, height, numImages));
+}
+
+void CudaRaster::setViewport(int width, int height, int offsetX, int offsetY)
+{
+    m_impl->setViewport(Vec2i(width, height), Vec2i(offsetX, offsetY));
+}
+
+void CudaRaster::setRenderModeFlags(U32 flags)
+{
+    m_impl->setRenderModeFlags(flags);
+}
+
+void CudaRaster::deferredClear(U32 clearColor)
+{
+    m_impl->deferredClear(clearColor);
+}
+
+void CudaRaster::setVertexBuffer(void* vertices, int numVertices)
+{
+    m_impl->setVertexBuffer(vertices, numVertices);
+}
+
+void CudaRaster::setIndexBuffer(void* indices, int numTriangles)
+{
+    m_impl->setIndexBuffer(indices, numTriangles);
+}
+
+bool CudaRaster::drawTriangles(const int* ranges, bool peel, cudaStream_t stream)
+{
+    return m_impl->drawTriangles((const Vec2i*)ranges, peel, stream);
+}
+
+void* CudaRaster::getColorBuffer(void)
+{
+    return m_impl->getColorBuffer();
+}
+
+void* CudaRaster::getDepthBuffer(void)
+{
+    return m_impl->getDepthBuffer();
+}
+
+void CudaRaster::swapDepthAndPeel(void)
+{
+    m_impl->swapDepthAndPeel();
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp
new file mode 100644
index 00000000..7aa7774c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Defs.hpp
@@ -0,0 +1,90 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include <cuda_runtime.h>
+#include <cstdint>
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+#ifndef NULL
+#   define NULL 0
+#endif
+
+#ifdef __CUDACC__
+#   define CR_CUDA 1
+#else
+#   define CR_CUDA 0
+#endif
+
+#if CR_CUDA
+#   define CR_CUDA_FUNC     __device__ __inline__
+#   define CR_CUDA_CONST    __constant__
+#else
+#   define CR_CUDA_FUNC     inline
+#   define CR_CUDA_CONST    static const
+#endif
+
+#define CR_UNREF(X)         ((void)(X))
+#define CR_ARRAY_SIZE(X)    ((int)(sizeof(X) / sizeof((X)[0])))
+
+//------------------------------------------------------------------------
+
+typedef uint8_t             U8;
+typedef uint16_t            U16;
+typedef uint32_t            U32;
+typedef uint64_t            U64;
+typedef int8_t              S8;
+typedef int16_t             S16;
+typedef int32_t             S32;
+typedef int64_t             S64;
+typedef float               F32;
+typedef double              F64;
+typedef void                (*FuncPtr)(void);
+
+//------------------------------------------------------------------------
+
+#define CR_U32_MAX          (0xFFFFFFFFu)
+#define CR_S32_MIN          (~0x7FFFFFFF)
+#define CR_S32_MAX          (0x7FFFFFFF)
+#define CR_U64_MAX          ((U64)(S64)-1)
+#define CR_S64_MIN          ((S64)-1 << 63)
+#define CR_S64_MAX          (~((S64)-1 << 63))
+#define CR_F32_MIN          (1.175494351e-38f)
+#define CR_F32_MAX          (3.402823466e+38f)
+#define CR_F64_MIN          (2.2250738585072014e-308)
+#define CR_F64_MAX          (1.7976931348623158e+308)
+
+//------------------------------------------------------------------------
+// Misc types.
+
+class Vec2i
+{
+public:
+    Vec2i(int x_, int y_) : x(x_), y(y_) {}
+    int x, y;
+};
+
+class Vec3i
+{
+public:
+    Vec3i(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
+    int x, y, z;
+};
+
+//------------------------------------------------------------------------
+// CUDA utilities.
+
+#if CR_CUDA
+#   define globalThreadIdx (threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y)))
+#endif
+
+//------------------------------------------------------------------------
+} // namespace CR
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl
new file mode 100644
index 00000000..720e9997
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/FineRaster.inl
@@ -0,0 +1,385 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Utility funcs.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void initTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth)
+{
+    tileZMax = CR_DEPTH_MAX;
+    tileZUpd = (::min(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]) < tileZMax);
+}
+
+__device__ __inline__ void updateTileZMax(U32& tileZMax, bool& tileZUpd, volatile U32* tileDepth, volatile U32* temp)
+{
+    // Entry is warp-coherent.
+    if (__any_sync(~0u, tileZUpd))
+    {
+        U32 z = ::max(tileDepth[threadIdx.x], tileDepth[threadIdx.x + 32]); __syncwarp();
+        temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  1]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  2]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  4]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 -  8]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        z = ::max(z, temp[threadIdx.x + 16 - 16]); __syncwarp(); temp[threadIdx.x + 16] = z; __syncwarp();
+        tileZMax = temp[47];
+        tileZUpd = false;
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void getTriangle(const CRParams& p, S32& triIdx, S32& dataIdx, uint4& triHeader, S32& segment)
+{
+    const CRTriangleHeader* triHeaderPtr    = (const CRTriangleHeader*)p.triHeader + blockIdx.z * p.maxSubtris;;
+    const S32*              tileSegData     = (const S32*)p.tileSegData  + p.maxTileSegs * CR_TILE_SEG_SIZE * blockIdx.z;
+    const S32*              tileSegNext     = (const S32*)p.tileSegNext  + p.maxTileSegs * blockIdx.z;
+    const S32*              tileSegCount    = (const S32*)p.tileSegCount + p.maxTileSegs * blockIdx.z;
+
+    if (threadIdx.x >= tileSegCount[segment])
+    {
+        triIdx = -1;
+        dataIdx = -1;
+    }
+    else
+    {
+        int subtriIdx = tileSegData[segment * CR_TILE_SEG_SIZE + threadIdx.x];
+        triIdx = subtriIdx >> 3;
+        dataIdx = triIdx;
+        subtriIdx &= 7;
+        if (subtriIdx != 7)
+            dataIdx = triHeaderPtr[triIdx].misc + subtriIdx;
+        triHeader = *((uint4*)triHeaderPtr + dataIdx);
+    }
+
+    // advance to next segment
+    segment = tileSegNext[segment];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool earlyZCull(uint4 triHeader, U32 tileZMax)
+{
+    U32 zmin = triHeader.w & 0xFFFFF000;
+    return (zmin > tileZMax);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 trianglePixelCoverage(const CRParams& p, const uint4& triHeader, int tileX, int tileY, volatile U64* s_cover8x8_lut)
+{
+    int baseX = (tileX << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.widthPixelsVp  - 1) << (CR_SUBPIXEL_LOG2 - 1));
+    int baseY = (tileY << (CR_TILE_LOG2 + CR_SUBPIXEL_LOG2)) - ((p.heightPixelsVp - 1) << (CR_SUBPIXEL_LOG2 - 1));
+
+    // extract S16 vertex positions while subtracting tile coordinates
+    S32 v0x  = sub_s16lo_s16lo(triHeader.x, baseX);
+    S32 v0y  = sub_s16hi_s16lo(triHeader.x, baseY);
+    S32 v01x = sub_s16lo_s16lo(triHeader.y, triHeader.x);
+    S32 v01y = sub_s16hi_s16hi(triHeader.y, triHeader.x);
+    S32 v20x = sub_s16lo_s16lo(triHeader.x, triHeader.z);
+    S32 v20y = sub_s16hi_s16hi(triHeader.x, triHeader.z);
+
+    // extract flipbits
+    U32 f01 = (triHeader.w >> 6) & 0x3C;
+    U32 f12 = (triHeader.w >> 2) & 0x3C;
+    U32 f20 = (triHeader.w << 2) & 0x3C;
+
+    // compute per-edge coverage masks
+    U64 c01, c12, c20;
+    c01 = cover8x8_exact_fast(v0x, v0y, v01x, v01y, f01, s_cover8x8_lut);
+    c12 = cover8x8_exact_fast(v0x + v01x, v0y + v01y, -v01x - v20x, -v01y - v20y, f12, s_cover8x8_lut);
+    c20 = cover8x8_exact_fast(v0x, v0y, v20x, v20y, f20, s_cover8x8_lut);
+
+    // combine masks
+    return c01 & c12 & c20;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 scan32_value(U32 value, volatile U32* temp)
+{
+    __syncwarp();
+    temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  1]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  2]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  4]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 -  8]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    value += temp[threadIdx.x + 16 - 16]; __syncwarp(); temp[threadIdx.x + 16] = value; __syncwarp();
+    return value;
+}
+
+__device__ __inline__ volatile const U32& scan32_total(volatile U32* temp)
+{
+    return temp[47];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ S32 findBit(U64 mask, int idx)
+{
+    U32 x = getLo(mask);
+    int  pop = __popc(x);
+    bool p   = (pop <= idx);
+    if (p) x = getHi(mask);
+    if (p) idx -= pop;
+    int bit = p ? 32 : 0;
+
+    pop = __popc(x & 0x0000ffffu);
+    p   = (pop <= idx);
+    if (p) x >>= 16;
+    if (p) bit += 16;
+    if (p) idx -= pop;
+
+    U32 tmp = x & 0x000000ffu;
+    pop = __popc(tmp);
+    p   = (pop <= idx);
+    if (p) tmp = x & 0x0000ff00u;
+    if (p) idx -= pop;
+
+    return findLeadingOne(tmp) + bit - idx;
+}
+
+//------------------------------------------------------------------------
+// Single-sample implementation.
+//------------------------------------------------------------------------
+
+__device__ __inline__ void executeROP(U32 color, U32 depth, volatile U32* pColor, volatile U32* pDepth, U32 ropMask)
+{
+    atomicMin((U32*)pDepth, depth);
+    __syncwarp(ropMask);
+    bool act = (depth == *pDepth);
+    __syncwarp(ropMask);
+    U32 actMask = __ballot_sync(ropMask, act);
+    if (act)
+    {
+        *pDepth = 0;
+        __syncwarp(actMask);
+        atomicMax((U32*)pDepth, threadIdx.x);
+        __syncwarp(actMask);
+        if (*pDepth == threadIdx.x)
+        {
+            *pDepth = depth;
+            *pColor = color;
+        }
+        __syncwarp(actMask);
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void fineRasterImpl(const CRParams p)
+{
+                                                                            // for 20 warps:
+    __shared__ volatile U64 s_cover8x8_lut[CR_COVER8X8_LUT_SIZE];           // 6KB
+    __shared__ volatile U32 s_tileColor   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tileDepth   [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_tilePeel    [CR_FINE_MAX_WARPS][CR_TILE_SQR]; // 5KB
+    __shared__ volatile U32 s_triDataIdx  [CR_FINE_MAX_WARPS][64];          // 5KB  CRTriangleData index
+    __shared__ volatile U64 s_triangleCov [CR_FINE_MAX_WARPS][64];          // 10KB coverage mask
+    __shared__ volatile U32 s_triangleFrag[CR_FINE_MAX_WARPS][64];          // 5KB  fragment index
+    __shared__ volatile U32 s_temp        [CR_FINE_MAX_WARPS][80];          // 6.25KB
+                                                                            // = 47.25KB total
+
+    CRAtomics&            atomics   = p.atomics[blockIdx.z];
+    const CRTriangleData* triData   = (const CRTriangleData*)p.triData + blockIdx.z * p.maxSubtris;
+
+    const S32*      activeTiles     = (const S32*)p.activeTiles  + CR_MAXTILES_SQR * blockIdx.z;
+    const S32*      tileFirstSeg    = (const S32*)p.tileFirstSeg + CR_MAXTILES_SQR * blockIdx.z;
+
+    volatile U32*   tileColor       = s_tileColor[threadIdx.y];
+    volatile U32*   tileDepth       = s_tileDepth[threadIdx.y];
+    volatile U32*   tilePeel        = s_tilePeel[threadIdx.y];
+    volatile U32*   triDataIdx      = s_triDataIdx[threadIdx.y];
+    volatile U64*   triangleCov     = s_triangleCov[threadIdx.y];
+    volatile U32*   triangleFrag    = s_triangleFrag[threadIdx.y];
+    volatile U32*   temp            = s_temp[threadIdx.y];
+
+    if (atomics.numSubtris > p.maxSubtris || atomics.numBinSegs > p.maxBinSegs || atomics.numTileSegs > p.maxTileSegs)
+        return;
+
+    temp[threadIdx.x] = 0; // first 16 elements of temp are always zero
+    cover8x8_setupLUT(s_cover8x8_lut);
+    __syncthreads();
+
+    // loop over tiles
+    for (;;)
+    {
+        // pick a tile
+        if (threadIdx.x == 0)
+            temp[16] = atomicAdd(&atomics.fineCounter, 1);
+        __syncwarp();
+        int activeIdx = temp[16];
+        if (activeIdx >= atomics.numActiveTiles)
+            break;
+
+        int tileIdx = activeTiles[activeIdx];
+        S32 segment = tileFirstSeg[tileIdx];
+        int tileY = tileIdx / p.widthTiles;
+        int tileX = tileIdx - tileY * p.widthTiles;
+        int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+        int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+
+        // initialize per-tile state
+        int triRead = 0, triWrite = 0;
+        int fragRead = 0, fragWrite = 0;
+        if (threadIdx.x == 0)
+            triangleFrag[63] = 0; // "previous triangle"
+
+        // deferred clear => clear tile
+        if (p.deferredClear)
+        {
+			tileColor[threadIdx.x] = p.clearColor;
+            tileDepth[threadIdx.x] = p.clearDepth;
+            tileColor[threadIdx.x + 32] = p.clearColor;
+            tileDepth[threadIdx.x + 32] = p.clearDepth;
+        }
+        else // otherwise => read tile from framebuffer
+        {
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+			tileColor[threadIdx.x] = pColor[px + p.strideX * py];
+            tileDepth[threadIdx.x] = pDepth[px + p.strideX * py];
+            tileColor[threadIdx.x + 32] = pColor[px + p.strideX * (py + 4)];
+            tileDepth[threadIdx.x + 32] = pDepth[px + p.strideX * (py + 4)];
+        }
+
+        // read peeling inputs if enabled
+        if (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling)
+        {
+            U32* pPeel = (U32*)p.peelBuffer + p.strideX * p.strideY * blockIdx.z;
+            tilePeel[threadIdx.x] = pPeel[px + p.strideX * py];
+            tilePeel[threadIdx.x + 32] = pPeel[px + p.strideX * (py + 4)];
+        }
+
+        U32 tileZMax;
+        bool tileZUpd;
+        initTileZMax(tileZMax, tileZUpd, tileDepth);
+
+        // process fragments
+        for(;;)
+        {
+            // need to queue more fragments?
+            if (fragWrite - fragRead < 32 && segment >= 0)
+            {
+                // update tile z - coherent over warp
+                updateTileZMax(tileZMax, tileZUpd, tileDepth, temp);
+
+                // read triangles
+                do
+                {
+                    // read triangle index and data, advance to next segment
+                    S32 triIdx, dataIdx;
+                    uint4 triHeader;
+                    getTriangle(p, triIdx, dataIdx, triHeader, segment);
+
+                    // early z cull
+                    if (triIdx >= 0 && earlyZCull(triHeader, tileZMax))
+                        triIdx = -1;
+
+                    // determine coverage
+                    U64 coverage = trianglePixelCoverage(p, triHeader, tileX, tileY, s_cover8x8_lut);
+                    S32 pop = (triIdx == -1) ? 0 : __popcll(coverage);
+
+                    // fragment count scan
+                    U32 frag = scan32_value(pop, temp);
+                    frag += fragWrite; // frag now holds cumulative fragment count
+                    fragWrite += scan32_total(temp);
+
+                    // queue non-empty triangles
+                    U32 goodMask = __ballot_sync(~0u, pop != 0);
+                    if (pop != 0)
+                    {
+                        int idx = (triWrite + __popc(goodMask & getLaneMaskLt())) & 63;
+                        triDataIdx  [idx] = dataIdx;
+                        triangleFrag[idx] = frag;
+                        triangleCov [idx] = coverage;
+                    }
+                    triWrite += __popc(goodMask);
+                }
+                while (fragWrite - fragRead < 32 && segment >= 0);
+            }
+            __syncwarp();
+
+            // end of segment?
+            if (fragRead == fragWrite)
+                break;
+
+            // clear triangle boundaries
+            temp[threadIdx.x + 16] = 0;
+            __syncwarp();
+
+            // tag triangle boundaries
+            if (triRead + threadIdx.x < triWrite)
+            {
+                int idx = triangleFrag[(triRead + threadIdx.x) & 63] - fragRead;
+                if (idx <= 32)
+                    temp[idx + 16 - 1] = 1;
+            }
+            __syncwarp();
+
+            int ropLaneIdx = threadIdx.x;
+            U32 boundaryMask = __ballot_sync(~0u, temp[ropLaneIdx + 16]);
+
+            // distribute fragments
+            bool hasFragment = (ropLaneIdx < fragWrite - fragRead);
+            U32 fragmentMask = __ballot_sync(~0u, hasFragment);
+            if (hasFragment)
+            {
+                int triBufIdx = (triRead + __popc(boundaryMask & getLaneMaskLt())) & 63;
+                int fragIdx = add_sub(fragRead, ropLaneIdx, triangleFrag[(triBufIdx - 1) & 63]);
+                U64 coverage = triangleCov[triBufIdx];
+                int pixelInTile = findBit(coverage, fragIdx);
+                int dataIdx = triDataIdx[triBufIdx];
+
+                // determine pixel position
+                U32 pixelX = (tileX << CR_TILE_LOG2) + (pixelInTile & 7);
+                U32 pixelY = (tileY << CR_TILE_LOG2) + (pixelInTile >> 3);
+
+                // depth test
+                U32 depth = 0;
+                uint4 td = *((uint4*)triData + dataIdx * (sizeof(CRTriangleData) >> 4));
+
+                depth = td.x * pixelX + td.y * pixelY + td.z;
+                bool zkill = (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) && (depth <= tilePeel[pixelInTile]);
+                if (!zkill)
+                {
+                    U32 oldDepth = tileDepth[pixelInTile];
+                    if (depth > oldDepth)
+                        zkill = true;
+                    else if (oldDepth == tileZMax)
+                        tileZUpd = true; // we are replacing previous zmax => need to update
+                }
+
+                U32 ropMask = __ballot_sync(fragmentMask, !zkill);
+                if (!zkill)
+					executeROP(td.w, depth, &tileColor[pixelInTile], &tileDepth[pixelInTile], ropMask);
+            }
+            // no need to sync, as next up is updateTileZMax that does internal warp sync
+
+            // update counters
+            fragRead = ::min(fragRead + 32, fragWrite);
+            triRead += __popc(boundaryMask);
+        }
+
+        // Write tile back to the framebuffer.
+        if (true)
+        {
+            int px = (tileX << CR_TILE_LOG2) + (threadIdx.x & (CR_TILE_SIZE - 1));
+            int py = (tileY << CR_TILE_LOG2) + (threadIdx.x >> CR_TILE_LOG2);
+            U32* pColor = (U32*)p.colorBuffer + p.strideX * p.strideY * blockIdx.z;
+            U32* pDepth = (U32*)p.depthBuffer + p.strideX * p.strideY * blockIdx.z;
+            pColor[px + p.strideX * py] = tileColor[threadIdx.x];
+            pDepth[px + p.strideX * py] = tileDepth[threadIdx.x];
+            pColor[px + p.strideX * (py + 4)] = tileColor[threadIdx.x + 32];
+            pDepth[px + p.strideX * (py + 4)] = tileDepth[threadIdx.x + 32];
+        }
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
new file mode 100644
index 00000000..26133c97
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/PrivateDefs.hpp
@@ -0,0 +1,153 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "Defs.hpp"
+#include "Constants.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+// Projected triangle.
+//------------------------------------------------------------------------
+
+struct CRTriangleHeader
+{
+    S16 v0x;    // Subpixels relative to viewport center. Valid if triSubtris = 1.
+    S16 v0y;
+    S16 v1x;
+    S16 v1y;
+    S16 v2x;
+    S16 v2y;
+
+    U32 misc;   // triSubtris=1: (zmin:20, f01:4, f12:4, f20:4), triSubtris>=2: (subtriBase)
+};
+
+//------------------------------------------------------------------------
+
+struct CRTriangleData
+{
+    U32 zx;     // zx * sampleX + zy * sampleY + zb = lerp(CR_DEPTH_MIN, CR_DEPTH_MAX, (clipZ / clipW + 1) / 2)
+    U32 zy;
+    U32 zb;
+    U32 id;     // Triangle id.
+};
+
+//------------------------------------------------------------------------
+// Device-side structures.
+//------------------------------------------------------------------------
+
+struct CRAtomics
+{
+    // Setup.
+    S32         numSubtris;         // = numTris
+
+    // Bin.
+    S32         binCounter;         // = 0
+    S32         numBinSegs;         // = 0
+
+    // Coarse.
+    S32         coarseCounter;      // = 0
+    S32         numTileSegs;        // = 0
+    S32         numActiveTiles;     // = 0
+
+    // Fine.
+    S32         fineCounter;        // = 0
+};
+
+//------------------------------------------------------------------------
+
+struct CRImageParams
+{
+    S32         triOffset;          // First triangle index to draw.
+    S32         triCount;           // Number of triangles to draw.
+    S32         binBatchSize;       // Number of triangles per batch.
+};
+
+//------------------------------------------------------------------------
+
+struct CRParams
+{
+    // Common.
+
+    CRAtomics*  atomics;            // Work counters. Per-image.
+    S32         numImages;          // Batch size.
+    S32         totalCount;         // In range mode, total number of triangles to render.
+    S32         instanceMode;       // 0 = range mode, 1 = instance mode.
+
+    S32         numVertices;        // Number of vertices in input buffer, not counting multiples in instance mode.
+    S32         numTriangles;       // Number of triangles in input buffer.
+    void*       vertexBuffer;       // numVertices * float4(x, y, z, w)
+    void*       indexBuffer;        // numTriangles * int3(vi0, vi1, vi2)
+
+    S32         widthPixels;        // Render buffer size in pixels. Must be multiple of tile size (8x8).
+    S32         heightPixels;
+    S32         widthPixelsVp;      // Viewport size in pixels.
+    S32         heightPixelsVp;
+    S32         widthBins;          // widthPixels / CR_BIN_SIZE
+    S32         heightBins;         // heightPixels / CR_BIN_SIZE
+    S32         numBins;            // widthBins * heightBins
+
+    F32         xs;                 // Vertex position adjustments for tiled rendering.
+    F32         ys;
+    F32         xo;
+    F32         yo;
+
+    S32         widthTiles;         // widthPixels / CR_TILE_SIZE
+    S32         heightTiles;        // heightPixels / CR_TILE_SIZE
+    S32         numTiles;           // widthTiles * heightTiles
+
+    U32         renderModeFlags;
+    S32         deferredClear;      // 1 = Clear framebuffer before rendering triangles.
+    U32         clearColor;
+    U32         clearDepth;
+
+    // These are uniform across batch.
+
+    S32         maxSubtris;
+    S32         maxBinSegs;
+    S32         maxTileSegs;
+
+    // Setup output / bin input.
+
+    void*       triSubtris;         // maxSubtris * U8
+    void*       triHeader;          // maxSubtris * CRTriangleHeader
+    void*       triData;            // maxSubtris * CRTriangleData
+
+    // Bin output / coarse input.
+
+    void*       binSegData;         // maxBinSegs * CR_BIN_SEG_SIZE * S32
+    void*       binSegNext;         // maxBinSegs * S32
+    void*       binSegCount;        // maxBinSegs * S32
+    void*       binFirstSeg;        // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 segIdx), -1 = none
+    void*       binTotal;           // CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * (S32 numTris)
+
+    // Coarse output / fine input.
+
+    void*       tileSegData;        // maxTileSegs * CR_TILE_SEG_SIZE * S32
+    void*       tileSegNext;        // maxTileSegs * S32
+    void*       tileSegCount;       // maxTileSegs * S32
+    void*       activeTiles;        // CR_MAXTILES_SQR * (S32 tileIdx)
+    void*       tileFirstSeg;       // CR_MAXTILES_SQR * (S32 segIdx), -1 = none
+
+    // Surface buffers. Outer tile offset is baked into pointers.
+
+    void*       colorBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       depthBuffer;        // sizePixels.x * sizePixels.y * numImages * U32
+    void*       peelBuffer;         // sizePixels.x * sizePixels.y * numImages * U32, only if peeling enabled.
+    S32         strideX;            // horizontal size in pixels
+    S32         strideY;            // vertical stride in pixels
+
+    // Per-image parameters for first images are embedded here to avoid extra memcpy for small batches.
+
+    CRImageParams imageParamsFirst[CR_EMBED_IMAGE_PARAMS];
+    const CRImageParams* imageParamsExtra; // After CR_EMBED_IMAGE_PARAMS.
+};
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
new file mode 100644
index 00000000..f7f05d57
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cpp
@@ -0,0 +1,370 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../../framework.h"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "RasterImpl.hpp"
+#include <cuda_runtime.h>
+
+using namespace CR;
+using std::min;
+using std::max;
+
+//------------------------------------------------------------------------
+// Kernel prototypes and variables.
+
+void triangleSetupKernel (const CRParams p);
+void binRasterKernel     (const CRParams p);
+void coarseRasterKernel  (const CRParams p);
+void fineRasterKernel    (const CRParams p);
+
+//------------------------------------------------------------------------
+
+RasterImpl::RasterImpl(void)
+:   m_renderModeFlags       (0),
+    m_deferredClear         (false),
+    m_clearColor            (0),
+    m_vertexPtr             (NULL),
+    m_indexPtr              (NULL),
+    m_numVertices           (0),
+    m_numTriangles          (0),
+    m_bufferSizesReported   (0),
+
+    m_numImages             (0),
+    m_bufferSizePixels      (0, 0),
+    m_bufferSizeVp          (0, 0),
+    m_sizePixels            (0, 0),
+    m_sizeVp                (0, 0),
+    m_offsetPixels          (0, 0),
+    m_sizeBins              (0, 0),
+    m_numBins               (0),
+    m_sizeTiles             (0, 0),
+    m_numTiles              (0),
+
+    m_numSMs                (1),
+    m_numCoarseBlocksPerSM  (1),
+    m_numFineBlocksPerSM    (1),
+    m_numFineWarpsPerBlock  (1),
+
+    m_maxSubtris            (1),
+    m_maxBinSegs            (1),
+    m_maxTileSegs           (1)
+{
+    // Query relevant device attributes.
+
+    int currentDevice = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&currentDevice));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&m_numSMs, cudaDevAttrMultiProcessorCount, currentDevice));
+    cudaFuncAttributes attr;
+    NVDR_CHECK_CUDA_ERROR(cudaFuncGetAttributes(&attr, (void*)fineRasterKernel));
+    m_numFineWarpsPerBlock = min(attr.maxThreadsPerBlock / 32, CR_FINE_MAX_WARPS);
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numCoarseBlocksPerSM, (void*)coarseRasterKernel, 32 * CR_COARSE_WARPS, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_numFineBlocksPerSM, (void*)fineRasterKernel, 32 * m_numFineWarpsPerBlock, 0));
+
+    // Setup functions.
+
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)triangleSetupKernel, cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)binRasterKernel,     cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)coarseRasterKernel,  cudaFuncCachePreferShared));
+    NVDR_CHECK_CUDA_ERROR(cudaFuncSetCacheConfig((void*)fineRasterKernel,    cudaFuncCachePreferShared));
+}
+
+//------------------------------------------------------------------------
+
+RasterImpl::~RasterImpl(void)
+{
+    // Empty.
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setBufferSize(Vec3i size)
+{
+    // Internal buffer width and height must be divisible by tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    m_bufferSizePixels = Vec2i(w, h);
+    m_bufferSizeVp     = Vec2i(size.x, size.y);
+    m_numImages        = size.z;
+
+    m_colorBuffer.reset(w * h * size.z * sizeof(U32));
+    m_depthBuffer.reset(w * h * size.z * sizeof(U32));
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::setViewport(Vec2i size, Vec2i offset)
+{
+    // Offset must be divisible by tile size.
+    NVDR_CHECK((offset.x & (CR_TILE_SIZE - 1)) == 0 && (offset.y & (CR_TILE_SIZE - 1)) == 0, "invalid viewport offset");
+
+    // Round internal viewport size to multiples of tile size.
+    int w = (size.x + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int h = (size.y + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    m_sizePixels    = Vec2i(w, h);
+    m_offsetPixels  = offset;
+    m_sizeVp        = Vec2i(size.x, size.y);
+    m_sizeTiles.x   = m_sizePixels.x >> CR_TILE_LOG2;
+    m_sizeTiles.y   = m_sizePixels.y >> CR_TILE_LOG2;
+    m_numTiles      = m_sizeTiles.x * m_sizeTiles.y;
+    m_sizeBins.x    = (m_sizeTiles.x + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_sizeBins.y    = (m_sizeTiles.y + CR_BIN_SIZE - 1) >> CR_BIN_LOG2;
+    m_numBins       = m_sizeBins.x * m_sizeBins.y;
+}
+
+void RasterImpl::swapDepthAndPeel(void)
+{
+    m_peelBuffer.reset(m_depthBuffer.getSize()); // Ensure equal size and valid pointer.
+
+    void* tmp = m_depthBuffer.getPtr();
+    m_depthBuffer.setPtr(m_peelBuffer.getPtr());
+    m_peelBuffer.setPtr(tmp);
+}
+
+//------------------------------------------------------------------------
+
+bool RasterImpl::drawTriangles(const Vec2i* ranges, bool peel, cudaStream_t stream)
+{
+    bool instanceMode = (!ranges);
+
+    int maxSubtrisSlack     = 4096;     // x 81B    = 324KB
+    int maxBinSegsSlack     = 256;      // x 2137B  = 534KB
+    int maxTileSegsSlack    = 4096;     // x 136B   = 544KB
+
+    // Resize atomics as needed.
+    m_crAtomics    .grow(m_numImages * sizeof(CRAtomics));
+    m_crAtomicsHost.grow(m_numImages * sizeof(CRAtomics));
+
+    // Size of these buffers doesn't depend on input.
+    m_binFirstSeg  .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_binTotal     .grow(m_numImages * CR_MAXBINS_SQR * CR_BIN_STREAMS_SIZE * sizeof(S32));
+    m_activeTiles  .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+    m_tileFirstSeg .grow(m_numImages * CR_MAXTILES_SQR * sizeof(S32));
+
+    // Construct per-image parameters and determine worst-case buffer sizes.
+    m_crImageParamsHost.grow(m_numImages * sizeof(CRImageParams));
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+    for (int i=0; i < m_numImages; i++)
+    {
+        CRImageParams& ip = imageParams[i];
+
+        int roundSize  = CR_BIN_WARPS * 32;
+        int minBatches = CR_BIN_STREAMS_SIZE * 2;
+        int maxRounds  = 32;
+
+        ip.triOffset = instanceMode ? 0 : ranges[i].x;
+        ip.triCount  = instanceMode ? m_numTriangles : ranges[i].y;
+        ip.binBatchSize = min(max(ip.triCount / (roundSize * minBatches), 1), maxRounds) * roundSize;
+
+        m_maxSubtris  = max(m_maxSubtris,  min(ip.triCount + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+        m_maxBinSegs  = max(m_maxBinSegs,  max(m_numBins * CR_BIN_STREAMS_SIZE, (ip.triCount - 1) / CR_BIN_SEG_SIZE + 1) + maxBinSegsSlack);
+        m_maxTileSegs = max(m_maxTileSegs, max(m_numTiles, (ip.triCount - 1) / CR_TILE_SEG_SIZE + 1) + maxTileSegsSlack);
+    }
+
+    // Retry until successful.
+
+    for (;;)
+    {
+        // Allocate buffers.
+        m_triSubtris.reset(m_numImages * m_maxSubtris * sizeof(U8));
+        m_triHeader .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleHeader));
+        m_triData   .reset(m_numImages * m_maxSubtris * sizeof(CRTriangleData));
+
+        m_binSegData .reset(m_numImages * m_maxBinSegs * CR_BIN_SEG_SIZE * sizeof(S32));
+        m_binSegNext .reset(m_numImages * m_maxBinSegs * sizeof(S32));
+        m_binSegCount.reset(m_numImages * m_maxBinSegs * sizeof(S32));
+
+        m_tileSegData .reset(m_numImages * m_maxTileSegs * CR_TILE_SEG_SIZE * sizeof(S32));
+        m_tileSegNext .reset(m_numImages * m_maxTileSegs * sizeof(S32));
+        m_tileSegCount.reset(m_numImages * m_maxTileSegs * sizeof(S32));
+
+        // Report if buffers grow from last time.
+        size_t sizesTotal = getTotalBufferSizes();
+        if (sizesTotal > m_bufferSizesReported)
+        {
+            size_t sizesMB = ((sizesTotal - 1) >> 20) + 1; // Round up.
+            sizesMB = ((sizesMB + 9) / 10) * 10; // 10MB granularity enough in this day and age.
+            LOG(INFO) << "Internal buffers grown to " << sizesMB << " MB";
+            m_bufferSizesReported = sizesMB << 20;
+        }
+
+        // Launch stages. Blocks until everything is done.
+        launchStages(instanceMode, peel, stream);
+
+        // Peeling iteration cannot fail, so no point checking things further.
+        if (peel)
+            break;
+
+        // Atomics after coarse stage are now available.
+        CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+
+        // Success?
+        bool failed = false;
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            failed = failed || (a.numSubtris > m_maxSubtris) || (a.numBinSegs > m_maxBinSegs) || (a.numTileSegs > m_maxTileSegs);
+        }
+        if (!failed)
+            break; // Success!
+
+        // If we were already at maximum capacity, no can do.
+        if (m_maxSubtris == CR_MAXSUBTRIS_SIZE)
+            return false;
+
+        // Enlarge buffers and try again.
+        for (int i=0; i < m_numImages; i++)
+        {
+            const CRAtomics& a = atomics[i];
+            m_maxSubtris  = max(m_maxSubtris,  min(a.numSubtris + maxSubtrisSlack, CR_MAXSUBTRIS_SIZE));
+            m_maxBinSegs  = max(m_maxBinSegs,  a.numBinSegs + maxBinSegsSlack);
+            m_maxTileSegs = max(m_maxTileSegs, a.numTileSegs + maxTileSegsSlack);
+        }
+    }
+
+    m_deferredClear = false;
+    return true; // Success.
+}
+
+//------------------------------------------------------------------------
+
+size_t RasterImpl::getTotalBufferSizes(void) const
+{
+    return
+        m_colorBuffer.getSize() + m_depthBuffer.getSize() + // Don't include atomics and image params.
+        m_triSubtris.getSize() + m_triHeader.getSize() + m_triData.getSize() +
+        m_binFirstSeg.getSize() + m_binTotal.getSize() + m_binSegData.getSize() + m_binSegNext.getSize() + m_binSegCount.getSize() +
+        m_activeTiles.getSize() + m_tileFirstSeg.getSize() + m_tileSegData.getSize() + m_tileSegNext.getSize() + m_tileSegCount.getSize();
+}
+
+//------------------------------------------------------------------------
+
+void RasterImpl::launchStages(bool instanceMode, bool peel, cudaStream_t stream)
+{
+    CRImageParams* imageParams = (CRImageParams*)m_crImageParamsHost.getPtr();
+
+    // Unless peeling, initialize atomics to mostly zero.
+    CRAtomics* atomics = (CRAtomics*)m_crAtomicsHost.getPtr();
+    if (!peel)
+    {
+        memset(atomics, 0, m_numImages * sizeof(CRAtomics));
+        for (int i=0; i < m_numImages; i++)
+            atomics[i].numSubtris = imageParams[i].triCount;
+    }
+
+    // Copy to device. If peeling, this is the state after coarse raster launch on first iteration.
+    NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomics.getPtr(), atomics, m_numImages * sizeof(CRAtomics), cudaMemcpyHostToDevice, stream));
+
+    // Copy per-image parameters if there are more than fits in launch parameter block and we haven't done it already.
+    if (!peel && m_numImages > CR_EMBED_IMAGE_PARAMS)
+    {
+        int numImageParamsExtra = m_numImages - CR_EMBED_IMAGE_PARAMS;
+        m_crImageParamsExtra.grow(numImageParamsExtra * sizeof(CRImageParams));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crImageParamsExtra.getPtr(), imageParams + CR_EMBED_IMAGE_PARAMS, numImageParamsExtra * sizeof(CRImageParams), cudaMemcpyHostToDevice, stream));
+    }
+
+    // Set global parameters.
+    CRParams p;
+    {
+        p.atomics           = (CRAtomics*)m_crAtomics.getPtr();
+        p.numImages         = m_numImages;
+        p.totalCount        = 0; // Only relevant in range mode.
+        p.instanceMode      = instanceMode ? 1 : 0;
+
+        p.numVertices       = m_numVertices;
+        p.numTriangles      = m_numTriangles;
+        p.vertexBuffer      = m_vertexPtr;
+        p.indexBuffer       = m_indexPtr;
+
+        p.widthPixels       = m_sizePixels.x;
+        p.heightPixels      = m_sizePixels.y;
+        p.widthPixelsVp     = m_sizeVp.x;
+        p.heightPixelsVp    = m_sizeVp.y;
+        p.widthBins         = m_sizeBins.x;
+        p.heightBins        = m_sizeBins.y;
+        p.numBins           = m_numBins;
+
+        p.xs                = (float)m_bufferSizeVp.x / (float)m_sizeVp.x;
+        p.ys                = (float)m_bufferSizeVp.y / (float)m_sizeVp.y;
+        p.xo                = (float)(m_bufferSizeVp.x - m_sizeVp.x - 2 * m_offsetPixels.x) / (float)m_sizeVp.x;
+        p.yo                = (float)(m_bufferSizeVp.y - m_sizeVp.y - 2 * m_offsetPixels.y) / (float)m_sizeVp.y;
+
+        p.widthTiles        = m_sizeTiles.x;
+        p.heightTiles       = m_sizeTiles.y;
+        p.numTiles          = m_numTiles;
+
+        p.renderModeFlags   = m_renderModeFlags;
+        p.deferredClear     = m_deferredClear ? 1 : 0;
+        p.clearColor        = m_clearColor;
+        p.clearDepth        = CR_DEPTH_MAX;
+
+        p.maxSubtris        = m_maxSubtris;
+        p.maxBinSegs        = m_maxBinSegs;
+        p.maxTileSegs       = m_maxTileSegs;
+
+        p.triSubtris        = m_triSubtris.getPtr();
+        p.triHeader         = m_triHeader.getPtr();
+        p.triData           = m_triData.getPtr();
+        p.binSegData        = m_binSegData.getPtr();
+        p.binSegNext        = m_binSegNext.getPtr();
+        p.binSegCount       = m_binSegCount.getPtr();
+        p.binFirstSeg       = m_binFirstSeg.getPtr();
+        p.binTotal          = m_binTotal.getPtr();
+        p.tileSegData       = m_tileSegData.getPtr();
+        p.tileSegNext       = m_tileSegNext.getPtr();
+        p.tileSegCount      = m_tileSegCount.getPtr();
+        p.activeTiles       = m_activeTiles.getPtr();
+        p.tileFirstSeg      = m_tileFirstSeg.getPtr();
+
+        size_t byteOffset = ((size_t)m_offsetPixels.x + (size_t)m_offsetPixels.y * (size_t)p.strideX) * sizeof(U32);
+        p.colorBuffer       = m_colorBuffer.getPtr(byteOffset);
+        p.depthBuffer       = m_depthBuffer.getPtr(byteOffset);
+        p.peelBuffer        = (m_renderModeFlags & CudaRaster::RenderModeFlag_EnableDepthPeeling) ? m_peelBuffer.getPtr(byteOffset) : 0;
+        p.strideX           = m_bufferSizePixels.x;
+        p.strideY           = m_bufferSizePixels.y;
+
+        memcpy(&p.imageParamsFirst, imageParams, min(m_numImages, CR_EMBED_IMAGE_PARAMS) * sizeof(CRImageParams));
+        p.imageParamsExtra  = (CRImageParams*)m_crImageParamsExtra.getPtr();
+    }
+
+    // Setup block sizes.
+
+    dim3 brBlock(32, CR_BIN_WARPS);
+    dim3 crBlock(32, CR_COARSE_WARPS);
+    dim3 frBlock(32, m_numFineWarpsPerBlock);
+    void* args[] = {&p};
+
+    // Launch stages from setup to coarse and copy atomics to host only if this is not a single-tile peeling iteration.
+    if (!peel)
+    {
+        if (instanceMode)
+        {
+            int setupBlocks = (m_numTriangles - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, m_numImages), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        else
+        {
+            for (int i=0; i < m_numImages; i++)
+                p.totalCount += imageParams[i].triCount;
+            int setupBlocks = (p.totalCount - 1) / (32 * CR_SETUP_WARPS) + 1;
+            NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)triangleSetupKernel, dim3(setupBlocks, 1, 1), dim3(32, CR_SETUP_WARPS), args, 0, stream));
+        }
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)binRasterKernel, dim3(CR_BIN_STREAMS_SIZE, 1, m_numImages), brBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)coarseRasterKernel, dim3(m_numSMs * m_numCoarseBlocksPerSM, 1, m_numImages), crBlock, args, 0, stream));
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(m_crAtomicsHost.getPtr(), m_crAtomics.getPtr(), sizeof(CRAtomics) * m_numImages, cudaMemcpyDeviceToHost, stream));
+    }
+
+    // Fine rasterizer is launched always.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)fineRasterKernel, dim3(m_numSMs * m_numFineBlocksPerSM, 1, m_numImages), frBlock, args, 0, stream));
+    NVDR_CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
new file mode 100644
index 00000000..43b1edf0
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "../CudaRaster.hpp"
+#include "PrivateDefs.hpp"
+#include "Constants.hpp"
+#include "Util.inl"
+
+namespace CR
+{
+
+//------------------------------------------------------------------------
+// Stage implementations.
+//------------------------------------------------------------------------
+
+#include "TriangleSetup.inl"
+#include "BinRaster.inl"
+#include "CoarseRaster.inl"
+#include "FineRaster.inl"
+
+}
+
+//------------------------------------------------------------------------
+// Stage entry points.
+//------------------------------------------------------------------------
+
+__global__ void __launch_bounds__(CR_SETUP_WARPS * 32, CR_SETUP_OPT_BLOCKS)  triangleSetupKernel (const CR::CRParams p)  { CR::triangleSetupImpl(p); }
+__global__ void __launch_bounds__(CR_BIN_WARPS * 32, 1)                      binRasterKernel     (const CR::CRParams p)  { CR::binRasterImpl(p); }
+__global__ void __launch_bounds__(CR_COARSE_WARPS * 32, 1)                   coarseRasterKernel  (const CR::CRParams p)  { CR::coarseRasterImpl(p); }
+__global__ void __launch_bounds__(CR_FINE_MAX_WARPS * 32, 1)                 fineRasterKernel    (const CR::CRParams p)  { CR::fineRasterImpl(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
new file mode 100644
index 00000000..d594acdf
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/RasterImpl.hpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "PrivateDefs.hpp"
+#include "Buffer.hpp"
+#include "../CudaRaster.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+class RasterImpl
+{
+public:
+					        RasterImpl				(void);
+					        ~RasterImpl				(void);
+
+    void                    setBufferSize           (Vec3i size);
+    void                    setViewport             (Vec2i size, Vec2i offset);
+    void                    setRenderModeFlags      (U32 flags) { m_renderModeFlags = flags; }
+    void                    deferredClear           (U32 color) { m_deferredClear = true; m_clearColor = color; }
+    void                    setVertexBuffer         (void* ptr, int numVertices) { m_vertexPtr = ptr; m_numVertices = numVertices; } // GPU pointer.
+    void                    setIndexBuffer          (void* ptr, int numTriangles) { m_indexPtr = ptr; m_numTriangles = numTriangles; } // GPU pointer.
+    bool                    drawTriangles           (const Vec2i* ranges, bool peel, cudaStream_t stream);
+    void*                   getColorBuffer          (void) { return m_colorBuffer.getPtr(); } // GPU pointer.
+    void*                   getDepthBuffer          (void) { return m_depthBuffer.getPtr(); } // GPU pointer.
+    void                    swapDepthAndPeel        (void);
+    size_t                  getTotalBufferSizes     (void) const;
+
+private:
+    void                    launchStages            (bool instanceMode, bool peel, cudaStream_t stream);
+
+    // State.
+
+    unsigned int            m_renderModeFlags;
+    bool                    m_deferredClear;
+    unsigned int            m_clearColor;
+    void*                   m_vertexPtr;
+    void*                   m_indexPtr;
+    int                     m_numVertices;          // Input buffer size.
+    int                     m_numTriangles;         // Input buffer size.
+    size_t                  m_bufferSizesReported;  // Previously reported buffer sizes.
+
+    // Surfaces.
+
+    Buffer                  m_colorBuffer;
+    Buffer                  m_depthBuffer;
+    Buffer                  m_peelBuffer;
+    int                     m_numImages;
+    Vec2i                   m_bufferSizePixels;     // Internal buffer size.
+    Vec2i                   m_bufferSizeVp;         // Total viewport size.
+    Vec2i                   m_sizePixels;           // Internal size at which all computation is done, buffers reserved, etc.
+    Vec2i                   m_sizeVp;               // Size to which output will be cropped outside, determines viewport size.
+    Vec2i                   m_offsetPixels;         // Viewport offset for tiled rendering.
+    Vec2i                   m_sizeBins;
+    S32                     m_numBins;
+    Vec2i                   m_sizeTiles;
+    S32                     m_numTiles;
+
+    // Launch sizes etc.
+
+    S32                     m_numSMs;
+    S32                     m_numCoarseBlocksPerSM;
+    S32                     m_numFineBlocksPerSM;
+    S32                     m_numFineWarpsPerBlock;
+
+    // Global intermediate buffers. Individual images have offsets to these.
+
+    Buffer                  m_crAtomics;
+    HostBuffer              m_crAtomicsHost;
+    HostBuffer              m_crImageParamsHost;
+    Buffer                  m_crImageParamsExtra;
+    Buffer                  m_triSubtris;
+    Buffer                  m_triHeader;
+    Buffer                  m_triData;
+    Buffer                  m_binFirstSeg;
+    Buffer                  m_binTotal;
+    Buffer                  m_binSegData;
+    Buffer                  m_binSegNext;
+	Buffer                  m_binSegCount;
+    Buffer                  m_activeTiles;
+    Buffer                  m_tileFirstSeg;
+    Buffer                  m_tileSegData;
+    Buffer                  m_tileSegNext;
+    Buffer                  m_tileSegCount;
+
+    // Actual buffer sizes.
+
+    S32                     m_maxSubtris;
+    S32                     m_maxBinSegs;
+    S32                     m_maxTileSegs;
+};
+
+//------------------------------------------------------------------------
+} // namespace CR
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
new file mode 100644
index 00000000..276f0a40
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/TriangleSetup.inl
@@ -0,0 +1,402 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void snapTriangle(
+    const CRParams& p,
+    float4 v0, float4 v1, float4 v2,
+    int2& p0, int2& p1, int2& p2, float3& rcpW, int2& lo, int2& hi)
+{
+    F32 viewScaleX = (F32)(p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1));
+    F32 viewScaleY = (F32)(p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+    rcpW = make_float3(1.0f / v0.w, 1.0f / v1.w, 1.0f / v2.w);
+    p0 = make_int2(f32_to_s32_sat(v0.x * rcpW.x * viewScaleX), f32_to_s32_sat(v0.y * rcpW.x * viewScaleY));
+    p1 = make_int2(f32_to_s32_sat(v1.x * rcpW.y * viewScaleX), f32_to_s32_sat(v1.y * rcpW.y * viewScaleY));
+    p2 = make_int2(f32_to_s32_sat(v2.x * rcpW.z * viewScaleX), f32_to_s32_sat(v2.y * rcpW.z * viewScaleY));
+    lo = make_int2(min_min(p0.x, p1.x, p2.x), min_min(p0.y, p1.y, p2.y));
+    hi = make_int2(max_max(p0.x, p1.x, p2.x), max_max(p0.y, p1.y, p2.y));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 cover8x8_selectFlips(S32 dx, S32 dy) // 10 instr
+{
+    U32 flips = 0;
+    if (dy > 0 || (dy == 0 && dx <= 0))
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y) ^ (1 << CR_FLIPBIT_COMPL);
+    if (dx > 0)
+        flips ^= (1 << CR_FLIPBIT_FLIP_X) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    if (::abs(dx) < ::abs(dy))
+        flips ^= (1 << CR_FLIPBIT_SWAP_XY) ^ (1 << CR_FLIPBIT_FLIP_Y);
+    return flips;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ bool prepareTriangle(
+    const CRParams& p,
+    int2 p0, int2 p1, int2 p2, int2 lo, int2 hi,
+    int2& d1, int2& d2, S32& area)
+{
+    // Backfacing or degenerate => cull.
+
+    d1 = make_int2(p1.x - p0.x, p1.y - p0.y);
+    d2 = make_int2(p2.x - p0.x, p2.y - p0.y);
+    area = d1.x * d2.y - d1.y * d2.x;
+
+    if (area == 0)
+        return false; // Degenerate.
+
+    if (area < 0 && (p.renderModeFlags & CudaRaster::RenderModeFlag_EnableBackfaceCulling) != 0)
+        return false; // Backfacing.
+
+    // AABB falls between samples => cull.
+
+    int sampleSize = 1 << CR_SUBPIXEL_LOG2;
+    int biasX = (p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int biasY = (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1)) - (sampleSize >> 1);
+    int lox = (int)add_add(lo.x, sampleSize - 1, biasX) & -sampleSize;
+    int loy = (int)add_add(lo.y, sampleSize - 1, biasY) & -sampleSize;
+    int hix = (hi.x + biasX) & -sampleSize;
+    int hiy = (hi.y + biasY) & -sampleSize;
+
+    if (lox > hix || loy > hiy)
+        return false; // Between pixels.
+
+    // AABB covers 1 or 2 samples => cull if they are not covered.
+
+    int diff = add_sub(hix, hiy, lox) - loy;
+    if (diff <= sampleSize)
+    {
+        int2 t0 = make_int2(add_sub(p0.x, biasX, lox), add_sub(p0.y, biasY, loy));
+        int2 t1 = make_int2(add_sub(p1.x, biasX, lox), add_sub(p1.y, biasY, loy));
+        int2 t2 = make_int2(add_sub(p2.x, biasX, lox), add_sub(p2.y, biasY, loy));
+        S32 e0 = t0.x * t1.y - t0.y * t1.x;
+        S32 e1 = t1.x * t2.y - t1.y * t2.x;
+        S32 e2 = t2.x * t0.y - t2.y * t0.x;
+        if (area < 0)
+        {
+            e0 = -e0;
+            e1 = -e1;
+            e2 = -e2;
+        }
+
+        if (e0 < 0 || e1 < 0 || e2 < 0)
+        {
+            if (diff == 0)
+                return false; // Between pixels.
+
+            t0 = make_int2(add_sub(p0.x, biasX, hix), add_sub(p0.y, biasY, hiy));
+            t1 = make_int2(add_sub(p1.x, biasX, hix), add_sub(p1.y, biasY, hiy));
+            t2 = make_int2(add_sub(p2.x, biasX, hix), add_sub(p2.y, biasY, hiy));
+            e0 = t0.x * t1.y - t0.y * t1.x;
+            e1 = t1.x * t2.y - t1.y * t2.x;
+            e2 = t2.x * t0.y - t2.y * t0.x;
+            if (area < 0)
+            {
+                e0 = -e0;
+                e1 = -e1;
+                e2 = -e2;
+            }
+
+            if (e0 < 0 || e1 < 0 || e2 < 0)
+                return false; // Between pixels.
+        }
+    }
+
+    // Otherwise => proceed to output the triangle.
+
+    return true; // Visible.
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void setupTriangle(
+    const CRParams& p,
+    CRTriangleHeader* th, CRTriangleData* td, int triId,
+    float v0z, float v1z, float v2z,
+    int2 p0, int2 p1, int2 p2, float3 rcpW,
+    int2 d1, int2 d2, S32 area)
+{
+    // Swap vertices 1 and 2 if area is negative. Only executed if backface culling is
+    // disabled (if it is enabled, we never come here with area < 0).
+
+    if (area < 0)
+    {
+        swap(d1, d2);
+        swap(p1, p2);
+        swap(v1z, v2z);
+        swap(rcpW.y, rcpW.z);
+        area = -area;
+    }
+
+    int2 wv0;
+    wv0.x = p0.x + (p.widthPixelsVp  << (CR_SUBPIXEL_LOG2 - 1));
+    wv0.y = p0.y + (p.heightPixelsVp << (CR_SUBPIXEL_LOG2 - 1));
+
+    // Setup depth plane equation.
+
+    F32 zcoef = (F32)(CR_DEPTH_MAX - CR_DEPTH_MIN) * 0.5f;
+    F32 zbias = (F32)(CR_DEPTH_MAX + CR_DEPTH_MIN) * 0.5f;
+    float3 zvert = make_float3(
+        (v0z * zcoef) * rcpW.x + zbias,
+        (v1z * zcoef) * rcpW.y + zbias,
+        (v2z * zcoef) * rcpW.z + zbias
+    );
+    int2 zv0 = make_int2(
+        wv0.x - (1 << (CR_SUBPIXEL_LOG2 - 1)),
+        wv0.y - (1 << (CR_SUBPIXEL_LOG2 - 1))
+    );
+    uint3 zpleq = setupPleq(zvert, zv0, d1, d2, 1.0f / (F32)area);
+
+    U32 zmin = f32_to_u32_sat(fminf(fminf(zvert.x, zvert.y), zvert.z) - (F32)CR_LERP_ERROR(0));
+
+    // Write CRTriangleData.
+
+    *(uint4*)td = make_uint4(zpleq.x, zpleq.y, zpleq.z, triId);
+
+    // Determine flipbits.
+
+    U32 f01 = cover8x8_selectFlips(d1.x, d1.y);
+    U32 f12 = cover8x8_selectFlips(d2.x - d1.x, d2.y - d1.y);
+    U32 f20 = cover8x8_selectFlips(-d2.x, -d2.y);
+
+    // Write CRTriangleHeader.
+
+    *(uint4*)th = make_uint4(
+        prmt(p0.x, p0.y, 0x5410),
+        prmt(p1.x, p1.y, 0x5410),
+        prmt(p2.x, p2.y, 0x5410),
+        (zmin & 0xfffff000u) | (f01 << 6) | (f12 << 2) | (f20 >> 2));
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void triangleSetupImpl(const CRParams p)
+{
+    __shared__ F32 s_bary[CR_SETUP_WARPS * 32][18];
+    F32* bary = s_bary[threadIdx.x + threadIdx.y * 32];
+
+    // Compute task and image indices.
+
+    int taskIdx = threadIdx.x + 32 * (threadIdx.y + CR_SETUP_WARPS * blockIdx.x);
+    int imageIdx = 0;
+    if (p.instanceMode)
+    {
+        imageIdx = blockIdx.z;
+        if (taskIdx >= p.numTriangles)
+            return;
+    }
+    else
+    {
+        while (imageIdx < p.numImages)
+        {
+            int count = getImageParams(p, imageIdx).triCount;
+            if (taskIdx < count)
+                break;
+            taskIdx -= count;
+            imageIdx += 1;
+        }
+        if (imageIdx == p.numImages)
+            return;
+    }
+
+    // Per-image data structures.
+
+    const CRImageParams& ip = getImageParams(p, imageIdx);
+    CRAtomics& atomics = p.atomics[imageIdx];
+
+    const int*          indexBuffer = (const int*)p.indexBuffer;
+    U8*                 triSubtris  = (U8*)p.triSubtris               + imageIdx * p.maxSubtris;
+    CRTriangleHeader*   triHeader   = (CRTriangleHeader*)p.triHeader  + imageIdx * p.maxSubtris;
+    CRTriangleData*     triData     = (CRTriangleData*)p.triData      + imageIdx * p.maxSubtris;
+
+    // Determine triangle index.
+
+    int triIdx = taskIdx;
+    if (!p.instanceMode)
+        triIdx += ip.triOffset;
+
+    // Read vertex indices.
+
+    if ((U32)triIdx >= (U32)p.numTriangles)
+    {
+        // Bad triangle index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    uint4 vidx;
+    vidx.x = indexBuffer[triIdx * 3 + 0];
+    vidx.y = indexBuffer[triIdx * 3 + 1];
+    vidx.z = indexBuffer[triIdx * 3 + 2];
+    vidx.w = triIdx + 1; // Triangle index.
+
+    if (vidx.x >= (U32)p.numVertices ||
+        vidx.y >= (U32)p.numVertices ||
+        vidx.z >= (U32)p.numVertices)
+    {
+        // Bad vertex index.
+        triSubtris[taskIdx] = 0;
+        return;
+    }
+
+    // Read vertex positions.
+
+    const float4* vertexBuffer = (const float4*)p.vertexBuffer;
+    if (p.instanceMode)
+        vertexBuffer += p.numVertices * imageIdx; // Instance offset.
+
+    float4 v0 = vertexBuffer[vidx.x];
+    float4 v1 = vertexBuffer[vidx.y];
+    float4 v2 = vertexBuffer[vidx.z];
+
+    // Adjust vertex positions according to current viewport size and offset.
+
+    v0.x = v0.x * p.xs + v0.w * p.xo;
+    v0.y = v0.y * p.ys + v0.w * p.yo;
+    v1.x = v1.x * p.xs + v1.w * p.xo;
+    v1.y = v1.y * p.ys + v1.w * p.yo;
+    v2.x = v2.x * p.xs + v2.w * p.xo;
+    v2.y = v2.y * p.ys + v2.w * p.yo;
+
+    // Outside view frustum => cull.
+
+    if (v0.w < fabsf(v0.x) | v0.w < fabsf(v0.y) | v0.w < fabsf(v0.z))
+    {
+        if ((v0.w < +v0.x & v1.w < +v1.x & v2.w < +v2.x) |
+            (v0.w < -v0.x & v1.w < -v1.x & v2.w < -v2.x) |
+            (v0.w < +v0.y & v1.w < +v1.y & v2.w < +v2.y) |
+            (v0.w < -v0.y & v1.w < -v1.y & v2.w < -v2.y) |
+            (v0.w < +v0.z & v1.w < +v1.z & v2.w < +v2.z) |
+            (v0.w < -v0.z & v1.w < -v1.z & v2.w < -v2.z))
+        {
+            triSubtris[taskIdx] = 0;
+            return;
+        }
+    }
+
+    // Inside depth range => try to snap vertices.
+
+    if (v0.w >= fabsf(v0.z) & v1.w >= fabsf(v1.z) & v2.w >= fabsf(v2.z))
+    {
+        // Inside S16 range and small enough => fast path.
+        // Note: aabbLimit comes from the fact that cover8x8
+        // does not support guardband with maximal viewport.
+
+        int2 p0, p1, p2, lo, hi;
+        float3 rcpW;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        S32 loxy = ::min(lo.x, lo.y);
+        S32 hixy = ::max(hi.x, hi.y);
+        S32 aabbLimit = (1 << (CR_MAXVIEWPORT_LOG2 + CR_SUBPIXEL_LOG2)) - 1;
+
+        if (loxy >= -32768 && hixy <= 32767 && hixy - loxy <= aabbLimit)
+        {
+            int2 d1, d2;
+            S32 area;
+            bool res = prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area);
+            triSubtris[taskIdx] = res ? 1 : 0;
+
+            if (res)
+                setupTriangle(
+                    p,
+                    &triHeader[taskIdx], &triData[taskIdx], vidx.w,
+                    v0.z, v1.z, v2.z,
+                    p0, p1, p2, rcpW,
+                    d1, d2, area);
+
+            return;
+        }
+    }
+
+    // Clip to view frustum.
+
+    float4 ov0 = v0;
+    float4 od1 = make_float4(v1.x - v0.x, v1.y - v0.y, v1.z - v0.z, v1.w - v0.w);
+    float4 od2 = make_float4(v2.x - v0.x, v2.y - v0.y, v2.z - v0.z, v2.w - v0.w);
+    int numVerts = clipTriangleWithFrustum(bary, &ov0.x, &v1.x, &v2.x, &od1.x, &od2.x);
+
+    // Count non-culled subtriangles.
+
+    v0.x = ov0.x + od1.x * bary[0] + od2.x * bary[1];
+    v0.y = ov0.y + od1.y * bary[0] + od2.y * bary[1];
+    v0.z = ov0.z + od1.z * bary[0] + od2.z * bary[1];
+    v0.w = ov0.w + od1.w * bary[0] + od2.w * bary[1];
+    v1.x = ov0.x + od1.x * bary[2] + od2.x * bary[3];
+    v1.y = ov0.y + od1.y * bary[2] + od2.y * bary[3];
+    v1.z = ov0.z + od1.z * bary[2] + od2.z * bary[3];
+    v1.w = ov0.w + od1.w * bary[2] + od2.w * bary[3];
+    float4 tv1 = v1;
+
+    int numSubtris = 0;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+            numSubtris++;
+
+        v1 = v2;
+    }
+
+    triSubtris[taskIdx] = numSubtris;
+
+    // Multiple subtriangles => allocate.
+
+    int subtriBase = taskIdx;
+    if (numSubtris > 1)
+    {
+        subtriBase = atomicAdd(&atomics.numSubtris, numSubtris);
+        triHeader[taskIdx].misc = subtriBase;
+        if (subtriBase + numSubtris > p.maxSubtris)
+            numVerts = 0;
+    }
+
+    // Setup subtriangles.
+
+    v1 = tv1;
+    for (int i = 2; i < numVerts; i++)
+    {
+        v2.x = ov0.x + od1.x * bary[i * 2 + 0] + od2.x * bary[i * 2 + 1];
+        v2.y = ov0.y + od1.y * bary[i * 2 + 0] + od2.y * bary[i * 2 + 1];
+        v2.z = ov0.z + od1.z * bary[i * 2 + 0] + od2.z * bary[i * 2 + 1];
+        v2.w = ov0.w + od1.w * bary[i * 2 + 0] + od2.w * bary[i * 2 + 1];
+
+        int2 p0, p1, p2, lo, hi, d1, d2;
+        float3 rcpW;
+        S32 area;
+
+        snapTriangle(p, v0, v1, v2, p0, p1, p2, rcpW, lo, hi);
+        if (prepareTriangle(p, p0, p1, p2, lo, hi, d1, d2, area))
+        {
+            setupTriangle(
+                p,
+                &triHeader[subtriBase], &triData[subtriBase], vidx.w,
+                v0.z, v1.z, v2.z,
+                p0, p1, p2, rcpW,
+                d1, d2, area);
+
+            subtriBase++;
+        }
+
+        v1 = v2;
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl
new file mode 100644
index 00000000..f8faeba7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/cudaraster/impl/Util.inl
@@ -0,0 +1,452 @@
+// Copyright (c) 2009-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "PrivateDefs.hpp"
+
+namespace CR
+{
+//------------------------------------------------------------------------
+
+template<class T> __device__ __inline__ void swap(T& a, T& b)               { T t = a; a = b; b = t; }
+
+__device__ __inline__ U32   getLo                   (U64 a)                 { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ S32   getLo                   (S64 a)                 { return __double2loint(__longlong_as_double(a)); }
+__device__ __inline__ U32   getHi                   (U64 a)                 { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ S32   getHi                   (S64 a)                 { return __double2hiint(__longlong_as_double(a)); }
+__device__ __inline__ U64   combineLoHi             (U32 lo, U32 hi)        { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ S64   combineLoHi             (S32 lo, S32 hi)        { return __double_as_longlong(__hiloint2double(hi, lo)); }
+__device__ __inline__ U32   getLaneMaskLt           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_lt;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskLe           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_le;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskGt           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_gt;" : "=r"(r)); return r; }
+__device__ __inline__ U32   getLaneMaskGe           (void)                  { U32 r; asm("mov.u32 %0, %lanemask_ge;" : "=r"(r)); return r; }
+__device__ __inline__ int   findLeadingOne          (U32 v)                 { U32 r; asm("bfind.u32 %0, %1;" : "=r"(r) : "r"(v)); return r; }
+__device__ __inline__ bool  singleLane              (void)                  { return ((::__ballot_sync(~0u, true) & getLaneMaskLt()) == 0); }
+
+__device__ __inline__ void  add_add_carry           (U32& rlo, U32 alo, U32 blo, U32& rhi, U32 ahi, U32 bhi) { U64 r = combineLoHi(alo, ahi) + combineLoHi(blo, bhi); rlo = getLo(r); rhi = getHi(r); }
+__device__ __inline__ S32   f32_to_s32_sat          (F32 a)                 { S32 v; asm("cvt.rni.sat.s32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u32_sat          (F32 a)                 { U32 v; asm("cvt.rni.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u32_sat_rmi      (F32 a)                 { U32 v; asm("cvt.rmi.sat.u32.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ U32   f32_to_u8_sat           (F32 a)                 { U32 v; asm("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(v) : "f"(a)); return v; }
+__device__ __inline__ S64   f32_to_s64              (F32 a)                 { S64 v; asm("cvt.rni.s64.f32 %0, %1;" : "=l"(v) : "f"(a)); return v; }
+__device__ __inline__ S32   add_s16lo_s16lo			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16hi_s16lo			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16lo_s16hi			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   add_s16hi_s16hi			(S32 a, S32 b)			{ S32 v; asm("vadd.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16lo_s16lo			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16hi_s16lo			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16lo_s16hi			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_s16hi_s16hi			(S32 a, S32 b)			{ S32 v; asm("vsub.s32.s32.s32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16lo_u16lo			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16hi_u16lo			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h0;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16lo_u16hi			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h0, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ S32   sub_u16hi_u16hi			(U32 a, U32 b)			{ S32 v; asm("vsub.s32.u32.u32 %0, %1.h1, %2.h1;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b0					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b0, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b1					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b2					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b2, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   add_b3					(U32 a, U32 b)			{ U32 v; asm("vadd.u32.u32.u32 %0, %1.b3, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ U32   vmad_b0					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b1					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b2					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b3					(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b0_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b0, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b1_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b1, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b2_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b2, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   vmad_b3_b3				(U32 a, U32 b, U32 c)	{ U32 v; asm("vmad.u32.u32.u32 %0, %1.b3, %2.b3, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_mask8				(U32 a, U32 b)			{ U32 v; U32 z=0; asm("vadd.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ U32   sub_mask8				(U32 a, U32 b)			{ U32 v; U32 z=0; asm("vsub.u32.u32.u32 %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(z)); return v; }
+__device__ __inline__ S32   max_max					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmax.s32.s32.s32.max %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   min_min					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmin.s32.s32.s32.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   max_add					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmax.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   min_add					(S32 a, S32 b, S32 c)	{ S32 v; asm("vmin.s32.s32.s32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_add					(U32 a, U32 b, U32 c)	{ U32 v; asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   sub_add					(U32 a, U32 b, U32 c)	{ U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   add_sub					(U32 a, U32 b, U32 c)	{ U32 v; asm("vsub.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(c), "r"(b)); return v; }
+__device__ __inline__ S32   add_clamp_0_x			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat.min %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   add_clamp_b0			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat %0.b0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   add_clamp_b2			(S32 a, S32 b, S32 c)	{ S32 v; asm("vadd.u32.s32.s32.sat %0.b2, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ U32   prmt					(U32 a, U32 b, U32 c)   { U32 v; asm("prmt.b32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   u32lo_sext              (U32 a)                 { U32 v; asm("cvt.s16.u32 %0, %1;" : "=r"(v) : "r"(a)); return v; }
+__device__ __inline__ U32   slct                    (U32 a, U32 b, S32 c)   { U32 v; asm("slct.u32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ S32   slct                    (S32 a, S32 b, S32 c)   { S32 v; asm("slct.s32.s32 %0, %1, %2, %3;" : "=r"(v) : "r"(a), "r"(b), "r"(c)); return v; }
+__device__ __inline__ F32   slct                    (F32 a, F32 b, S32 c)   { F32 v; asm("slct.f32.s32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "r"(c)); return v; }
+__device__ __inline__ U32   isetge                  (S32 a, S32 b)          { U32 v; asm("set.ge.u32.s32 %0, %1, %2;" : "=r"(v) : "r"(a), "r"(b)); return v; }
+__device__ __inline__ F64   rcp_approx              (F64 a)                 { F64 v; asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(v) : "d"(a)); return v; }
+__device__ __inline__ F32   fma_rm                  (F32 a, F32 b, F32 c)   { F32 v; asm("fma.rm.f32 %0, %1, %2, %3;" : "=f"(v) : "f"(a), "f"(b), "f"(c)); return v; }
+__device__ __inline__ U32   idiv_fast               (U32 a, U32 b);
+
+__device__ __inline__ uint3 setupPleq               (float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp);
+
+__device__ __inline__ void  cover8x8_setupLUT           (volatile U64* lut);
+__device__ __inline__ U64   cover8x8_exact_fast         (S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut); // Assumes viewport <= 2^11, subpixels <= 2^4, no guardband.
+__device__ __inline__ U64   cover8x8_lookupMask         (S64 yinit, U32 yinc, U32 flips, volatile const U64* lut);
+
+__device__ __inline__ U64   cover8x8_exact_noLUT        (S32 ox, S32 oy, S32 dx, S32 dy); // optimized reference implementation, does not require look-up table
+__device__ __inline__ U64   cover8x8_conservative_noLUT (S32 ox, S32 oy, S32 dx, S32 dy);
+__device__ __inline__ U64   cover8x8_generateMask_noLUT (S32 curr, S32 dx, S32 dy);
+
+template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems); // Assumes that numItems <= threadsInBlock. Must sync before & after the call.
+
+__device__ __inline__ const CRImageParams& getImageParams(const CRParams& p, int idx)
+{
+    return (idx < CR_EMBED_IMAGE_PARAMS) ? p.imageParamsFirst[idx] : p.imageParamsExtra[idx - CR_EMBED_IMAGE_PARAMS];
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipPolygonWithPlane(F32* baryOut, const F32* baryIn, int numIn, F32 v0, F32 v1, F32 v2)
+{
+    int numOut = 0;
+    if (numIn >= 3)
+    {
+        int ai = (numIn - 1) * 2;
+        F32 av = v0 + v1 * baryIn[ai + 0] + v2 * baryIn[ai + 1];
+        for (int bi = 0; bi < numIn * 2; bi += 2)
+        {
+            F32 bv = v0 + v1 * baryIn[bi + 0] + v2 * baryIn[bi + 1];
+            if (av * bv < 0.0f)
+            {
+                F32 bc = av / (av - bv);
+                F32 ac = 1.0f - bc;
+                baryOut[numOut + 0] = baryIn[ai + 0] * ac + baryIn[bi + 0] * bc;
+                baryOut[numOut + 1] = baryIn[ai + 1] * ac + baryIn[bi + 1] * bc;
+                numOut += 2;
+            }
+            if (bv >= 0.0f)
+            {
+                baryOut[numOut + 0] = baryIn[bi + 0];
+                baryOut[numOut + 1] = baryIn[bi + 1];
+                numOut += 2;
+            }
+            ai = bi;
+            av = bv;
+        }
+    }
+    return (numOut >> 1);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ int clipTriangleWithFrustum(F32* bary, const F32* v0, const F32* v1, const F32* v2, const F32* d1, const F32* d2)
+{
+    int num = 3;
+    bary[0] = 0.0f, bary[1] = 0.0f;
+    bary[2] = 1.0f, bary[3] = 0.0f;
+    bary[4] = 0.0f, bary[5] = 1.0f;
+
+    if ((v0[3] < fabsf(v0[0])) | (v1[3] < fabsf(v1[0])) | (v2[3] < fabsf(v2[0])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[0], d1[3] + d1[0], d2[3] + d2[0]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[0], d1[3] - d1[0], d2[3] - d2[0]);
+    }
+    if ((v0[3] < fabsf(v0[1])) | (v1[3] < fabsf(v1[1])) | (v2[3] < fabsf(v2[1])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[1], d1[3] + d1[1], d2[3] + d2[1]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[1], d1[3] - d1[1], d2[3] - d2[1]);
+    }
+    if ((v0[3] < fabsf(v0[2])) | (v1[3] < fabsf(v1[2])) | (v2[3] < fabsf(v2[2])))
+    {
+        F32 temp[18];
+        num = clipPolygonWithPlane(temp, bary, num, v0[3] + v0[2], d1[3] + d1[2], d2[3] + d2[2]);
+        num = clipPolygonWithPlane(bary, temp, num, v0[3] - v0[2], d1[3] - d1[2], d2[3] - d2[2]);
+    }
+    return num;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 idiv_fast(U32 a, U32 b)
+{
+    return f32_to_u32_sat_rmi(((F32)a + 0.5f) / (F32)b);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U32 toABGR(float4 color)
+{
+	// 11 instructions: 4*FFMA, 4*F2I, 3*PRMT
+	U32 x = f32_to_u32_sat_rmi(fma_rm(color.x, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 y = f32_to_u32_sat_rmi(fma_rm(color.y, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 z = f32_to_u32_sat_rmi(fma_rm(color.z, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	U32 w = f32_to_u32_sat_rmi(fma_rm(color.w, (1 << 24) * 255.0f, (1 << 24) * 0.5f));
+	return prmt(prmt(x, y, 0x0073), prmt(z, w, 0x0073), 0x5410);
+}
+
+//------------------------------------------------------------------------
+// v0 = subpixels relative to the bottom-left sampling point
+
+__device__ __inline__ uint3 setupPleq(float3 values, int2 v0, int2 d1, int2 d2, F32 areaRcp)
+{
+    F32 mx = fmaxf(fmaxf(values.x, values.y), values.z);
+    int sh = ::min(::max((__float_as_int(mx) >> 23) - (127 + 22), 0), 8);
+    S32 t0 = (U32)values.x >> sh;
+    S32 t1 = ((U32)values.y >> sh) - t0;
+    S32 t2 = ((U32)values.z >> sh) - t0;
+
+    U32 rcpMant = (__float_as_int(areaRcp) & 0x007FFFFF) | 0x00800000;
+    int rcpShift = (23 + 127) - (__float_as_int(areaRcp) >> 23);
+
+    uint3 pleq;
+    S64 xc = ((S64)t1 * d2.y - (S64)t2 * d1.y) * rcpMant;
+    S64 yc = ((S64)t2 * d1.x - (S64)t1 * d2.x) * rcpMant;
+    pleq.x = (U32)(xc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+    pleq.y = (U32)(yc >> (rcpShift - (sh + CR_SUBPIXEL_LOG2)));
+
+    S32 centerX = (v0.x * 2 + min_min(d1.x, d2.x, 0) + max_max(d1.x, d2.x, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+    S32 centerY = (v0.y * 2 + min_min(d1.y, d2.y, 0) + max_max(d1.y, d2.y, 0)) >> (CR_SUBPIXEL_LOG2 + 1);
+    S32 vcx = v0.x - (centerX << CR_SUBPIXEL_LOG2);
+    S32 vcy = v0.y - (centerY << CR_SUBPIXEL_LOG2);
+
+    pleq.z = t0 << sh;
+    pleq.z -= (U32)(((xc >> 13) * vcx + (yc >> 13) * vcy) >> (rcpShift - (sh + 13)));
+    pleq.z -= pleq.x * centerX + pleq.y * centerY;
+    return pleq;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ void cover8x8_setupLUT(volatile U64* lut)
+{
+    for (S32 lutIdx = threadIdx.x + blockDim.x * threadIdx.y; lutIdx < CR_COVER8X8_LUT_SIZE; lutIdx += blockDim.x * blockDim.y)
+    {
+        int half       = (lutIdx < (12 << 5)) ? 0 : 1;
+        int yint       = (lutIdx >> 5) - half * 12 - 3;
+        U32 shape      = ((lutIdx >> 2) & 7) << (31 - 2);
+        S32 slctSwapXY = lutIdx << (31 - 1);
+        S32 slctNegX   = lutIdx << (31 - 0);
+        S32 slctCompl  = slctSwapXY ^ slctNegX;
+
+        U64 mask = 0;
+        int xlo = half * 4;
+        int xhi = xlo + 4;
+        for (int x = xlo; x < xhi; x++)
+        {
+            int ylo = slct(0, ::max(yint, 0), slctCompl);
+            int yhi = slct(::min(yint, 8), 8, slctCompl);
+            for (int y = ylo; y < yhi; y++)
+            {
+                int xx = slct(x, y, slctSwapXY);
+                int yy = slct(y, x, slctSwapXY);
+                xx = slct(xx, 7 - xx, slctNegX);
+                mask |= (U64)1 << (xx + yy * 8);
+            }
+            yint += shape >> 31;
+            shape <<= 1;
+        }
+        lut[lutIdx] = mask;
+    }
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_fast(S32 ox, S32 oy, S32 dx, S32 dy, U32 flips, volatile const U64* lut) // 52 instr
+{
+    F32  yinitBias  = (F32)(1 << (31 - CR_MAXVIEWPORT_LOG2 - CR_SUBPIXEL_LOG2 * 2));
+    F32  yinitScale = (F32)(1 << (32 - CR_SUBPIXEL_LOG2));
+    F32  yincScale  = 65536.0f * 65536.0f;
+
+    S32  slctFlipY  = flips << (31 - CR_FLIPBIT_FLIP_Y);
+    S32  slctFlipX  = flips << (31 - CR_FLIPBIT_FLIP_X);
+    S32  slctSwapXY = flips << (31 - CR_FLIPBIT_SWAP_XY);
+
+    // Evaluate cross product.
+
+    S32 t = ox * dy - oy * dx;
+    F32 det = (F32)slct(t, t - dy * (7 << CR_SUBPIXEL_LOG2), slctFlipX);
+    if (flips >= (1 << CR_FLIPBIT_COMPL))
+        det = -det;
+
+    // Represent Y as a function of X.
+
+    F32 xrcp  = 1.0f / (F32)::abs(slct(dx, dy, slctSwapXY));
+    F32 yzero = det * yinitScale * xrcp + yinitBias;
+    S64 yinit = f32_to_s64(slct(yzero, -yzero, slctFlipY));
+    U32 yinc  = f32_to_u32_sat((F32)::abs(slct(dy, dx, slctSwapXY)) * xrcp * yincScale);
+
+    // Lookup.
+
+    return cover8x8_lookupMask(yinit, yinc, flips, lut);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_lookupMask(S64 yinit, U32 yinc, U32 flips, volatile const U64* lut)
+{
+    // First half.
+
+    U32 yfrac = getLo(yinit);
+    U32 shape = add_clamp_0_x(getHi(yinit) + 4, 0, 11);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    int oct = flips & ((1 << CR_FLIPBIT_FLIP_X) | (1 << CR_FLIPBIT_SWAP_XY));
+    U64 mask = *(U64*)((U8*)lut + oct + (shape << 5));
+
+    // Second half.
+
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    shape = add_clamp_0_x(getHi(yinit) + 4, __popc(shape & 15), 11);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    add_add_carry(yfrac, yfrac, yinc, shape, shape, shape);
+    mask |= *(U64*)((U8*)lut + oct + (shape << 5) + (12 << 8));
+    return (flips >= (1 << CR_FLIPBIT_COMPL)) ? ~mask : mask;
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_exact_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+    S32 curr = ox * dy - oy * dx;
+    if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+    return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_conservative_noLUT(S32 ox, S32 oy, S32 dx, S32 dy)
+{
+    S32 curr = ox * dy - oy * dx;
+    if (dy > 0 || (dy == 0 && dx <= 0)) curr--; // exclusive
+    curr += (::abs(dx) + ::abs(dy)) << (CR_SUBPIXEL_LOG2 - 1);
+    return cover8x8_generateMask_noLUT(curr, dx, dy);
+}
+
+//------------------------------------------------------------------------
+
+__device__ __inline__ U64 cover8x8_generateMask_noLUT(S32 curr, S32 dx, S32 dy)
+{
+    curr += (dx - dy) * (7 << CR_SUBPIXEL_LOG2);
+    S32 stepX = dy << (CR_SUBPIXEL_LOG2 + 1);
+    S32 stepYorig = -dx - dy * 7;
+    S32 stepY = stepYorig << (CR_SUBPIXEL_LOG2 + 1);
+
+    U32 hi = isetge(curr, 0);
+    U32 frac = curr + curr;
+    for (int i = 62; i >= 32; i--)
+        add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, hi, hi, hi);
+
+	U32 lo = 0;
+    for (int i = 31; i >= 0; i--)
+        add_add_carry(frac, frac, ((i & 7) == 7) ? stepY : stepX, lo, lo, lo);
+
+	lo ^= lo >> 1,  hi ^= hi >> 1;
+	lo ^= lo >> 2,  hi ^= hi >> 2;
+	lo ^= lo >> 4,  hi ^= hi >> 4;
+	lo ^= lo >> 8,  hi ^= hi >> 8;
+	lo ^= lo >> 16, hi ^= hi >> 16;
+
+	if (dy < 0)
+    {
+        lo ^= 0x55AA55AA;
+        hi ^= 0x55AA55AA;
+    }
+	if (stepYorig < 0)
+    {
+        lo ^= 0xFF00FF00;
+        hi ^= 0x00FF00FF;
+    }
+	if ((hi & 1) != 0)
+		lo = ~lo;
+
+    return combineLoHi(lo, hi);
+}
+
+//------------------------------------------------------------------------
+
+template <class T> __device__ __inline__ void sortShared(T* ptr, int numItems)
+{
+    int thrInBlock = threadIdx.x + threadIdx.y * blockDim.x;
+    int range = 16;
+
+    // Use transposition sort within each 16-wide subrange.
+
+    int base = thrInBlock * 2;
+    bool act = (base < numItems - 1);
+    U32 actMask = __ballot_sync(~0u, act);
+    if (act)
+    {
+        bool tryOdd = (base < numItems - 2 && (~base & (range - 2)) != 0);
+        T mid = ptr[base + 1];
+
+        for (int iter = 0; iter < range; iter += 2)
+        {
+            // Evens.
+
+            T tmp = ptr[base + 0];
+            if (tmp > mid)
+            {
+                ptr[base + 0] = mid;
+                mid = tmp;
+            }
+            __syncwarp(actMask);
+
+            // Odds.
+
+            if (tryOdd)
+            {
+                tmp = ptr[base + 2];
+                if (mid > tmp)
+                {
+                    ptr[base + 2] = mid;
+                    mid = tmp;
+                }
+            }
+            __syncwarp(actMask);
+        }
+        ptr[base + 1] = mid;
+    }
+
+    // Multiple subranges => Merge hierarchically.
+
+    for (; range < numItems; range <<= 1)
+    {
+        // Assuming that we would insert the current item into the other
+        // subrange, use binary search to find the appropriate slot.
+
+        __syncthreads();
+
+        T item;
+        int slot;
+        if (thrInBlock < numItems)
+        {
+            item = ptr[thrInBlock];
+            slot = (thrInBlock & -range) ^ range;
+            if (slot < numItems)
+            {
+                T tmp = ptr[slot];
+                bool inclusive = ((thrInBlock & range) != 0);
+                if (tmp < item || (inclusive && tmp == item))
+                {
+                    for (int step = (range >> 1); step != 0; step >>= 1)
+                    {
+                        int probe = slot + step;
+                        if (probe < numItems)
+                        {
+                            tmp = ptr[probe];
+                            if (tmp < item || (inclusive && tmp == item))
+                                slot = probe;
+                        }
+                    }
+                    slot++;
+                }
+            }
+        }
+
+        // Store the item at an appropriate place.
+
+        __syncthreads();
+
+        if (thrInBlock < numItems)
+            ptr[slot + (thrInBlock & (range * 2 - 1)) - range] = item;
+    }
+}
+
+//------------------------------------------------------------------------
+}
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/framework.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/framework.h
new file mode 100644
index 00000000..12d803ca
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/framework.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+// Framework-specific macros to enable code sharing.
+
+//------------------------------------------------------------------------
+// Tensorflow.
+
+#ifdef NVDR_TENSORFLOW
+#define EIGEN_USE_GPU
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/default/logging.h"
+using namespace tensorflow;
+using namespace tensorflow::shape_inference;
+#define NVDR_CTX_ARGS OpKernelContext* _nvdr_ctx
+#define NVDR_CTX_PARAMS _nvdr_ctx
+#define NVDR_CHECK(COND, ERR) OP_REQUIRES(_nvdr_ctx, COND, errors::Internal(ERR))
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) OP_CHECK_CUDA_ERROR(_nvdr_ctx, CUDA_CALL)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) OP_CHECK_GL_ERROR(_nvdr_ctx, GL_CALL)
+#endif
+
+//------------------------------------------------------------------------
+// PyTorch.
+
+#ifdef NVDR_TORCH
+#ifndef __CUDACC__
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAUtils.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <pybind11/numpy.h>
+#endif
+#define NVDR_CTX_ARGS int _nvdr_ctx_dummy
+#define NVDR_CTX_PARAMS 0
+#define NVDR_CHECK(COND, ERR) do { TORCH_CHECK(COND, ERR) } while(0)
+#define NVDR_CHECK_CUDA_ERROR(CUDA_CALL) do { cudaError_t err = CUDA_CALL; TORCH_CHECK(!err, "Cuda error: ", cudaGetLastError(), "[", #CUDA_CALL, ";]"); } while(0)
+#define NVDR_CHECK_GL_ERROR(GL_CALL) do { GL_CALL; GLenum err = glGetError(); TORCH_CHECK(err == GL_NO_ERROR, "OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]"); } while(0)
+#endif
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.cpp
new file mode 100644
index 00000000..2af3e931
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.cpp
@@ -0,0 +1,403 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common.
+//------------------------------------------------------------------------
+
+#include "framework.h"
+#include "glutil.h"
+#include <iostream>
+#include <iomanip>
+
+// Create the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) return_type (GLAPIENTRY* name)(__VA_ARGS__) = 0;
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+// Track initialization status.
+static volatile bool s_glExtInitialized = false;
+
+// Error strings.
+const char* getGLErrorString(GLenum err)
+{
+    switch(err)
+    {
+        case GL_NO_ERROR:                       return "GL_NO_ERROR";
+        case GL_INVALID_ENUM:                   return "GL_INVALID_ENUM";
+        case GL_INVALID_VALUE:                  return "GL_INVALID_VALUE";
+        case GL_INVALID_OPERATION:              return "GL_INVALID_OPERATION";
+        case GL_STACK_OVERFLOW:                 return "GL_STACK_OVERFLOW";
+        case GL_STACK_UNDERFLOW:                return "GL_STACK_UNDERFLOW";
+        case GL_OUT_OF_MEMORY:                  return "GL_OUT_OF_MEMORY";
+        case GL_INVALID_FRAMEBUFFER_OPERATION:  return "GL_INVALID_FRAMEBUFFER_OPERATION";
+        case GL_TABLE_TOO_LARGE:                return "GL_TABLE_TOO_LARGE";
+        case GL_CONTEXT_LOST:                   return "GL_CONTEXT_LOST";
+    }
+    return "Unknown error";
+}
+
+//------------------------------------------------------------------------
+// Windows.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+
+static CRITICAL_SECTION getInitializedCriticalSection(void)
+{
+    CRITICAL_SECTION cs;
+    InitializeCriticalSection(&cs);
+    return cs;
+}
+
+static CRITICAL_SECTION s_getProcAddressMutex = getInitializedCriticalSection();
+
+static void safeGetProcAddress(const char* name, PROC* pfn)
+{
+    PROC result = wglGetProcAddress(name);
+    if (!result)
+    {
+        LeaveCriticalSection(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    // Use critical section for thread safety.
+    EnterCriticalSection(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROC*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    // Done.
+    LeaveCriticalSection(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+    if (!wglMakeCurrent(glctx.hdc, glctx.hglrc))
+        LOG(FATAL) << "wglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    if (!wglMakeCurrent(NULL, NULL))
+        LOG(FATAL) << "wglMakeCurrent() failed when releasing GL context";
+}
+
+extern "C" int set_gpu(const char*); // In setgpu.lib
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        if (cudaDeviceGetPCIBusId(pciBusId, 255, cudaDeviceIdx))
+        {
+            LOG(INFO) << "PCI bus id query failed";
+        }
+        else
+        {
+            int res = set_gpu(pciBusId);
+            LOG(INFO) << "Selecting device with PCI bus id " << pciBusId << " - " << (res ? "failed, expect crash or major slowdown" : "success");
+        }
+    }
+
+    HINSTANCE hInstance = GetModuleHandle(NULL);
+    WNDCLASS wc = {};
+    wc.style         = CS_OWNDC;
+    wc.lpfnWndProc   = DefWindowProc;
+    wc.hInstance     = hInstance;
+    wc.lpszClassName = "__DummyGLClassCPP";
+    int res = RegisterClass(&wc);
+
+    HWND hwnd = CreateWindow(
+        "__DummyGLClassCPP",        // lpClassName
+        "__DummyGLWindowCPP",       // lpWindowName
+        WS_OVERLAPPEDWINDOW,        // dwStyle
+        CW_USEDEFAULT,              // x
+        CW_USEDEFAULT,              // y
+        0, 0,                       // nWidth, nHeight
+        NULL, NULL,                 // hWndParent, hMenu
+        hInstance,                  // hInstance
+        NULL                        // lpParam
+    );
+
+    PIXELFORMATDESCRIPTOR pfd = {};
+    pfd.dwFlags      = PFD_SUPPORT_OPENGL;
+    pfd.iPixelType   = PFD_TYPE_RGBA;
+    pfd.iLayerType   = PFD_MAIN_PLANE;
+    pfd.cColorBits   = 32;
+    pfd.cDepthBits   = 24;
+    pfd.cStencilBits = 8;
+
+    HDC hdc = GetDC(hwnd);
+    int pixelformat = ChoosePixelFormat(hdc, &pfd);
+    SetPixelFormat(hdc, pixelformat, &pfd);
+
+    HGLRC hglrc = wglCreateContext(hdc);
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context created (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)hglrc << ")";
+
+    GLContext glctx = {hdc, hglrc, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.hglrc)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (wglGetCurrentContext() == glctx.hglrc)
+        releaseGLContext();
+
+    HWND hwnd = WindowFromDC(glctx.hdc);
+    if (!hwnd)
+        LOG(FATAL) << "WindowFromDC() failed";
+    if (!ReleaseDC(hwnd, glctx.hdc))
+        LOG(FATAL) << "ReleaseDC() failed";
+    if (!wglDeleteContext(glctx.hglrc))
+        LOG(FATAL) << "wglDeleteContext() failed";
+    if (!DestroyWindow(hwnd))
+        LOG(FATAL) << "DestroyWindow() failed";
+
+    LOG(INFO) << std::hex << std::setfill('0')
+              << "WGL OpenGL context destroyed (hdc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hdc
+              << ", hglrc: 0x" << std::setw(8) << (uint32_t)(uintptr_t)glctx.hglrc << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+
+static pthread_mutex_t s_getProcAddressMutex;
+
+typedef void (*PROCFN)();
+
+static void safeGetProcAddress(const char* name, PROCFN* pfn)
+{
+    PROCFN result = eglGetProcAddress(name);
+    if (!result)
+    {
+        pthread_mutex_unlock(&s_getProcAddressMutex); // Prepare for thread exit.
+        LOG(FATAL) << "wglGetProcAddress() failed for '" << name << "'";
+        exit(1); // Should never get here but make sure we exit.
+    }
+    *pfn = result;
+}
+
+static void initializeGLExtensions(void)
+{
+    pthread_mutex_lock(&s_getProcAddressMutex);
+
+    // Only dig function pointers if not done already.
+    if (!s_glExtInitialized)
+    {
+        // Generate code to populate the function pointers.
+#define GLUTIL_EXT(return_type, name, ...) safeGetProcAddress(#name, (PROCFN*)&name);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+        // Mark as initialized.
+        s_glExtInitialized = true;
+    }
+
+    pthread_mutex_unlock(&s_getProcAddressMutex);
+    return;
+}
+
+void setGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "setGLContext() called with null gltcx";
+
+    if (!eglMakeCurrent(glctx.display, EGL_NO_SURFACE, EGL_NO_SURFACE, glctx.context))
+        LOG(ERROR) << "eglMakeCurrent() failed when setting GL context";
+
+    if (glctx.extInitialized)
+        return;
+    initializeGLExtensions();
+    glctx.extInitialized = 1;
+}
+
+void releaseGLContext(void)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (display == EGL_NO_DISPLAY)
+        LOG(WARNING) << "releaseGLContext() called with no active display";
+    if (!eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, EGL_NO_CONTEXT))
+        LOG(FATAL) << "eglMakeCurrent() failed when releasing GL context";
+}
+
+static EGLDisplay getCudaDisplay(int cudaDeviceIdx)
+{
+    typedef EGLBoolean (*eglQueryDevicesEXT_t)(EGLint, EGLDeviceEXT, EGLint*);
+    typedef EGLBoolean (*eglQueryDeviceAttribEXT_t)(EGLDeviceEXT, EGLint, EGLAttrib*);
+    typedef EGLDisplay (*eglGetPlatformDisplayEXT_t)(EGLenum, void*, const EGLint*);
+
+    eglQueryDevicesEXT_t eglQueryDevicesEXT = (eglQueryDevicesEXT_t)eglGetProcAddress("eglQueryDevicesEXT");
+    if (!eglQueryDevicesEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDevicesEXT\") failed";
+        return 0;
+    }
+
+    eglQueryDeviceAttribEXT_t eglQueryDeviceAttribEXT = (eglQueryDeviceAttribEXT_t)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDeviceAttribEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglQueryDeviceAttribEXT\") failed";
+        return 0;
+    }
+
+    eglGetPlatformDisplayEXT_t eglGetPlatformDisplayEXT = (eglGetPlatformDisplayEXT_t)eglGetProcAddress("eglGetPlatformDisplayEXT");
+    if (!eglGetPlatformDisplayEXT)
+    {
+        LOG(INFO) << "eglGetProcAddress(\"eglGetPlatformDisplayEXT\") failed";
+        return 0;
+    }
+
+    int num_devices = 0;
+    eglQueryDevicesEXT(0, 0, &num_devices);
+    if (!num_devices)
+        return 0;
+
+    EGLDisplay display = 0;
+    EGLDeviceEXT* devices = (EGLDeviceEXT*)malloc(num_devices * sizeof(void*));
+    eglQueryDevicesEXT(num_devices, devices, &num_devices);
+    for (int i=0; i < num_devices; i++)
+    {
+        EGLDeviceEXT device = devices[i];
+        intptr_t value = -1;
+        if (eglQueryDeviceAttribEXT(device, EGL_CUDA_DEVICE_NV, &value) && value == cudaDeviceIdx)
+        {
+            display = eglGetPlatformDisplayEXT(EGL_PLATFORM_DEVICE_EXT, device, 0);
+            break;
+        }
+    }
+
+    free(devices);
+    return display;
+}
+
+GLContext createGLContext(int cudaDeviceIdx)
+{
+    EGLDisplay display = 0;
+
+    if (cudaDeviceIdx >= 0)
+    {
+        char pciBusId[256] = "";
+        LOG(INFO) << "Creating GL context for Cuda device " << cudaDeviceIdx;
+        display = getCudaDisplay(cudaDeviceIdx);
+        if (!display)
+            LOG(INFO) << "Failed, falling back to default display";
+    }
+
+    if (!display)
+    {
+        display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+        if (display == EGL_NO_DISPLAY)
+            LOG(FATAL) << "eglGetDisplay() failed";
+    }
+
+    EGLint major;
+    EGLint minor;
+    if (!eglInitialize(display, &major, &minor))
+        LOG(FATAL) << "eglInitialize() failed";
+
+    // Choose configuration.
+
+    const EGLint context_attribs[] = {
+        EGL_RED_SIZE,           8,
+        EGL_GREEN_SIZE,         8,
+        EGL_BLUE_SIZE,          8,
+        EGL_ALPHA_SIZE,         8,
+        EGL_DEPTH_SIZE,         24,
+        EGL_STENCIL_SIZE,       8,
+        EGL_RENDERABLE_TYPE,    EGL_OPENGL_BIT,
+        EGL_SURFACE_TYPE,       EGL_PBUFFER_BIT,
+        EGL_NONE
+    };
+
+    EGLConfig config;
+    EGLint num_config;
+    if (!eglChooseConfig(display, context_attribs, &config, 1, &num_config))
+        LOG(FATAL) << "eglChooseConfig() failed";
+
+    // Create GL context.
+
+    if (!eglBindAPI(EGL_OPENGL_API))
+        LOG(FATAL) << "eglBindAPI() failed";
+
+    EGLContext context = eglCreateContext(display, config, EGL_NO_CONTEXT, NULL);
+    if (context == EGL_NO_CONTEXT)
+        LOG(FATAL) << "eglCreateContext() failed";
+
+    // Done.
+
+    LOG(INFO) << "EGL " << (int)minor << "." << (int)major << " OpenGL context created (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)context << ")";
+
+    GLContext glctx = {display, context, 0};
+    return glctx;
+}
+
+void destroyGLContext(GLContext& glctx)
+{
+    if (!glctx.context)
+        LOG(FATAL) << "destroyGLContext() called with null gltcx";
+
+    // If this is the current context, release it.
+    if (eglGetCurrentContext() == glctx.context)
+        releaseGLContext();
+
+    if (!eglDestroyContext(glctx.display, glctx.context))
+        LOG(ERROR) << "eglDestroyContext() failed";
+
+    LOG(INFO) << "EGL OpenGL context destroyed (disp: 0x"
+              << std::hex << std::setfill('0')
+              << std::setw(16) << (uintptr_t)glctx.display
+              << ", ctx: 0x" << std::setw(16) << (uintptr_t)glctx.context << ")";
+
+    memset(&glctx, 0, sizeof(GLContext));
+}
+
+//------------------------------------------------------------------------
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.h
new file mode 100644
index 00000000..e9a3a7d9
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Windows-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef _WIN32
+#define NOMINMAX
+#include <windows.h> // Required by gl.h in Windows.
+#define GLAPIENTRY APIENTRY
+
+struct GLContext
+{
+    HDC     hdc;
+    HGLRC   hglrc;
+    int     extInitialized;
+};
+
+#endif // _WIN32
+
+//------------------------------------------------------------------------
+// Linux-specific headers and types.
+//------------------------------------------------------------------------
+
+#ifdef __linux__
+#define EGL_NO_X11 // X11/Xlib.h has "#define Status int" which breaks Tensorflow. Avoid it.
+#define MESA_EGL_NO_X11_HEADERS
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#define GLAPIENTRY
+
+struct GLContext
+{
+    EGLDisplay  display;
+    EGLContext  context;
+    int         extInitialized;
+};
+
+#endif // __linux__
+
+//------------------------------------------------------------------------
+// OpenGL, CUDA interop, GL extensions.
+//------------------------------------------------------------------------
+#define GL_GLEXT_LEGACY
+#include <GL/gl.h>
+#include <cuda_gl_interop.h>
+
+// Constants.
+#ifndef GL_VERSION_1_2
+#define GL_CLAMP_TO_EDGE                 0x812F
+#define GL_TEXTURE_3D                    0x806F
+#endif
+#ifndef GL_VERSION_1_5
+#define GL_ARRAY_BUFFER                  0x8892
+#define GL_DYNAMIC_DRAW                  0x88E8
+#define GL_ELEMENT_ARRAY_BUFFER          0x8893
+#endif
+#ifndef GL_VERSION_2_0
+#define GL_FRAGMENT_SHADER               0x8B30
+#define GL_INFO_LOG_LENGTH               0x8B84
+#define GL_LINK_STATUS                   0x8B82
+#define GL_VERTEX_SHADER                 0x8B31
+#endif
+#ifndef GL_VERSION_3_0
+#define GL_MAJOR_VERSION                 0x821B
+#define GL_MINOR_VERSION                 0x821C
+#define GL_RGBA32F                       0x8814
+#define GL_TEXTURE_2D_ARRAY              0x8C1A
+#endif
+#ifndef GL_VERSION_3_2
+#define GL_GEOMETRY_SHADER               0x8DD9
+#endif
+#ifndef GL_ARB_framebuffer_object
+#define GL_COLOR_ATTACHMENT0             0x8CE0
+#define GL_COLOR_ATTACHMENT1             0x8CE1
+#define GL_DEPTH_STENCIL                 0x84F9
+#define GL_DEPTH_STENCIL_ATTACHMENT      0x821A
+#define GL_DEPTH24_STENCIL8              0x88F0
+#define GL_FRAMEBUFFER                   0x8D40
+#define GL_INVALID_FRAMEBUFFER_OPERATION 0x0506
+#define GL_UNSIGNED_INT_24_8             0x84FA
+#endif
+#ifndef GL_ARB_imaging
+#define GL_TABLE_TOO_LARGE               0x8031
+#endif
+#ifndef GL_KHR_robustness
+#define GL_CONTEXT_LOST                  0x0507
+#endif
+
+// Declare function pointers to OpenGL extension functions.
+#define GLUTIL_EXT(return_type, name, ...) extern return_type (GLAPIENTRY* name)(__VA_ARGS__);
+#include "glutil_extlist.h"
+#undef GLUTIL_EXT
+
+//------------------------------------------------------------------------
+// Common functions.
+//------------------------------------------------------------------------
+
+void        setGLContext            (GLContext& glctx);
+void        releaseGLContext        (void);
+GLContext   createGLContext         (int cudaDeviceIdx);
+void        destroyGLContext        (GLContext& glctx);
+const char* getGLErrorString        (GLenum err);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil_extlist.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil_extlist.h
new file mode 100644
index 00000000..afa08f39
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/glutil_extlist.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#ifndef GL_VERSION_1_2
+GLUTIL_EXT(void,   glTexImage3D,                GLenum target, GLint level, GLint internalFormat, GLsizei width, GLsizei height, GLsizei depth, GLint border, GLenum format, GLenum type, const void *pixels);
+#endif
+#ifndef GL_VERSION_1_5
+GLUTIL_EXT(void,   glBindBuffer,                GLenum target, GLuint buffer);
+GLUTIL_EXT(void,   glBufferData,                GLenum target, ptrdiff_t size, const void* data, GLenum usage);
+GLUTIL_EXT(void,   glGenBuffers,                GLsizei n, GLuint* buffers);
+#endif
+#ifndef GL_VERSION_2_0
+GLUTIL_EXT(void,   glAttachShader,              GLuint program, GLuint shader);
+GLUTIL_EXT(void,   glCompileShader,             GLuint shader);
+GLUTIL_EXT(GLuint, glCreateProgram,             void);
+GLUTIL_EXT(GLuint, glCreateShader,              GLenum type);
+GLUTIL_EXT(void,   glDrawBuffers,               GLsizei n, const GLenum* bufs);
+GLUTIL_EXT(void,   glEnableVertexAttribArray,   GLuint index);
+GLUTIL_EXT(void,   glGetProgramInfoLog,         GLuint program, GLsizei bufSize, GLsizei* length, char* infoLog);
+GLUTIL_EXT(void,   glGetProgramiv,              GLuint program, GLenum pname, GLint* param);
+GLUTIL_EXT(void,   glLinkProgram,               GLuint program);
+GLUTIL_EXT(void,   glShaderSource,              GLuint shader, GLsizei count, const char *const* string, const GLint* length);
+GLUTIL_EXT(void,   glUniform1f,                 GLint location, GLfloat v0);
+GLUTIL_EXT(void,   glUniform2f,                 GLint location, GLfloat v0, GLfloat v1);
+GLUTIL_EXT(void,   glUseProgram,                GLuint program);
+GLUTIL_EXT(void,   glVertexAttribPointer,       GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, const void* pointer);
+#endif
+#ifndef GL_VERSION_3_2
+GLUTIL_EXT(void,   glFramebufferTexture,        GLenum target, GLenum attachment, GLuint texture, GLint level);
+#endif
+#ifndef GL_ARB_framebuffer_object
+GLUTIL_EXT(void,   glBindFramebuffer,           GLenum target, GLuint framebuffer);
+GLUTIL_EXT(void,   glGenFramebuffers,           GLsizei n, GLuint* framebuffers);
+#endif
+#ifndef GL_ARB_vertex_array_object
+GLUTIL_EXT(void,   glBindVertexArray,           GLuint array);
+GLUTIL_EXT(void,   glGenVertexArrays,           GLsizei n, GLuint* arrays);
+#endif
+#ifndef GL_ARB_multi_draw_indirect
+GLUTIL_EXT(void,   glMultiDrawElementsIndirect, GLenum mode, GLenum type, const void *indirect, GLsizei primcount, GLsizei stride);
+#endif
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.cu
new file mode 100644
index 00000000..3bd2a7a7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "interpolate.h"
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateFwdKernelTemplate(const InterpolateKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Output ptrs.
+    float* out = p.out + pidx * p.numAttr;
+    float2* outDA = ENABLE_DA ? (((float2*)p.outDA) + pidx * p.numDiffAttr) : 0;
+
+    // Fetch rasterizer output.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = float_to_triidx(r.w) - 1;
+    bool triValid = (triIdx >= 0 && triIdx < p.numTriangles);
+
+    // If no geometry in entire warp, zero the output and exit.
+    // Otherwise force barys to zero and output with live threads.
+    if (__all_sync(0xffffffffu, !triValid))
+    {
+        for (int i=0; i < p.numAttr; i++)
+            out[i] = 0.f;
+        if (ENABLE_DA)
+            for (int i=0; i < p.numDiffAttr; i++)
+                outDA[i] = make_float2(0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = triValid ? p.tri[triIdx * 3 + 0] : 0;
+    int vi1 = triValid ? p.tri[triIdx * 3 + 1] : 0;
+    int vi2 = triValid ? p.tri[triIdx * 3 + 2] : 0;
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Pointers to attributes.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+
+    // Barys. If no triangle, force all to zero -> output is zero.
+    float b0 = triValid ? r.x : 0.f;
+    float b1 = triValid ? r.y : 0.f;
+    float b2 = triValid ? (1.f - r.x - r.y) : 0.f;
+
+    // Interpolate and write attributes.
+    for (int i=0; i < p.numAttr; i++)
+        out[i] = b0*a0[i] + b1*a1[i] + b2*a2[i];
+
+    // No diff attrs? Exit.
+    if (!ENABLE_DA)
+        return;
+
+    // Read bary pixel differentials if we have a triangle.
+    float4 db = make_float4(0.f, 0.f, 0.f, 0.f);
+    if (triValid)
+        db = ((float4*)p.rastDB)[pidx];
+
+    // Unpack a bit.
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    // Calculate the pixel differentials of chosen attributes.    
+    for (int i=0; i < p.numDiffAttr; i++)
+    {   
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Zero output if invalid index.
+        float dsdx = 0.f;
+        float dsdy = 0.f;
+        if (j >= 0 && j < p.numAttr)
+        {
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            dsdx = dudx*dsdu + dvdx*dsdv;
+            dsdy = dudy*dsdu + dvdy*dsdv;
+        }
+
+        // Write.
+        outDA[i] = make_float2(dsdx, dsdy);
+    }
+}
+
+// Template specializations.
+__global__ void InterpolateFwdKernel  (const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<false>(p); }
+__global__ void InterpolateFwdKernelDa(const InterpolateKernelParams p) { InterpolateFwdKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool ENABLE_DA>
+static __forceinline__ __device__ void InterpolateGradKernelTemplate(const InterpolateKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH * IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Fetch triangle ID. If none, output zero bary/db gradients and exit.
+    float4 r = ((float4*)p.rast)[pidx];
+    int triIdx = float_to_triidx(r.w) - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        ((float4*)p.gradRaster)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        if (ENABLE_DA)
+            ((float4*)p.gradRasterDB)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if corrupt indices.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index unless broadcasting.
+    if (p.instance_mode && !p.attrBC)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Pointers to inputs.
+    const float* a0 = p.attr + vi0 * p.numAttr;
+    const float* a1 = p.attr + vi1 * p.numAttr;
+    const float* a2 = p.attr + vi2 * p.numAttr;
+    const float* pdy = p.dy + pidx * p.numAttr;
+
+    // Pointers to outputs.
+    float* ga0 = p.gradAttr + vi0 * p.numAttr;
+    float* ga1 = p.gradAttr + vi1 * p.numAttr;
+    float* ga2 = p.gradAttr + vi2 * p.numAttr;
+
+    // Barys and bary gradient accumulators.
+    float b0 = r.x;
+    float b1 = r.y;
+    float b2 = 1.f - r.x - r.y;
+    float gb0 = 0.f;
+    float gb1 = 0.f;
+
+    // Loop over attributes and accumulate attribute gradients.
+    for (int i=0; i < p.numAttr; i++)
+    {
+        float y = pdy[i];
+        float s0 = a0[i];
+        float s1 = a1[i];
+        float s2 = a2[i];
+        gb0 += y * (s0 - s2);
+        gb1 += y * (s1 - s2);
+        caAtomicAdd(ga0 + i, b0 * y);
+        caAtomicAdd(ga1 + i, b1 * y);
+        caAtomicAdd(ga2 + i, b2 * y);
+    }
+
+    // Write the bary gradients.
+    ((float4*)p.gradRaster)[pidx] = make_float4(gb0, gb1, 0.f, 0.f);
+
+    // If pixel differentials disabled, we're done.
+    if (!ENABLE_DA)
+        return;
+
+    // Calculate gradients based on attribute pixel differentials.
+    const float2* dda = ((float2*)p.dda) + pidx * p.numDiffAttr;
+    float gdudx = 0.f;
+    float gdudy = 0.f;
+    float gdvdx = 0.f;
+    float gdvdy = 0.f;
+
+    // Read bary pixel differentials.
+    float4 db = ((float4*)p.rastDB)[pidx];
+    float dudx = db.x;
+    float dudy = db.y;
+    float dvdx = db.z;
+    float dvdy = db.w;
+
+    for (int i=0; i < p.numDiffAttr; i++)
+    {
+        // Input attribute index.
+        int j = p.diff_attrs_all ? i : p.diffAttrs[i];
+        if (j < 0)
+            j += p.numAttr; // Python-style negative indices.
+
+        // Check that index is valid.
+        if (j >= 0 && j < p.numAttr)
+        {
+            float2 dsdxy = dda[i];
+            float dsdx = dsdxy.x;
+            float dsdy = dsdxy.y;
+
+            float s0 = a0[j];
+            float s1 = a1[j];
+            float s2 = a2[j];
+
+            // Gradients of db.
+            float dsdu = s0 - s2;
+            float dsdv = s1 - s2;
+            gdudx += dsdu * dsdx;
+            gdudy += dsdu * dsdy;
+            gdvdx += dsdv * dsdx;
+            gdvdy += dsdv * dsdy;
+
+            // Gradients of attributes.
+            float du = dsdx*dudx + dsdy*dudy;
+            float dv = dsdx*dvdx + dsdy*dvdy;
+            caAtomicAdd(ga0 + j, du);
+            caAtomicAdd(ga1 + j, dv);
+            caAtomicAdd(ga2 + j, -du - dv);
+        }
+    }
+
+    // Write.
+    ((float4*)p.gradRasterDB)[pidx] = make_float4(gdudx, gdudy, gdvdx, gdvdy);
+}
+
+// Template specializations.
+__global__ void InterpolateGradKernel  (const InterpolateKernelParams p) { InterpolateGradKernelTemplate<false>(p); }
+__global__ void InterpolateGradKernelDa(const InterpolateKernelParams p) { InterpolateGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.h
new file mode 100644
index 00000000..d35d8388
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/interpolate.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define IP_FWD_MAX_KERNEL_BLOCK_WIDTH   8
+#define IP_FWD_MAX_KERNEL_BLOCK_HEIGHT  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+#define IP_MAX_DIFF_ATTRS               32
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct InterpolateKernelParams
+{
+    const int*      tri;                            // Incoming triangle buffer.
+    const float*    attr;                           // Incoming attribute buffer.
+    const float*    rast;                           // Incoming rasterizer output buffer.
+    const float*    rastDB;                         // Incoming rasterizer output buffer for bary derivatives.
+    const float*    dy;                             // Incoming attribute gradients.
+    const float*    dda;                            // Incoming attr diff gradients.
+    float*          out;                            // Outgoing interpolated attributes.
+    float*          outDA;                          // Outgoing texcoord major axis lengths.
+    float*          gradAttr;                       // Outgoing attribute gradients.
+    float*          gradRaster;                     // Outgoing rasterizer gradients.
+    float*          gradRasterDB;                   // Outgoing rasterizer bary diff gradients.
+    int             numTriangles;                   // Number of triangles.
+    int             numVertices;                    // Number of vertices.
+    int             numAttr;                        // Number of total vertex attributes.
+    int             numDiffAttr;                    // Number of attributes to differentiate.
+    int             width;                          // Image width.
+    int             height;                         // Image height.
+    int             depth;                          // Minibatch size.
+    int             attrBC;                         // 0=normal, 1=attr is broadcast.
+    int             instance_mode;                  // 0=normal, 1=instance mode.
+    int             diff_attrs_all;                 // 0=normal, 1=produce pixel differentials for all attributes.
+    int             diffAttrs[IP_MAX_DIFF_ATTRS];   // List of attributes to differentiate.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.cu
new file mode 100644
index 00000000..455aca3e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.cu
@@ -0,0 +1,276 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "rasterize.h"
+
+//------------------------------------------------------------------------
+// Cuda forward rasterizer pixel shader kernel.
+
+__global__ void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width_out || py >= p.height_out || pz >= p.depth)
+        return;
+
+    // Pixel indices.
+    int pidx_in  = px + p.width_in  * (py + p.height_in  * pz);
+    int pidx_out = px + p.width_out * (py + p.height_out * pz);
+
+    // Fetch triangle idx.
+    int triIdx = p.in_idx[pidx_in] - 1;
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+    {
+        // No or corrupt triangle.
+        ((float4*)p.out)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out.
+        ((float4*)p.out_db)[pidx_out] = make_float4(0.0, 0.0, 0.0, 0.0); // Clear out_db.
+        return;
+    }
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Perspective correct, normalized barycentrics.
+    float iw = 1.f / (a0 + a1 + a2);
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Compute z/w for depth buffer.
+    float z = p0.z * a0 + p1.z * a1 + p2.z * a2;
+    float w = p0.w * a0 + p1.w * a1 + p2.w * a2;
+    float zw = z / w;
+
+    // Clamps to avoid NaNs.
+    b0 = __saturatef(b0); // Clamp to [+0.0, 1.0].
+    b1 = __saturatef(b1); // Clamp to [+0.0, 1.0].
+    zw = fmaxf(fminf(zw, 1.f), -1.f);
+
+    // Emit output.
+    ((float4*)p.out)[pidx_out] = make_float4(b0, b1, zw, triidx_to_float(triIdx + 1));
+
+    // Calculate bary pixel differentials.
+    float dfxdx = p.xs * iw;
+    float dfydy = p.ys * iw;
+    float da0dx = p2.y*p1.w - p1.y*p2.w;
+    float da0dy = p1.x*p2.w - p2.x*p1.w;
+    float da1dx = p0.y*p2.w - p2.y*p0.w;
+    float da1dy = p2.x*p0.w - p0.x*p2.w;
+    float da2dx = p1.y*p0.w - p0.y*p1.w;
+    float da2dy = p0.x*p1.w - p1.x*p0.w;
+    float datdx = da0dx + da1dx + da2dx;
+    float datdy = da0dy + da1dy + da2dy;
+    float dudx = dfxdx * (b0 * datdx - da0dx);
+    float dudy = dfydy * (b0 * datdy - da0dy);
+    float dvdx = dfxdx * (b1 * datdx - da1dx);
+    float dvdy = dfydy * (b1 * datdy - da1dy);
+
+    // Emit bary pixel differentials.
+    ((float4*)p.out_db)[pidx_out] = make_float4(dudx, dudy, dvdx, dvdy);
+}
+
+//------------------------------------------------------------------------
+// Gradient Cuda kernel.
+
+template <bool ENABLE_DB>
+static __forceinline__ __device__ void RasterizeGradKernelTemplate(const RasterizeGradParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH * RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.width || py >= p.height || pz >= p.depth)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.width * (py + p.height * pz);
+
+    // Read triangle idx and dy.
+    float2 dy  = ((float2*)p.dy)[pidx * 2];
+    float4 ddb = ENABLE_DB ? ((float4*)p.ddb)[pidx] : make_float4(0.f, 0.f, 0.f, 0.f);
+    int triIdx = float_to_triidx(((float*)p.out)[pidx * 4 + 3]) - 1;
+
+    // Exit if nothing to do.
+    if (triIdx < 0 || triIdx >= p.numTriangles)
+        return; // No or corrupt triangle.
+    int grad_all_dy = __float_as_int(dy.x) | __float_as_int(dy.y); // Bitwise OR of all incoming gradients.
+    int grad_all_ddb = 0;
+    if (ENABLE_DB)
+        grad_all_ddb = __float_as_int(ddb.x) | __float_as_int(ddb.y) | __float_as_int(ddb.z) | __float_as_int(ddb.w);
+    if (((grad_all_dy | grad_all_ddb) << 1) == 0)
+        return; // All incoming gradients are +0/-0.
+
+    // Fetch vertex indices.
+    int vi0 = p.tri[triIdx * 3 + 0];
+    int vi1 = p.tri[triIdx * 3 + 1];
+    int vi2 = p.tri[triIdx * 3 + 2];
+
+    // Bail out if vertex indices are corrupt.
+    if (vi0 < 0 || vi0 >= p.numVertices ||
+        vi1 < 0 || vi1 >= p.numVertices ||
+        vi2 < 0 || vi2 >= p.numVertices)
+        return;
+
+    // In instance mode, adjust vertex indices by minibatch index.
+    if (p.instance_mode)
+    {
+        vi0 += pz * p.numVertices;
+        vi1 += pz * p.numVertices;
+        vi2 += pz * p.numVertices;
+    }
+
+    // Initialize coalesced atomics.
+    CA_SET_GROUP(triIdx);
+
+    // Fetch vertex positions.
+    float4 p0 = ((float4*)p.pos)[vi0];
+    float4 p1 = ((float4*)p.pos)[vi1];
+    float4 p2 = ((float4*)p.pos)[vi2];
+
+    // Evaluate edge functions.
+    float fx = p.xs * (float)px + p.xo;
+    float fy = p.ys * (float)py + p.yo;
+    float p0x = p0.x - fx * p0.w;
+    float p0y = p0.y - fy * p0.w;
+    float p1x = p1.x - fx * p1.w;
+    float p1y = p1.y - fy * p1.w;
+    float p2x = p2.x - fx * p2.w;
+    float p2y = p2.y - fy * p2.w;
+    float a0 = p1x*p2y - p1y*p2x;
+    float a1 = p2x*p0y - p2y*p0x;
+    float a2 = p0x*p1y - p0y*p1x;
+
+    // Compute inverse area with epsilon.
+    float at = a0 + a1 + a2;
+    float ep = copysignf(1e-6f, at); // ~1 pixel in 1k x 1k image.
+    float iw = 1.f / (at + ep);
+
+    // Perspective correct, normalized barycentrics.
+    float b0 = a0 * iw;
+    float b1 = a1 * iw;
+
+    // Position gradients.
+    float gb0  = dy.x * iw;
+    float gb1  = dy.y * iw;
+    float gbb  = gb0 * b0 + gb1 * b1;
+    float gp0x = gbb * (p2y - p1y) - gb1 * p2y;
+    float gp1x = gbb * (p0y - p2y) + gb0 * p2y;
+    float gp2x = gbb * (p1y - p0y) - gb0 * p1y + gb1 * p0y;
+    float gp0y = gbb * (p1x - p2x) + gb1 * p2x;
+    float gp1y = gbb * (p2x - p0x) - gb0 * p2x;
+    float gp2y = gbb * (p0x - p1x) + gb0 * p1x - gb1 * p0x;
+    float gp0w = -fx * gp0x - fy * gp0y;
+    float gp1w = -fx * gp1x - fy * gp1y;
+    float gp2w = -fx * gp2x - fy * gp2y;
+
+    // Bary differential gradients.
+    if (ENABLE_DB && ((grad_all_ddb) << 1) != 0)
+    {
+        float dfxdX = p.xs * iw;
+        float dfydY = p.ys * iw;
+        ddb.x *= dfxdX;
+        ddb.y *= dfydY;
+        ddb.z *= dfxdX;
+        ddb.w *= dfydY;
+
+        float da0dX = p1.y * p2.w - p2.y * p1.w;
+        float da1dX = p2.y * p0.w - p0.y * p2.w;
+        float da2dX = p0.y * p1.w - p1.y * p0.w;
+        float da0dY = p2.x * p1.w - p1.x * p2.w;
+        float da1dY = p0.x * p2.w - p2.x * p0.w;
+        float da2dY = p1.x * p0.w - p0.x * p1.w;
+        float datdX = da0dX + da1dX + da2dX;
+        float datdY = da0dY + da1dY + da2dY;
+
+        float x01 = p0.x - p1.x;
+        float x12 = p1.x - p2.x;
+        float x20 = p2.x - p0.x;
+        float y01 = p0.y - p1.y;
+        float y12 = p1.y - p2.y;
+        float y20 = p2.y - p0.y;
+        float w01 = p0.w - p1.w;
+        float w12 = p1.w - p2.w;
+        float w20 = p2.w - p0.w;
+
+        float a0p1 = fy * p2.x - fx * p2.y;
+        float a0p2 = fx * p1.y - fy * p1.x;
+        float a1p0 = fx * p2.y - fy * p2.x;
+        float a1p2 = fy * p0.x - fx * p0.y;
+
+        float wdudX = 2.f * b0 * datdX - da0dX;
+        float wdudY = 2.f * b0 * datdY - da0dY;
+        float wdvdX = 2.f * b1 * datdX - da1dX;
+        float wdvdY = 2.f * b1 * datdY - da1dY;
+
+        float c0  = iw * (ddb.x * wdudX + ddb.y * wdudY + ddb.z * wdvdX + ddb.w * wdvdY);
+        float cx  = c0 * fx - ddb.x * b0 - ddb.z * b1;
+        float cy  = c0 * fy - ddb.y * b0 - ddb.w * b1;
+        float cxy = iw * (ddb.x * datdX + ddb.y * datdY);
+        float czw = iw * (ddb.z * datdX + ddb.w * datdY);
+
+        gp0x += c0 * y12 - cy * w12              + czw * p2y                                               + ddb.w * p2.w;
+        gp1x += c0 * y20 - cy * w20 - cxy * p2y                              - ddb.y * p2.w;
+        gp2x += c0 * y01 - cy * w01 + cxy * p1y  - czw * p0y                 + ddb.y * p1.w                - ddb.w * p0.w;
+        gp0y += cx * w12 - c0 * x12              - czw * p2x                                - ddb.z * p2.w;
+        gp1y += cx * w20 - c0 * x20 + cxy * p2x               + ddb.x * p2.w;
+        gp2y += cx * w01 - c0 * x01 - cxy * p1x  + czw * p0x  - ddb.x * p1.w                + ddb.z * p0.w;
+        gp0w += cy * x12 - cx * y12              - czw * a1p0                               + ddb.z * p2.y - ddb.w * p2.x;
+        gp1w += cy * x20 - cx * y20 - cxy * a0p1              - ddb.x * p2.y + ddb.y * p2.x;
+        gp2w += cy * x01 - cx * y01 - cxy * a0p2 - czw * a1p2 + ddb.x * p1.y - ddb.y * p1.x - ddb.z * p0.y + ddb.w * p0.x;
+    }
+
+    // Accumulate using coalesced atomics.
+    caAtomicAdd3_xyw(p.grad + 4 * vi0, gp0x, gp0y, gp0w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi1, gp1x, gp1y, gp1w);
+    caAtomicAdd3_xyw(p.grad + 4 * vi2, gp2x, gp2y, gp2w);
+}
+
+// Template specializations.
+__global__ void RasterizeGradKernel  (const RasterizeGradParams p) { RasterizeGradKernelTemplate<false>(p); }
+__global__ void RasterizeGradKernelDb(const RasterizeGradParams p) { RasterizeGradKernelTemplate<true>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.h
new file mode 100644
index 00000000..cb3104fa
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Constants and helpers.
+
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH  8
+#define RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT 8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH  8
+#define RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT 8
+
+//------------------------------------------------------------------------
+// CUDA forward rasterizer shader kernel params.
+
+struct RasterizeCudaFwdShaderParams
+{
+    const float*    pos;            // Vertex positions.
+    const int*      tri;            // Triangle indices.
+    const int*      in_idx;         // Triangle idx buffer from rasterizer.
+    float*          out;            // Main output buffer.
+    float*          out_db;         // Bary pixel gradient output buffer.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width_in;       // Input image width.
+    int             height_in;      // Input image height.
+    int             width_out;      // Output image width.
+    int             height_out;     // Output image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
+// Gradient CUDA kernel params.
+
+struct RasterizeGradParams
+{
+    const float*    pos;            // Incoming position buffer.
+    const int*      tri;            // Incoming triangle buffer.
+    const float*    out;            // Rasterizer output buffer.
+    const float*    dy;             // Incoming gradients of rasterizer output buffer.
+    const float*    ddb;            // Incoming gradients of bary diff output buffer.
+    float*          grad;           // Outgoing position gradients.
+    int             numTriangles;   // Number of triangles.
+    int             numVertices;    // Number of vertices.
+    int             width;          // Image width.
+    int             height;         // Image height.
+    int             depth;          // Size of minibatch.
+    int             instance_mode;  // 1 if in instance rendering mode.
+    float           xs, xo, ys, yo; // Pixel position to clip-space x, y transform.
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp
new file mode 100644
index 00000000..ac71ccd8
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.cpp
@@ -0,0 +1,644 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "rasterize_gl.h"
+#include "glutil.h"
+#include <vector>
+#define STRINGIFY_SHADER_SOURCE(x) #x
+
+//------------------------------------------------------------------------
+// Helpers.
+
+#define ROUND_UP(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+static int ROUND_UP_BITS(uint32_t x, uint32_t y)
+{
+    // Round x up so that it has at most y bits of mantissa.
+    if (x < (1u << y))
+        return x;
+    uint32_t m = 0;
+    while (x & ~m)
+        m = (m << 1) | 1u;
+    m >>= y;
+    if (!(x & m))
+        return x;
+    return (x | m) + 1u;
+}
+
+//------------------------------------------------------------------------
+// Draw command struct used by rasterizer.
+
+struct GLDrawCmd
+{
+    uint32_t    count;
+    uint32_t    instanceCount;
+    uint32_t    firstIndex;
+    uint32_t    baseVertex;
+    uint32_t    baseInstance;
+};
+
+//------------------------------------------------------------------------
+// GL helpers.
+
+static void compileGLShader(NVDR_CTX_ARGS, const RasterizeGLState& s, GLuint* pShader, GLenum shaderType, const char* src_buf)
+{
+    std::string src(src_buf);
+
+    // Set preprocessor directives.
+    int n = src.find('\n') + 1; // After first line containing #version directive.
+    if (s.enableZModify)
+        src.insert(n, "#define IF_ZMODIFY(x) x\n");
+    else
+        src.insert(n, "#define IF_ZMODIFY(x)\n");
+
+    const char *cstr = src.c_str();
+    *pShader = 0;
+    NVDR_CHECK_GL_ERROR(*pShader = glCreateShader(shaderType));
+    NVDR_CHECK_GL_ERROR(glShaderSource(*pShader, 1, &cstr, 0));
+    NVDR_CHECK_GL_ERROR(glCompileShader(*pShader));
+}
+
+static void constructGLProgram(NVDR_CTX_ARGS, GLuint* pProgram, GLuint glVertexShader, GLuint glGeometryShader, GLuint glFragmentShader)
+{
+    *pProgram = 0;
+
+    GLuint glProgram = 0;
+    NVDR_CHECK_GL_ERROR(glProgram = glCreateProgram());
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glVertexShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glGeometryShader));
+    NVDR_CHECK_GL_ERROR(glAttachShader(glProgram, glFragmentShader));
+    NVDR_CHECK_GL_ERROR(glLinkProgram(glProgram));
+
+    GLint linkStatus = 0;
+    NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_LINK_STATUS, &linkStatus));
+    if (!linkStatus)
+    {
+        GLint infoLen = 0;
+        NVDR_CHECK_GL_ERROR(glGetProgramiv(glProgram, GL_INFO_LOG_LENGTH, &infoLen));
+        if (infoLen)
+        {
+            const char* hdr = "glLinkProgram() failed:\n";
+            std::vector<char> info(strlen(hdr) + infoLen);
+            strcpy(&info[0], hdr);
+            NVDR_CHECK_GL_ERROR(glGetProgramInfoLog(glProgram, infoLen, &infoLen, &info[strlen(hdr)]));
+            NVDR_CHECK(0, &info[0]);
+        }
+        NVDR_CHECK(0, "glLinkProgram() failed");
+    }
+
+    *pProgram = glProgram;
+}
+
+//------------------------------------------------------------------------
+// Shared C++ functions.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx)
+{
+    // Create GL context and set it current.
+    s.glctx = createGLContext(cudaDeviceIdx);
+    setGLContext(s.glctx);
+
+    // Version check.
+    GLint vMajor = 0;
+    GLint vMinor = 0;
+    glGetIntegerv(GL_MAJOR_VERSION, &vMajor);
+    glGetIntegerv(GL_MINOR_VERSION, &vMinor);
+    glGetError(); // Clear possible GL_INVALID_ENUM error in version query.
+    LOG(INFO) << "OpenGL version reported as " << vMajor << "." << vMinor;
+    NVDR_CHECK((vMajor == 4 && vMinor >= 4) || vMajor > 4, "OpenGL 4.4 or later is required");
+
+    // Enable depth modification workaround on A100 and later.
+    int capMajor = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&capMajor, cudaDevAttrComputeCapabilityMajor, cudaDeviceIdx));
+    s.enableZModify = (capMajor >= 8);
+
+    // Number of output buffers.
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Set up vertex shader.
+    compileGLShader(NVDR_CTX_PARAMS, s, &s.glVertexShader, GL_VERTEX_SHADER,
+        "#version 330\n"
+        "#extension GL_ARB_shader_draw_parameters : enable\n"
+        STRINGIFY_SHADER_SOURCE(
+            layout(location = 0) in vec4 in_pos;
+            out int v_layer;
+            out int v_offset;
+            void main()
+            {
+                int layer = gl_DrawIDARB;
+                gl_Position = in_pos;
+                v_layer = layer;
+                v_offset = gl_BaseInstanceARB; // Sneak in TriID offset here.
+            }
+        )
+    );
+
+    // Geometry and fragment shaders depend on if bary differential output is enabled or not.
+    if (s.enableDB)
+    {
+        // Set up geometry shader. Calculation of per-pixel bary differentials is based on:
+        //           u = (u/w) / (1/w)
+        //   --> du/dX = d((u/w) / (1/w))/dX
+        //   --> du/dX = [d(u/w)/dX - u*d(1/w)/dX] * w
+        // and we know both d(u/w)/dX and d(1/w)/dX are constant over triangle.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                layout(location = 0) uniform vec2 vp_scale;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                out vec4 var_db;
+                void main()
+                {
+                    // Plane equations for bary differentials.
+                    float w0 = gl_in[0].gl_Position.w;
+                    float w1 = gl_in[1].gl_Position.w;
+                    float w2 = gl_in[2].gl_Position.w;
+                    vec2 p0 = gl_in[0].gl_Position.xy;
+                    vec2 p1 = gl_in[1].gl_Position.xy;
+                    vec2 p2 = gl_in[2].gl_Position.xy;
+                    vec2 e0 = p0*w2 - p2*w0;
+                    vec2 e1 = p1*w2 - p2*w1;
+                    float a = e0.x*e1.y - e0.y*e1.x;
+
+                    // Clamp area to an epsilon to avoid arbitrarily high bary differentials.
+                    float eps = 1e-6f; // ~1 pixel in 1k x 1k image.
+                    float ca = (abs(a) >= eps) ? a : (a < 0.f) ? -eps : eps; // Clamp with sign.
+                    float ia = 1.f / ca; // Inverse area.
+
+                    vec2 ascl = ia * vp_scale;
+                    float dudx =  e1.y * ascl.x;
+                    float dudy = -e1.x * ascl.y;
+                    float dvdx = -e0.y * ascl.x;
+                    float dvdy =  e0.x * ascl.y;
+
+                    float duwdx = w2 * dudx;
+                    float dvwdx = w2 * dvdx;
+                    float duvdx = w0 * dudx + w1 * dvdx;
+                    float duwdy = w2 * dudy;
+                    float dvwdy = w2 * dvdy;
+                    float duvdy = w0 * dudy + w1 * dvdy;
+
+                    vec4 db0 = vec4(duvdx - dvwdx, duvdy - dvwdy, dvwdx, dvwdy);
+                    vec4 db1 = vec4(duwdx, duwdy, duvdx - duwdx, duvdy - duwdy);
+                    vec4 db2 = vec4(duwdx, duwdy, dvwdx, dvwdy);
+
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_db = db0; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_db = db1; EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_db = db2; EmitVertex();
+                }
+            )
+        );
+
+        // Set up fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    out_db = var_db * var_uvzw.w;
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+
+        // Set up fragment shader for depth peeling.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                in vec4 var_db;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                layout(location = 1) out vec4 out_db;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, depth_new, id_float);
+                    out_db = var_db * var_uvzw.w;
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+    }
+    else
+    {
+        // Geometry shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glGeometryShader, GL_GEOMETRY_SHADER,
+            "#version 330\n"
+            STRINGIFY_SHADER_SOURCE(
+                layout(triangles) in;
+                layout(triangle_strip, max_vertices=3) out;
+                in int v_layer[];
+                in int v_offset[];
+                out vec4 var_uvzw;
+                void main()
+                {
+                    int layer_id = v_layer[0];
+                    int prim_id = gl_PrimitiveIDIn + v_offset[0];
+
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[0].gl_Position.x, gl_in[0].gl_Position.y, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); var_uvzw = vec4(1.f, 0.f, gl_in[0].gl_Position.z, gl_in[0].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[1].gl_Position.x, gl_in[1].gl_Position.y, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); var_uvzw = vec4(0.f, 1.f, gl_in[1].gl_Position.z, gl_in[1].gl_Position.w); EmitVertex();
+                    gl_Layer = layer_id; gl_PrimitiveID = prim_id; gl_Position = vec4(gl_in[2].gl_Position.x, gl_in[2].gl_Position.y, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); var_uvzw = vec4(0.f, 0.f, gl_in[2].gl_Position.z, gl_in[2].gl_Position.w); EmitVertex();
+                }
+            )
+        );
+
+        // Fragment shader without bary differential output.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShader, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                layout(location = 0) out vec4 out_raster;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+
+        // Depth peeling variant of fragment shader.
+        compileGLShader(NVDR_CTX_PARAMS, s, &s.glFragmentShaderDP, GL_FRAGMENT_SHADER,
+            "#version 430\n"
+            STRINGIFY_SHADER_SOURCE(
+                in vec4 var_uvzw;
+                layout(binding = 0) uniform sampler2DArray out_prev;
+                layout(location = 0) out vec4 out_raster;
+                IF_ZMODIFY(
+                    layout(location = 1) uniform float in_dummy;
+                )
+                void main()
+                {
+                    int id_int = gl_PrimitiveID + 1;
+                    float id_float = (id_int <= 0x01000000) ? float(id_int) : intBitsToFloat(0x4a800000 + id_int);
+
+                    vec4 prev = texelFetch(out_prev, ivec3(gl_FragCoord.x, gl_FragCoord.y, gl_Layer), 0);
+                    float depth_new = var_uvzw.z / var_uvzw.w;
+                    if (prev.w == 0 || depth_new <= prev.z)
+                        discard;
+                    out_raster = vec4(var_uvzw.x, var_uvzw.y, var_uvzw.z / var_uvzw.w, id_float);
+                    IF_ZMODIFY(gl_FragDepth = gl_FragCoord.z + in_dummy;)
+                }
+            )
+        );
+    }
+
+    // Finalize programs.
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgram, s.glVertexShader, s.glGeometryShader, s.glFragmentShader);
+    constructGLProgram(NVDR_CTX_PARAMS, &s.glProgramDP, s.glVertexShader, s.glGeometryShader, s.glFragmentShaderDP);
+
+    // Construct main fbo and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenFramebuffers(1, &s.glFBO));
+    NVDR_CHECK_GL_ERROR(glBindFramebuffer(GL_FRAMEBUFFER, s.glFBO));
+
+    // Enable two color attachments.
+    GLenum draw_buffers[2] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+    NVDR_CHECK_GL_ERROR(glDrawBuffers(num_outputs, draw_buffers));
+
+    // Construct vertex array object.
+    NVDR_CHECK_GL_ERROR(glGenVertexArrays(1, &s.glVAO));
+    NVDR_CHECK_GL_ERROR(glBindVertexArray(s.glVAO));
+
+    // Construct position buffer, bind permanently, enable, set ptr.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ARRAY_BUFFER, s.glPosBuffer));
+    NVDR_CHECK_GL_ERROR(glEnableVertexAttribArray(0));
+    NVDR_CHECK_GL_ERROR(glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 0, 0));
+
+    // Construct index buffer and bind permanently.
+    NVDR_CHECK_GL_ERROR(glGenBuffers(1, &s.glTriBuffer));
+    NVDR_CHECK_GL_ERROR(glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, s.glTriBuffer));
+
+    // Set up depth test.
+    NVDR_CHECK_GL_ERROR(glEnable(GL_DEPTH_TEST));
+    NVDR_CHECK_GL_ERROR(glDepthFunc(GL_LESS));
+    NVDR_CHECK_GL_ERROR(glClearDepth(1.0));
+
+    // Create and bind output buffers. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(num_outputs, s.glColorBuffer));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + i, s.glColorBuffer[i], 0));
+    }
+
+    // Create and bind depth/stencil buffer. Storage is allocated later.
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+    NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, s.glDepthStencilBuffer, 0));
+
+    // Create texture name for previous output buffer (depth peeling).
+    NVDR_CHECK_GL_ERROR(glGenTextures(1, &s.glPrevOutBuffer));
+}
+
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth)
+{
+    changes = false;
+
+    // Resize vertex buffer?
+    if (posCount > s.posCount)
+    {
+        if (s.cudaPosBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.posCount = (posCount > 64) ? ROUND_UP_BITS(posCount, 2) : 64;
+        LOG(INFO) << "Increasing position buffer size to " << s.posCount << " float32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ARRAY_BUFFER, s.posCount * sizeof(float), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaPosBuffer, s.glPosBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+        changes = true;
+    }
+
+    // Resize triangle buffer?
+    if (triCount > s.triCount)
+    {
+        if (s.cudaTriBuffer)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.triCount = (triCount > 64) ? ROUND_UP_BITS(triCount, 2) : 64;
+        LOG(INFO) << "Increasing triangle buffer size to " << s.triCount << " int32";
+        NVDR_CHECK_GL_ERROR(glBufferData(GL_ELEMENT_ARRAY_BUFFER, s.triCount * sizeof(int32_t), NULL, GL_DYNAMIC_DRAW));
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterBuffer(&s.cudaTriBuffer, s.glTriBuffer, cudaGraphicsRegisterFlagsWriteDiscard));
+        changes = true;
+    }
+
+    // Resize framebuffer?
+    if (width > s.width || height > s.height || depth > s.depth)
+    {
+        int num_outputs = s.enableDB ? 2 : 1;
+        if (s.cudaColorBuffer[0])
+            for (int i=0; i < num_outputs; i++)
+                NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+
+        if (s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+            s.cudaPrevOutBuffer = 0;
+        }
+
+        // New framebuffer size.
+        s.width  = (width > s.width) ? width : s.width;
+        s.height = (height > s.height) ? height : s.height;
+        s.depth  = (depth > s.depth) ? depth : s.depth;
+        s.width  = ROUND_UP(s.width, 32);
+        s.height = ROUND_UP(s.height, 32);
+        LOG(INFO) << "Increasing frame buffer size to (width, height, depth) = (" << s.width << ", " << s.height << ", " << s.depth << ")";
+
+        // Allocate color buffers.
+        for (int i=0; i < num_outputs; i++)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[i]));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+        }
+
+        // Allocate depth/stencil buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glDepthStencilBuffer));
+        NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_DEPTH24_STENCIL8, s.width, s.height, s.depth, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 0));
+
+        // (Re-)register all GL buffers into Cuda.
+        for (int i=0; i < num_outputs; i++)
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaColorBuffer[i], s.glColorBuffer[i], GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+
+        changes = true;
+    }
+}
+
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx)
+{
+    // Only copy inputs if we are on first iteration of depth peeling or not doing it at all.
+    if (peeling_idx < 1)
+    {
+        if (triPtr)
+        {
+            // Copy both position and triangle buffers.
+            void* glPosPtr = NULL;
+            void* glTriPtr = NULL;
+            size_t posBytes = 0;
+            size_t triBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(2, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glTriPtr, &triBytes, s.cudaTriBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK(triBytes >= triCount * sizeof(int32_t), "mapped GL triangle buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glTriPtr, triPtr, triCount * sizeof(int32_t), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(2, &s.cudaPosBuffer, stream));
+        }
+        else
+        {
+            // Copy position buffer only. Triangles are already copied and known to be constant.
+            void* glPosPtr = NULL;
+            size_t posBytes = 0;
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(1, &s.cudaPosBuffer, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsResourceGetMappedPointer(&glPosPtr, &posBytes, s.cudaPosBuffer));
+            NVDR_CHECK(posBytes >= posCount * sizeof(float), "mapped GL position buffer size mismatch");
+            NVDR_CHECK_CUDA_ERROR(cudaMemcpyAsync(glPosPtr, posPtr, posCount * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(1, &s.cudaPosBuffer, stream));
+        }
+    }
+
+    // Select program based on whether we have a depth peeling input or not.
+    if (peeling_idx < 1)
+    {
+        // Normal case: No peeling, or peeling disabled.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgram));
+    }
+    else
+    {
+        // If we don't have a third buffer yet, create one.
+        if (!s.cudaPrevOutBuffer)
+        {
+            NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+            NVDR_CHECK_GL_ERROR(glTexImage3D(GL_TEXTURE_2D_ARRAY, 0, GL_RGBA32F, s.width, s.height, s.depth, 0, GL_RGBA, GL_UNSIGNED_BYTE, 0));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MAG_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_MIN_FILTER, GL_NEAREST));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_GL_ERROR(glTexParameteri(GL_TEXTURE_2D_ARRAY, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE));
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsGLRegisterImage(&s.cudaPrevOutBuffer, s.glPrevOutBuffer, GL_TEXTURE_3D, cudaGraphicsRegisterFlagsReadOnly));
+        }
+
+        // Swap the GL buffers.
+        GLuint glTempBuffer = s.glPrevOutBuffer;
+        s.glPrevOutBuffer = s.glColorBuffer[0];
+        s.glColorBuffer[0] = glTempBuffer;
+
+        // Swap the Cuda buffers.
+        cudaGraphicsResource_t cudaTempBuffer = s.cudaPrevOutBuffer;
+        s.cudaPrevOutBuffer = s.cudaColorBuffer[0];
+        s.cudaColorBuffer[0] = cudaTempBuffer;
+
+        // Bind the new output buffer.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glColorBuffer[0]));
+        NVDR_CHECK_GL_ERROR(glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, s.glColorBuffer[0], 0));
+
+        // Bind old buffer as the input texture.
+        NVDR_CHECK_GL_ERROR(glBindTexture(GL_TEXTURE_2D_ARRAY, s.glPrevOutBuffer));
+
+        // Activate the correct program.
+        NVDR_CHECK_GL_ERROR(glUseProgram(s.glProgramDP));
+    }
+
+    // Set viewport, clear color buffer(s) and depth/stencil buffer.
+    NVDR_CHECK_GL_ERROR(glViewport(0, 0, width, height));
+    NVDR_CHECK_GL_ERROR(glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT));
+
+    // If outputting bary differentials, set resolution uniform
+    if (s.enableDB)
+        NVDR_CHECK_GL_ERROR(glUniform2f(0, 2.f / (float)width, 2.f / (float)height));
+
+    // Set the dummy uniform if depth modification workaround is active.
+    if (s.enableZModify)
+        NVDR_CHECK_GL_ERROR(glUniform1f(1, 0.f));
+
+    // Render the meshes.
+    if (depth == 1 && !rangesPtr)
+    {
+        // Trivial case.
+        NVDR_CHECK_GL_ERROR(glDrawElements(GL_TRIANGLES, triCount, GL_UNSIGNED_INT, 0));
+    }
+    else
+    {
+        // Populate a buffer for draw commands and execute it.
+        std::vector<GLDrawCmd> drawCmdBuffer(depth);
+
+        if (!rangesPtr)
+        {
+            // Fill in range array to instantiate the same triangles for each output layer.
+            // Triangle IDs starts at zero (i.e., one) for each layer, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = drawCmdBuffer[i];
+                cmd.firstIndex    = 0;
+                cmd.count         = triCount;
+                cmd.baseVertex    = vtxPerInstance * i;
+                cmd.baseInstance  = 0;
+                cmd.instanceCount = 1;
+            }
+        }
+        else
+        {
+            // Fill in the range array according to user-given ranges. Triangle IDs point
+            // to the input triangle array, NOT index within range, so they correspond to
+            // the first dimension in addressing the triangle array.
+            for (int i=0, j=0; i < depth; i++)
+            {
+                GLDrawCmd& cmd = drawCmdBuffer[i];
+                int first = rangesPtr[j++];
+                int count = rangesPtr[j++];
+                NVDR_CHECK(first >= 0 && count >= 0, "range contains negative values");
+                NVDR_CHECK((first + count) * 3 <= triCount, "range extends beyond end of triangle buffer");
+                cmd.firstIndex    = first * 3;
+                cmd.count         = count * 3;
+                cmd.baseVertex    = 0;
+                cmd.baseInstance  = first;
+                cmd.instanceCount = 1;
+            }
+        }
+
+        // Draw!
+        NVDR_CHECK_GL_ERROR(glMultiDrawElementsIndirect(GL_TRIANGLES, GL_UNSIGNED_INT, &drawCmdBuffer[0], depth, sizeof(GLDrawCmd)));
+    }
+}
+
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth)
+{
+    // Copy color buffers to output tensors.
+    cudaArray_t array = 0;
+    cudaChannelFormatDesc arrayDesc = {};   // For error checking.
+    cudaExtent arrayExt = {};               // For error checking.
+    int num_outputs = s.enableDB ? 2 : 1;
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsMapResources(num_outputs, s.cudaColorBuffer, stream));
+    for (int i=0; i < num_outputs; i++)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsSubResourceGetMappedArray(&array, s.cudaColorBuffer[i], 0, 0));
+        NVDR_CHECK_CUDA_ERROR(cudaArrayGetInfo(&arrayDesc, &arrayExt, NULL, array));
+        NVDR_CHECK(arrayDesc.f == cudaChannelFormatKindFloat, "CUDA mapped array data kind mismatch");
+        NVDR_CHECK(arrayDesc.x == 32 && arrayDesc.y == 32 && arrayDesc.z == 32 && arrayDesc.w == 32, "CUDA mapped array data width mismatch");
+        NVDR_CHECK(arrayExt.width >= width && arrayExt.height >= height && arrayExt.depth >= depth, "CUDA mapped array extent mismatch");
+        cudaMemcpy3DParms p = {0};
+        p.srcArray = array;
+        p.dstPtr.ptr = outputPtr[i];
+        p.dstPtr.pitch = width * 4 * sizeof(float);
+        p.dstPtr.xsize = width;
+        p.dstPtr.ysize = height;
+        p.extent.width = width;
+        p.extent.height = height;
+        p.extent.depth = depth;
+        p.kind = cudaMemcpyDeviceToDevice;
+        NVDR_CHECK_CUDA_ERROR(cudaMemcpy3DAsync(&p, stream));
+    }
+    NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnmapResources(num_outputs, s.cudaColorBuffer, stream));
+}
+
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s)
+{
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    if (s.cudaPosBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPosBuffer));
+        s.cudaPosBuffer = 0;
+    }
+
+    if (s.cudaTriBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaTriBuffer));
+        s.cudaTriBuffer = 0;
+    }
+
+    for (int i=0; i < num_outputs; i++)
+    {
+        if (s.cudaColorBuffer[i])
+        {
+            NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaColorBuffer[i]));
+            s.cudaColorBuffer[i] = 0;
+        }
+    }
+
+    if (s.cudaPrevOutBuffer)
+    {
+        NVDR_CHECK_CUDA_ERROR(cudaGraphicsUnregisterResource(s.cudaPrevOutBuffer));
+        s.cudaPrevOutBuffer = 0;
+    }
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.h
new file mode 100644
index 00000000..27537c56
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/rasterize_gl.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+
+//------------------------------------------------------------------------
+// Do not try to include OpenGL stuff when compiling CUDA kernels for torch.
+
+#if !(defined(NVDR_TORCH) && defined(__CUDACC__))
+#include "framework.h"
+#include "glutil.h"
+
+//------------------------------------------------------------------------
+// OpenGL-related persistent state for forward op.
+
+struct RasterizeGLState // Must be initializable by memset to zero.
+{
+    int                     width;              // Allocated frame buffer width.
+    int                     height;             // Allocated frame buffer height.
+    int                     depth;              // Allocated frame buffer depth.
+    int                     posCount;           // Allocated position buffer in floats.
+    int                     triCount;           // Allocated triangle buffer in ints.
+    GLContext               glctx;
+    GLuint                  glFBO;
+    GLuint                  glColorBuffer[2];
+    GLuint                  glPrevOutBuffer;
+    GLuint                  glDepthStencilBuffer;
+    GLuint                  glVAO;
+    GLuint                  glTriBuffer;
+    GLuint                  glPosBuffer;
+    GLuint                  glProgram;
+    GLuint                  glProgramDP;
+    GLuint                  glVertexShader;
+    GLuint                  glGeometryShader;
+    GLuint                  glFragmentShader;
+    GLuint                  glFragmentShaderDP;
+    cudaGraphicsResource_t  cudaColorBuffer[2];
+    cudaGraphicsResource_t  cudaPrevOutBuffer;
+    cudaGraphicsResource_t  cudaPosBuffer;
+    cudaGraphicsResource_t  cudaTriBuffer;
+    int                     enableDB;
+    int                     enableZModify;      // Modify depth in shader, workaround for a rasterization issue on A100.
+};
+
+//------------------------------------------------------------------------
+// Shared C++ code prototypes.
+
+void rasterizeInitGLContext(NVDR_CTX_ARGS, RasterizeGLState& s, int cudaDeviceIdx);
+void rasterizeResizeBuffers(NVDR_CTX_ARGS, RasterizeGLState& s, bool& changes, int posCount, int triCount, int width, int height, int depth);
+void rasterizeRender(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, const float* posPtr, int posCount, int vtxPerInstance, const int32_t* triPtr, int triCount, const int32_t* rangesPtr, int width, int height, int depth, int peeling_idx);
+void rasterizeCopyResults(NVDR_CTX_ARGS, RasterizeGLState& s, cudaStream_t stream, float** outputPtr, int width, int height, int depth);
+void rasterizeReleaseBuffers(NVDR_CTX_ARGS, RasterizeGLState& s);
+
+//------------------------------------------------------------------------
+#endif // !(defined(NVDR_TORCH) && defined(__CUDACC__))
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cpp
new file mode 100644
index 00000000..51633e10
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cpp
@@ -0,0 +1,104 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "framework.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Mip stack construction and access helpers.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p)
+{
+    char buf[1024];
+    int bufsz = 1024;
+
+    std::string msg = "Mip-map size error - cannot downsample an odd extent greater than 1. Resize the texture so that both spatial extents are powers of two, or limit the number of mip maps using max_mip_level argument.\n";
+
+    int w = p.texWidth;
+    int h = p.texHeight;
+    bool ew = false;
+    bool eh = false;
+
+    msg += "Attempted mip stack construction:\n";
+    msg +=               "level  width height\n";
+    msg +=               "-----  ----- ------\n";
+    snprintf(buf, bufsz, "base   %5d  %5d\n", w, h);
+    msg += buf;
+
+    int mipTotal = 0;
+    int level = 0;
+    while ((w|h) > 1 && !(ew || eh)) // Stop at first impossible size.
+    {
+        // Current level.
+        level += 1;
+
+        // Determine if downsampling fails.
+        ew = ew || (w > 1 && (w & 1));
+        eh = eh || (h > 1 && (h & 1));
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        // Append level size to error message.
+        snprintf(buf, bufsz, "mip %-2d ", level);
+        msg += buf; 
+        if (ew) snprintf(buf, bufsz, "  err  ");
+        else    snprintf(buf, bufsz, "%5d  ", w);
+        msg += buf;
+        if (eh) snprintf(buf, bufsz, "  err\n");
+        else    snprintf(buf, bufsz, "%5d\n", h);
+        msg += buf;
+    }
+
+    NVDR_CHECK(0, msg);
+}
+
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets)
+{
+    // No levels at all?
+    if (p.mipLevelLimit == 0)
+    {
+        p.mipLevelMax = 0;
+        return 0;
+    }
+
+    // Current level size.
+    int w = p.texWidth;
+    int h = p.texHeight;
+
+    int mipTotal = 0;
+    int level = 0;
+    int c = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE) ? (p.channels * 6) : p.channels;
+    mipOffsets[0] = 0;
+    while ((w|h) > 1)
+    {
+        // Current level.
+        level += 1;
+
+        // Quit if cannot downsample.
+        if ((w > 1 && (w & 1)) || (h > 1 && (h & 1)))
+            raiseMipSizeError(NVDR_CTX_PARAMS, p);
+
+        // Downsample.
+        if (w > 1) w >>= 1;
+        if (h > 1) h >>= 1;
+
+        mipOffsets[level] = mipTotal; // Store the mip offset (#floats).
+        mipTotal += w * h * p.texDepth * c;
+
+        // Hit the level limit?
+        if (p.mipLevelLimit >= 0 && level == p.mipLevelLimit)
+            break;
+    }
+
+    p.mipLevelMax = level;
+    return mipTotal;
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cu
new file mode 100644
index 00000000..490b8d68
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.cu
@@ -0,0 +1,1156 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "common.h"
+#include "texture.h"
+
+//------------------------------------------------------------------------
+// Memory access and math helpers.
+
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float  b, float c) { a[0] += b * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float2 b, float c) { a[0] += b.x * c; a[s] += b.y * c; }
+static __device__ __forceinline__ void accum_from_mem(float* a, int s, float4 b, float c) { a[0] += b.x * c; a[s] += b.y * c; a[2*s] += b.z * c; a[3*s] += b.w * c; }
+static __device__ __forceinline__ void accum_to_mem(float&  a, float* b, int s) { a += b[0]; }
+static __device__ __forceinline__ void accum_to_mem(float2& a, float* b, int s) { float2 v = a; v.x += b[0]; v.y += b[s]; a = v; }
+static __device__ __forceinline__ void accum_to_mem(float4& a, float* b, int s) { float4 v = a; v.x += b[0]; v.y += b[s]; v.z += b[2*s]; v.w += b[3*s]; a = v; }
+static __device__ __forceinline__ bool isfinite_vec3(const float3& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z); }
+static __device__ __forceinline__ bool isfinite_vec4(const float4& a) { return isfinite(a.x) && isfinite(a.y) && isfinite(a.z) && isfinite(a.w); }
+template<class T> static __device__ __forceinline__ T lerp  (const T& a, const T& b, float c) { return a + c * (b - a); }
+template<class T> static __device__ __forceinline__ T bilerp(const T& a, const T& b, const T& c, const T& d, const float2& e) { return lerp(lerp(a, b, e.x), lerp(c, d, e.x), e.y); }
+
+//------------------------------------------------------------------------
+// Cube map wrapping for smooth filtering across edges and corners. At corners,
+// one of the texture coordinates will be negative. For correct interpolation,
+// the missing texel must take the average color of the other three.
+
+static __constant__ uint32_t c_cubeWrapMask1[48] =
+{
+    0x1530a440, 0x1133a550, 0x6103a110, 0x1515aa44, 0x6161aa11, 0x40154a04, 0x44115a05, 0x04611a01,
+    0x2630a440, 0x2233a550, 0x5203a110, 0x2626aa44, 0x5252aa11, 0x40264a04, 0x44225a05, 0x04521a01,
+    0x32608064, 0x3366a055, 0x13062091, 0x32328866, 0x13132299, 0x50320846, 0x55330a55, 0x05130219,
+    0x42508064, 0x4455a055, 0x14052091, 0x42428866, 0x14142299, 0x60420846, 0x66440a55, 0x06140219,
+    0x5230a044, 0x5533a055, 0x1503a011, 0x5252aa44, 0x1515aa11, 0x40520a44, 0x44550a55, 0x04150a11,
+    0x6130a044, 0x6633a055, 0x2603a011, 0x6161aa44, 0x2626aa11, 0x40610a44, 0x44660a55, 0x04260a11,
+};
+
+static __constant__ uint8_t c_cubeWrapMask2[48] =
+{
+    0x26, 0x33, 0x11, 0x05, 0x00, 0x09, 0x0c, 0x04, 0x04, 0x00, 0x00, 0x05, 0x00, 0x81, 0xc0, 0x40,
+    0x02, 0x03, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x02, 0x64, 0x30, 0x90, 0x55, 0xa0, 0x99, 0xcc, 0x64,
+    0x24, 0x30, 0x10, 0x05, 0x00, 0x01, 0x00, 0x00, 0x06, 0x03, 0x01, 0x05, 0x00, 0x89, 0xcc, 0x44,
+};
+
+static __device__ __forceinline__ int4 wrapCubeMap(int face, int ix0, int ix1, int iy0, int iy1, int w)
+{
+    // Calculate case number.
+    int cx = (ix0 < 0) ? 0 : (ix1 >= w) ? 2 : 1;
+    int cy = (iy0 < 0) ? 0 : (iy1 >= w) ? 6 : 3;
+    int c = cx + cy;
+    if (c >= 5)
+        c--;
+    c = (face << 3) + c;
+
+    // Compute coordinates and faces.
+    unsigned int m = c_cubeWrapMask1[c];
+    int x0 = (m >>  0) & 3; x0 = (x0 == 0) ? 0 : (x0 == 1) ? ix0 : iy0;
+    int x1 = (m >>  2) & 3; x1 = (x1 == 0) ? 0 : (x1 == 1) ? ix1 : iy0;
+    int x2 = (m >>  4) & 3; x2 = (x2 == 0) ? 0 : (x2 == 1) ? ix0 : iy1;
+    int x3 = (m >>  6) & 3; x3 = (x3 == 0) ? 0 : (x3 == 1) ? ix1 : iy1;
+    int y0 = (m >>  8) & 3; y0 = (y0 == 0) ? 0 : (y0 == 1) ? ix0 : iy0;
+    int y1 = (m >> 10) & 3; y1 = (y1 == 0) ? 0 : (y1 == 1) ? ix1 : iy0;
+    int y2 = (m >> 12) & 3; y2 = (y2 == 0) ? 0 : (y2 == 1) ? ix0 : iy1;
+    int y3 = (m >> 14) & 3; y3 = (y3 == 0) ? 0 : (y3 == 1) ? ix1 : iy1;
+    int f0 = ((m >> 16) & 15) - 1;
+    int f1 = ((m >> 20) & 15) - 1;
+    int f2 = ((m >> 24) & 15) - 1;
+    int f3 = ((m >> 28)     ) - 1;
+
+    // Flips.
+    unsigned int f = c_cubeWrapMask2[c];
+    int w1 = w - 1;
+    if (f & 0x01) x0 = w1 - x0;
+    if (f & 0x02) x1 = w1 - x1;
+    if (f & 0x04) x2 = w1 - x2;
+    if (f & 0x08) x3 = w1 - x3;
+    if (f & 0x10) y0 = w1 - y0;
+    if (f & 0x20) y1 = w1 - y1;
+    if (f & 0x40) y2 = w1 - y2;
+    if (f & 0x80) y3 = w1 - y3;
+
+    // Done.
+    int4 tcOut;
+    tcOut.x = x0 + (y0 + f0 * w) * w;
+    tcOut.y = x1 + (y1 + f1 * w) * w;
+    tcOut.z = x2 + (y2 + f2 * w) * w;
+    tcOut.w = x3 + (y3 + f3 * w) * w;
+    return tcOut;
+}
+
+//------------------------------------------------------------------------
+// Cube map indexing and gradient functions.
+
+// Map a 3D lookup vector into an (s,t) face coordinates (returned in first .
+// two parameters) and face index.
+static __device__ __forceinline__ int indexCubeMap(float& x, float& y, float z)
+{
+    float ax = fabsf(x);
+    float ay = fabsf(y);
+    float az = fabsf(z);
+    int idx;
+    float c;
+    if (az > fmaxf(ax, ay)) { idx = 4; c = z; }
+    else if (ay > ax)       { idx = 2; c = y; y = z; }
+    else                    { idx = 0; c = x; x = z; }
+    if (c < 0.f) idx += 1;
+    float m = __frcp_rz(fabsf(c)) * .5;
+    float m0 = __uint_as_float(__float_as_uint(m) ^ ((0x21u >> idx) << 31));
+    float m1 = (idx != 2) ? -m : m;
+    x = x * m0 + .5;
+    y = y * m1 + .5;
+    if (!isfinite(x) || !isfinite(y))
+        return -1; // Invalid uv.
+    x = fminf(fmaxf(x, 0.f), 1.f);
+    y = fminf(fmaxf(y, 0.f), 1.f);
+    return idx;
+}
+
+// Based on dA/d{s,t}, compute dA/d{x,y,z} at a given 3D lookup vector.
+static __device__ __forceinline__ float3 indexCubeMapGrad(float3 uv, float gu, float gv)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c;
+    float c0 = gu;
+    float c1 = gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 *= uv.x; c1 *= uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 *= uv.x; c1 *= uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 *= uv.z; c1 *= uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl = (c0 + c1) * m;
+    float gx = (idx & 0x03) ? gl : (idx & 0x20) ? -gu : gu;
+    float gy = (idx & 0x0c) ? gl : -gv;
+    float gz = (idx & 0x30) ? gl : (idx & 0x03) ? gu : gv;
+    gz = (idx & 0x09) ? -gz : gz;
+    float3 res = make_float3(gx, gy, gz) * (m * .5f);
+    if (!isfinite_vec3(res))
+        return make_float3(0.f, 0.f, 0.f); // Invalid uv.
+    return res;
+}
+
+// Based on dL/d(d{s,t}/s{X,Y}), compute dL/d(d{x,y,z}/d{X,Y}). This is just two
+// indexCubeMapGrad() functions rolled together.
+static __device__ __forceinline__ void indexCubeMapGrad4(float3 uv, float4 dw, float3& g0, float3& g1)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, c0, c1;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; c0 = uv.x; c1 = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; c0 = uv.x; c1 = uv.z; }
+    else                    { idx = 0x01; c = uv.x; c0 = uv.z; c1 = uv.y; }
+    if (c < 0.f) idx += idx;
+    float m = __frcp_rz(fabsf(c));
+    c0 = (idx & 0x34) ? -c0 : c0;
+    c1 = (idx & 0x2e) ? -c1 : c1;
+    float gl0 = (dw.x * c0 + dw.z * c1) * m;
+    float gl1 = (dw.y * c0 + dw.w * c1) * m;
+    float gx0 = (idx & 0x03) ? gl0 : (idx & 0x20) ? -dw.x : dw.x;
+    float gx1 = (idx & 0x03) ? gl1 : (idx & 0x20) ? -dw.y : dw.y;
+    float gy0 = (idx & 0x0c) ? gl0 : -dw.z;
+    float gy1 = (idx & 0x0c) ? gl1 : -dw.w;
+    float gz0 = (idx & 0x30) ? gl0 : (idx & 0x03) ? dw.x : dw.z;
+    float gz1 = (idx & 0x30) ? gl1 : (idx & 0x03) ? dw.y : dw.w;
+    if (idx & 0x09)
+    {
+        gz0 = -gz0;
+        gz1 = -gz1;
+    }
+    g0 = make_float3(gx0, gy0, gz0) * (m * .5f);
+    g1 = make_float3(gx1, gy1, gz1) * (m * .5f);
+    if (!isfinite_vec3(g0) || !isfinite_vec3(g1))
+    {
+        g0 = make_float3(0.f, 0.f, 0.f); // Invalid uv.
+        g1 = make_float3(0.f, 0.f, 0.f);
+    }
+}
+
+// Compute d{s,t}/d{X,Y} based on d{x,y,z}/d{X,Y} at a given 3D lookup vector.
+// Result is (ds/dX, ds/dY, dt/dX, dt/dY).
+static __device__ __forceinline__ float4 indexCubeMapGradST(float3 uv, float3 dvdX, float3 dvdY)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+    float m = __frcp_rz(fabsf(c));
+    float dm = m * .5f;
+    float mm = m * dm;
+    gu *= (idx & 0x34) ? -mm : mm;
+    gv *= (idx & 0x2e) ? -mm : mm;
+
+    float4 res;
+    if (idx & 0x03)
+    {
+        res = make_float4(gu * dvdX.x + dm * dvdX.z,
+                          gu * dvdY.x + dm * dvdY.z,
+                          gv * dvdX.x - dm * dvdX.y,
+                          gv * dvdY.x - dm * dvdY.y);
+    }
+    else if (idx & 0x0c)
+    {
+        res = make_float4(gu * dvdX.y + dm * dvdX.x,
+                          gu * dvdY.y + dm * dvdY.x,
+                          gv * dvdX.y + dm * dvdX.z,
+                          gv * dvdY.y + dm * dvdY.z);
+    }
+    else // (idx & 0x30)
+    {
+        res = make_float4(gu * dvdX.z + copysignf(dm, c) * dvdX.x,
+                          gu * dvdY.z + copysignf(dm, c) * dvdY.x,
+                          gv * dvdX.z - dm * dvdX.y,
+                          gv * dvdY.z - dm * dvdY.y);
+    }
+
+    if (!isfinite_vec4(res))
+        return make_float4(0.f, 0.f, 0.f, 0.f);
+
+    return res;
+}
+
+// Compute d(d{s,t}/d{X,Y})/d{x,y,z}, i.e., how the pixel derivatives of 2D face
+// coordinates change w.r.t. 3D texture coordinate vector, returned as follows:
+//   |  d(ds/dX)/dx  d(ds/dY)/dx  d(dt/dX)/dx  d(dt/dY)/dx  |
+//   |  d(ds/dX)/dy  d(ds/dY)/dy  d(dt/dX)/dy  d(dt/dY)/dy  |
+//   |  d(ds/dX)/dz  d(ds/dY)/dz  d(dt/dX)/dz  d(dt/dY)/dz  |
+static __device__ __forceinline__ void indexCubeMapGrad2(float3 uv, float3 dvdX, float3 dvdY, float4& dx, float4& dy, float4& dz)
+{
+    float ax = fabsf(uv.x);
+    float ay = fabsf(uv.y);
+    float az = fabsf(uv.z);
+    int idx;
+    float c, gu, gv;
+    if (az > fmaxf(ax, ay)) { idx = 0x10; c = uv.z; gu = uv.x; gv = uv.y; }
+    else if (ay > ax)       { idx = 0x04; c = uv.y; gu = uv.x; gv = uv.z; }
+    else                    { idx = 0x01; c = uv.x; gu = uv.z; gv = uv.y; }
+    if (c < 0.f) idx += idx;
+
+    if (idx & 0x09)
+    {
+        dvdX.z = -dvdX.z;
+        dvdY.z = -dvdY.z;
+    }
+
+    float m = __frcp_rz(c);
+    float dm = -m * fabsf(m) * .5;
+    float mm = m * m * .5;
+    float mu = (idx & 0x34) ? -mm : mm;
+    float mv = (idx & 0x2e) ? -mm : mm;
+    gu *= -2.0 * m * mu;
+    gv *= -2.0 * m * mv;
+
+    if (idx & 0x03)
+    {
+        dx.x = gu * dvdX.x + dm * dvdX.z;
+        dx.y = gu * dvdY.x + dm * dvdY.z;
+        dx.z = gv * dvdX.x - dm * dvdX.y;
+        dx.w = gv * dvdY.x - dm * dvdY.y;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.x;
+        dy.w = mv * dvdY.x;
+        dz.x = mu * dvdX.x;
+        dz.y = mu * dvdY.x;
+        dz.z = 0.f;
+        dz.w = 0.f;
+    }
+    else if (idx & 0x0c)
+    {
+        dx.x = mu * dvdX.y;
+        dx.y = mu * dvdY.y;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = gu * dvdX.y + dm * dvdX.x;
+        dy.y = gu * dvdY.y + dm * dvdY.x;
+        dy.z = gv * dvdX.y + dm * dvdX.z;
+        dy.w = gv * dvdY.y + dm * dvdY.z;
+        dz.x = 0.f;
+        dz.y = 0.f;
+        dz.z = mv * dvdX.y;
+        dz.w = mv * dvdY.y;
+    }
+    else // (idx & 0x30)
+    {
+        dx.x = mu * dvdX.z;
+        dx.y = mu * dvdY.z;
+        dx.z = 0.f;
+        dx.w = 0.f;
+        dy.x = 0.f;
+        dy.y = 0.f;
+        dy.z = mv * dvdX.z;
+        dy.w = mv * dvdY.z;
+        dz.x = gu * dvdX.z - fabsf(dm) * dvdX.x;
+        dz.y = gu * dvdY.z - fabsf(dm) * dvdY.x;
+        dz.z = gv * dvdX.z - dm * dvdX.y;
+        dz.w = gv * dvdY.z - dm * dvdY.y;
+    }
+}
+
+//------------------------------------------------------------------------
+// General texture indexing.
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ int indexTextureNearest(const TextureKernelParams& p, float3 uv, int tz)
+{
+    int w = p.texWidth;
+    int h = p.texHeight;
+    float u = uv.x;
+    float v = uv.y;
+
+    // Cube map indexing.
+    if (CUBE_MODE)
+    {
+        // No wrap. Fold face index into tz right away.
+        int idx = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        if (idx < 0)
+            return -1; // Invalid uv.
+        tz = 6 * tz + idx;
+    }
+    else
+    {
+        // Handle boundary.
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+    }
+
+    u = u * (float)w;
+    v = v * (float)h;
+
+    int iu = __float2int_rd(u);
+    int iv = __float2int_rd(v);
+
+    // In zero boundary mode, return texture address -1.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        if (iu < 0 || iu >= w || iv < 0 || iv >= h)
+            return -1;
+    }
+
+    // Otherwise clamp and calculate the coordinate properly.
+    iu = min(max(iu, 0), w-1);
+    iv = min(max(iv, 0), h-1);
+    return iu + w * (iv + tz * h);
+}
+
+template <bool CUBE_MODE>
+static __device__ __forceinline__ float2 indexTextureLinear(const TextureKernelParams& p, float3 uv, int tz, int4& tcOut, int level)
+{
+    // Mip level size.
+    int2 sz = mipLevelSize(p, level);
+    int w = sz.x;
+    int h = sz.y;
+
+    // Compute texture-space u, v.
+    float u = uv.x;
+    float v = uv.y;
+    bool clampU = false;
+    bool clampV = false;
+
+    // Cube map indexing.
+    int face = 0;
+    if (CUBE_MODE)
+    {
+        // Neither clamp or wrap.
+        face = indexCubeMap(u, v, uv.z); // Rewrites u, v.
+        if (face < 0)
+        {
+            tcOut.x = tcOut.y = tcOut.z = tcOut.w = -1; // Invalid uv.
+            return make_float2(0.f, 0.f);
+        }
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+    }
+    else
+    {
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+        {
+            // Wrap.
+            u = u - (float)__float2int_rd(u);
+            v = v - (float)__float2int_rd(v);
+        }
+
+        // Move to texel space.
+        u = u * (float)w - 0.5f;
+        v = v * (float)h - 0.5f;
+
+        if (p.boundaryMode == TEX_BOUNDARY_MODE_CLAMP)
+        {
+            // Clamp to center of edge texels.
+            u = fminf(fmaxf(u, 0.f), w - 1.f);
+            v = fminf(fmaxf(v, 0.f), h - 1.f);
+            clampU = (u == 0.f || u == w - 1.f);
+            clampV = (v == 0.f || v == h - 1.f);
+        }
+    }
+
+    // Compute texel coordinates and weights.
+    int iu0 = __float2int_rd(u);
+    int iv0 = __float2int_rd(v);
+    int iu1 = iu0 + (clampU ? 0 : 1); // Ensure zero u/v gradients with clamped.
+    int iv1 = iv0 + (clampV ? 0 : 1);
+    u -= (float)iu0;
+    v -= (float)iv0;
+
+    // Cube map wrapping.
+    bool cubeWrap = CUBE_MODE && (iu0 < 0 || iv0 < 0 || iu1 >= w || iv1 >= h);
+    if (cubeWrap)
+    {
+        tcOut = wrapCubeMap(face, iu0, iu1, iv0, iv1, w);
+        tcOut += 6 * tz * w * h;  // Bring in tz.
+        return make_float2(u, v); // Done.
+    }
+
+    // Fold cube map face into tz.
+    if (CUBE_MODE)
+        tz = 6 * tz + face;
+
+    // Wrap overflowing texel indices.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_WRAP)
+    {
+        if (iu0 < 0) iu0 += w;
+        if (iv0 < 0) iv0 += h;
+        if (iu1 >= w) iu1 -= w;
+        if (iv1 >= h) iv1 -= h;
+    }
+
+    // Coordinates with tz folded in.
+    int iu0z = iu0 + tz * w * h;
+    int iu1z = iu1 + tz * w * h;
+    tcOut.x = iu0z + w * iv0;
+    tcOut.y = iu1z + w * iv0;
+    tcOut.z = iu0z + w * iv1;
+    tcOut.w = iu1z + w * iv1;
+
+    // Invalidate texture addresses outside unit square if we are in zero mode.
+    if (!CUBE_MODE && p.boundaryMode == TEX_BOUNDARY_MODE_ZERO)
+    {
+        bool iu0_out = (iu0 < 0 || iu0 >= w);
+        bool iu1_out = (iu1 < 0 || iu1 >= w);
+        bool iv0_out = (iv0 < 0 || iv0 >= h);
+        bool iv1_out = (iv1 < 0 || iv1 >= h);
+        if (iu0_out || iv0_out) tcOut.x = -1;
+        if (iu1_out || iv0_out) tcOut.y = -1;
+        if (iu0_out || iv1_out) tcOut.z = -1;
+        if (iu1_out || iv1_out) tcOut.w = -1;
+    }
+
+    // All done.
+    return make_float2(u, v);
+}
+
+//------------------------------------------------------------------------
+// Mip level calculation.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __device__ __forceinline__ void calculateMipLevel(int& level0, int& level1, float& flevel, const TextureKernelParams& p, int pidx, float3 uv, float4* pdw, float3* pdfdv)
+{
+    // Do nothing if mips not in use.
+    if (FILTER_MODE == TEX_MODE_NEAREST || FILTER_MODE == TEX_MODE_LINEAR)
+        return;
+
+    // Determine mip level based on UV pixel derivatives. If no derivatives are given (mip level bias only), leave as zero.
+    if (!BIAS_ONLY)
+    {
+        // Get pixel derivatives of texture coordinates.
+        float4 uvDA;
+        float3 dvdX, dvdY; // Gradients use these later.
+        if (CUBE_MODE)
+        {
+            // Fetch.
+            float2 d0 = ((const float2*)p.uvDA)[3 * pidx + 0];
+            float2 d1 = ((const float2*)p.uvDA)[3 * pidx + 1];
+            float2 d2 = ((const float2*)p.uvDA)[3 * pidx + 2];
+
+            // Map d{x,y,z}/d{X,Y} into d{s,t}/d{X,Y}.
+            dvdX = make_float3(d0.x, d1.x, d2.x); // d{x,y,z}/dX
+            dvdY = make_float3(d0.y, d1.y, d2.y); // d{x,y,z}/dY
+            uvDA = indexCubeMapGradST(uv, dvdX, dvdY); // d{s,t}/d{X,Y}
+        }
+        else
+        {
+            // Fetch.
+            uvDA = ((const float4*)p.uvDA)[pidx];
+        }
+
+        // Scaling factors.
+        float uscl = p.texWidth;
+        float vscl = p.texHeight;
+
+        // d[s,t]/d[X,Y].
+        float dsdx = uvDA.x * uscl;
+        float dsdy = uvDA.y * uscl;
+        float dtdx = uvDA.z * vscl;
+        float dtdy = uvDA.w * vscl;
+
+        // Calculate footprint axis lengths.
+        float A = dsdx*dsdx + dtdx*dtdx;
+        float B = dsdy*dsdy + dtdy*dtdy;
+        float C = dsdx*dsdy + dtdx*dtdy;
+        float l2b = 0.5 * (A + B);
+        float l2n = 0.25 * (A-B)*(A-B) + C*C;
+        float l2a = sqrt(l2n);
+        float lenMinorSqr = fmaxf(0.0, l2b - l2a);
+        float lenMajorSqr = l2b + l2a;
+
+        // Footprint vs. mip level gradient.
+        if (pdw && FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            float dw   = 0.72134752f / (l2n + l2a * l2b); // Constant is 0.5/ln(2).
+            float AB   = dw * .5f * (A - B);
+            float Cw   = dw * C;
+            float l2aw = dw * l2a;
+            float d_f_ddsdX = uscl * (dsdx * (l2aw + AB) + dsdy * Cw);
+            float d_f_ddsdY = uscl * (dsdy * (l2aw - AB) + dsdx * Cw);
+            float d_f_ddtdX = vscl * (dtdx * (l2aw + AB) + dtdy * Cw);
+            float d_f_ddtdY = vscl * (dtdy * (l2aw - AB) + dtdx * Cw);
+
+            float4 d_f_dw = make_float4(d_f_ddsdX, d_f_ddsdY, d_f_ddtdX, d_f_ddtdY);
+            if (!CUBE_MODE)
+                *pdw = isfinite_vec4(d_f_dw) ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+
+            // In cube maps, there is also a texture coordinate vs. mip level gradient.
+            // Only output nonzero vectors if both are free of inf/Nan garbage.
+            if (CUBE_MODE)
+            {
+                float4 dx, dy, dz;
+                indexCubeMapGrad2(uv, dvdX, dvdY, dx, dy, dz);
+                float3 d_dsdX_dv = make_float3(dx.x, dy.x, dz.x);
+                float3 d_dsdY_dv = make_float3(dx.y, dy.y, dz.y);
+                float3 d_dtdX_dv = make_float3(dx.z, dy.z, dz.z);
+                float3 d_dtdY_dv = make_float3(dx.w, dy.w, dz.w);
+
+                float3 d_f_dv = make_float3(0.f, 0.f, 0.f);
+                d_f_dv += d_dsdX_dv * d_f_ddsdX;
+                d_f_dv += d_dsdY_dv * d_f_ddsdY;
+                d_f_dv += d_dtdX_dv * d_f_ddtdX;
+                d_f_dv += d_dtdY_dv * d_f_ddtdY;
+
+                bool finite = isfinite_vec4(d_f_dw) && isfinite_vec3(d_f_dv);
+                *pdw   = finite ? d_f_dw : make_float4(0.f, 0.f, 0.f, 0.f);
+                *pdfdv = finite ? d_f_dv : make_float3(0.f, 0.f, 0.f);
+            }
+        }
+
+        // Finally, calculate mip level.
+        flevel = .5f * __log2f(lenMajorSqr); // May be inf/NaN, but clamp fixes it.
+    }
+
+    // Bias the mip level and clamp.
+    if (p.mipLevelBias)
+        flevel += p.mipLevelBias[pidx];
+    flevel = fminf(fmaxf(flevel, 0.f), (float)p.mipLevelMax);
+
+    // Calculate levels depending on filter mode.
+    level0 = __float2int_rd(flevel);
+
+    // Leave everything else at zero if flevel == 0 (magnification) or when in linear-mipmap-nearest mode.
+    if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR && flevel > 0.f)
+    {
+        level1 = min(level0 + 1, p.mipLevelMax);
+        flevel -= level0; // Fractional part. Zero if clamped on last level.
+    }
+}
+
+//------------------------------------------------------------------------
+// Texel fetch and accumulator helpers that understand cube map corners.
+
+template<class T>
+static __device__ __forceinline__ void fetchQuad(T& a00, T& a10, T& a01, T& a11, const float* pIn, int4 tc, bool corner)
+{
+    // For invalid cube map uv, tc will be all negative, and all texel values will be zero.
+    if (corner)
+    {
+        T avg = zero_value<T>();
+        if (tc.x >= 0) avg += (a00 = *((const T*)&pIn[tc.x]));
+        if (tc.y >= 0) avg += (a10 = *((const T*)&pIn[tc.y]));
+        if (tc.z >= 0) avg += (a01 = *((const T*)&pIn[tc.z]));
+        if (tc.w >= 0) avg += (a11 = *((const T*)&pIn[tc.w]));
+        avg *= 0.33333333f;
+        if (tc.x < 0) a00 = avg;
+        if (tc.y < 0) a10 = avg;
+        if (tc.z < 0) a01 = avg;
+        if (tc.w < 0) a11 = avg;
+    }
+    else
+    {
+        a00 = (tc.x >= 0) ? *((const T*)&pIn[tc.x]) : zero_value<T>();
+        a10 = (tc.y >= 0) ? *((const T*)&pIn[tc.y]) : zero_value<T>();
+        a01 = (tc.z >= 0) ? *((const T*)&pIn[tc.z]) : zero_value<T>();
+        a11 = (tc.w >= 0) ? *((const T*)&pIn[tc.w]) : zero_value<T>();
+    }
+}
+
+static __device__ __forceinline__ void accumQuad(float4 c, float* pOut, int level, int4 tc, bool corner, CA_TEMP_PARAM)
+{
+    // For invalid cube map uv, tc will be all negative, and no accumulation will take place.
+    if (corner)
+    {
+        float cb;
+        if (tc.x < 0) cb = c.x;
+        if (tc.y < 0) cb = c.y;
+        if (tc.z < 0) cb = c.z;
+        if (tc.w < 0) cb = c.w;
+        cb *= 0.33333333f;
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x + cb);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y + cb);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z + cb);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w + cb);
+    }
+    else
+    {
+        if (tc.x >= 0) caAtomicAddTexture(pOut, level, tc.x, c.x);
+        if (tc.y >= 0) caAtomicAddTexture(pOut, level, tc.y, c.y);
+        if (tc.z >= 0) caAtomicAddTexture(pOut, level, tc.z, c.z);
+        if (tc.w >= 0) caAtomicAddTexture(pOut, level, tc.w, c.w);
+    }
+}
+
+//------------------------------------------------------------------------
+// Mip builder kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipBuildKernelTemplate(const TextureKernelParams p)
+{
+    // Sizes.
+    int2 sz_in = mipLevelSize(p, p.mipLevelOut - 1);
+    int2 sz_out = mipLevelSize(p, p.mipLevelOut);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= sz_out.x || py >= sz_out.y)
+        return;
+
+    // Pixel indices.
+    int pidx_in0 = p.channels * (((px + sz_in.x * py) << 1) + (pz * sz_in.x * sz_in.y));
+    int pidx_in1 = pidx_in0 + p.channels * sz_in.x; // Next pixel down.
+    int pidx_out = p.channels * (px + sz_out.x * (py + sz_out.y * pz));
+
+    // Input and output pointers.
+    const float* pin = p.tex[p.mipLevelOut - 1];
+    float* pout = (float*)p.tex[p.mipLevelOut];
+
+    // Special case: Input texture height or width is 1.
+    if (sz_in.x == 1 || sz_in.y == 1)
+    {
+        if (sz_in.y == 1)
+            pidx_in1 = pidx_in0 + p.channels; // Next pixel on the right.
+
+        for (int i=0; i < p.channels; i += C)
+        {
+            T v0 = *((const T*)&pin[pidx_in0 + i]);
+            T v1 = *((const T*)&pin[pidx_in1 + i]);
+            T avg = .5f * (v0 + v1);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+            avg = (avg - .5f) * 1.41421356f + .5f;
+#endif
+            *((T*)&pout[pidx_out + i]) = avg;
+        }
+
+        return;
+    }
+
+    for (int i=0; i < p.channels; i += C)
+    {
+        T v0 = *((const T*)&pin[pidx_in0 + i]);
+        T v1 = *((const T*)&pin[pidx_in0 + i + p.channels]);
+        T v2 = *((const T*)&pin[pidx_in1 + i]);
+        T v3 = *((const T*)&pin[pidx_in1 + i + p.channels]);
+        T avg = .25f * (v0 + v1 + v2 + v3);
+#if TEX_DEBUG_MIP_RETAIN_VARIANCE
+        avg = (avg - .5f) * 2.f + .5f;
+#endif
+        *((T*)&pout[pidx_out + i]) = avg;
+    }
+}
+
+// Template specializations.
+__global__ void MipBuildKernel1(const TextureKernelParams p) { MipBuildKernelTemplate<float,  1>(p); }
+__global__ void MipBuildKernel2(const TextureKernelParams p) { MipBuildKernelTemplate<float2, 2>(p); }
+__global__ void MipBuildKernel4(const TextureKernelParams p) { MipBuildKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Forward kernel.
+
+template <class T, int C, bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureFwdKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Output ptr.
+    float* pOut = p.out + pidx * p.channels;
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        tc *= p.channels;
+        const float* pIn = p.tex[0];
+
+        // Copy if valid tc, otherwise output zero.
+        for (int i=0; i < p.channels; i += C)
+            *((T*)&pOut[i]) = (tc >= 0) ? *((const T*)&pIn[tc + i]) : zero_value<T>();
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, 0, 0);
+
+    // Get texel indices and pointer for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Bilinear fetch.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        // Interpolate.
+        for (int i=0; i < p.channels; i += C, tc0 += C)
+        {
+            T a00, a10, a01, a11;
+            fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            *((T*)&pOut[i]) = bilerp(a00, a10, a01, a11, uv0);
+        }
+        return; // Exit.
+    }
+
+    // Get texel indices and pointer for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Trilinear fetch.
+    for (int i=0; i < p.channels; i += C, tc0 += C, tc1 += C)
+    {
+        // First level.
+        T a00, a10, a01, a11;
+        fetchQuad<T>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        T a = bilerp(a00, a10, a01, a11, uv0);
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            T b00, b10, b01, b11;
+            fetchQuad<T>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            T b = bilerp(b00, b10, b01, b11, uv1);
+            a = lerp(a, b, flevel); // Interpolate between levels.
+        }
+
+        // Write.
+        *((T*)&pOut[i]) = a;
+    }
+}
+
+// Template specializations.
+__global__ void TextureFwdKernelNearest1                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest2                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelNearest4                    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelLinear1                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear2                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinear4                     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest1        (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest2        (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearest4        (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear1         (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear2         (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinear4         (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeNearest1                (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest2                (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeNearest4                (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinear1                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear2                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinear4                 (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest1    (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest2    (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearest4    (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear1     (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear2     (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinear4     (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO1      (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO2      (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapNearestBO4      (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO1       (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO2       (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelLinearMipmapLinearBO4       (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO1  (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO2  (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapNearestBO4  (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO1   (const TextureKernelParams p) { TextureFwdKernelTemplate<float,  1, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO2   (const TextureKernelParams p) { TextureFwdKernelTemplate<float2, 2, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureFwdKernelCubeLinearMipmapLinearBO4   (const TextureKernelParams p) { TextureFwdKernelTemplate<float4, 4, true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
+// Gradient mip puller kernel.
+
+template<class T, int C>
+static __forceinline__ __device__ void MipGradKernelTemplate(const TextureKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.texWidth || py >= p.texHeight)
+        return;
+
+    // Number of wide elements.
+    int c = p.channels;
+    if (C == 2) c >>= 1;
+    if (C == 4) c >>= 2;
+
+    // Dynamically allocated shared memory for holding a texel.
+    extern __shared__ float s_texelAccum[];
+    int sharedOfs = threadIdx.x + threadIdx.y * blockDim.x;
+    int sharedStride = blockDim.x * blockDim.y;
+#   define TEXEL_ACCUM(_i) (s_texelAccum + (sharedOfs + (_i) * sharedStride))
+
+    // Clear the texel.
+    for (int i=0; i < p.channels; i++)
+        *TEXEL_ACCUM(i) = 0.f;
+
+    // Track texel position and accumulation weight over the mip stack.
+    int x = px;
+    int y = py;
+    float w = 1.f;
+
+    // Pull gradients from all levels.
+    int2 sz = mipLevelSize(p, 0); // Previous level size.
+    for (int level=1; level <= p.mipLevelMax; level++)
+    {
+        // Weight decay depends on previous level size.
+        if (sz.x > 1) w *= .5f;
+        if (sz.y > 1) w *= .5f;
+
+        // Current level size and coordinates.
+        sz = mipLevelSize(p, level);
+        x >>= 1;
+        y >>= 1;
+
+        T* pIn = (T*)(p.gradTex[level] + (x + sz.x * (y + sz.y * pz)) * p.channels);
+        for (int i=0; i < c; i++)
+            accum_from_mem(TEXEL_ACCUM(i * C), sharedStride, pIn[i], w);
+    }
+
+    // Add to main texture gradients.
+    T* pOut = (T*)(p.gradTex[0] + (px + p.texWidth * (py + p.texHeight * pz)) * p.channels);
+    for (int i=0; i < c; i++)
+        accum_to_mem(pOut[i], TEXEL_ACCUM(i * C), sharedStride);
+}
+
+// Template specializations.
+__global__ void MipGradKernel1(const TextureKernelParams p) { MipGradKernelTemplate<float,  1>(p); }
+__global__ void MipGradKernel2(const TextureKernelParams p) { MipGradKernelTemplate<float2, 2>(p); }
+__global__ void MipGradKernel4(const TextureKernelParams p) { MipGradKernelTemplate<float4, 4>(p); }
+
+//------------------------------------------------------------------------
+// Gradient kernel.
+
+template <bool CUBE_MODE, bool BIAS_ONLY, int FILTER_MODE>
+static __forceinline__ __device__ void TextureGradKernelTemplate(const TextureKernelParams p)
+{
+    // Temporary space for coalesced atomics.
+    CA_DECLARE_TEMP(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH * TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT);
+
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    int tz = (p.texDepth == 1) ? 0 : pz;
+    if (px >= p.imgWidth || py >= p.imgHeight || pz >= p.n)
+        return;
+
+    // Pixel index.
+    int pidx = px + p.imgWidth * (py + p.imgHeight * pz);
+
+    // Early exit if output gradients are zero.
+    const float* pDy = p.dy + pidx * p.channels;
+    unsigned int dmax = 0u;
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i < p.channels; i += 4)
+        {
+            uint4 dy = *((const uint4*)&pDy[i]);
+            dmax |= (dy.x | dy.y | dy.z | dy.w);
+        }
+    }
+    else
+    {
+        for (int i=0; i < p.channels; i++)
+            dmax |= __float_as_uint(pDy[i]);
+    }
+
+    // Store zeros and exit.
+    if (__uint_as_float(dmax) == 0.f)
+    {
+        if (CUBE_MODE)
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float3*)p.gradUV)[pidx] = make_float3(0.f, 0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                {
+                    ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(0.f, 0.f);
+                    ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(0.f, 0.f);
+                }
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        else
+        {
+            if (FILTER_MODE != TEX_MODE_NEAREST)
+                ((float2*)p.gradUV)[pidx] = make_float2(0.f, 0.f);
+            if (FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                if (p.gradUVDA)
+                    ((float4*)p.gradUVDA)[pidx] = make_float4(0.f, 0.f, 0.f, 0.f);
+                if (p.gradMipLevelBias)
+                    p.gradMipLevelBias[pidx] = 0.f;
+            }
+        }
+        return;
+    }
+
+    // Get UV.
+    float3 uv;
+    if (CUBE_MODE)
+        uv = ((const float3*)p.uv)[pidx];
+    else
+        uv = make_float3(((const float2*)p.uv)[pidx], 0.f);
+
+    // Nearest mode - texture gradients only.
+    if (FILTER_MODE == TEX_MODE_NEAREST)
+    {
+        int tc = indexTextureNearest<CUBE_MODE>(p, uv, tz);
+        if (tc < 0)
+            return; // Outside texture.
+
+        tc *= p.channels;
+        float* pOut = p.gradTex[0];
+
+        // Accumulate texture gradients.
+        for (int i=0; i < p.channels; i++)
+            caAtomicAddTexture(pOut, 0, tc + i, pDy[i]);
+
+        return; // Exit.
+    }
+
+    // Calculate mip level. In 'linear' mode these will all stay zero.
+    float4 dw = make_float4(0.f, 0.f, 0.f, 0.f);
+    float3 dfdv = make_float3(0.f, 0.f, 0.f);
+    float  flevel = 0.f; // Fractional level.
+    int    level0 = 0;   // Discrete level 0.
+    int    level1 = 0;   // Discrete level 1.
+    calculateMipLevel<CUBE_MODE, BIAS_ONLY, FILTER_MODE>(level0, level1, flevel, p, pidx, uv, &dw, &dfdv);
+
+    // UV gradient accumulators.
+    float gu = 0.f;
+    float gv = 0.f;
+
+    // Get texel indices and pointers for level 0.
+    int4 tc0 = make_int4(0, 0, 0, 0);
+    float2 uv0 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc0, level0);
+    const float* pIn0 = p.tex[level0];
+    float* pOut0 = p.gradTex[level0];
+    bool corner0 = CUBE_MODE && ((tc0.x | tc0.y | tc0.z | tc0.w) < 0);
+    tc0 *= p.channels;
+
+    // Texel weights.
+    float uv011 = uv0.x * uv0.y;
+    float uv010 = uv0.x - uv011;
+    float uv001 = uv0.y - uv011;
+    float uv000 = 1.f - uv0.x - uv001;
+    float4 tw0 = make_float4(uv000, uv010, uv001, uv011);
+
+    // Attribute weights.
+    int2 sz0 = mipLevelSize(p, level0);
+    float sclu0 = (float)sz0.x;
+    float sclv0 = (float)sz0.y;
+
+    // Bilinear mode - texture and uv gradients.
+    if (FILTER_MODE == TEX_MODE_LINEAR || FILTER_MODE == TEX_MODE_LINEAR_MIPMAP_NEAREST)
+    {
+        for (int i=0; i < p.channels; i++, tc0 += 1)
+        {
+            float dy = pDy[i];
+            accumQuad(tw0 * dy, pOut0, level0, tc0, corner0, CA_TEMP);
+
+            float a00, a10, a01, a11;
+            fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+            float ad = (a11 + a00 - a10 - a01);
+            gu += dy * ((a10 - a00) + uv0.y * ad) * sclu0;
+            gv += dy * ((a01 - a00) + uv0.x * ad) * sclv0;
+        }
+
+        // Store UV gradients and exit.
+        if (CUBE_MODE)
+            ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv);
+        else
+            ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+        return;
+    }
+
+    // Accumulate fractional mip level gradient.
+    float df = 0; // dL/df.
+
+    // Get texel indices and pointers for level 1.
+    int4 tc1 = make_int4(0, 0, 0, 0);
+    float2 uv1 = indexTextureLinear<CUBE_MODE>(p, uv, tz, tc1, level1);
+    const float* pIn1 = p.tex[level1];
+    float* pOut1 = p.gradTex[level1];
+    bool corner1 = CUBE_MODE && ((tc1.x | tc1.y | tc1.z | tc1.w) < 0);
+    tc1 *= p.channels;
+
+    // Texel weights.
+    float uv111 = uv1.x * uv1.y;
+    float uv110 = uv1.x - uv111;
+    float uv101 = uv1.y - uv111;
+    float uv100 = 1.f - uv1.x - uv101;
+    float4 tw1 = make_float4(uv100, uv110, uv101, uv111);
+
+    // Attribute weights.
+    int2 sz1 = mipLevelSize(p, level1);
+    float sclu1 = (float)sz1.x;
+    float sclv1 = (float)sz1.y;
+
+    // Trilinear mode.
+    for (int i=0; i < p.channels; i++, tc0 += 1, tc1 += 1)
+    {
+        float dy = pDy[i];
+        float dy0 = (1.f - flevel) * dy;
+        accumQuad(tw0 * dy0, pOut0, level0, tc0, corner0, CA_TEMP);
+
+        // UV gradients for first level.
+        float a00, a10, a01, a11;
+        fetchQuad<float>(a00, a10, a01, a11, pIn0, tc0, corner0);
+        float ad = (a11 + a00 - a10 - a01);
+        gu += dy0 * ((a10 - a00) + uv0.y * ad) * sclu0;
+        gv += dy0 * ((a01 - a00) + uv0.x * ad) * sclv0;
+
+        // Second level unless in magnification mode.
+        if (flevel > 0.f)
+        {
+            // Texture gradients for second level.
+            float dy1 = flevel * dy;
+            accumQuad(tw1 * dy1, pOut1, level1, tc1, corner1, CA_TEMP);
+
+            // UV gradients for second level.
+            float b00, b10, b01, b11;
+            fetchQuad<float>(b00, b10, b01, b11, pIn1, tc1, corner1);
+            float bd = (b11 + b00 - b10 - b01);
+            gu += dy1 * ((b10 - b00) + uv1.y * bd) * sclu1;
+            gv += dy1 * ((b01 - b00) + uv1.x * bd) * sclv1;
+
+            // Mip level gradient.
+            float a = bilerp(a00, a10, a01, a11, uv0);
+            float b = bilerp(b00, b10, b01, b11, uv1);
+            df += (b-a) * dy;
+        }
+    }
+
+    // Store UV gradients.
+    if (CUBE_MODE)
+        ((float3*)p.gradUV)[pidx] = indexCubeMapGrad(uv, gu, gv) + (dfdv * df);
+    else
+        ((float2*)p.gradUV)[pidx] = make_float2(gu, gv);
+
+    // Store mip level bias gradient.
+    if (p.gradMipLevelBias)
+        p.gradMipLevelBias[pidx] = df;
+
+    // Store UV pixel differential gradients.
+    if (!BIAS_ONLY)
+    {
+        // Final gradients.
+        dw *= df; // dL/(d{s,y}/d{X,Y}) = df/(d{s,y}/d{X,Y}) * dL/df.
+
+        // Store them.
+        if (CUBE_MODE)
+        {
+            // Remap from dL/(d{s,t}/s{X,Y}) to dL/(d{x,y,z}/d{X,Y}).
+            float3 g0, g1;
+            indexCubeMapGrad4(uv, dw, g0, g1);
+            ((float2*)p.gradUVDA)[3 * pidx + 0] = make_float2(g0.x, g1.x);
+            ((float2*)p.gradUVDA)[3 * pidx + 1] = make_float2(g0.y, g1.y);
+            ((float2*)p.gradUVDA)[3 * pidx + 2] = make_float2(g0.z, g1.z);
+        }
+        else
+            ((float4*)p.gradUVDA)[pidx] = dw;
+    }
+}
+
+// Template specializations.
+__global__ void TextureGradKernelNearest                    (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelLinear                     (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearest        (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinear         (const TextureKernelParams p) { TextureGradKernelTemplate<false, false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeNearest                (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinear                 (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearest    (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinear     (const TextureKernelParams p) { TextureGradKernelTemplate<true,  false, TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelLinearMipmapNearestBO      (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelLinearMipmapLinearBO       (const TextureKernelParams p) { TextureGradKernelTemplate<false, true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapNearestBO  (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_NEAREST>(p); }
+__global__ void TextureGradKernelCubeLinearMipmapLinearBO   (const TextureKernelParams p) { TextureGradKernelTemplate<true,  true,  TEX_MODE_LINEAR_MIPMAP_LINEAR>(p); }
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.h
new file mode 100644
index 00000000..f79b600f
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/common/texture.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "framework.h"
+
+//------------------------------------------------------------------------
+// Constants.
+
+#define TEX_DEBUG_MIP_RETAIN_VARIANCE           0   // For debugging
+#define TEX_FWD_MAX_KERNEL_BLOCK_WIDTH          8
+#define TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT         8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH      8
+#define TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT     8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH         8
+#define TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT        8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH     8
+#define TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT    8
+#define TEX_MAX_MIP_LEVEL                       16  // Currently a texture cannot be larger than 2 GB because we use 32-bit indices everywhere.
+#define TEX_MODE_NEAREST                        0   // Nearest on base level.
+#define TEX_MODE_LINEAR                         1   // Bilinear on base level.
+#define TEX_MODE_LINEAR_MIPMAP_NEAREST          2   // Bilinear on nearest mip level.
+#define TEX_MODE_LINEAR_MIPMAP_LINEAR           3   // Trilinear.
+#define TEX_MODE_COUNT                          4
+#define TEX_BOUNDARY_MODE_CUBE                  0   // Cube map mode.
+#define TEX_BOUNDARY_MODE_WRAP                  1   // Wrap (u, v).
+#define TEX_BOUNDARY_MODE_CLAMP                 2   // Clamp (u, v).
+#define TEX_BOUNDARY_MODE_ZERO                  3   // Pad with zeros.
+#define TEX_BOUNDARY_MODE_COUNT                 4
+
+//------------------------------------------------------------------------
+// CUDA kernel params.
+
+struct TextureKernelParams
+{
+    const float*    tex[TEX_MAX_MIP_LEVEL];         // Incoming texture buffer with mip levels.
+    const float*    uv;                             // Incoming texcoord buffer.
+    const float*    uvDA;                           // Incoming uv pixel diffs or NULL.
+    const float*    mipLevelBias;                   // Incoming mip level bias or NULL.
+    const float*    dy;                             // Incoming output gradient.
+    float*          out;                            // Outgoing texture data.
+    float*          gradTex[TEX_MAX_MIP_LEVEL];     // Outgoing texture gradients with mip levels.
+    float*          gradUV;                         // Outgoing texcoord gradient.
+    float*          gradUVDA;                       // Outgoing texcoord pixel differential gradient.
+    float*          gradMipLevelBias;               // Outgoing mip level bias gradient.
+    int             enableMip;                      // If true, we have uv_da and/or mip_level_bias input(s), and a mip tensor.
+    int             filterMode;                     // One of the TEX_MODE_ constants.
+    int             boundaryMode;                   // One of the TEX_BOUNDARY_MODE_ contants.
+    int             texConst;                       // If true, texture is known to be constant.
+    int             mipLevelLimit;                  // Mip level limit coming from the op.
+    int             channels;                       // Number of texture channels.
+    int             imgWidth;                       // Image width.
+    int             imgHeight;                      // Image height.
+    int             texWidth;                       // Texture width.
+    int             texHeight;                      // Texture height.
+    int             texDepth;                       // Texture depth.
+    int             n;                              // Minibatch size.
+    int             mipLevelMax;                    // Maximum mip level index. Zero if mips disabled.
+    int             mipLevelOut;                    // Mip level being calculated in builder kernel.
+};
+
+//------------------------------------------------------------------------
+// C++ helper function prototypes.
+
+void raiseMipSizeError(NVDR_CTX_ARGS, const TextureKernelParams& p);
+int calculateMipInfo(NVDR_CTX_ARGS, TextureKernelParams& p, int* mipOffsets);
+
+//------------------------------------------------------------------------
+// Macros.
+
+#define mipLevelSize(p, i) make_int2(((p).texWidth >> (i)) > 1 ? ((p).texWidth >> (i)) : 1, ((p).texHeight >> (i)) > 1 ? ((p).texHeight >> (i)) : 1)
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/lib/setgpu.lib b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/lib/setgpu.lib
new file mode 100644
index 00000000..add9a0c4
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/lib/setgpu.lib differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/__init__.py
new file mode 100644
index 00000000..cf62df87
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import rasterize, interpolate, texture, antialias
+from .plugin_loader import set_cache_dir
+
+__all__ = ["rasterize", "interpolate", "texture", "antialias", "set_cache_dir"]
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/ops.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/ops.py
new file mode 100644
index 00000000..be51deef
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/ops.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import tensorflow as tf
+import numpy as np
+import os
+from . import plugin_loader
+
+#----------------------------------------------------------------------------
+# Helpers.
+#----------------------------------------------------------------------------
+
+# OpenGL-related linker options depending on platform.
+def _get_gl_opts():
+    libs = {
+        'posix': ['GL', 'EGL'],
+        'nt':    ['gdi32', 'opengl32', 'user32', 'setgpu'],
+    }
+    return ['-l' + x for x in libs[os.name]]
+
+# Load the cpp plugin.
+def _get_plugin():
+    fn = os.path.join(os.path.dirname(__file__), 'tf_all.cu')
+    return plugin_loader.get_plugin(fn, extra_nvcc_options=_get_gl_opts() + ['-DNVDR_TENSORFLOW'])
+
+# Convert parameter to a numpy array if possible.
+def _get_constant(x, dtype):
+    try:
+        return np.asarray(x, dtype=dtype)
+    except (TypeError, ValueError):
+        return None
+
+# Tests for a construction-time constantness instead of tf.constant node because
+# the latter can be overridden in Session.run() feed_dict at evaluation time.
+def _is_constant(x, dtype):
+    if isinstance(x, np.ndarray):
+        return np.can_cast(x.dtype, dtype, 'unsafe')
+    else:
+        return _get_constant(x, dtype) is not None
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+def rasterize(pos, tri, resolution, ranges=None, tri_const=False, output_db=True, grad_db=True):
+    assert tri_const is True or tri_const is False
+    assert output_db is True or output_db is False
+
+    # Known constant resolution?
+    resolution_c = _get_constant(resolution, np.int32)
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert all inputs to tensors / base types.
+    tri_const = 1 if tri_const else 0
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    resolution = tf.convert_to_tensor(resolution, dtype=tf.int32)
+    if ranges is None:
+        ranges = tf.convert_to_tensor(np.zeros(shape=[0, 2], dtype=np.int32)) # Empty tensor.
+    else:
+        ranges = tf.convert_to_tensor(ranges, dtype=tf.int32) # Convert input to tensor.
+
+    # Infer as much about the output shape as possible.
+    out_shape = [None, None, None, 4]
+    if pos.shape.rank == 3: # Instanced mode.
+        out_shape[0] = pos.shape[0].value
+    elif pos.shape.rank == 2: # Range mode.
+        if ranges.shape.rank not in [None, 0]:
+            out_shape[0] = ranges.shape[0].value
+    if resolution_c is not None:
+        assert resolution_c.shape == (2,)
+        out_shape[1], out_shape[2] = resolution_c
+
+    # Output pixel differentials.
+    @tf.custom_gradient
+    def func_db(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 1, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape)
+        def grad(dy, ddb):
+            if grad_db:
+                return _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+            else:
+                return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Do not output pixel differentials.
+    @tf.custom_gradient
+    def func(pos):
+        out, out_db = _get_plugin().rasterize_fwd(pos, tri, resolution, ranges, 0, tri_const)
+        out.set_shape(out_shape)
+        out_db.set_shape(out_shape[:-1] + [0]) # Zero channels in out_db.
+        def grad(dy, _):
+            return _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return (out, out_db), grad
+
+    # Choose stub.
+    if output_db:
+        return func_db(pos)
+    else:
+        return func(pos)
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = _get_constant(diff_attrs, np.int32)
+        assert (diff_attrs is not None) and len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    # Convert all inputs to tensors.
+    attr = tf.convert_to_tensor(attr, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+    if diff_attrs:
+        rast_db = tf.convert_to_tensor(rast_db, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if rast.shape.rank is not None:
+        out_shape = [rast.shape[0].value, rast.shape[1].value, rast.shape[2].value, None]
+    if attr.shape.rank in [2, 3]:
+        out_shape[3] = attr.shape[-1].value
+
+    # Output pixel differentials for at least some attributes.
+    @tf.custom_gradient
+    def func_da(attr, rast, rast_db):
+        diff_attrs_all = int(diff_attrs == 'all')
+        diff_attrs_list = [] if diff_attrs_all else diff_attrs
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+
+        # Infer number of channels in out_da.
+        if not diff_attrs_all:
+            da_channels = 2 * len(diff_attrs)
+        if (attr.shape.rank in [2, 3]) and (attr.shape[-1].value is not None):
+            da_channels = 2 * attr.shape[-1].value
+        else:
+            da_channels = None
+
+        # Set output shapes.
+        out.set_shape(out_shape)
+        out_da.set_shape([out_shape[0], out_shape[1], out_shape[2], da_channels])
+
+        def grad(dy, dda):
+            return _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return (out, out_da), grad
+
+    # No pixel differentials for any attribute.
+    @tf.custom_gradient
+    def func(attr, rast):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        out.set_shape(out_shape)
+        out_da.set_shape(out_shape[:-1] + [0]) # Zero channels in out_da.
+        def grad(dy, _):
+            return _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return (out, out_da), grad
+
+    # Choose stub.
+    if diff_attrs:
+        return func_da(attr, rast, rast_db)
+    else:
+        return func(attr, rast)
+
+#----------------------------------------------------------------------------
+# Texture.
+#----------------------------------------------------------------------------
+
+def texture(tex, uv, uv_da=None, filter_mode='auto', boundary_mode='wrap', tex_const=False, max_mip_level=None):
+    assert tex_const is True or tex_const is False
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None) else 'linear'
+
+    # Known constant texture?
+    tex_const = tex_const or _is_constant(tex, np.float32)
+
+    # Sanitize inputs.
+    tex_const = 1 if tex_const else 0
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Convert inputs to tensors.
+    tex = tf.convert_to_tensor(tex, dtype=tf.float32)
+    uv = tf.convert_to_tensor(uv, dtype=tf.float32)
+    if 'mipmap' in filter_mode:
+        uv_da = tf.convert_to_tensor(uv_da, dtype=tf.float32)
+
+    # Infer output shape.
+    out_shape = [None, None, None, None]
+    if uv.shape.rank is not None:
+        assert uv.shape.rank == 4
+        out_shape = [uv.shape[0].value, uv.shape[1].value, uv.shape[2].value, None]
+    if tex.shape.rank is not None:
+        assert tex.shape.rank == (5 if boundary_mode == 'cube' else 4)
+        out_shape[-1] = tex.shape[-1].value
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Linear-mipmap-linear: Mipmaps enabled, all gradients active.
+    @tf.custom_gradient
+    def func_linear_mipmap_linear(tex, uv, uv_da):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear-mipmap-nearest: Mipmaps enabled, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear_mipmap_nearest(tex, uv):
+        out, mip = _get_plugin().texture_fwd_mip(tex, uv, uv_da, filter_mode_enum, boundary_mode_enum, tex_const, max_mip_level)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip, filter_mode_enum, boundary_mode_enum, max_mip_level)
+        return out, grad
+
+    # Linear: Mipmaps disabled, no uv_da, no gradients to uv_da.
+    @tf.custom_gradient
+    def func_linear(tex, uv):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Nearest: Mipmaps disabled, no uv_da, no gradients to uv_da or uv.
+    @tf.custom_gradient
+    def func_nearest(tex):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        out.set_shape(out_shape)
+        def grad(dy):
+            return _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+        return out, grad
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear':
+        return func_linear_mipmap_linear(tex, uv, uv_da)
+    elif filter_mode == 'linear-mipmap-nearest':
+        return func_linear_mipmap_nearest(tex, uv)
+    elif filter_mode == 'linear':
+        return func_linear(tex, uv)
+    elif filter_mode == 'nearest':
+        return func_nearest(tex)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+def antialias(color, rast, pos, tri, tri_const=False, pos_gradient_boost=1.0):
+    assert tri_const is True or tri_const is False
+
+    # Known constant triangles?
+    tri_const = tri_const or _is_constant(tri, np.int32)
+
+    # Convert inputs to tensors.
+    color = tf.convert_to_tensor(color, dtype=tf.float32)
+    rast = tf.convert_to_tensor(rast, dtype=tf.float32)
+    pos = tf.convert_to_tensor(pos, dtype=tf.float32)
+    tri = tf.convert_to_tensor(tri, dtype=tf.int32)
+
+    # Sanitize inputs.
+    tri_const = 1 if tri_const else 0
+
+    @tf.custom_gradient
+    def func(color, pos):
+        color_out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, tri_const)
+        color_out.set_shape(color.shape)
+        def grad(dy):
+            grad_color, grad_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+            if pos_gradient_boost != 1.0:
+                grad_pos = grad_pos * pos_gradient_boost
+            return grad_color, grad_pos
+        return color_out, grad
+
+    return func(color, pos)
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
new file mode 100644
index 00000000..3918aecd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/plugin_loader.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import glob
+import os
+import re
+import uuid
+import hashlib
+import tempfile
+import shutil
+import tensorflow as tf
+from tensorflow.python.client import device_lib # pylint: disable=no-name-in-module
+
+#----------------------------------------------------------------------------
+# Global options.
+
+_nvdiffrast_cache_dir = None
+
+def set_cache_dir(path: str) -> None:
+    '''Set CUDA kernel compilation temp dir.
+
+    If `set_cache_dir` is not called, the cache directory will default to
+    one of the below:
+
+    - Value of NVDIFFRAST_CACHE_DIR env var, if set
+    - $HOME/.cache/nvdiffrast if HOME env var is set
+    - $USERPROFILE/.cache/nvdiffrast if USERPROFILE is set.
+
+    Args:
+      path: Where to save CUDA kernel build temporaries
+    '''
+    global _nvdiffrast_cache_dir
+    _nvdiffrast_cache_dir = path
+
+def make_cache_dir_path(*paths: str) -> str:
+    if _nvdiffrast_cache_dir is not None:
+        return os.path.join(_nvdiffrast_cache_dir, *paths)
+    if 'NVDIFFRAST_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['NVDIFFRAST_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'nvdiffrast', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'nvdiffrast', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'nvdiffrast', *paths)
+
+cuda_cache_version_tag = 'v1'
+do_not_hash_included_headers = False # Speed up compilation by assuming that headers included by the CUDA code never change. Unsafe!
+verbose = True # Print status messages to stdout.
+
+#----------------------------------------------------------------------------
+# Internal helper funcs.
+
+def _find_compiler_bindir():
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Enterprise/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Professional/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/BuildTools/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    hostx64_paths = sorted(glob.glob('C:/Program Files (x86)/Microsoft Visual Studio/*/Community/VC/Tools/MSVC/*/bin/Hostx64/x64'), reverse=True)
+    if hostx64_paths != []:
+        return hostx64_paths[0]
+    vc_bin_dir = 'C:/Program Files (x86)/Microsoft Visual Studio 14.0/vc/bin'
+    if os.path.isdir(vc_bin_dir):
+        return vc_bin_dir
+    return None
+
+def _get_compute_cap(device):
+    caps_str = device.physical_device_desc
+    m = re.search('compute capability: (\\d+).(\\d+)', caps_str)
+    major = m.group(1)
+    minor = m.group(2)
+    return (major, minor)
+
+def _get_cuda_gpu_arch_string():
+    gpus = [x for x in device_lib.list_local_devices() if x.device_type == 'GPU']
+    if len(gpus) == 0:
+        raise RuntimeError('No GPU devices found')
+    (major, minor) = _get_compute_cap(gpus[0])
+    return 'sm_%s%s' % (major, minor)
+
+def _run_cmd(cmd):
+    with os.popen(cmd) as pipe:
+        output = pipe.read()
+        status = pipe.close()
+    if status is not None:
+        raise RuntimeError('NVCC returned an error. See below for full command line and output log:\n\n%s\n\n%s' % (cmd, output))
+
+def _prepare_nvcc_cli(opts):
+    cmd = 'nvcc ' + opts.strip()
+    cmd += ' --disable-warnings'
+    cmd += ' --include-path "%s"' % tf.sysconfig.get_include()
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'protobuf_archive', 'src')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'com_google_absl')
+    cmd += ' --include-path "%s"' % os.path.join(tf.sysconfig.get_include(), 'external', 'eigen_archive')
+
+    compiler_bindir = _find_compiler_bindir()
+    if compiler_bindir is None:
+        # Require that _find_compiler_bindir succeeds on Windows.  Allow
+        # nvcc to use whatever is the default on Linux.
+        if os.name == 'nt':
+            raise RuntimeError('Could not find MSVC/GCC/CLANG installation on this computer. Check compiler_bindir_search_path list in "%s".' % __file__)
+    else:
+        cmd += ' --compiler-bindir "%s"' % compiler_bindir
+    cmd += ' 2>&1'
+    return cmd
+
+#----------------------------------------------------------------------------
+# Main entry point.
+
+_plugin_cache = dict()
+
+def get_plugin(cuda_file, extra_nvcc_options=[]):
+    cuda_file_base = os.path.basename(cuda_file)
+    cuda_file_name, cuda_file_ext = os.path.splitext(cuda_file_base)
+
+    # Already in cache?
+    if cuda_file in _plugin_cache:
+        return _plugin_cache[cuda_file]
+
+    # Setup plugin.
+    if verbose:
+        print('Setting up TensorFlow plugin "%s": ' % cuda_file_base, end='', flush=True)
+    try:
+        # Hash CUDA source.
+        md5 = hashlib.md5()
+        with open(cuda_file, 'rb') as f:
+            md5.update(f.read())
+        md5.update(b'\n')
+
+        # Hash headers included by the CUDA code by running it through the preprocessor.
+        if not do_not_hash_included_headers:
+            if verbose:
+                print('Preprocessing... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + cuda_file_ext)
+                _run_cmd(_prepare_nvcc_cli('"%s" --preprocess -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir)))
+                with open(tmp_file, 'rb') as f:
+                    bad_file_str = ('"' + cuda_file.replace('\\', '/') + '"').encode('utf-8') # __FILE__ in error check macros
+                    good_file_str = ('"' + cuda_file_base + '"').encode('utf-8')
+                    for ln in f:
+                        if not ln.startswith(b'# ') and not ln.startswith(b'#line '): # ignore line number pragmas
+                            ln = ln.replace(bad_file_str, good_file_str)
+                            md5.update(ln)
+                    md5.update(b'\n')
+
+        # Select compiler options.
+        compile_opts = ''
+        if os.name == 'nt':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.lib')
+            compile_opts += ' --library-path="%s"' % (os.path.dirname(__file__) + r"\..\lib") # Find libraries during compilation.
+        elif os.name == 'posix':
+            compile_opts += '"%s"' % os.path.join(tf.sysconfig.get_lib(), 'python', '_pywrap_tensorflow_internal.so')
+            compile_opts += ' --compiler-options \'-fPIC -D_GLIBCXX_USE_CXX11_ABI=0\''
+        else:
+            assert False # not Windows or Linux, w00t?
+        compile_opts += ' --gpu-architecture=%s' % _get_cuda_gpu_arch_string()
+        compile_opts += ' --use_fast_math'
+        for opt in extra_nvcc_options:
+            compile_opts += ' ' + opt
+        nvcc_cmd = _prepare_nvcc_cli(compile_opts)
+
+        # Hash build configuration.
+        md5.update(('nvcc_cmd: ' + nvcc_cmd).encode('utf-8') + b'\n')
+        md5.update(('tf.VERSION: ' + tf.VERSION).encode('utf-8') + b'\n')
+        md5.update(('cuda_cache_version_tag: ' + cuda_cache_version_tag).encode('utf-8') + b'\n')
+
+        # Compile if not already compiled.
+        bin_file_ext = '.dll' if os.name == 'nt' else '.so'
+        cuda_cache_path = make_cache_dir_path()
+        bin_file = os.path.join(make_cache_dir_path(), cuda_file_name + '_' + md5.hexdigest() + bin_file_ext)
+        if not os.path.isfile(bin_file):
+            if verbose:
+                print('Compiling... ', end='', flush=True)
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_file = os.path.join(tmp_dir, cuda_file_name + '_tmp' + bin_file_ext)
+                _run_cmd(nvcc_cmd + ' "%s" --shared -o "%s" --keep --keep-dir "%s"' % (cuda_file, tmp_file, tmp_dir))
+                os.makedirs(cuda_cache_path, exist_ok=True)
+                intermediate_file = os.path.join(cuda_cache_path, cuda_file_name + '_' + uuid.uuid4().hex + '_tmp' + bin_file_ext)
+                shutil.copyfile(tmp_file, intermediate_file)
+                os.rename(intermediate_file, bin_file) # atomic
+
+        # Load.
+        if verbose:
+            print('Loading... ', end='', flush=True)
+        plugin = tf.load_op_library(bin_file)
+
+        # Add to cache.
+        _plugin_cache[cuda_file] = plugin
+        if verbose:
+            print('Done.', flush=True)
+        return plugin
+
+    except:
+        if verbose:
+            print('Failed!', flush=True)
+        raise
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
new file mode 100644
index 00000000..8eefcfbd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_all.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+// TF-specific helpers.
+
+#define OP_CHECK_CUDA_ERROR(CTX, CUDA_CALL) do { cudaError_t err = CUDA_CALL; OP_REQUIRES(CTX, err == cudaSuccess, errors::Internal("Cuda error: ", cudaGetErrorName(err), "[", #CUDA_CALL, ";]")); } while (0)
+#define OP_CHECK_GL_ERROR(CTX, GL_CALL) do { GL_CALL; GLenum err = glGetError(); OP_REQUIRES(CTX, err == GL_NO_ERROR, errors::Internal("OpenGL error: ", getGLErrorString(err), "[", #GL_CALL, ";]")); } while (0)
+
+// Cuda kernels and CPP all together. What an absolute compilation unit.
+
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#include "../common/framework.h"
+#include "../common/glutil.cpp"
+
+#include "../common/common.h"
+#include "../common/common.cpp"
+
+#include "../common/rasterize.h"
+#include "../common/rasterize_gl.cpp"
+#include "../common/rasterize.cu"
+#include "tf_rasterize.cu"
+
+#include "../common/interpolate.cu"
+#include "tf_interpolate.cu"
+
+#include "../common/texture.cpp"
+#include "../common/texture.cu"
+#include "tf_texture.cu"
+
+#include "../common/antialias.cu"
+#include "tf_antialias.cu"
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
new file mode 100644
index 00000000..9b14962a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_antialias.cu
@@ -0,0 +1,278 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct AntialiasFwdOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_attribs.tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color     = ctx->input(0);
+        const Tensor& rasterOut = ctx->input(1);
+        const Tensor& pos       = ctx->input(2);
+        const Tensor& tri       = ctx->input(3);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate output tensor.
+        Tensor* outputTensor = NULL;
+        TensorShape outputShape;
+        outputShape.AddDim(p.n);
+        outputShape.AddDim(p.height);
+        outputShape.AddDim(p.width);
+        outputShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, outputShape, &outputTensor));
+        p.output = outputTensor->flat<float>().data();
+
+        // Allocate work buffer. One extra int4 for storing counters.
+        Tensor* workTensor = NULL;
+        TensorShape workShape;
+        workShape.AddDim(p.n * p.width * p.height * 8 + 4); // 8 int for a maximum of two work items per pixel.
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, workShape, &workTensor));
+        p.workBuffer = (int4*)(workTensor->flat<int>().data());
+
+        // Clear the work counters.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rasterOut  &  7), errors::Internal("raster_out input tensor not aligned to float2"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Kernel parameters.
+        void* args[] = {&p};
+
+        // (Re-)calculate opposite vertex hash.
+        if (!p.evHash || !p.tri_const)
+        {            
+            if (p.allocTriangles < p.numTriangles)
+            {
+                p.allocTriangles = max(p.allocTriangles, 64);
+                while (p.allocTriangles < p.numTriangles)
+                    p.allocTriangles <<= 1; // Must be power of two.
+               
+                // (Re-)allocate memory for the hash.
+                OP_CHECK_CUDA_ERROR(ctx, cudaFree(p.evHash));
+                OP_CHECK_CUDA_ERROR(ctx, cudaMalloc(&p.evHash, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4)));
+                LOG(INFO) << "Increasing topology hash size to accommodate " << p.allocTriangles << " triangles";
+            }
+
+            // Clear the hash and launch the mesh kernel to populate it.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.evHash, 0, p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * sizeof(uint4), stream));
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+        }
+
+        // Copy input to output as a baseline.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.output, p.color, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        // Choose launch parameters for the discontinuity finder kernel and launch.
+        dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+        dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+        // Determine optimum block size for the persistent analysis kernel.
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+
+        // Launch analysis kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasFwd")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Output     ("output: float")
+    .Output     ("work_buffer: int32")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasFwd").Device(DEVICE_GPU), AntialiasFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct AntialiasGradOp : public OpKernel
+{
+    AntialiasKernelParams m_attribs;
+
+    AntialiasGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        AntialiasKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& color      = ctx->input(0);
+        const Tensor& rasterOut  = ctx->input(1);
+        const Tensor& pos        = ctx->input(2);
+        const Tensor& tri        = ctx->input(3);
+        const Tensor& dy         = ctx->input(4);
+        const Tensor& workBuffer = ctx->input(5);
+
+        // Instance rendering mode?
+        p.instance_mode = pos.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+            p.numVertices = (pos.dims() > 1) ? pos.dim_size(1) : 0;
+        else
+            p.numVertices = (pos.dims() > 0) ? pos.dim_size(0) : 0;
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.n        = (color.dims() > 0) ? color.dim_size(0) : 0;
+        p.height   = (color.dims() > 1) ? color.dim_size(1) : 0;
+        p.width    = (color.dims() > 2) ? color.dim_size(2) : 0;
+        p.channels = (color.dims() > 3) ? color.dim_size(3) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) > 0 && dy.dim_size(2) > 0 && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, color.dims() == 4 && color.dim_size(0) > 0 && color.dim_size(1) > 0 && color.dim_size(2) > 0 && color.dim_size(3) > 0, errors::InvalidArgument("color must have shape[>0, >0, >0, >0]"));
+        OP_REQUIRES(ctx, rasterOut.dims() == 4 && rasterOut.dim_size(0) > 0 && rasterOut.dim_size(1) > 0 && rasterOut.dim_size(2) > 0 && rasterOut.dim_size(3) == 4, errors::InvalidArgument("raster_out must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, color.dim_size(1) == rasterOut.dim_size(1) && color.dim_size(2) == rasterOut.dim_size(2), errors::InvalidArgument("color and raster_out inputs must have same spatial dimensions"));
+        OP_REQUIRES(ctx, color.dim_size(1) == dy.dim_size(1) && color.dim_size(2) == dy.dim_size(2) && color.dim_size(3) == dy.dim_size(3), errors::InvalidArgument("color and dy inputs must have same dimensions"));
+        if (p.instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out, pos"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n && pos.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out, pos"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, >0, 4] or [>0, 4]"));
+            OP_REQUIRES(ctx, rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs color, raster_out"));
+            OP_REQUIRES(ctx, dy.dim_size(0) == p.n && rasterOut.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs dy, color, raster_out"));
+        }
+
+        // Get input pointers.
+        p.dy = dy.flat<float>().data();
+        p.color = color.flat<float>().data();
+        p.rasterOut = rasterOut.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.pos = pos.flat<float>().data();
+        p.workBuffer = (int4*)(workBuffer.flat<int>().data());
+
+        // Misc parameters.
+        p.xh = .5f * (float)p.width;
+        p.yh = .5f * (float)p.height;
+
+        // Allocate color gradient output tensor.
+        Tensor* gradColor = NULL;
+        TensorShape gradColorShape;
+        gradColorShape.AddDim(p.n);
+        gradColorShape.AddDim(p.height);
+        gradColorShape.AddDim(p.width);
+        gradColorShape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, gradColorShape, &gradColor));
+        p.gradColor = gradColor->flat<float>().data();
+
+        // Allocate position gradient output tensor.
+        Tensor* gradPos = NULL;
+        TensorShape gradPosShape;
+        if (p.instance_mode)
+            gradPosShape.AddDim(p.n);
+        gradPosShape.AddDim(p.numVertices);
+        gradPosShape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, gradPosShape, &gradPos));
+        p.gradPos = gradPos->flat<float>().data();
+
+        // Initialize all the stuff.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream)); // Gradient kernel work counter.
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemcpyAsync(p.gradColor, p.dy, p.n * p.height * p.width * p.channels * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradPos, 0, (p.instance_mode ? p.n : 1) * p.numVertices * 4 * sizeof(float), stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos        & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.workBuffer & 15), errors::Internal("work_buffer internal tensor not aligned to int4"));
+
+        // Launch the gradient kernel.
+        void* args[] = {&p};
+
+        int device = 0;
+        int numCTA = 0;
+        int numSM  = 0;
+        OP_CHECK_CUDA_ERROR(ctx, cudaGetDevice(&device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+        OP_CHECK_CUDA_ERROR(ctx, cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+    }
+};
+
+REGISTER_OP("AntialiasGrad")
+    .Input      ("color: float")
+    .Input      ("raster_out: float")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("work_buffer: int32")
+    .Output     ("grad_color: float")
+    .Output     ("grad_pos: float");
+
+REGISTER_KERNEL_BUILDER(Name("AntialiasGrad").Device(DEVICE_GPU), AntialiasGradOp);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu
new file mode 100644
index 00000000..612ce1af
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_interpolate.cu
@@ -0,0 +1,301 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void interpolateParseOpAttributes(OpKernelConstruction* ctx, InterpolateKernelParams& p, bool enableDA)
+{
+    if (enableDA)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs_all", &p.diff_attrs_all));
+        if (!p.diff_attrs_all)
+        {
+            std::vector<int> diff_attrs_vec;
+            OP_REQUIRES_OK(ctx, ctx->GetAttr("diff_attrs", &diff_attrs_vec));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() > 0, errors::InvalidArgument("differentiation enabled with empty diff_attrs list"));
+            OP_REQUIRES(ctx, diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, errors::InvalidArgument("too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)"));
+            p.numDiffAttr = diff_attrs_vec.size();
+            memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateFwdOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 3 : 2);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, attr.dim_size(0) == p.depth || attr.dim_size(0) == 1, errors::InvalidArgument("minibatch size mismatch between inputs rast, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, rast_db.dims() == 4 && rast_db.dim_size(0) > 0 && rast_db.dim_size(1) > 0 && rast_db.dim_size(2) > 0 && rast_db.dim_size(3) == 4, errors::InvalidArgument("rast_db must have shape[>0, >0, >0, 4]"));
+            OP_REQUIRES(ctx, rast_db.dim_size(1) == rast.dim_size(1) && rast_db.dim_size(2) == rast.dim_size(2), errors::InvalidArgument("spatial size mismatch between inputs rast and rast_db"));
+            OP_REQUIRES(ctx, rast_db.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, rast_db"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr = attr.flat<float>().data();
+        p.rast = rast.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.attrBC = (p.instance_mode && attr.dim_size(0) == 1) ? 1 : 0;
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+
+        // Allocate main output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.depth);
+        out_shape.AddDim(p.height);
+        out_shape.AddDim(p.width);
+        out_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Allocate pixel differential output tensor.
+        Tensor* out_da_tensor = NULL;
+        out_shape.set_dim(3, p.numDiffAttr * 2);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, out_shape, &out_da_tensor));
+        p.outDA = ENABLE_DA ? out_da_tensor->flat<float>().data() : 0;
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+        if (ENABLE_DA)
+            OP_REQUIRES(ctx, !((uintptr_t)p.outDA & 7), errors::Internal("out_da output tensor not aligned to float2"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateFwd")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Output     ("out: float")
+    .Output     ("out_da: float");
+
+REGISTER_OP("InterpolateFwdDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("rast_db: float")
+    .Output     ("out: float")
+    .Output     ("out_da: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwd")  .Device(DEVICE_GPU), InterpolateFwdOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateFwdDa").Device(DEVICE_GPU), InterpolateFwdOp<true>);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DA>
+struct InterpolateGradOp : public OpKernel
+{
+    InterpolateKernelParams m_attribs;
+
+    InterpolateGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        interpolateParseOpAttributes(ctx, m_attribs, ENABLE_DA);      
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        InterpolateKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Get input.
+        const Tensor& attr    = ctx->input(0);
+        const Tensor& rast    = ctx->input(1);
+        const Tensor& tri     = ctx->input(2);
+        const Tensor& dy      = ctx->input(3);
+        const Tensor& rast_db = ctx->input(ENABLE_DA ? 4 : 3);
+        const Tensor& dda     = ctx->input(ENABLE_DA ? 5 : 3);
+
+        // Instance rendering mode?
+        p.instance_mode = attr.dims() > 2;
+
+        // Extract input dimensions.
+        if (p.instance_mode)
+        {
+            p.numVertices  = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+            p.numAttr      = (attr.dims() > 2) ? attr.dim_size(2) : 0;
+        }
+        else
+        {
+            p.numVertices  = (attr.dims() > 0) ? attr.dim_size(0) : 0;
+            p.numAttr      = (attr.dims() > 1) ? attr.dim_size(1) : 0;
+        }
+        p.numTriangles = (tri.dims() > 0) ? tri.dim_size(0) : 0;
+        p.depth        = (rast.dims() > 0) ? rast.dim_size(0) : 0;
+        p.height       = (rast.dims() > 1) ? rast.dim_size(1) : 0;
+        p.width        = (rast.dims() > 2) ? rast.dim_size(2) : 0;
+        int attr_depth = p.instance_mode ? (attr.dims() > 1 ? attr.dim_size(0) : 0) : 1;
+
+        // Sanity checks.
+        OP_REQUIRES(ctx, rast.dims() == 4 && rast.dim_size(0) > 0 && rast.dim_size(1) > 0 && rast.dim_size(2) > 0 && rast.dim_size(3) == 4, errors::InvalidArgument("rast must have shape[>0, >0, >0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, (attr.dims() == 2 || attr.dims() == 3) && attr.dim_size(0) > 0 && attr.dim_size(1) > 0 && (attr.dims() == 2 || attr.dim_size(2) > 0), errors::InvalidArgument("attr must have shape [>0, >0, >0] or [>0, >0]"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) > 0 && dy.dim_size(1) == p.height && dy.dim_size(2) == p.width && dy.dim_size(3) > 0, errors::InvalidArgument("dy must have shape [>0, height, width, >0]"));
+        OP_REQUIRES(ctx, dy.dim_size(3) == p.numAttr, errors::InvalidArgument("argument count mismatch between inputs dy, attr"));
+        OP_REQUIRES(ctx, (attr_depth == p.depth || attr_depth == 1) && dy.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between inputs rast, dy, attr"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, dda.dims() == 4 && dda.dim_size(0) > 0 && dda.dim_size(1) == p.height && dda.dim_size(2) == p.width, errors::InvalidArgument("dda must have shape [>0, height, width, ?]"));
+            OP_REQUIRES(ctx, dda.dim_size(0) == p.depth, errors::InvalidArgument("minibatch size mismatch between rast, dda"));
+        }
+
+        // All diff attrs mode.
+        if (p.diff_attrs_all)
+            p.numDiffAttr = p.numAttr;
+
+        // Get input pointers.
+        p.attr   = attr.flat<float>().data();
+        p.rast   = rast.flat<float>().data();
+        p.tri    = tri.flat<int>().data();
+        p.dy     = dy.flat<float>().data();
+        p.rastDB = ENABLE_DA ? rast_db.flat<float>().data() : 0;
+        p.dda    = ENABLE_DA ? dda.flat<float>().data() : 0;
+        p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+        // Allocate attribute gradient output tensor.
+        Tensor* grad_attr_tensor = NULL;
+        TensorShape grad_attr_shape;
+        if (p.instance_mode)
+            grad_attr_shape.AddDim(attr_depth);
+        grad_attr_shape.AddDim(p.numVertices);
+        grad_attr_shape.AddDim(p.numAttr);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_attr_shape, &grad_attr_tensor));
+        p.gradAttr = grad_attr_tensor->flat<float>().data();
+
+        // Allocate bary gradient output tensor.
+        Tensor* grad_rast_tensor = NULL;
+        TensorShape grad_rast_shape;
+        grad_rast_shape.AddDim(p.depth);
+        grad_rast_shape.AddDim(p.height);
+        grad_rast_shape.AddDim(p.width);
+        grad_rast_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_rast_shape, &grad_rast_tensor));
+        p.gradRaster = grad_rast_tensor->flat<float>().data();
+
+        // Allocate bary pixel diff gradient output tensor.
+        if (ENABLE_DA)
+        {
+            Tensor* grad_rast_db_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_rast_shape, &grad_rast_db_tensor));
+            p.gradRasterDB = grad_rast_db_tensor->flat<float>().data();
+        }
+        
+        // Clear attribute gradients.
+        cudaMemsetAsync(p.gradAttr, 0, attr_depth * p.numVertices * p.numAttr * sizeof(float), stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.rast   & 15), errors::Internal("rast input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.gradRaster & 15), errors::Internal("grad_rast output tensor not aligned to float4"));
+        if (ENABLE_DA)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.dda & 7), errors::Internal("dda input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.rastDB & 15), errors::Internal("rast_db input tensor not aligned to float4"));        
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradRasterDB & 15), errors::Internal("grad_rast_db output tensor not aligned to float4"));
+        }
+    
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DA ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("InterpolateGrad")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    ;
+
+REGISTER_OP("InterpolateGradDa")
+    .Input      ("attr: float")
+    .Input      ("rast: float")
+    .Input      ("tri: int32")
+    .Input      ("dy: float")
+    .Input      ("rast_db: float")
+    .Input      ("dda: float")
+    .Output     ("grad_attr: float")
+    .Output     ("grad_rast: float")
+    .Output     ("grad_rast_db: float")
+    .Attr       ("diff_attrs_all: int")
+    .Attr       ("diff_attrs: list(int)");
+    ;
+
+REGISTER_KERNEL_BUILDER(Name("InterpolateGrad")  .Device(DEVICE_GPU), InterpolateGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("InterpolateGradDa").Device(DEVICE_GPU), InterpolateGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu
new file mode 100644
index 00000000..4d0a2616
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_rasterize.cu
@@ -0,0 +1,242 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct RasterizeFwdOp : public OpKernel
+{
+    RasterizeGLState        m_glState;              // OpenGL-related persistent state.
+    int                     m_tri_const;            // 1 if triangle array is known to be constant.
+
+    RasterizeFwdOp(OpKernelConstruction* ctx):
+        OpKernel(ctx)
+    {
+        memset(&m_glState, 0, sizeof(RasterizeGLState));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("enable_db", &m_glState.enableDB));
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("tri_const", &m_tri_const));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Check that input shapes are correct.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& resolution = ctx->input(2);
+        const Tensor& ranges = ctx->input(3);
+
+        // Determine number of outputs
+        int num_outputs = m_glState.enableDB ? 2 : 1;
+
+        // Determine instance mode and check input dimensions.
+        bool instance_mode = pos.dims() > 2;
+        if (instance_mode)
+        {
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) > 0 && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("instance mode - pos must have shape [>0, >0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("range mode - pos must have shape [>0, 4]"));
+            OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+            OP_REQUIRES(ctx, resolution.dims() == 1 && resolution.dim_size(0) == 2, errors::InvalidArgument("resolution must have shape [2]"));
+            OP_REQUIRES(ctx, ranges.dims() == 2 && ranges.dim_size(0) > 0 && ranges.dim_size(1) == 2, errors::InvalidArgument("range mode - ranges must have shape [>0, 2]"));
+        }
+
+        // Get output shape.
+        const int32_t* res_in = resolution.flat<int32_t>().data(); // This is in CPU memory.
+        int height = res_in[0];
+        int width  = res_in[1];
+        int depth  = instance_mode ? pos.dim_size(0) : ranges.dim_size(0);
+        OP_REQUIRES(ctx, height > 0 && width > 0, errors::InvalidArgument("resolution must be [>0, >0]"));
+
+        // Get position and triangle buffer sizes in int32/float32.
+        int posCount = 4 * pos.dim_size(0) * (instance_mode ? pos.dim_size(1) : 1);
+        int triCount = 3 * tri.dim_size(0);
+
+        // Init context and GL?
+        bool initCtx = !m_glState.glFBO;
+        if (initCtx)
+        {
+            const DeviceBase::GpuDeviceInfo* g = ctx->device()->tensorflow_gpu_device_info();
+            int cudaDeviceIdx = g ? g->gpu_id : -1;
+            rasterizeInitGLContext(ctx, m_glState, cudaDeviceIdx); // In common/rasterize.cpp
+        }
+        else
+            setGLContext(m_glState.glctx); // (Re-)Activate GL context.
+
+        // Resize all buffers.
+        bool changes = false;
+        rasterizeResizeBuffers(ctx, m_glState, changes, posCount, triCount, width, height, depth); // In common/rasterize_gl.cpp
+        if (changes)
+        {
+#ifdef _WIN32
+            // Workaround for occasional blank first frame on Windows.
+            releaseGLContext();
+            setGLContext(m_glState.glctx);
+#endif
+        }
+
+        // Copy input data to GL and render.
+        const float* posPtr = pos.flat<float>().data();
+        const int32_t* rangesPtr = instance_mode ? 0 : ranges.flat<int32_t>().data(); // This is in CPU memory.
+        const int32_t* triPtr = (initCtx || !m_tri_const) ? tri.flat<int32_t>().data() : NULL; // Copy triangles only if needed.
+        int vtxPerInstance = instance_mode ? pos.dim_size(1) : 0;
+        rasterizeRender(ctx, m_glState, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, -1);
+
+        // Allocate output tensors.
+        TensorShape output_shape;
+        output_shape.AddDim(depth);
+        output_shape.AddDim(height);
+        output_shape.AddDim(width);
+        output_shape.AddDim(4);
+        float* outputPtr[2];
+        for (int i=0; i < 2; i++)
+        {
+            if (i >= num_outputs)
+                output_shape.set_dim(3, 0); // Zero channels for unwanted out_db tensor.
+            Tensor* output_tensor = NULL;
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(i, output_shape, &output_tensor));
+            if (i < num_outputs)
+                outputPtr[i] = output_tensor->flat<float>().data();
+        }
+
+        // Copy rasterized results into CUDA buffers.
+        rasterizeCopyResults(ctx, m_glState, stream, outputPtr, width, height, depth);
+
+        // Done. Release GL context.
+        releaseGLContext();
+    }
+};
+
+REGISTER_OP("RasterizeFwd")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("resolution: int32")
+    .Input      ("ranges: int32")
+    .Output     ("out: float")
+    .Output     ("out_db: float")
+    .Attr       ("enable_db: int")
+    .Attr       ("tri_const: int");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeFwd").Device(DEVICE_GPU).HostMemory("resolution").HostMemory("ranges"), RasterizeFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+template <bool ENABLE_DB>
+struct RasterizeGradOp : public OpKernel
+{
+    RasterizeGradParams m_attribs;
+
+    RasterizeGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        RasterizeGradParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+
+        // Input tensors.
+        const Tensor& pos = ctx->input(0);
+        const Tensor& tri = ctx->input(1);
+        const Tensor& out = ctx->input(2);
+        const Tensor& dy  = ctx->input(3);
+        const Tensor& ddb = ctx->input(ENABLE_DB ? 4 : 3);
+
+        // Determine instance mode.
+        p.instance_mode = (pos.dims() > 2) ? 1 : 0;
+
+        // Shape is taken from the rasterizer output tensor.
+        OP_REQUIRES(ctx, out.dims() == 4, errors::InvalidArgument("out must be rank-4"));
+        p.depth  = out.dim_size(0);
+        p.height = out.dim_size(1);
+        p.width  = out.dim_size(2);
+        OP_REQUIRES(ctx, p.depth > 0 && p.height > 0 && p.width > 0, errors::InvalidArgument("resolution must be [>0, >0, >0]"));
+
+        // Check other shapes.
+        if (p.instance_mode)
+            OP_REQUIRES(ctx, pos.dims() == 3 && pos.dim_size(0) == p.depth && pos.dim_size(1) > 0 && pos.dim_size(2) == 4, errors::InvalidArgument("pos must have shape [depth, >0, 4]"));
+        else
+            OP_REQUIRES(ctx, pos.dims() == 2 && pos.dim_size(0) > 0 && pos.dim_size(1) == 4, errors::InvalidArgument("pos must have shape [>0, 4]"));
+        OP_REQUIRES(ctx, tri.dims() == 2 && tri.dim_size(0) > 0 && tri.dim_size(1) == 3, errors::InvalidArgument("tri must have shape [>0, 3]"));
+        OP_REQUIRES(ctx, out.dims() == 4 && out.dim_size(0) == p.depth && out.dim_size(1) == p.height && out.dim_size(2) == p.width && out.dim_size(3) == 4, errors::InvalidArgument("out must have shape [depth, height, width, 4]"));
+        OP_REQUIRES(ctx,  dy.dims() == 4 &&  dy.dim_size(0) == p.depth &&  dy.dim_size(1) == p.height &&  dy.dim_size(2) == p.width &&  dy.dim_size(3) == 4, errors::InvalidArgument("dy must have shape [depth, height, width, 4]"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, ddb.dims() == 4 && ddb.dim_size(0) == p.depth && ddb.dim_size(1) == p.height && ddb.dim_size(2) == p.width && ddb.dim_size(3) == 4, errors::InvalidArgument("ddb must have shape [depth, height, width, 4]"));
+
+        // Populate parameters.
+        p.numTriangles = tri.dim_size(0);
+        p.numVertices = p.instance_mode ? pos.dim_size(1) : pos.dim_size(0);
+        p.pos = pos.flat<float>().data();
+        p.tri = tri.flat<int>().data();
+        p.out = out.flat<float>().data();
+        p.dy  = dy.flat<float>().data();
+        p.ddb = ENABLE_DB ? ddb.flat<float>().data() : 0;
+
+        // Set up pixel position to clip space x, y transform.
+        p.xs = 2.f / (float)p.width;
+        p.xo = 1.f / (float)p.width - 1.f;
+        p.ys = 2.f / (float)p.height;
+        p.yo = 1.f / (float)p.height - 1.f;
+
+        // Allocate output tensor for position gradients.
+        Tensor* grad_tensor = NULL;
+        TensorShape grad_shape;
+        if (p.instance_mode)
+            grad_shape.AddDim(p.depth);
+        grad_shape.AddDim(p.numVertices);
+        grad_shape.AddDim(4);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_shape, &grad_tensor));
+        p.grad = grad_tensor->flat<float>().data();
+
+        // Clear the output buffers.
+        size_t gradBytes = (p.instance_mode ? p.depth : 1) * p.numVertices * 4 * sizeof(float);
+        cudaMemsetAsync(p.grad, 0, gradBytes, stream);
+
+        // Verify that buffers are aligned to allow float2/float4 operations.
+        OP_REQUIRES(ctx, !((uintptr_t)p.pos & 15), errors::Internal("pos input tensor not aligned to float4"));
+        OP_REQUIRES(ctx, !((uintptr_t)p.dy  &  7), errors::Internal("dy input tensor not aligned to float2"));
+        if (ENABLE_DB)
+            OP_REQUIRES(ctx, !((uintptr_t)p.ddb & 15), errors::Internal("ddb input tensor not aligned to float4"));
+
+        // Choose launch parameters.
+        dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+        // Launch CUDA kernel.
+        void* args[] = {&p};
+        void* func = ENABLE_DB ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("RasterizeGrad")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Output     ("grad: float");
+
+REGISTER_OP("RasterizeGradDb")
+    .Input      ("pos: float")
+    .Input      ("tri: int32")
+    .Input      ("out: float")
+    .Input      ("dy: float")
+    .Input      ("ddb: float")
+    .Output     ("grad: float");
+
+REGISTER_KERNEL_BUILDER(Name("RasterizeGrad")  .Device(DEVICE_GPU), RasterizeGradOp<false>);
+REGISTER_KERNEL_BUILDER(Name("RasterizeGradDb").Device(DEVICE_GPU), RasterizeGradOp<true>);
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu
new file mode 100644
index 00000000..c5382fed
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/tensorflow/tf_texture.cu
@@ -0,0 +1,525 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Common op attribute parser.
+
+static __host__ void parseOpAttributes(OpKernelConstruction* ctx, TextureKernelParams& p)
+{
+    // Mip and filter modes.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("filter_mode", &p.filterMode));
+    OP_REQUIRES(ctx, p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, errors::InvalidArgument("filter_mode unsupported"));
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        OP_REQUIRES_OK(ctx, ctx->GetAttr("max_mip_level", &p.mipLevelLimit));
+        OP_REQUIRES(ctx, p.mipLevelLimit >= -1, errors::InvalidArgument("invalid max_mip_level"));
+        ctx->GetAttr("tex_const", &p.texConst); // Only available in forward op.
+    }
+
+    // Boundary mode.
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("boundary_mode", &p.boundaryMode));
+    OP_REQUIRES(ctx, p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, errors::InvalidArgument("boundary_mode unsupported"));
+}
+
+//------------------------------------------------------------------------
+// Forward TensorFlow op.
+
+struct TextureFwdOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+    PersistentTensor    m_persistentMipTensor; // Used if texture is constant and mips are enabled.
+    bool                m_persistentMipTensorInitialized;
+
+    TextureFwdOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        m_persistentMipTensorInitialized = false;
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 2 : 1);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), errors::InvalidArgument("texture size too large"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+
+        // Allocate output tensor.
+        Tensor* out_tensor = NULL;
+        TensorShape out_shape;
+        out_shape.AddDim(p.n);
+        out_shape.AddDim(p.imgHeight);
+        out_shape.AddDim(p.imgWidth);
+        out_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out_tensor));
+        p.out = out_tensor->flat<float>().data();
+
+        // Choose kernel variants based on channel count.
+        void* args[] = {&p};
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        float* pmip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Mip output tensor.
+            Tensor* mip_tensor = NULL;
+            TensorShape mip_shape;
+            mip_shape.AddDim(mipTotal);
+
+            // If texture is constant, calculate mip stack only once.
+            bool computeMip = true;
+            if (p.texConst)
+            {
+                // First execution?
+                if (!m_persistentMipTensorInitialized)
+                {
+                    // Allocate a persistent mip tensor.
+                    OP_REQUIRES_OK(ctx, ctx->allocate_persistent(DT_FLOAT, mip_shape, &m_persistentMipTensor, &mip_tensor));
+                    m_persistentMipTensorInitialized = true;
+                }
+                else
+                {
+                    // Reuse the persistent tensor, do not recompute mip levels.
+                    mip_tensor = m_persistentMipTensor.AccessTensor(ctx);
+                    computeMip = false;
+                }
+
+                // Set as output tensor as well.
+                ctx->set_output(1, *mip_tensor);
+            }
+            else
+            {
+                // Allocate an output tensor as usual.
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(1, mip_shape, &mip_tensor));
+            }
+
+            pmip = mip_tensor->flat<float>().data(); // Pointer to data.
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+            // Build mip levels if needed.
+            if (computeMip)
+            {
+                for (int i=1; i <= p.mipLevelMax; i++)
+                {
+                    int2 ms = mipLevelSize(p, i);
+                    int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+                    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+                    dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+                    p.mipLevelOut = i;
+
+                    void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+                    OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+                }
+            }
+        }
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv & 7), errors::Internal("uv input tensor not aligned to float2"));
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 15), errors::Internal("out output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 15), errors::Internal("mip output tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0] & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.out    & 7), errors::Internal("out output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip     & 7), errors::Internal("mip output tensor not aligned to float2"));
+        }
+        if (!cube_mode)
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+        else
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+
+        // Choose launch parameters for texture lookup kernel.
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        // Choose kernel based on filter mode, cube mode, and datatype.
+        void* func_tbl[TEX_MODE_COUNT * 3 * 2] = {
+            (void*)TextureFwdKernelNearest1,
+            (void*)TextureFwdKernelNearest2,
+            (void*)TextureFwdKernelNearest4,
+            (void*)TextureFwdKernelLinear1,
+            (void*)TextureFwdKernelLinear2,
+            (void*)TextureFwdKernelLinear4,
+            (void*)TextureFwdKernelLinearMipmapNearest1,
+            (void*)TextureFwdKernelLinearMipmapNearest2,
+            (void*)TextureFwdKernelLinearMipmapNearest4,
+            (void*)TextureFwdKernelLinearMipmapLinear1,
+            (void*)TextureFwdKernelLinearMipmapLinear2,
+            (void*)TextureFwdKernelLinearMipmapLinear4,
+            (void*)TextureFwdKernelCubeNearest1,
+            (void*)TextureFwdKernelCubeNearest2,
+            (void*)TextureFwdKernelCubeNearest4,
+            (void*)TextureFwdKernelCubeLinear1,
+            (void*)TextureFwdKernelCubeLinear2,
+            (void*)TextureFwdKernelCubeLinear4,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+            (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+            (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+        func_idx = func_idx * 3 + channel_div_idx;
+
+        // Launch kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+    }
+};
+
+REGISTER_OP("TextureFwd")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Output     ("out: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureFwdMip")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("uv_da: float")
+    .Output     ("out: float")
+    .Output     ("mip: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("tex_const: int")
+    .Attr       ("max_mip_level: int");
+
+REGISTER_KERNEL_BUILDER(Name("TextureFwd")   .Device(DEVICE_GPU), TextureFwdOp);
+REGISTER_KERNEL_BUILDER(Name("TextureFwdMip").Device(DEVICE_GPU), TextureFwdOp);
+
+//------------------------------------------------------------------------
+// Gradient TensorFlow op.
+
+struct TextureGradOp : public OpKernel
+{
+    TextureKernelParams m_attribs;
+
+    TextureGradOp(OpKernelConstruction* ctx): OpKernel(ctx)
+    {
+        memset(&m_attribs, 0, sizeof(m_attribs));
+        parseOpAttributes(ctx, m_attribs);
+    }
+
+    void Compute(OpKernelContext* ctx)
+    {
+        TextureKernelParams& p = m_attribs;
+        cudaStream_t stream = ctx->eigen_device<Eigen::GpuDevice>().stream();
+        bool cube_mode = (p.boundaryMode == TEX_BOUNDARY_MODE_CUBE);
+
+        // Get input.
+        const Tensor& tex   = ctx->input(0);
+        const Tensor& uv    = ctx->input(1);
+        const Tensor& dy    = ctx->input(2);
+        const Tensor& uv_da = ctx->input(p.enableMip ? 3 : 2);
+        const Tensor& mip   = ctx->input(p.enableMip ? 4 : 2);
+
+        // Extract input dimensions.
+        p.n         = (uv.dims() > 0) ? uv.dim_size(0) : 0;
+        p.imgHeight = (uv.dims() > 1) ? uv.dim_size(1) : 0;
+        p.imgWidth  = (uv.dims() > 2) ? uv.dim_size(2) : 0;
+        p.texDepth  = (tex.dims() > 0) ? tex.dim_size(0) : 0;
+        if (!cube_mode)
+        {
+            p.texHeight = (tex.dims() > 1) ? tex.dim_size(1) : 0;
+            p.texWidth  = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.channels  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+        }
+        else
+        {
+            p.texHeight = (tex.dims() > 2) ? tex.dim_size(2) : 0;
+            p.texWidth  = (tex.dims() > 3) ? tex.dim_size(3) : 0;
+            p.channels  = (tex.dims() > 4) ? tex.dim_size(4) : 0;
+        }
+
+        // Sanity checks.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, tex.dims() == 4 && tex.dim_size(0) > 0 && tex.dim_size(1) > 0 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0, errors::InvalidArgument("tex must have shape[>0, >0, >0, >0]"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 2, errors::InvalidArgument("uv must have shape [>0, >0, >0, 2]"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, tex.dims() == 5 && tex.dim_size(0) > 0 && tex.dim_size(1) == 6 && tex.dim_size(2) > 0 && tex.dim_size(3) > 0 && tex.dim_size(4) > 0, errors::InvalidArgument("tex must have shape[>0, 6, >0, >0, >0] in cube map mode"));
+            OP_REQUIRES(ctx, uv.dims() == 4 && uv.dim_size(0) > 0 && uv.dim_size(1) > 0 && uv.dim_size(2) > 0 && uv.dim_size(3) == 3, errors::InvalidArgument("uv must have shape [>0, >0, >0, 3] in cube map mode"));
+            OP_REQUIRES(ctx, tex.dim_size(2) == tex.dim_size(3), errors::InvalidArgument("texture shape must be square in cube map mode"));
+        }
+        OP_REQUIRES(ctx, tex.dim_size(0) == 1 || tex.dim_size(0) == p.n, errors::InvalidArgument("minibatch size mismatch between inputs tex, uv"));
+        OP_REQUIRES(ctx, dy.dims() == 4 && dy.dim_size(0) == p.n && dy.dim_size(1) == p.imgHeight && dy.dim_size(2) == p.imgWidth && dy.dim_size(3) == p.channels, errors::InvalidArgument("dy must have shape [minibatch_size, height, width, channels]"));
+        if (p.enableMip)
+        {
+            if (!cube_mode)
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 4, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 4]"));
+            else
+                OP_REQUIRES(ctx, uv_da.dims() == 4 && uv_da.dim_size(0) == p.n && uv_da.dim_size(1) == p.imgHeight && uv_da.dim_size(2) == p.imgWidth && uv_da.dim_size(3) == 6, errors::InvalidArgument("uv_da must have shape [minibatch_size, height, width, 6] in cube map mode"));
+        }
+
+        // Get input pointers.
+        p.tex[0] = tex.flat<float>().data();
+        p.uv = uv.flat<float>().data();
+        p.dy = dy.flat<float>().data();
+        p.uvDA = p.enableMip ? uv_da.flat<float>().data() : 0;
+        float* pmip = p.enableMip ? (float*)mip.flat<float>().data() : 0;
+
+        // Allocate output tensor for tex gradient.
+        Tensor* grad_tex_tensor = NULL;
+        TensorShape grad_tex_shape;
+        grad_tex_shape.AddDim(p.texDepth);
+        if (cube_mode)
+            grad_tex_shape.AddDim(6);
+        grad_tex_shape.AddDim(p.texHeight);
+        grad_tex_shape.AddDim(p.texWidth);
+        grad_tex_shape.AddDim(p.channels);
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(0, grad_tex_shape, &grad_tex_tensor));
+        p.gradTex[0] = grad_tex_tensor->flat<float>().data();
+
+        // Allocate output tensor for uv gradient.
+        if (p.filterMode != TEX_MODE_NEAREST)
+        {
+            TensorShape grad_uv_shape;
+            Tensor* grad_uv_tensor = NULL;
+            grad_uv_shape.AddDim(p.n);
+            grad_uv_shape.AddDim(p.imgHeight);
+            grad_uv_shape.AddDim(p.imgWidth);
+            grad_uv_shape.AddDim(uv.dim_size(3));
+            OP_REQUIRES_OK(ctx, ctx->allocate_output(1, grad_uv_shape, &grad_uv_tensor));
+            p.gradUV = grad_uv_tensor->flat<float>().data();
+
+            // Allocate output tensor for uv_da gradient.
+            if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+            {
+                Tensor* grad_uv_da_tensor = NULL;
+                grad_uv_shape.set_dim(3, uv_da.dim_size(3));
+                OP_REQUIRES_OK(ctx, ctx->allocate_output(2, grad_uv_shape, &grad_uv_da_tensor));
+                p.gradUVDA = grad_uv_da_tensor->flat<float>().data();
+            }
+        }
+
+        // Choose kernel variants based on channel count.
+        int channel_div_idx = 0;
+        if (!(p.channels & 3))
+            channel_div_idx = 2;  // Channel count divisible by 4.
+        else if (!(p.channels & 1))
+            channel_div_idx = 1;  // Channel count divisible by 2.
+
+        // Mip-related setup.
+        Tensor grad_mip_tensor;
+        float* pgradMip = 0;
+        if (p.enableMip)
+        {
+            // Generate mip offsets.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(ctx, p, mipOffsets);
+
+            // Get space for temporary mip gradients.
+            TensorShape grad_mip_shape;
+            grad_mip_shape.AddDim(mipTotal);
+            ctx->allocate_temp(DT_FLOAT, grad_mip_shape, &grad_mip_tensor);
+            pgradMip = grad_mip_tensor.flat<float>().data();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+
+            // Clear mip gradients.
+            OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(pgradMip, 0, mipTotal * sizeof(float), stream));
+        }
+
+        // Initialize texture gradients to zero.
+        int texBytes = p.texHeight * p.texWidth * p.texDepth * p.channels * sizeof(float);
+        if (cube_mode)
+            texBytes *= 6;
+        OP_CHECK_CUDA_ERROR(ctx, cudaMemsetAsync(p.gradTex[0], 0, texBytes, stream));
+
+        // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+        if (!cube_mode)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uv       & 7), errors::Internal("uv input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUV   & 7), errors::Internal("grad_uv output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 15), errors::Internal("uv_da input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 15), errors::Internal("grad_uv_da output tensor not aligned to float4"));
+        }
+        else
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.uvDA     & 7), errors::Internal("uv_da input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradUVDA & 7), errors::Internal("grad_uv_da output tensor not aligned to float2"));
+        }
+        if ((p.channels & 3) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 15), errors::Internal("tex input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 15), errors::Internal("grad_tex output tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 15), errors::Internal("dy input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 15), errors::Internal("mip input tensor not aligned to float4"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 15), errors::Internal("internal mip gradient tensor not aligned to float4"));
+        }
+        if ((p.channels & 1) == 0)
+        {
+            OP_REQUIRES(ctx, !((uintptr_t)p.tex[0]     & 7), errors::Internal("tex input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.gradTex[0] & 7), errors::Internal("grad_tex output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)p.dy         & 7), errors::Internal("dy output tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pmip         & 7), errors::Internal("mip input tensor not aligned to float2"));
+            OP_REQUIRES(ctx, !((uintptr_t)pgradMip     & 7), errors::Internal("internal mip gradient tensor not aligned to float2"));
+        }
+
+        // Choose launch parameters for main gradient kernel.
+        void* args[] = {&p};
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+        void* func_tbl[TEX_MODE_COUNT * 2] = {
+            (void*)TextureGradKernelNearest,
+            (void*)TextureGradKernelLinear,
+            (void*)TextureGradKernelLinearMipmapNearest,
+            (void*)TextureGradKernelLinearMipmapLinear,
+            (void*)TextureGradKernelCubeNearest,
+            (void*)TextureGradKernelCubeLinear,
+            (void*)TextureGradKernelCubeLinearMipmapNearest,
+            (void*)TextureGradKernelCubeLinearMipmapLinear,
+        };
+
+        // Function index.
+        int func_idx = p.filterMode;
+        if (cube_mode)
+            func_idx += TEX_MODE_COUNT;
+
+        // Launch main gradient kernel.
+        OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+        // Launch kernel to pull gradients from mip levels.
+        if (p.enableMip)
+        {
+            dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+            dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+            int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+            void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+            OP_CHECK_CUDA_ERROR(ctx, cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+        }
+    }
+};
+
+REGISTER_OP("TextureGradNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int");
+
+REGISTER_OP("TextureGradLinearMipmapNearest")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_OP("TextureGradLinearMipmapLinear")
+    .Input      ("tex: float")
+    .Input      ("uv: float")
+    .Input      ("dy: float")
+    .Input      ("uv_da: float")
+    .Input      ("mip: float")
+    .Output     ("grad_tex: float")
+    .Output     ("grad_uv: float")
+    .Output     ("grad_uv_da: float")
+    .Attr       ("filter_mode: int")
+    .Attr       ("boundary_mode: int")
+    .Attr       ("max_mip_level: int");
+    
+REGISTER_KERNEL_BUILDER(Name("TextureGradNearest")            .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinear")             .Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapNearest").Device(DEVICE_GPU), TextureGradOp);
+REGISTER_KERNEL_BUILDER(Name("TextureGradLinearMipmapLinear") .Device(DEVICE_GPU), TextureGradOp);
+        
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/__init__.py
new file mode 100644
index 00000000..d28f95e7
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+from .ops import RasterizeCudaContext, RasterizeGLContext, get_log_level, set_log_level, rasterize, DepthPeeler, interpolate, texture, texture_construct_mip, antialias, antialias_construct_topology_hash
+__all__ = ["RasterizeCudaContext", "RasterizeGLContext", "get_log_level", "set_log_level", "rasterize", "DepthPeeler", "interpolate", "texture", "texture_construct_mip", "antialias", "antialias_construct_topology_hash"]
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/ops.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/ops.py
new file mode 100644
index 00000000..f366c022
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/ops.py
@@ -0,0 +1,729 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import importlib
+import logging
+import numpy as np
+import os
+import torch
+import torch.utils.cpp_extension
+
+#----------------------------------------------------------------------------
+# C++/Cuda plugin compiler/loader.
+
+_cached_plugin = {}
+def _get_plugin(gl=False):
+    assert isinstance(gl, bool)
+
+    # Return cached plugin if already loaded.
+    if _cached_plugin.get(gl, None) is not None:
+        return _cached_plugin[gl]
+
+    # Make sure we can find the necessary compiler and libary binaries.
+    if os.name == 'nt':
+        lib_dir = os.path.dirname(__file__) + r"\..\lib"
+        def find_cl_path():
+            import glob
+            def get_sort_key(x):
+                # Primary criterion is VS version, secondary is edition, third is internal MSVC version.
+                x = x.split('\\')[3:]
+                x[1] = {'BuildTools': '~0', 'Community': '~1', 'Pro': '~2', 'Professional': '~3', 'Enterprise': '~4'}.get(x[1], x[1])
+                return x
+            vs_relative_path = r"\Microsoft Visual Studio\*\*\VC\Tools\MSVC\*\bin\Hostx64\x64"
+            paths = glob.glob(r"C:\Program Files" + vs_relative_path)
+            paths += glob.glob(r"C:\Program Files (x86)" + vs_relative_path)
+            if paths:
+                return sorted(paths, key=get_sort_key)[-1]
+
+        # If cl.exe is not on path, try to find it.
+        if os.system("where cl.exe >nul 2>nul") != 0:
+            cl_path = find_cl_path()
+            if cl_path is None:
+                raise RuntimeError("Could not locate a supported Microsoft Visual C++ installation")
+            os.environ['PATH'] += ';' + cl_path
+
+    # Compiler options.
+    common_opts = ['-DNVDR_TORCH']
+    cc_opts = []
+    if os.name == 'nt':
+        cc_opts += ['/wd4067', '/wd4624'] # Disable warnings in torch headers.
+
+    # Linker options for the GL-interfacing plugin.
+    ldflags = []
+    if gl:
+        if os.name == 'posix':
+            ldflags = ['-lGL', '-lEGL']
+        elif os.name == 'nt':
+            libs = ['gdi32', 'opengl32', 'user32', 'setgpu']
+            ldflags = ['/LIBPATH:' + lib_dir] + ['/DEFAULTLIB:' + x for x in libs]
+
+    # List of source files.
+    if gl:
+        source_files = [
+            '../common/common.cpp',
+            '../common/glutil.cpp',
+            '../common/rasterize_gl.cpp',
+            'torch_bindings_gl.cpp',
+            'torch_rasterize_gl.cpp',
+        ]
+    else:
+        source_files = [
+            '../common/cudaraster/impl/Buffer.cpp',
+            '../common/cudaraster/impl/CudaRaster.cpp',
+            '../common/cudaraster/impl/RasterImpl.cu',
+            '../common/cudaraster/impl/RasterImpl.cpp',
+            '../common/common.cpp',
+            '../common/rasterize.cu',
+            '../common/interpolate.cu',
+            '../common/texture.cu',
+            '../common/texture.cpp',
+            '../common/antialias.cu',
+            'torch_bindings.cpp',
+            'torch_rasterize.cpp',
+            'torch_interpolate.cpp',
+            'torch_texture.cpp',
+            'torch_antialias.cpp',
+        ]
+
+    # Some containers set this to contain old architectures that won't compile. We only need the one installed in the machine.
+    os.environ['TORCH_CUDA_ARCH_LIST'] = ''
+
+    # On Linux, show a warning if GLEW is being forcibly loaded when compiling the GL plugin.
+    if gl and (os.name == 'posix') and ('libGLEW' in os.environ.get('LD_PRELOAD', '')):
+        logging.getLogger('nvdiffrast').warning("Warning: libGLEW is being loaded via LD_PRELOAD, and will probably conflict with the OpenGL plugin")
+
+    # Try to detect if a stray lock file is left in cache directory and show a warning. This sometimes happens on Windows if the build is interrupted at just the right moment.
+    plugin_name = 'nvdiffrast_plugin' + ('_gl' if gl else '')
+    try:
+        lock_fn = os.path.join(torch.utils.cpp_extension._get_build_directory(plugin_name, False), 'lock')
+        if os.path.exists(lock_fn):
+            logging.getLogger('nvdiffrast').warning("Lock file exists in build directory: '%s'" % lock_fn)
+    except:
+        pass
+
+    # Speed up compilation on Windows.
+    if os.name == 'nt':
+        # Skip telemetry sending step in vcvarsall.bat
+        os.environ['VSCMD_SKIP_SENDTELEMETRY'] = '1'
+
+        # Opportunistically patch distutils to cache MSVC environments.
+        try:
+            import distutils._msvccompiler
+            import functools
+            if not hasattr(distutils._msvccompiler._get_vc_env, '__wrapped__'):
+                distutils._msvccompiler._get_vc_env = functools.lru_cache()(distutils._msvccompiler._get_vc_env)
+        except:
+            pass
+
+    # Compile and load.
+    source_paths = [os.path.join(os.path.dirname(__file__), fn) for fn in source_files]
+    torch.utils.cpp_extension.load(name=plugin_name, sources=source_paths, extra_cflags=common_opts+cc_opts, extra_cuda_cflags=common_opts+['-lineinfo'], extra_ldflags=ldflags, with_cuda=True, verbose=False)
+
+    # Import, cache, and return the compiled module.
+    _cached_plugin[gl] = importlib.import_module(plugin_name)
+    return _cached_plugin[gl]
+
+#----------------------------------------------------------------------------
+# Log level.
+#----------------------------------------------------------------------------
+
+def get_log_level():
+    '''Get current log level.
+
+    Returns:
+      Current log level in nvdiffrast. See `set_log_level()` for possible values.
+    '''
+    return _get_plugin().get_log_level()
+
+def set_log_level(level):
+    '''Set log level.
+
+    Log levels follow the convention on the C++ side of Torch:
+      0 = Info,
+      1 = Warning,
+      2 = Error,
+      3 = Fatal.
+    The default log level is 1.
+
+    Args:
+      level: New log level as integer. Internal nvdiffrast messages of this 
+             severity or higher will be printed, while messages of lower
+             severity will be silent.
+    '''
+    _get_plugin().set_log_level(level)
+
+#----------------------------------------------------------------------------
+# CudaRaster state wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeCudaContext:
+    def __init__(self, device=None):
+        '''Create a new Cuda rasterizer context.
+
+        The context is deleted and internal storage is released when the object is
+        destroyed.
+
+        Args:
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created Cuda rasterizer context.
+        '''
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin().RasterizeCRStateWrapper(cuda_device_idx)
+        self.output_db = True
+        self.active_depth_peeler = None
+
+#----------------------------------------------------------------------------
+# GL state wrapper.
+#----------------------------------------------------------------------------
+
+class RasterizeGLContext:
+    def __init__(self, output_db=True, mode='automatic', device=None):
+        '''Create a new OpenGL rasterizer context.
+
+        Creating an OpenGL context is a slow operation so you should usually reuse the same
+        context in all calls to `rasterize()` on the same CPU thread. The OpenGL context
+        is deleted when the object is destroyed.
+
+        Side note: When using the OpenGL context in a rasterization operation, the
+        context's internal framebuffer object is automatically enlarged to accommodate the
+        rasterization operation's output shape, but it is never shrunk in size until the
+        context is destroyed. Thus, if you need to rasterize, say, deep low-resolution
+        tensors and also shallow high-resolution tensors, you can conserve GPU memory by
+        creating two separate OpenGL contexts for these tasks. In this scenario, using the
+        same OpenGL context for both tasks would end up reserving GPU memory for a deep,
+        high-resolution output tensor.
+
+        Args:
+          output_db (bool): Compute and output image-space derivates of barycentrics.
+          mode: OpenGL context handling mode. Valid values are 'manual' and 'automatic'.
+          device (Optional): Cuda device on which the context is created. Type can be
+                             `torch.device`, string (e.g., `'cuda:1'`), or int. If not
+                             specified, context will be created on currently active Cuda
+                             device.
+        Returns:
+          The newly created OpenGL rasterizer context.
+        '''
+        assert output_db is True or output_db is False
+        assert mode in ['automatic', 'manual']
+        self.output_db = output_db
+        self.mode = mode
+        if device is None:
+            cuda_device_idx = torch.cuda.current_device()
+        else:
+            with torch.cuda.device(device):
+                cuda_device_idx = torch.cuda.current_device()
+        self.cpp_wrapper = _get_plugin(gl=True).RasterizeGLStateWrapper(output_db, mode == 'automatic', cuda_device_idx)
+        self.active_depth_peeler = None # For error checking only.
+
+    def set_context(self):
+        '''Set (activate) OpenGL context in the current CPU thread.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.set_context()
+
+    def release_context(self):
+        '''Release (deactivate) currently active OpenGL context.
+           Only available if context was created in manual mode.
+        '''
+        assert self.mode == 'manual'
+        self.cpp_wrapper.release_context()
+
+#----------------------------------------------------------------------------
+# Rasterize.
+#----------------------------------------------------------------------------
+
+class _rasterize_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, raster_ctx, pos, tri, resolution, ranges, grad_db, peeling_idx):
+        if isinstance(raster_ctx, RasterizeGLContext):
+            out, out_db = _get_plugin(gl=True).rasterize_fwd_gl(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        else:
+            out, out_db = _get_plugin().rasterize_fwd_cuda(raster_ctx.cpp_wrapper, pos, tri, resolution, ranges, peeling_idx)
+        ctx.save_for_backward(pos, tri, out)
+        ctx.saved_grad_db = grad_db
+        return out, out_db
+
+    @staticmethod
+    def backward(ctx, dy, ddb):
+        pos, tri, out = ctx.saved_tensors
+        if ctx.saved_grad_db:
+            g_pos = _get_plugin().rasterize_grad_db(pos, tri, out, dy, ddb)
+        else:
+            g_pos = _get_plugin().rasterize_grad(pos, tri, out, dy)
+        return None, g_pos, None, None, None, None, None
+
+# Op wrapper.
+def rasterize(glctx, pos, tri, resolution, ranges=None, grad_db=True):
+    '''Rasterize triangles.
+
+    All input tensors must be contiguous and reside in GPU memory except for
+    the `ranges` tensor that, if specified, has to reside in CPU memory. The
+    output tensors will be contiguous and reside in GPU memory.
+
+    Args:
+        glctx: Rasterizer context of type `RasterizeGLContext` or `RasterizeCudaContext`.
+        pos: Vertex position tensor with dtype `torch.float32`. To enable range
+             mode, this tensor should have a 2D shape [num_vertices, 4]. To enable
+             instanced mode, use a 3D shape [minibatch_size, num_vertices, 4].
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        resolution: Output resolution as integer tuple (height, width).
+        ranges: In range mode, tensor with shape [minibatch_size, 2] and dtype
+                `torch.int32`, specifying start indices and counts into `tri`.
+                Ignored in instanced mode.
+        grad_db: Propagate gradients of image-space derivatives of barycentrics
+                 into `pos` in backward pass. Ignored if using an OpenGL context that
+                 was not configured to output image-space derivatives.
+
+    Returns:
+        A tuple of two tensors. The first output tensor has shape [minibatch_size,
+        height, width, 4] and contains the main rasterizer output in order (u, v, z/w,
+        triangle_id). If the OpenGL context was configured to output image-space
+        derivatives of barycentrics, the second output tensor will also have shape
+        [minibatch_size, height, width, 4] and contain said derivatives in order
+        (du/dX, du/dY, dv/dX, dv/dY). Otherwise it will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    '''
+    assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
+    assert grad_db is True or grad_db is False
+    grad_db = grad_db and glctx.output_db
+
+    # Sanitize inputs.
+    assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+    resolution = tuple(resolution)
+    if ranges is None:
+        ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+    else:
+        assert isinstance(ranges, torch.Tensor)
+
+    # Check that context is not currently reserved for depth peeling.
+    if glctx.active_depth_peeler is not None:
+        return RuntimeError("Cannot call rasterize() during depth peeling operation, use rasterize_next_layer() instead")
+
+    # Instantiate the function.
+    return _rasterize_func.apply(glctx, pos, tri, resolution, ranges, grad_db, -1)
+
+#----------------------------------------------------------------------------
+# Depth peeler context manager for rasterizing multiple depth layers.
+#----------------------------------------------------------------------------
+
+class DepthPeeler:
+    def __init__(self, glctx, pos, tri, resolution, ranges=None, grad_db=True):
+        '''Create a depth peeler object for rasterizing multiple depth layers.
+
+        Arguments are the same as in `rasterize()`.
+
+        Returns:
+          The newly created depth peeler.
+        '''
+        assert isinstance(glctx, (RasterizeGLContext, RasterizeCudaContext))
+        assert grad_db is True or grad_db is False
+        grad_db = grad_db and glctx.output_db
+
+        # Sanitize inputs as usual.
+        assert isinstance(pos, torch.Tensor) and isinstance(tri, torch.Tensor)
+        resolution = tuple(resolution)
+        if ranges is None:
+            ranges = torch.empty(size=(0, 2), dtype=torch.int32, device='cpu')
+        else:
+            assert isinstance(ranges, torch.Tensor)
+
+        # Store all the parameters.
+        self.raster_ctx = glctx
+        self.pos = pos
+        self.tri = tri
+        self.resolution = resolution
+        self.ranges = ranges
+        self.grad_db = grad_db
+        self.peeling_idx = None
+
+    def __enter__(self):
+        if self.raster_ctx is None:
+            raise RuntimeError("Cannot re-enter a terminated depth peeling operation")
+        if self.raster_ctx.active_depth_peeler is not None:
+            raise RuntimeError("Cannot have multiple depth peelers active simultaneously in a rasterization context")
+        self.raster_ctx.active_depth_peeler = self
+        self.peeling_idx = 0
+        return self
+
+    def __exit__(self, *args):
+        assert self.raster_ctx.active_depth_peeler is self
+        self.raster_ctx.active_depth_peeler = None
+        self.raster_ctx = None # Remove all references to input tensor so they're not left dangling.
+        self.pos = None
+        self.tri = None
+        self.resolution = None
+        self.ranges = None
+        self.grad_db = None
+        self.peeling_idx = None
+        return None
+
+    def rasterize_next_layer(self):
+        '''Rasterize next depth layer.
+
+        Operation is equivalent to `rasterize()` except that previously reported
+        surface points are culled away.
+
+        Returns:
+          A tuple of two tensors as in `rasterize()`.
+        '''
+        assert self.raster_ctx.active_depth_peeler is self
+        assert self.peeling_idx >= 0
+        result = _rasterize_func.apply(self.raster_ctx, self.pos, self.tri, self.resolution, self.ranges, self.grad_db, self.peeling_idx)
+        self.peeling_idx += 1
+        return result
+
+#----------------------------------------------------------------------------
+# Interpolate.
+#----------------------------------------------------------------------------
+
+# Output pixel differentials for at least some attributes.
+class _interpolate_func_da(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list):
+        out, out_da = _get_plugin().interpolate_fwd_da(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+        ctx.save_for_backward(attr, rast, tri, rast_db)
+        ctx.saved_misc = diff_attrs_all, diff_attrs_list
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, dda):
+        attr, rast, tri, rast_db = ctx.saved_tensors
+        diff_attrs_all, diff_attrs_list = ctx.saved_misc
+        g_attr, g_rast, g_rast_db = _get_plugin().interpolate_grad_da(attr, rast, tri, dy, rast_db, dda, diff_attrs_all, diff_attrs_list)
+        return g_attr, g_rast, None, g_rast_db, None, None
+
+# No pixel differential for any attribute.
+class _interpolate_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, attr, rast, tri):
+        out, out_da = _get_plugin().interpolate_fwd(attr, rast, tri)
+        ctx.save_for_backward(attr, rast, tri)
+        return out, out_da
+
+    @staticmethod
+    def backward(ctx, dy, _):
+        attr, rast, tri = ctx.saved_tensors
+        g_attr, g_rast = _get_plugin().interpolate_grad(attr, rast, tri, dy)
+        return g_attr, g_rast, None
+
+# Op wrapper.
+def interpolate(attr, rast, tri, rast_db=None, diff_attrs=None):
+    """Interpolate vertex attributes.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensors
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        attr: Attribute tensor with dtype `torch.float32`. 
+              Shape is [num_vertices, num_attributes] in range mode, or 
+              [minibatch_size, num_vertices, num_attributes] in instanced mode.
+              Broadcasting is supported along the minibatch axis.
+        rast: Main output tensor from `rasterize()`.
+        tri: Triangle tensor with shape [num_triangles, 3] and dtype `torch.int32`.
+        rast_db: (Optional) Tensor containing image-space derivatives of barycentrics, 
+                 i.e., the second output tensor from `rasterize()`. Enables computing
+                 image-space derivatives of attributes.
+        diff_attrs: (Optional) List of attribute indices for which image-space
+                    derivatives are to be computed. Special value 'all' is equivalent
+                    to list [0, 1, ..., num_attributes - 1].
+
+    Returns:
+        A tuple of two tensors. The first output tensor contains interpolated
+        attributes and has shape [minibatch_size, height, width, num_attributes].
+        If `rast_db` and `diff_attrs` were specified, the second output tensor contains
+        the image-space derivatives of the selected attributes and has shape
+        [minibatch_size, height, width, 2 * len(diff_attrs)]. The derivatives of the
+        first selected attribute A will be on channels 0 and 1 as (dA/dX, dA/dY), etc.
+        Otherwise, the second output tensor will be an empty tensor with shape
+        [minibatch_size, height, width, 0].
+    """
+    # Sanitize the list of pixel differential attributes.
+    if diff_attrs is None:
+        diff_attrs = []
+    elif diff_attrs != 'all':
+        diff_attrs = np.asarray(diff_attrs, np.int32)
+        assert len(diff_attrs.shape) == 1
+        diff_attrs = diff_attrs.tolist()
+
+    diff_attrs_all = int(diff_attrs == 'all')
+    diff_attrs_list = [] if diff_attrs_all else diff_attrs
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (attr, rast, tri))
+    if diff_attrs:
+        assert isinstance(rast_db, torch.Tensor)
+
+    # Choose stub.
+    if diff_attrs:
+        return _interpolate_func_da.apply(attr, rast, tri, rast_db, diff_attrs_all, diff_attrs_list)
+    else:
+        return _interpolate_func.apply(attr, rast, tri)
+
+#----------------------------------------------------------------------------
+# Texture
+#----------------------------------------------------------------------------
+
+# Linear-mipmap-linear and linear-mipmap-nearest: Mipmaps enabled.
+class _texture_func_mip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack):
+        empty = torch.tensor([])
+        if uv_da is None:
+            uv_da = empty
+        if mip_level_bias is None:
+            mip_level_bias = empty
+        if mip_wrapper is None:
+            mip_wrapper = _get_plugin().TextureMipWrapper()
+        out = _get_plugin().texture_fwd_mip(tex, uv, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv, uv_da, mip_level_bias, *mip_stack)
+        ctx.saved_misc = filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv, uv_da, mip_level_bias, *mip_stack = ctx.saved_tensors
+        filter_mode, mip_wrapper, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear-mipmap-linear':
+            g_tex, g_uv, g_uv_da, g_mip_level_bias, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, g_uv_da, g_mip_level_bias, None, None, None) + tuple(g_mip_stack)
+        else: # linear-mipmap-nearest
+            g_tex, g_uv, g_mip_stack = _get_plugin().texture_grad_linear_mipmap_nearest(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode_enum, boundary_mode_enum)
+            return (None, g_tex, g_uv, None, None, None, None, None) + tuple(g_mip_stack)
+
+# Linear and nearest: Mipmaps disabled.
+class _texture_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum):
+        out = _get_plugin().texture_fwd(tex, uv, filter_mode_enum, boundary_mode_enum)
+        ctx.save_for_backward(tex, uv)
+        ctx.saved_misc = filter_mode, filter_mode_enum, boundary_mode_enum
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        tex, uv = ctx.saved_tensors
+        filter_mode, filter_mode_enum, boundary_mode_enum = ctx.saved_misc
+        if filter_mode == 'linear':
+            g_tex, g_uv = _get_plugin().texture_grad_linear(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, g_uv, None, None
+        else: # nearest
+            g_tex = _get_plugin().texture_grad_nearest(tex, uv, dy, filter_mode_enum, boundary_mode_enum)
+            return None, g_tex, None, None, None
+
+# Op wrapper.
+def texture(tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto', boundary_mode='wrap', max_mip_level=None):
+    """Perform texture sampling.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Args:
+        tex: Texture tensor with dtype `torch.float32`. For 2D textures, must have shape
+             [minibatch_size, tex_height, tex_width, tex_channels]. For cube map textures,
+             must have shape [minibatch_size, 6, tex_height, tex_width, tex_channels] where
+             tex_width and tex_height are equal. Note that `boundary_mode` must also be set
+             to 'cube' to enable cube map mode. Broadcasting is supported along the minibatch axis.
+        uv: Tensor containing per-pixel texture coordinates. When sampling a 2D texture,
+            must have shape [minibatch_size, height, width, 2]. When sampling a cube map
+            texture, must have shape [minibatch_size, height, width, 3].
+        uv_da: (Optional) Tensor containing image-space derivatives of texture coordinates.
+               Must have same shape as `uv` except for the last dimension that is to be twice
+               as long.
+        mip_level_bias: (Optional) Per-pixel bias for mip level selection. If `uv_da` is omitted,
+                        determines mip level directly. Must have shape [minibatch_size, height, width].
+        mip: (Optional) Preconstructed mipmap stack from a `texture_construct_mip()` call, or a list
+                        of tensors specifying a custom mipmap stack. When specifying a custom mipmap stack,
+                        the tensors in the list must follow the same format as `tex` except for width and
+                        height that must follow the usual rules for mipmap sizes. The base level texture
+                        is still supplied in `tex` and must not be included in the list. Gradients of a
+                        custom mipmap stack are not automatically propagated to base texture but the mipmap
+                        tensors will receive gradients of their own. If a mipmap stack is not specified
+                        but the chosen filter mode requires it, the mipmap stack is constructed internally
+                        and discarded afterwards.
+        filter_mode: Texture filtering mode to be used. Valid values are 'auto', 'nearest',
+                     'linear', 'linear-mipmap-nearest', and 'linear-mipmap-linear'. Mode 'auto'
+                     selects 'linear' if neither `uv_da` or `mip_level_bias` is specified, and
+                     'linear-mipmap-linear' when at least one of them is specified, these being
+                     the highest-quality modes possible depending on the availability of the
+                     image-space derivatives of the texture coordinates or direct mip level information.
+        boundary_mode: Valid values are 'wrap', 'clamp', 'zero', and 'cube'. If `tex` defines a
+                       cube map, this must be set to 'cube'. The default mode 'wrap' takes fractional
+                       part of texture coordinates. Mode 'clamp' clamps texture coordinates to the
+                       centers of the boundary texels. Mode 'zero' virtually extends the texture with
+                       all-zero values in all directions.
+        max_mip_level: If specified, limits the number of mipmaps constructed and used in mipmap-based
+                       filter modes.
+
+    Returns:
+        A tensor containing the results of the texture sampling with shape
+        [minibatch_size, height, width, tex_channels]. Cube map fetches with invalid uv coordinates
+        (e.g., zero vectors) output all zeros and do not propagate gradients.
+    """
+
+    # Default filter mode.
+    if filter_mode == 'auto':
+        filter_mode = 'linear-mipmap-linear' if (uv_da is not None or mip_level_bias is not None) else 'linear'
+
+    # Sanitize inputs.
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+
+    # Check inputs.
+    assert isinstance(tex, torch.Tensor) and isinstance(uv, torch.Tensor)
+    if 'mipmap' in filter_mode:
+        assert isinstance(uv_da, torch.Tensor) or isinstance(mip_level_bias, torch.Tensor)
+
+    # If mipping disabled via max level=0, we may as well use simpler filtering internally.
+    if max_mip_level == 0 and filter_mode in ['linear-mipmap-nearest', 'linear-mipmap-linear']:
+        filter_mode = 'linear'
+
+    # Convert filter mode to internal enumeration.
+    filter_mode_dict = {'nearest': 0, 'linear': 1, 'linear-mipmap-nearest': 2, 'linear-mipmap-linear': 3}
+    filter_mode_enum = filter_mode_dict[filter_mode]
+
+    # Convert boundary mode to internal enumeration.
+    boundary_mode_dict = {'cube': 0, 'wrap': 1, 'clamp': 2, 'zero': 3}
+    boundary_mode_enum = boundary_mode_dict[boundary_mode]
+
+    # Construct a mipmap if necessary.
+    if 'mipmap' in filter_mode:
+        mip_wrapper, mip_stack = None, []
+        if mip is not None:
+            assert isinstance(mip, (_get_plugin().TextureMipWrapper, list))
+            if isinstance(mip, list):
+                assert all(isinstance(x, torch.Tensor) for x in mip)
+                mip_stack = mip
+            else:
+                mip_wrapper = mip
+        else:
+            mip_wrapper = _get_plugin().texture_construct_mip(tex, max_mip_level, boundary_mode == 'cube')
+
+    # Choose stub.
+    if filter_mode == 'linear-mipmap-linear' or filter_mode == 'linear-mipmap-nearest':
+        return _texture_func_mip.apply(filter_mode, tex, uv, uv_da, mip_level_bias, mip_wrapper, filter_mode_enum, boundary_mode_enum, *mip_stack)
+    else:
+        return _texture_func.apply(filter_mode, tex, uv, filter_mode_enum, boundary_mode_enum)
+
+# Mipmap precalculation for cases where the texture stays constant.
+def texture_construct_mip(tex, max_mip_level=None, cube_mode=False):
+    """Construct a mipmap stack for a texture.
+
+    This function can be used for constructing a mipmap stack for a texture that is known to remain
+    constant. This avoids reconstructing it every time `texture()` is called.
+
+    Args:
+        tex: Texture tensor with the same constraints as in `texture()`.
+        max_mip_level: If specified, limits the number of mipmaps constructed.
+        cube_mode: Must be set to True if `tex` specifies a cube map texture.
+
+    Returns:
+        An opaque object containing the mipmap stack. This can be supplied in a call to `texture()` 
+        in the `mip` argument.
+    """
+
+    assert isinstance(tex, torch.Tensor)
+    assert cube_mode is True or cube_mode is False
+    if max_mip_level is None:
+        max_mip_level = -1
+    else:
+        max_mip_level = int(max_mip_level)
+        assert max_mip_level >= 0
+    return _get_plugin().texture_construct_mip(tex, max_mip_level, cube_mode)
+
+#----------------------------------------------------------------------------
+# Antialias.
+#----------------------------------------------------------------------------
+
+class _antialias_func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, color, rast, pos, tri, topology_hash, pos_gradient_boost):
+        out, work_buffer = _get_plugin().antialias_fwd(color, rast, pos, tri, topology_hash)
+        ctx.save_for_backward(color, rast, pos, tri)
+        ctx.saved_misc = pos_gradient_boost, work_buffer
+        return out
+
+    @staticmethod
+    def backward(ctx, dy):
+        color, rast, pos, tri = ctx.saved_tensors
+        pos_gradient_boost, work_buffer = ctx.saved_misc
+        g_color, g_pos = _get_plugin().antialias_grad(color, rast, pos, tri, dy, work_buffer)
+        if pos_gradient_boost != 1.0:
+            g_pos = g_pos * pos_gradient_boost
+        return g_color, None, g_pos, None, None, None
+
+# Op wrapper.
+def antialias(color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+    """Perform antialiasing.
+
+    All input tensors must be contiguous and reside in GPU memory. The output tensor
+    will be contiguous and reside in GPU memory.
+
+    Note that silhouette edge determination is based on vertex indices in the triangle
+    tensor. For it to work properly, a vertex belonging to multiple triangles must be
+    referred to using the same vertex index in each triangle. Otherwise, nvdiffrast will always
+    classify the adjacent edges as silhouette edges, which leads to bad performance and
+    potentially incorrect gradients. If you are unsure whether your data is good, check
+    which pixels are modified by the antialias operation and compare to the example in the
+    documentation.
+
+    Args:
+        color: Input image to antialias with shape [minibatch_size, height, width, num_channels].
+        rast: Main output tensor from `rasterize()`.
+        pos: Vertex position tensor used in the rasterization operation.
+        tri: Triangle tensor used in the rasterization operation.
+        topology_hash: (Optional) Preconstructed topology hash for the triangle tensor. If not
+                       specified, the topology hash is constructed internally and discarded afterwards.
+        pos_gradient_boost: (Optional) Multiplier for gradients propagated to `pos`.
+
+    Returns:
+        A tensor containing the antialiased image with the same shape as `color` input tensor.
+    """
+
+    # Check inputs.
+    assert all(isinstance(x, torch.Tensor) for x in (color, rast, pos, tri))
+
+    # Construct topology hash unless provided by user.
+    if topology_hash is not None:
+        assert isinstance(topology_hash, _get_plugin().TopologyHashWrapper)
+    else:
+        topology_hash = _get_plugin().antialias_construct_topology_hash(tri)
+
+    # Instantiate the function.
+    return _antialias_func.apply(color, rast, pos, tri, topology_hash, pos_gradient_boost)
+
+# Topology hash precalculation for cases where the triangle array stays constant.
+def antialias_construct_topology_hash(tri):
+    """Construct a topology hash for a triangle tensor.
+
+    This function can be used for constructing a topology hash for a triangle tensor that is 
+    known to remain constant. This avoids reconstructing it every time `antialias()` is called.
+
+    Args:
+        tri: Triangle tensor with shape [num_triangles, 3]. Must be contiguous and reside in
+             GPU memory.
+
+    Returns:
+        An opaque object containing the topology hash. This can be supplied in a call to 
+        `antialias()` in the `topology_hash` argument.
+    """
+    assert isinstance(tri, torch.Tensor)
+    return _get_plugin().antialias_construct_topology_hash(tri)
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp
new file mode 100644
index 00000000..730a200e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_antialias.cpp
@@ -0,0 +1,243 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/antialias.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void AntialiasFwdMeshKernel         (const AntialiasKernelParams p);
+void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p);
+void AntialiasFwdAnalysisKernel     (const AntialiasKernelParams p);
+void AntialiasGradKernel            (const AntialiasKernelParams p);
+
+//------------------------------------------------------------------------
+// Topology hash construction.
+
+TopologyHashWrapper antialias_construct_topology_hash(torch::Tensor tri)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tri));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tri);
+    NVDR_CHECK_CONTIGUOUS(tri);
+    NVDR_CHECK_I32(tri);
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Fill in kernel parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = 0x7fffffff; // Let's not require vertex positions just to enable an error check.
+    p.tri = tri.data_ptr<int>();
+
+    // Kernel parameters.
+    p.allocTriangles = 64;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Construct the hash tensor and get pointer.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
+    torch::Tensor ev_hash = torch::zeros({(uint64_t)p.allocTriangles * AA_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles) * 4}, opts);
+    p.evHash = (uint4*)(ev_hash.data_ptr<int>());
+
+    // Check alignment.
+    NVDR_CHECK(!((uintptr_t)p.evHash & 15), "ev_hash internal tensor not aligned to int4");
+
+    // Populate the hash.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdMeshKernel, (p.numTriangles - 1) / AA_MESH_KERNEL_THREADS_PER_BLOCK + 1, AA_MESH_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return.
+    TopologyHashWrapper hash_wrap;
+    hash_wrap.ev_hash = ev_hash;
+    return hash_wrap;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_fwd(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash_wrap)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    torch::Tensor& topology_hash = topology_hash_wrap.ev_hash; // Unwrap.
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, topology_hash);
+    NVDR_CHECK_F32(color, rast, pos);
+    NVDR_CHECK_I32(tri, topology_hash);
+
+    // Sanity checks.
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and rast inputs must have same spatial dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, rast");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.evHash = (uint4*)(topology_hash.data_ptr<int>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Determine hash allocation size.
+    p.allocTriangles = 64;
+    while (p.allocTriangles < p.numTriangles)
+        p.allocTriangles <<= 1; // Must be power of two.
+
+    // Allocate output tensors.
+    torch::Tensor out = color.detach().clone(); // Use color as base.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor work_buffer = torch::empty({p.n * p.width * p.height * 8 + 4}, opts); // 8 int for a maximum of two work items per pixel.
+    p.output = out.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Clear the work counters.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(p.workBuffer, 0, sizeof(int4), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rasterOut  &  7), "raster_out input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+    NVDR_CHECK(!((uintptr_t)p.evHash     & 15), "topology_hash internal tensor not aligned to int4");
+
+    // Choose launch parameters for the discontinuity finder kernel and launch.
+    void* args[] = {&p};
+    dim3 blockSize(AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH, AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT, 1);
+    dim3 gridSize = getLaunchGridSize(blockSize, p.width, p.height, p.n);
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdDiscontinuityKernel, gridSize, blockSize, args, 0, stream));
+
+    // Determine optimum block size for the persistent analysis kernel and launch.
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasFwdAnalysisKernel, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasFwdAnalysisKernel, numCTA * numSM, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, work_buffer);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor> antialias_grad(torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(color));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    AntialiasKernelParams p = {}; // Initialize all fields to zero.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(color, rast, pos, tri, dy, work_buffer);
+    NVDR_CHECK_CONTIGUOUS(color, rast, pos, tri, work_buffer);
+    NVDR_CHECK_F32(color, rast, pos, dy, work_buffer);
+    NVDR_CHECK_I32(tri);
+
+    // Sanity checks.
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) > 0 && dy.size(2) > 0 && dy.size(3) > 0, "dy must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(color.sizes().size() == 4 && color.size(0) > 0 && color.size(1) > 0 && color.size(2) > 0 && color.size(3) > 0, "color must have shape[>0, >0, >0, >0]");
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "raster_out must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(color.size(1) == rast.size(1) && color.size(2) == rast.size(2), "color and raster_out inputs must have same spatial dimensions");
+    NVDR_CHECK(color.size(1) == dy.size(1) && color.size(2) == dy.size(2) && color.size(3) == dy.size(3), "color and dy inputs must have same dimensions");
+    if (p.instance_mode)
+    {
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0) && pos.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out, pos");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0) && pos.size(0) ==color.size(0), "minibatch size mismatch between inputs dy, color, raster_out, pos");
+    }
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, >0, 4] or [>0, 4]");
+        NVDR_CHECK(rast.size(0) == color.size(0), "minibatch size mismatch between inputs color, raster_out");
+        NVDR_CHECK(dy.size(0) == color.size(0) && rast.size(0) == color.size(0), "minibatch size mismatch between inputs dy, color, raster_out");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = pos.size(p.instance_mode ? 1 : 0);
+    p.numTriangles = tri.size(0);
+    p.n            = color.size(0);
+    p.height       = color.size(1);
+    p.width        = color.size(2);
+    p.channels     = color.size(3);
+
+    // Ensure dy is contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.color = color.data_ptr<float>();
+    p.rasterOut = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.pos = pos.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.workBuffer = (int4*)(work_buffer.data_ptr<float>());
+
+    // Misc parameters.
+    p.xh = .5f * (float)p.width;
+    p.yh = .5f * (float)p.height;
+
+    // Allocate output tensors.
+    torch::Tensor grad_color = dy_.detach().clone(); // Use dy as base.
+    torch::Tensor grad_pos = torch::zeros_like(pos);
+    p.gradColor = grad_color.data_ptr<float>();
+    p.gradPos = grad_pos.data_ptr<float>();
+
+    // Clear gradient kernel work counter.
+    NVDR_CHECK_CUDA_ERROR(cudaMemsetAsync(&p.workBuffer[0].y, 0, sizeof(int), stream));
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos        & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.workBuffer & 15), "work_buffer internal tensor not aligned to int4");
+
+    // Determine optimum block size for the gradient kernel and launch.
+    void* args[] = {&p};
+    int device = 0;
+    int numCTA = 0;
+    int numSM  = 0;
+    NVDR_CHECK_CUDA_ERROR(cudaGetDevice(&device));
+    NVDR_CHECK_CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numCTA, (void*)AntialiasGradKernel, AA_GRAD_KERNEL_THREADS_PER_BLOCK, 0));
+    NVDR_CHECK_CUDA_ERROR(cudaDeviceGetAttribute(&numSM, cudaDevAttrMultiProcessorCount, device));
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)AntialiasGradKernel, numCTA * numSM, AA_GRAD_KERNEL_THREADS_PER_BLOCK, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(grad_color, grad_pos);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp
new file mode 100644
index 00000000..898e17e3
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings.cpp
@@ -0,0 +1,73 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes. Return type macros for readability.
+
+#define OP_RETURN_T     torch::Tensor
+#define OP_RETURN_TT    std::tuple<torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTT   std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTTT  std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+#define OP_RETURN_TTV   std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+#define OP_RETURN_TTTTV std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >
+
+OP_RETURN_TT        rasterize_fwd_cuda                  (RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
+OP_RETURN_T         rasterize_grad                      (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy);
+OP_RETURN_T         rasterize_grad_db                   (torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb);
+OP_RETURN_TT        interpolate_fwd                     (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri);
+OP_RETURN_TT        interpolate_fwd_da                  (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+OP_RETURN_TT        interpolate_grad                    (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy);
+OP_RETURN_TTT       interpolate_grad_da                 (torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec);
+TextureMipWrapper   texture_construct_mip               (torch::Tensor tex, int max_mip_level, bool cube_mode);
+OP_RETURN_T         texture_fwd                         (torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_fwd_mip                     (torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_T         texture_grad_nearest                (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TT        texture_grad_linear                 (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode);
+OP_RETURN_TTV       texture_grad_linear_mipmap_nearest  (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+OP_RETURN_TTTTV     texture_grad_linear_mipmap_linear   (torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode);
+TopologyHashWrapper antialias_construct_topology_hash   (torch::Tensor tri);
+OP_RETURN_TT        antialias_fwd                       (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, TopologyHashWrapper topology_hash);
+OP_RETURN_TT        antialias_grad                      (torch::Tensor color, torch::Tensor rast, torch::Tensor pos, torch::Tensor tri, torch::Tensor dy, torch::Tensor work_buffer);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeCRStateWrapper>(m, "RasterizeCRStateWrapper").def(pybind11::init<int>());
+    pybind11::class_<TextureMipWrapper>(m, "TextureMipWrapper").def(pybind11::init<>());
+    pybind11::class_<TopologyHashWrapper>(m, "TopologyHashWrapper");
+
+    // Plumbing to torch/c10 logging system.
+    m.def("get_log_level", [](void)     { return FLAGS_caffe2_log_level;  }, "get log level");
+    m.def("set_log_level", [](int level){ FLAGS_caffe2_log_level = level; }, "set log level");
+
+    // Ops.
+    m.def("rasterize_fwd_cuda",                 &rasterize_fwd_cuda,                    "rasterize forward op (cuda)");
+    m.def("rasterize_grad",                     &rasterize_grad,                        "rasterize gradient op ignoring db gradients");
+    m.def("rasterize_grad_db",                  &rasterize_grad_db,                     "rasterize gradient op with db gradients");
+    m.def("interpolate_fwd",                    &interpolate_fwd,                       "interpolate forward op with attribute derivatives");
+    m.def("interpolate_fwd_da",                 &interpolate_fwd_da,                    "interpolate forward op without attribute derivatives");
+    m.def("interpolate_grad",                   &interpolate_grad,                      "interpolate gradient op with attribute derivatives");
+    m.def("interpolate_grad_da",                &interpolate_grad_da,                   "interpolate gradient op without attribute derivatives");
+    m.def("texture_construct_mip",              &texture_construct_mip,                 "texture mipmap construction");
+    m.def("texture_fwd",                        &texture_fwd,                           "texture forward op without mipmapping");
+    m.def("texture_fwd_mip",                    &texture_fwd_mip,                       "texture forward op with mipmapping");
+    m.def("texture_grad_nearest",               &texture_grad_nearest,                  "texture gradient op in nearest mode");
+    m.def("texture_grad_linear",                &texture_grad_linear,                   "texture gradient op in linear mode");
+    m.def("texture_grad_linear_mipmap_nearest", &texture_grad_linear_mipmap_nearest,    "texture gradient op in linear-mipmap-nearest mode");
+    m.def("texture_grad_linear_mipmap_linear",  &texture_grad_linear_mipmap_linear,     "texture gradient op in linear-mipmap-linear mode");
+    m.def("antialias_construct_topology_hash",  &antialias_construct_topology_hash,     "antialias topology hash construction");
+    m.def("antialias_fwd",                      &antialias_fwd,                         "antialias forward op");
+    m.def("antialias_grad",                     &antialias_grad,                        "antialias gradient op");
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings_gl.cpp
new file mode 100644
index 00000000..5363e802
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_bindings_gl.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Op prototypes.
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx);
+
+//------------------------------------------------------------------------
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    // State classes.
+    pybind11::class_<RasterizeGLStateWrapper>(m, "RasterizeGLStateWrapper").def(pybind11::init<bool, bool, int>())
+        .def("set_context",     &RasterizeGLStateWrapper::setContext)
+        .def("release_context", &RasterizeGLStateWrapper::releaseContext);
+
+    // Ops.
+    m.def("rasterize_fwd_gl", &rasterize_fwd_gl, "rasterize forward op (opengl)");
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_common.inl b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_common.inl
new file mode 100644
index 00000000..74dea415
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_common.inl
@@ -0,0 +1,29 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#pragma once
+#include "../common/framework.h"
+
+//------------------------------------------------------------------------
+// Input check helpers.
+//------------------------------------------------------------------------
+
+#ifdef _MSC_VER
+#define __func__ __FUNCTION__
+#endif
+
+#define NVDR_CHECK_DEVICE(...) do { TORCH_CHECK(at::cuda::check_device({__VA_ARGS__}), __func__, "(): Inputs " #__VA_ARGS__ " must reside on the same GPU device") } while(0)
+#define NVDR_CHECK_CPU(...) do { nvdr_check_cpu({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must reside on CPU"); } while(0)
+#define NVDR_CHECK_CONTIGUOUS(...) do { nvdr_check_contiguous({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be contiguous tensors"); } while(0)
+#define NVDR_CHECK_F32(...) do { nvdr_check_f32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be float32 tensors"); } while(0)
+#define NVDR_CHECK_I32(...) do { nvdr_check_i32({__VA_ARGS__}, __func__, "(): Inputs " #__VA_ARGS__ " must be int32 tensors"); } while(0)
+inline void nvdr_check_cpu(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.device().type() == c10::DeviceType::CPU, func, err_msg); }
+inline void nvdr_check_contiguous(at::ArrayRef<at::Tensor> ts, const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.is_contiguous(), func, err_msg); }
+inline void nvdr_check_f32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kFloat32, func, err_msg); }
+inline void nvdr_check_i32(at::ArrayRef<at::Tensor> ts,        const char* func, const char* err_msg) { for (const at::Tensor& t : ts) TORCH_CHECK(t.dtype() == torch::kInt32, func, err_msg); }
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp
new file mode 100644
index 00000000..b2c99fcc
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_interpolate.cpp
@@ -0,0 +1,250 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "../common/common.h"
+#include "../common/interpolate.h"
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void InterpolateFwdKernel   (const InterpolateKernelParams p);
+void InterpolateFwdKernelDa (const InterpolateKernelParams p);
+void InterpolateGradKernel  (const InterpolateKernelParams p);
+void InterpolateGradKernelDa(const InterpolateKernelParams p);
+
+//------------------------------------------------------------------------
+// Helper
+
+static void set_diff_attrs(InterpolateKernelParams& p, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    if (diff_attrs_all)
+    {
+        p.numDiffAttr = p.numAttr;
+        p.diff_attrs_all = 1;
+    }
+    else
+    {
+        NVDR_CHECK(diff_attrs_vec.size() <= IP_MAX_DIFF_ATTRS, "too many entries in diff_attrs list (increase IP_MAX_DIFF_ATTRS)");
+        p.numDiffAttr = diff_attrs_vec.size();
+        memcpy(p.diffAttrs, &diff_attrs_vec[0], diff_attrs_vec.size()*sizeof(int));
+    }
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor rast_db, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, rast_db);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, rast_db);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK( tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    if (p.instance_mode)
+        NVDR_CHECK(attr.size(0) == rast.size(0) || attr.size(0) == 1, "minibatch size mismatch between inputs rast, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr.size(0) == 1) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.depth, p.height, p.width, p.numAttr}, opts);
+    torch::Tensor out_da = torch::empty({p.depth, p.height, p.width, p.numDiffAttr * 2}, opts);
+
+    p.out = out.data_ptr<float>();
+    p.outDA = enable_da ? out_da.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast   & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.outDA  &  7), "out_da output tensor not aligned to float2");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_FWD_MAX_KERNEL_BLOCK_WIDTH, IP_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateFwdKernelDa : (void*)InterpolateFwdKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_da);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_fwd(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    return interpolate_fwd_da(attr, rast, tri, empty_tensor, false, empty_vec);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> interpolate_grad_da(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy, torch::Tensor rast_db, torch::Tensor dda, bool diff_attrs_all, std::vector<int>& diff_attrs_vec)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(attr));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    InterpolateKernelParams p = {}; // Initialize all fields to zero.
+    bool enable_da = (rast_db.defined()) && (diff_attrs_all || !diff_attrs_vec.empty());
+    p.instance_mode = (attr.sizes().size() > 2) ? 1 : 0;
+
+    // Check inputs.
+    if (enable_da)
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy, rast_db, dda);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri, rast_db);
+        NVDR_CHECK_F32(attr, rast, dy, rast_db, dda);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(attr, rast, tri, dy);
+        NVDR_CHECK_CONTIGUOUS(attr, rast, tri);
+        NVDR_CHECK_F32(attr, rast, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Depth of attributes.
+    int attr_depth = p.instance_mode ? (attr.sizes().size() > 1 ? attr.size(0) : 0) : 1;
+
+    // Sanity checks.
+    NVDR_CHECK(rast.sizes().size() == 4 && rast.size(0) > 0 && rast.size(1) > 0 && rast.size(2) > 0 && rast.size(3) == 4, "rast must have shape[>0, >0, >0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK((attr.sizes().size() == 2 || attr.sizes().size() == 3) && attr.size(0) > 0 && attr.size(1) > 0 && (attr.sizes().size() == 2 || attr.size(2) > 0), "attr must have shape [>0, >0, >0] or [>0, >0]");
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) > 0 && dy.size(1) == rast.size(1) && dy.size(2) == rast.size(2) && dy.size(3) > 0, "dy must have shape [>0, height, width, >0]");
+    NVDR_CHECK(dy.size(3) == attr.size(attr.sizes().size() - 1), "argument count mismatch between inputs dy, attr");
+    NVDR_CHECK((attr_depth == rast.size(0) || attr_depth == 1) && dy.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, dy, attr");
+    if (enable_da)
+    {
+        NVDR_CHECK(dda.sizes().size() == 4 && dda.size(0) > 0 && dda.size(1) == rast.size(1) && dda.size(2) == rast.size(2), "dda must have shape [>0, height, width, ?]");
+        NVDR_CHECK(dda.size(0) == rast.size(0), "minibatch size mismatch between rast, dda");
+        NVDR_CHECK(rast_db.sizes().size() == 4 && rast_db.size(0) > 0 && rast_db.size(1) > 0 && rast_db.size(2) > 0 && rast_db.size(3) == 4, "rast_db must have shape[>0, >0, >0, 4]");
+        NVDR_CHECK(rast_db.size(1) == rast.size(1) && rast_db.size(2) == rast.size(2), "spatial size mismatch between inputs rast and rast_db");
+        NVDR_CHECK(rast_db.size(0) == rast.size(0), "minibatch size mismatch between inputs rast, rast_db");
+    }
+
+    // Extract input dimensions.
+    p.numVertices  = attr.size(p.instance_mode ? 1 : 0);
+    p.numAttr      = attr.size(p.instance_mode ? 2 : 1);
+    p.numTriangles = tri.size(0);
+    p.height       = rast.size(1);
+    p.width        = rast.size(2);
+    p.depth        = rast.size(0);
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor dda_;
+    if (enable_da)
+        dda_ = dda.contiguous();
+
+    // Set attribute pixel differential info if enabled, otherwise leave as zero.
+    if (enable_da)
+        set_diff_attrs(p, diff_attrs_all, diff_attrs_vec);
+    else
+        p.numDiffAttr = 0;
+
+    // Get input pointers.
+    p.attr = attr.data_ptr<float>();
+    p.rast = rast.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.dy = dy_.data_ptr<float>();
+    p.rastDB = enable_da ? rast_db.data_ptr<float>() : NULL;
+    p.dda = enable_da ? dda_.data_ptr<float>() : NULL;
+    p.attrBC = (p.instance_mode && attr_depth < p.depth) ? 1 : 0;
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor gradAttr = torch::zeros_like(attr);
+    torch::Tensor gradRaster = torch::empty_like(rast);
+    torch::Tensor gradRasterDB;
+    if (enable_da)
+        gradRasterDB = torch::empty_like(rast_db);
+
+    p.gradAttr = gradAttr.data_ptr<float>();
+    p.gradRaster = gradRaster.data_ptr<float>();
+    p.gradRasterDB = enable_da ? gradRasterDB.data_ptr<float>() : NULL;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.rast         & 15), "rast input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.rastDB       & 15), "rast_db input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dda          &  7), "dda input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.gradRaster   & 15), "grad_rast output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.gradRasterDB & 15), "grad_rast_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(IP_GRAD_MAX_KERNEL_BLOCK_WIDTH, IP_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_da ? (void*)InterpolateGradKernelDa : (void*)InterpolateGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return results.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>(gradAttr, gradRaster, gradRasterDB);
+}
+
+// Version without derivatives.
+std::tuple<torch::Tensor, torch::Tensor> interpolate_grad(torch::Tensor attr, torch::Tensor rast, torch::Tensor tri, torch::Tensor dy)
+{
+    std::vector<int> empty_vec;
+    torch::Tensor empty_tensor;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> result = interpolate_grad_da(attr, rast, tri, dy, empty_tensor, empty_tensor, false, empty_vec);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp
new file mode 100644
index 00000000..589e227a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize.cpp
@@ -0,0 +1,265 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize.h"
+#include "../common/cudaraster/CudaRaster.hpp"
+#include "../common/cudaraster/impl/Constants.hpp"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void RasterizeCudaFwdShaderKernel(const RasterizeCudaFwdShaderParams p);
+void RasterizeGradKernel(const RasterizeGradParams p);
+void RasterizeGradKernelDb(const RasterizeGradParams p);
+
+//------------------------------------------------------------------------
+// Python CudaRaster state wrapper methods.
+
+RasterizeCRStateWrapper::RasterizeCRStateWrapper(int cudaDeviceIdx_)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx_);
+    cudaDeviceIdx = cudaDeviceIdx_;
+    cr = new CR::CudaRaster();
+}
+
+RasterizeCRStateWrapper::~RasterizeCRStateWrapper(void)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(cudaDeviceIdx);
+    delete cr;
+}
+
+//------------------------------------------------------------------------
+// Forward op (Cuda).
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_cuda(RasterizeCRStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    CR::CudaRaster* cr = stateWrapper.cr;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that CudaRaster context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "CudaRaster context must must reside on the same device as input tensors");
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height_out = std::get<0>(resolution);
+    int width_out  = std::get<1>(resolution);
+    int depth      = instance_mode ? pos.size(0) : ranges.size(0); // Depth of tensor, not related to depth buffering.
+    NVDR_CHECK(height_out > 0 && width_out > 0, "resolution must be [>0, >0]");
+
+    // Round internal resolution up to tile size.
+    int height = (height_out + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int width  = (width_out  + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+
+    // Get position and triangle buffer sizes in vertices / triangles.
+    int posCount = instance_mode ? pos.size(1) : pos.size(0);
+    int triCount = tri.size(0);
+
+    // Set up CudaRaster buffers.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    cr->setVertexBuffer((void*)posPtr, posCount);
+    cr->setIndexBuffer((void*)triPtr, triCount);
+    cr->setBufferSize(width_out, height_out, depth);
+
+    // Enable depth peeling?
+    bool enablePeel = (peeling_idx > 0);
+    cr->setRenderModeFlags(enablePeel ? CR::CudaRaster::RenderModeFlag_EnableDepthPeeling : 0); // No backface culling.
+    if (enablePeel)
+        cr->swapDepthAndPeel(); // Use previous depth buffer as peeling depth input.
+
+    // Determine viewport tiling.
+    int tileCountX = (width  + CR_MAXVIEWPORT_SIZE - 1) / CR_MAXVIEWPORT_SIZE;
+    int tileCountY = (height + CR_MAXVIEWPORT_SIZE - 1) / CR_MAXVIEWPORT_SIZE;
+    int tileSizeX = ((width  + tileCountX - 1) / tileCountX + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    int tileSizeY = ((height + tileCountY - 1) / tileCountY + CR_TILE_SIZE - 1) & (-CR_TILE_SIZE);
+    TORCH_CHECK(tileCountX > 0 && tileCountY > 0 && tileSizeX > 0 && tileSizeY > 0,             "internal error in tile size calculation: count or size is zero");
+    TORCH_CHECK(tileSizeX <= CR_MAXVIEWPORT_SIZE && tileSizeY <= CR_MAXVIEWPORT_SIZE,           "internal error in tile size calculation: tile larger than allowed");
+    TORCH_CHECK((tileSizeX & (CR_TILE_SIZE - 1)) == 0 && (tileSizeY & (CR_TILE_SIZE - 1)) == 0, "internal error in tile size calculation: tile not divisible by ", CR_TILE_SIZE);
+    TORCH_CHECK(tileCountX * tileSizeX >= width && tileCountY * tileSizeY >= height,            "internal error in tile size calculation: tiles do not cover viewport");
+
+    // Rasterize in tiles.
+    for (int tileY = 0; tileY < tileCountY; tileY++)
+    for (int tileX = 0; tileX < tileCountX; tileX++)
+    {
+        // Set CudaRaster viewport according to tile.
+        int offsetX = tileX * tileSizeX;
+        int offsetY = tileY * tileSizeY;
+        int sizeX = (width_out  - offsetX) < tileSizeX ? (width_out  - offsetX) : tileSizeX;
+        int sizeY = (height_out - offsetY) < tileSizeY ? (height_out - offsetY) : tileSizeY;
+        cr->setViewport(sizeX, sizeY, offsetX, offsetY);
+
+        // Run all triangles in one batch. In case of error, the workload could be split into smaller batches - maybe do that in the future.
+        // Only enable peeling-specific optimizations to skip first stages when image fits in one tile. Those are not valid otherwise.
+        cr->deferredClear(0u);
+        bool success = cr->drawTriangles(rangesPtr, enablePeel && (tileCountX == 1 && tileCountY == 1), stream);
+        NVDR_CHECK(success, "subtriangle count overflow");
+    }
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height_out, width_out, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height_out, width_out, 4}, opts);
+
+    // Populate pixel shader kernel parameters.
+    RasterizeCudaFwdShaderParams p;
+    p.pos = posPtr;
+    p.tri = triPtr;
+    p.in_idx = (const int*)cr->getColorBuffer();
+    p.out = out.data_ptr<float>();
+    p.out_db = out_db.data_ptr<float>();
+    p.numTriangles = triCount;
+    p.numVertices = posCount;
+    p.width_in = width;
+    p.height_in = height;
+    p.width_out = width_out;
+    p.height_out = height_out;
+    p.depth  = depth;
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+    p.xs = 2.f / (float)width_out;
+    p.xo = 1.f / (float)width_out - 1.f;
+    p.ys = 2.f / (float)height_out;
+    p.yo = 1.f / (float)height_out - 1.f;
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15),    "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.out & 15),    "out output tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.out_db & 15), "out_db output tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_WIDTH, RAST_CUDA_FWD_SHADER_KERNEL_BLOCK_HEIGHT, p.width_out, p.height_out);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width_out, p.height_out, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel((void*)RasterizeCudaFwdShaderKernel, gridSize, blockSize, args, 0, stream));
+
+    // Return.
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+torch::Tensor rasterize_grad_db(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy, torch::Tensor ddb)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGradParams p;
+    bool enable_db = ddb.defined();
+
+    // Check inputs.
+    if (enable_db)
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy, ddb);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy, ddb);
+        NVDR_CHECK_I32(tri);
+    }
+    else
+    {
+        NVDR_CHECK_DEVICE(pos, tri, out, dy);
+        NVDR_CHECK_CONTIGUOUS(pos, tri, out);
+        NVDR_CHECK_F32(pos, out, dy);
+        NVDR_CHECK_I32(tri);
+    }
+
+    // Determine instance mode.
+    p.instance_mode = (pos.sizes().size() > 2) ? 1 : 0;
+
+    // Shape is taken from the rasterizer output tensor.
+    NVDR_CHECK(out.sizes().size() == 4, "tensor out must be rank-4");
+    p.depth  = out.size(0);
+    p.height = out.size(1);
+    p.width  = out.size(2);
+    NVDR_CHECK(p.depth > 0 && p.height > 0 && p.width > 0, "resolution must be [>0, >0, >0]");
+
+    // Check other shapes.
+    if (p.instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) == p.depth && pos.size(1) > 0 && pos.size(2) == 4, "pos must have shape [depth, >0, 4]");
+    else
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "pos must have shape [>0, 4]");
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+    NVDR_CHECK(out.sizes().size() == 4 && out.size(0) == p.depth && out.size(1) == p.height && out.size(2) == p.width && out.size(3) == 4, "out must have shape [depth, height, width, 4]");
+    NVDR_CHECK( dy.sizes().size() == 4 &&  dy.size(0) == p.depth &&  dy.size(1) == p.height &&  dy.size(2) == p.width &&  dy.size(3) == 4, "dy must have shape [depth, height, width, 4]");
+    if (enable_db)
+        NVDR_CHECK(ddb.sizes().size() == 4 && ddb.size(0) == p.depth && ddb.size(1) == p.height && ddb.size(2) == p.width && ddb.size(3) == 4, "ddb must have shape [depth, height, width, 4]");
+
+    // Ensure gradients are contiguous.
+    torch::Tensor dy_ = dy.contiguous();
+    torch::Tensor ddb_;
+    if (enable_db)
+        ddb_ = ddb.contiguous();
+
+    // Populate parameters.
+    p.numTriangles = tri.size(0);
+    p.numVertices = p.instance_mode ? pos.size(1) : pos.size(0);
+    p.pos = pos.data_ptr<float>();
+    p.tri = tri.data_ptr<int>();
+    p.out = out.data_ptr<float>();
+    p.dy  = dy_.data_ptr<float>();
+    p.ddb = enable_db ? ddb_.data_ptr<float>() : NULL;
+
+    // Set up pixel position to clip space x, y transform.
+    p.xs = 2.f / (float)p.width;
+    p.xo = 1.f / (float)p.width - 1.f;
+    p.ys = 2.f / (float)p.height;
+    p.yo = 1.f / (float)p.height - 1.f;
+
+    // Allocate output tensor for position gradients.
+    torch::Tensor grad = torch::zeros_like(pos);
+    p.grad = grad.data_ptr<float>();
+
+    // Verify that buffers are aligned to allow float2/float4 operations.
+    NVDR_CHECK(!((uintptr_t)p.pos & 15), "pos input tensor not aligned to float4");
+    NVDR_CHECK(!((uintptr_t)p.dy  &  7), "dy input tensor not aligned to float2");
+    NVDR_CHECK(!((uintptr_t)p.ddb & 15), "ddb input tensor not aligned to float4");
+
+    // Choose launch parameters.
+    dim3 blockSize = getLaunchBlockSize(RAST_GRAD_MAX_KERNEL_BLOCK_WIDTH, RAST_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.width, p.height);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.width, p.height, p.depth);
+
+    // Launch CUDA kernel.
+    void* args[] = {&p};
+    void* func = enable_db ? (void*)RasterizeGradKernelDb : (void*)RasterizeGradKernel;
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func, gridSize, blockSize, args, 0, stream));
+
+    // Return the gradients.
+    return grad;
+}
+
+// Version without derivatives.
+torch::Tensor rasterize_grad(torch::Tensor pos, torch::Tensor tri, torch::Tensor out, torch::Tensor dy)
+{
+    torch::Tensor empty_tensor;
+    return rasterize_grad_db(pos, tri, out, dy, empty_tensor);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize_gl.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize_gl.cpp
new file mode 100644
index 00000000..3776134a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_rasterize_gl.cpp
@@ -0,0 +1,132 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/rasterize_gl.h"
+#include <tuple>
+
+//------------------------------------------------------------------------
+// Python GL state wrapper methods.
+
+RasterizeGLStateWrapper::RasterizeGLStateWrapper(bool enableDB, bool automatic_, int cudaDeviceIdx_)
+{
+    pState = new RasterizeGLState();
+    automatic = automatic_;
+    cudaDeviceIdx = cudaDeviceIdx_;
+    memset(pState, 0, sizeof(RasterizeGLState));
+    pState->enableDB = enableDB ? 1 : 0;
+    rasterizeInitGLContext(NVDR_CTX_PARAMS, *pState, cudaDeviceIdx_);
+    releaseGLContext();
+}
+
+RasterizeGLStateWrapper::~RasterizeGLStateWrapper(void)
+{
+    setGLContext(pState->glctx);
+    rasterizeReleaseBuffers(NVDR_CTX_PARAMS, *pState);
+    releaseGLContext();
+    destroyGLContext(pState->glctx);
+    delete pState;
+}
+
+void RasterizeGLStateWrapper::setContext(void)
+{
+    setGLContext(pState->glctx);
+}
+
+void RasterizeGLStateWrapper::releaseContext(void)
+{
+    releaseGLContext();
+}
+
+//------------------------------------------------------------------------
+// Forward op (OpenGL).
+
+std::tuple<torch::Tensor, torch::Tensor> rasterize_fwd_gl(RasterizeGLStateWrapper& stateWrapper, torch::Tensor pos, torch::Tensor tri, std::tuple<int, int> resolution, torch::Tensor ranges, int peeling_idx)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(pos));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    RasterizeGLState& s = *stateWrapper.pState;
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(pos, tri);
+    NVDR_CHECK_CPU(ranges);
+    NVDR_CHECK_CONTIGUOUS(pos, tri, ranges);
+    NVDR_CHECK_F32(pos);
+    NVDR_CHECK_I32(tri, ranges);
+
+    // Check that GL context was created for the correct GPU.
+    NVDR_CHECK(pos.get_device() == stateWrapper.cudaDeviceIdx, "GL context must must reside on the same device as input tensors");
+
+    // Determine number of outputs
+    int num_outputs = s.enableDB ? 2 : 1;
+
+    // Determine instance mode and check input dimensions.
+    bool instance_mode = pos.sizes().size() > 2;
+    if (instance_mode)
+        NVDR_CHECK(pos.sizes().size() == 3 && pos.size(0) > 0 && pos.size(1) > 0 && pos.size(2) == 4, "instance mode - pos must have shape [>0, >0, 4]");
+    else
+    {
+        NVDR_CHECK(pos.sizes().size() == 2 && pos.size(0) > 0 && pos.size(1) == 4, "range mode - pos must have shape [>0, 4]");
+        NVDR_CHECK(ranges.sizes().size() == 2 && ranges.size(0) > 0 && ranges.size(1) == 2, "range mode - ranges must have shape [>0, 2]");
+    }
+    NVDR_CHECK(tri.sizes().size() == 2 && tri.size(0) > 0 && tri.size(1) == 3, "tri must have shape [>0, 3]");
+
+    // Get output shape.
+    int height = std::get<0>(resolution);
+    int width  = std::get<1>(resolution);
+    int depth  = instance_mode ? pos.size(0) : ranges.size(0);
+    NVDR_CHECK(height > 0 && width > 0, "resolution must be [>0, >0]");
+
+    // Get position and triangle buffer sizes in int32/float32.
+    int posCount = 4 * pos.size(0) * (instance_mode ? pos.size(1) : 1);
+    int triCount = 3 * tri.size(0);
+
+    // Set the GL context unless manual context.
+    if (stateWrapper.automatic)
+        setGLContext(s.glctx);
+
+    // Resize all buffers.
+    bool changes = false;
+    rasterizeResizeBuffers(NVDR_CTX_PARAMS, s, changes, posCount, triCount, width, height, depth);
+    if (changes)
+    {
+#ifdef _WIN32
+        // Workaround for occasional blank first frame on Windows.
+        releaseGLContext();
+        setGLContext(s.glctx);
+#endif
+    }
+
+    // Copy input data to GL and render.
+    const float* posPtr = pos.data_ptr<float>();
+    const int32_t* rangesPtr = instance_mode ? 0 : ranges.data_ptr<int32_t>(); // This is in CPU memory.
+    const int32_t* triPtr = tri.data_ptr<int32_t>();
+    int vtxPerInstance = instance_mode ? pos.size(1) : 0;
+    rasterizeRender(NVDR_CTX_PARAMS, s, stream, posPtr, posCount, vtxPerInstance, triPtr, triCount, rangesPtr, width, height, depth, peeling_idx);
+
+    // Allocate output tensors.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({depth, height, width, 4}, opts);
+    torch::Tensor out_db = torch::empty({depth, height, width, s.enableDB ? 4 : 0}, opts);
+    float* outputPtr[2];
+    outputPtr[0] = out.data_ptr<float>();
+    outputPtr[1] = s.enableDB ? out_db.data_ptr<float>() : NULL;
+
+    // Copy rasterized results into CUDA buffers.
+    rasterizeCopyResults(NVDR_CTX_PARAMS, s, stream, outputPtr, width, height, depth);
+
+    // Done. Release GL context and return.
+    if (stateWrapper.automatic)
+        releaseGLContext();
+
+    return std::tuple<torch::Tensor, torch::Tensor>(out, out_db);
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_texture.cpp b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_texture.cpp
new file mode 100644
index 00000000..2257f566
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_texture.cpp
@@ -0,0 +1,718 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+#include "torch_types.h"
+#include "../common/common.h"
+#include "../common/texture.h"
+#include <cuda_runtime.h>
+
+//------------------------------------------------------------------------
+// Kernel prototypes.
+
+void MipBuildKernel1                            (const TextureKernelParams p);
+void MipBuildKernel2                            (const TextureKernelParams p);
+void MipBuildKernel4                            (const TextureKernelParams p);
+void TextureFwdKernelNearest1                   (const TextureKernelParams p);
+void TextureFwdKernelNearest2                   (const TextureKernelParams p);
+void TextureFwdKernelNearest4                   (const TextureKernelParams p);
+void TextureFwdKernelLinear1                    (const TextureKernelParams p);
+void TextureFwdKernelLinear2                    (const TextureKernelParams p);
+void TextureFwdKernelLinear4                    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest1       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest2       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearest4       (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear1        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear2        (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinear4        (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest1               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest2               (const TextureKernelParams p);
+void TextureFwdKernelCubeNearest4               (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear1                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear2                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinear4                (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest1   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest2   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearest4   (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear1    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear2    (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinear4    (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO1     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO2     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapNearestBO4     (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO1      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO2      (const TextureKernelParams p);
+void TextureFwdKernelLinearMipmapLinearBO4      (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO1 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO2 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapNearestBO4 (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO1  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO2  (const TextureKernelParams p);
+void TextureFwdKernelCubeLinearMipmapLinearBO4  (const TextureKernelParams p);
+void MipGradKernel1                             (const TextureKernelParams p);
+void MipGradKernel2                             (const TextureKernelParams p);
+void MipGradKernel4                             (const TextureKernelParams p);
+void TextureGradKernelNearest                   (const TextureKernelParams p);
+void TextureGradKernelLinear                    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearest       (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinear        (const TextureKernelParams p);
+void TextureGradKernelCubeNearest               (const TextureKernelParams p);
+void TextureGradKernelCubeLinear                (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearest   (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinear    (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapNearestBO     (const TextureKernelParams p);
+void TextureGradKernelLinearMipmapLinearBO      (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapNearestBO (const TextureKernelParams p);
+void TextureGradKernelCubeLinearMipmapLinearBO  (const TextureKernelParams p);
+
+//------------------------------------------------------------------------
+// Modeselektor.
+
+static void set_modes(TextureKernelParams& p, int filter_mode, int boundary_mode, int max_mip_level)
+{
+    // Mip and filter modes.
+    p.filterMode = filter_mode;
+    NVDR_CHECK(p.filterMode >= 0 && p.filterMode < TEX_MODE_COUNT, "filter_mode unsupported");
+    p.enableMip = (p.filterMode == TEX_MODE_LINEAR_MIPMAP_NEAREST || p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR);
+
+    // Mip level clamp.
+    if (p.enableMip)
+    {
+        p.mipLevelLimit = max_mip_level;
+        NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+    }
+
+    // Boundary mode.
+    p.boundaryMode = boundary_mode;
+    NVDR_CHECK(p.boundaryMode >= 0 && p.boundaryMode < TEX_BOUNDARY_MODE_COUNT, "boundary_mode unsupported");
+}
+
+//------------------------------------------------------------------------
+// Mipmap construction.
+
+TextureMipWrapper texture_construct_mip(torch::Tensor tex, int max_mip_level, bool cube_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    p.mipLevelLimit = max_mip_level;
+    p.boundaryMode = cube_mode ? TEX_BOUNDARY_MODE_CUBE : TEX_BOUNDARY_MODE_WRAP;
+    NVDR_CHECK(p.mipLevelLimit >= -1, "invalid max_mip_level");
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex);
+    NVDR_CHECK_CONTIGUOUS(tex);
+    NVDR_CHECK_F32(tex);
+
+    // Populate parameters and sanity check tex shape.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+    }
+    p.texDepth  = tex.size(0);
+    p.texHeight = tex.size(cube_mode ? 2 : 1);
+    p.texWidth  = tex.size(cube_mode ? 3 : 2);
+    p.channels  = tex.size(cube_mode ? 4 : 3);
+
+    // Set texture pointer.
+    p.tex[0] = tex.data_ptr<float>();
+
+    // Generate mip offsets and calculate total size.
+    int mipOffsets[TEX_MAX_MIP_LEVEL];
+    int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+
+    // Allocate and set mip tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor mip = torch::empty({mipTotal}, opts);
+    float* pmip = mip.data_ptr<float>();
+    for (int i=1; i <= p.mipLevelMax; i++)
+        p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Build mip levels.
+    for (int i=1; i <= p.mipLevelMax; i++)
+    {
+        int2 ms = mipLevelSize(p, i);
+        int3 sz = make_int3(ms.x, ms.y, p.texDepth);
+        dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_MIP_KERNEL_BLOCK_HEIGHT, sz.x, sz.y);
+        dim3 gridSize  = getLaunchGridSize(blockSize, sz.x, sz.y, sz.z * (cube_mode ? 6 : 1));
+        p.mipLevelOut = i;
+
+        void* build_func_tbl[3] = { (void*)MipBuildKernel1, (void*)MipBuildKernel2, (void*)MipBuildKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(build_func_tbl[channel_div_idx], gridSize, blockSize, args, 0, stream));
+    }
+
+    // Return the mip tensor in a wrapper.
+    TextureMipWrapper mip_wrapper;
+    mip_wrapper.mip = mip;
+    mip_wrapper.max_mip_level = max_mip_level;
+    mip_wrapper.texture_size = tex.sizes().vec();
+    mip_wrapper.cube_mode = cube_mode;
+    return mip_wrapper;
+}
+
+//------------------------------------------------------------------------
+// Forward op.
+
+torch::Tensor texture_fwd_mip(torch::Tensor tex, torch::Tensor uv, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor.
+    torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCUDA);
+    torch::Tensor out = torch::empty({p.n, p.imgHeight, p.imgWidth, p.channels}, opts);
+    p.out = out.data_ptr<float>();
+
+    // Choose kernel variants based on channel count.
+    void* args[] = {&p};
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    float* pmip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match and assign.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in custom mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+                p.tex[i] = t.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets, check mipmap size, and set mip data pointer.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "wrapped mip tensor size mismatch");
+            pmip = mip_w.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uv & 7), "uv input tensor not aligned to float2");
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 15), "tex or mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.out    & 15), "out output tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip     & 15), "mip input tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+            NVDR_CHECK(!((uintptr_t)p.tex[i] & 7), "tex or mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.out    & 7), "out output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip     & 7), "mip input tensor not aligned to float2");
+    }
+    if (!cube_mode)
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 15), "uv_da input tensor not aligned to float4");
+    else
+        NVDR_CHECK(!((uintptr_t)p.uvDA & 7), "uv_da input tensor not aligned to float2");
+
+    // Choose launch parameters for texture lookup kernel.
+    dim3 blockSize = getLaunchBlockSize(TEX_FWD_MAX_KERNEL_BLOCK_WIDTH, TEX_FWD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    // Choose kernel based on filter mode, cube mode, bias-only mode, and datatype.
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2 * 3] = {
+        (void*)TextureFwdKernelNearest1,
+        (void*)TextureFwdKernelNearest2,
+        (void*)TextureFwdKernelNearest4,
+        (void*)TextureFwdKernelLinear1,
+        (void*)TextureFwdKernelLinear2,
+        (void*)TextureFwdKernelLinear4,
+        (void*)TextureFwdKernelLinearMipmapNearest1,
+        (void*)TextureFwdKernelLinearMipmapNearest2,
+        (void*)TextureFwdKernelLinearMipmapNearest4,
+        (void*)TextureFwdKernelLinearMipmapLinear1,
+        (void*)TextureFwdKernelLinearMipmapLinear2,
+        (void*)TextureFwdKernelLinearMipmapLinear4,
+        (void*)TextureFwdKernelCubeNearest1,
+        (void*)TextureFwdKernelCubeNearest2,
+        (void*)TextureFwdKernelCubeNearest4,
+        (void*)TextureFwdKernelCubeLinear1,
+        (void*)TextureFwdKernelCubeLinear2,
+        (void*)TextureFwdKernelCubeLinear4,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearest4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinear4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelLinearMipmapLinearBO4,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        NULL,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapNearestBO4,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO1,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO2,
+        (void*)TextureFwdKernelCubeLinearMipmapLinearBO4,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+    func_idx = func_idx * 3 + channel_div_idx; // Choose vector size.
+
+    // Launch kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Return output tensor.
+    return out;
+}
+
+// Version without mipmaps.
+torch::Tensor texture_fwd(torch::Tensor tex, torch::Tensor uv, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    return texture_fwd_mip(tex, uv, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+}
+
+//------------------------------------------------------------------------
+// Gradient op.
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(tex));
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    TextureKernelParams p = {}; // Initialize all fields to zero.
+    bool has_mip_stack = (mip_stack.size() > 0);
+    torch::Tensor& mip_w = mip_wrapper.mip; // Unwrap.
+    int max_mip_level = has_mip_stack ? mip_stack.size() : mip_wrapper.max_mip_level;
+    set_modes(p, filter_mode, boundary_mode, max_mip_level);
+
+    // See if we have these tensors or not.
+    bool has_uv_da = uv_da.defined() && uv_da.nbytes();
+    bool has_mip_level_bias = mip_level_bias.defined() && mip_level_bias.nbytes();
+
+    if (p.enableMip)
+    {
+        NVDR_CHECK(has_uv_da || has_mip_level_bias, "mipmapping filter mode requires uv_da and/or mip_level_bias input");
+        NVDR_CHECK(has_mip_stack || mip_w.defined(), "mipmapping filter mode requires mip wrapper or mip stack input");
+    }
+
+    // Check inputs.
+    NVDR_CHECK_DEVICE(tex, uv);
+    NVDR_CHECK_CONTIGUOUS(tex, uv);
+    NVDR_CHECK_F32(tex, uv);
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            TORCH_CHECK(at::cuda::check_device(mip_stack), __func__, "(): Mip stack inputs must reside on the correct GPU device");
+            nvdr_check_contiguous(mip_stack, __func__, "(): Mip stack inputs must be contiguous tensors");
+            nvdr_check_f32(mip_stack, __func__, "(): Mip stack inputs must be float32 tensors");
+        }
+        else
+        {
+            NVDR_CHECK_DEVICE(mip_w);
+            NVDR_CHECK_CONTIGUOUS(mip_w);
+            NVDR_CHECK_F32(mip_w);
+        }
+        if (has_uv_da)
+        {
+            NVDR_CHECK_DEVICE(uv_da);
+            NVDR_CHECK_CONTIGUOUS(uv_da);
+            NVDR_CHECK_F32(uv_da);
+        }
+        if (has_mip_level_bias)
+        {
+            NVDR_CHECK_DEVICE(mip_level_bias);
+            NVDR_CHECK_CONTIGUOUS(mip_level_bias);
+            NVDR_CHECK_F32(mip_level_bias);
+        }
+    }
+
+    // Sanity checks and state setters.
+    bool cube_mode = (boundary_mode == TEX_BOUNDARY_MODE_CUBE);
+    if (!cube_mode)
+    {
+        NVDR_CHECK(tex.sizes().size() == 4 && tex.size(0) > 0 && tex.size(1) > 0 && tex.size(2) > 0 && tex.size(3) > 0, "tex must have shape[>0, >0, >0, >0]");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 2, "uv must have shape [>0, >0, >0, 2]");
+        p.texHeight = tex.size(1);
+        p.texWidth  = tex.size(2);
+        p.channels  = tex.size(3);
+    }
+    else
+    {
+        NVDR_CHECK(tex.sizes().size() == 5 && tex.size(0) > 0 && tex.size(1) == 6 && tex.size(2) > 0 && tex.size(3) > 0 && tex.size(4) > 0, "tex must have shape[>0, 6, >0, >0, >0] in cube map mode");
+        NVDR_CHECK(uv.sizes().size() == 4 && uv.size(0) > 0 && uv.size(1) > 0 && uv.size(2) > 0 && uv.size(3) == 3, "uv must have shape [>0, >0, >0, 3] in cube map mode");
+        NVDR_CHECK(tex.size(2) == tex.size(3), "texture shape must be square in cube map mode");
+        p.texHeight = tex.size(2);
+        p.texWidth  = tex.size(3);
+        p.channels  = tex.size(4);
+    }
+    NVDR_CHECK(tex.size(0) == 1 || tex.size(0) == uv.size(0), "minibatch size mismatch between inputs tex, uv");
+    NVDR_CHECK(p.texWidth <= (1 << TEX_MAX_MIP_LEVEL) && p.texHeight <= (1 << TEX_MAX_MIP_LEVEL), "texture size too large");
+    p.n         = uv.size(0);
+    p.imgHeight = uv.size(1);
+    p.imgWidth  = uv.size(2);
+    p.texDepth  = tex.size(0);
+    if (p.enableMip)
+    {
+        if (has_uv_da)
+        {
+            if (!cube_mode)
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 4, "uv_da must have shape [minibatch_size, height, width, 4]");
+            else
+                NVDR_CHECK(uv_da.sizes().size() == 4 && uv_da.size(0) == p.n && uv_da.size(1) == p.imgHeight && uv_da.size(2) == p.imgWidth && uv_da.size(3) == 6, "uv_da must have shape [minibatch_size, height, width, 6] in cube map mode");
+        }
+        if (has_mip_level_bias)
+            NVDR_CHECK(mip_level_bias.sizes().size() == 3 && mip_level_bias.size(0) == p.n && mip_level_bias.size(1) == p.imgHeight && mip_level_bias.size(2) == p.imgWidth, "mip_level_bias must have shape [minibatch_size, height, width]");
+    }
+    NVDR_CHECK(dy.sizes().size() == 4 && dy.size(0) == p.n && dy.size(1) == p.imgHeight && dy.size(2) == p.imgWidth && dy.size(3) == p.channels, "dy must have shape [minibatch_size, height, width, channels]");
+
+    // Get contiguous version of dy.
+    torch::Tensor dy_ = dy.contiguous();
+
+    // Get input pointers.
+    p.tex[0] = tex.data_ptr<float>();
+    p.uv = uv.data_ptr<float>();
+    p.dy = dy_.data_ptr<float>();
+    p.uvDA = (p.enableMip && has_uv_da) ? uv_da.data_ptr<float>() : NULL;
+    p.mipLevelBias = (p.enableMip && has_mip_level_bias) ? mip_level_bias.data_ptr<float>() : NULL;
+
+    // Allocate output tensor for tex gradient.
+    torch::Tensor grad_tex = torch::zeros_like(tex);
+    p.gradTex[0] = grad_tex.data_ptr<float>();
+
+    // Allocate output tensor for uv gradient.
+    torch::Tensor grad_uv;
+    torch::Tensor grad_uv_da;
+    torch::Tensor grad_mip_level_bias;
+    if (p.filterMode != TEX_MODE_NEAREST)
+    {
+        grad_uv = torch::empty_like(uv);
+        p.gradUV = grad_uv.data_ptr<float>();
+
+        // Gradients for things affecting mip level.
+        if (p.filterMode == TEX_MODE_LINEAR_MIPMAP_LINEAR)
+        {
+            // Allocate output tensor for uv_da gradient.
+            if (has_uv_da)
+            {
+                grad_uv_da = torch::empty_like(uv_da);
+                p.gradUVDA = grad_uv_da.data_ptr<float>();
+            }
+
+            // Allocate output tensor for mip_level_bias gradient.
+            if (has_mip_level_bias)
+            {
+                grad_mip_level_bias = torch::empty_like(mip_level_bias);
+                p.gradMipLevelBias = grad_mip_level_bias.data_ptr<float>();
+            }
+        }
+    }
+
+    // Choose kernel variants based on channel count.
+    int channel_div_idx = 0;
+    if (!(p.channels & 3))
+        channel_div_idx = 2;  // Channel count divisible by 4.
+    else if (!(p.channels & 1))
+        channel_div_idx = 1;  // Channel count divisible by 2.
+
+    // Mip-related setup.
+    torch::Tensor grad_mip;
+    std::vector<torch::Tensor> grad_mip_stack;
+    float* pmip = 0;
+    float* pgradMip = 0;
+    if (p.enableMip)
+    {
+        if (has_mip_stack)
+        {
+            // Custom mip stack supplied. Check that sizes match, assign, construct gradient tensors.
+            p.mipLevelMax = max_mip_level;
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                torch::Tensor& t = mip_stack[i-1];
+                int2 sz = mipLevelSize(p, i);
+                if (!cube_mode)
+                    NVDR_CHECK(t.sizes().size() == 4 && t.size(0) == tex.size(0) && t.size(1) == sz.y && t.size(2) == sz.x && t.size(3) == p.channels, "mip level size mismatch in mip stack");
+                else
+                    NVDR_CHECK(t.sizes().size() == 5 && t.size(0) == tex.size(0) && t.size(1) == 6 && t.size(2) == sz.y && t.size(3) == sz.x && t.size(4) == p.channels, "mip level size mismatch in mip stack");
+                if (sz.x == 1 && sz.y == 1)
+                    NVDR_CHECK(i == p.mipLevelMax, "mip level size mismatch in mip stack");
+
+                torch::Tensor g = torch::zeros_like(t);
+                grad_mip_stack.push_back(g);
+
+                p.tex[i] = t.data_ptr<float>();
+                p.gradTex[i] = g.data_ptr<float>();
+            }
+        }
+        else
+        {
+            // Generate mip offsets and get space for temporary mip gradients.
+            int mipOffsets[TEX_MAX_MIP_LEVEL];
+            int mipTotal = calculateMipInfo(NVDR_CTX_PARAMS, p, mipOffsets);
+            NVDR_CHECK(tex.sizes() == mip_wrapper.texture_size && cube_mode == mip_wrapper.cube_mode, "mip does not match texture size");
+            NVDR_CHECK(mip_w.sizes().size() == 1 && mip_w.size(0) == mipTotal, "mip tensor size mismatch");
+            grad_mip = torch::zeros_like(mip_w);
+            pmip = (float*)mip_w.data_ptr<float>();
+            pgradMip = grad_mip.data_ptr<float>();
+            for (int i=1; i <= p.mipLevelMax; i++)
+            {
+                p.tex[i] = pmip + mipOffsets[i]; // Pointers to mip levels.
+                p.gradTex[i] = pgradMip + mipOffsets[i]; // Pointers to mip gradients.
+            }
+        }
+    }
+
+    // Verify that buffers are aligned to allow float2/float4 operations. Unused pointers are zero so always aligned.
+    if (!cube_mode)
+    {
+        NVDR_CHECK(!((uintptr_t)p.uv       & 7), "uv input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUV   & 7), "grad_uv output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 15), "uv_da input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 15), "grad_uv_da output tensor not aligned to float4");
+    }
+    else
+    {
+        NVDR_CHECK(!((uintptr_t)p.uvDA     & 7), "uv_da input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)p.gradUVDA & 7), "grad_uv_da output tensor not aligned to float2");
+    }
+    if ((p.channels & 3) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 15), "tex or mip input tensor not aligned to float4");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 15), "grad_tex output tensor not aligned to float4");
+        }
+        NVDR_CHECK(!((uintptr_t)p.dy         & 15), "dy input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pmip         & 15), "mip input tensor not aligned to float4");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 15), "internal mip gradient tensor not aligned to float4");
+    }
+    if ((p.channels & 1) == 0)
+    {
+        for (int i=0; i <= p.mipLevelMax; i++)
+        {
+            NVDR_CHECK(!((uintptr_t)p.tex[i]     & 7), "tex or mip input tensor not aligned to float2");
+            NVDR_CHECK(!((uintptr_t)p.gradTex[i] & 7), "grad_tex output tensor not aligned to float2");
+        }
+         NVDR_CHECK(!((uintptr_t)p.dy         & 7), "dy output tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pmip         & 7), "mip input tensor not aligned to float2");
+        NVDR_CHECK(!((uintptr_t)pgradMip     & 7), "internal mip gradient tensor not aligned to float2");
+    }
+
+    // Choose launch parameters for main gradient kernel.
+    void* args[] = {&p};
+    dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_KERNEL_BLOCK_HEIGHT, p.imgWidth, p.imgHeight);
+    dim3 gridSize  = getLaunchGridSize(blockSize, p.imgWidth, p.imgHeight, p.n);
+
+    void* func_tbl[TEX_MODE_COUNT * 2 * 2] = {
+        (void*)TextureGradKernelNearest,
+        (void*)TextureGradKernelLinear,
+        (void*)TextureGradKernelLinearMipmapNearest,
+        (void*)TextureGradKernelLinearMipmapLinear,
+        (void*)TextureGradKernelCubeNearest,
+        (void*)TextureGradKernelCubeLinear,
+        (void*)TextureGradKernelCubeLinearMipmapNearest,
+        (void*)TextureGradKernelCubeLinearMipmapLinear,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelLinearMipmapNearestBO,
+        (void*)TextureGradKernelLinearMipmapLinearBO,
+        NULL,
+        NULL,
+        (void*)TextureGradKernelCubeLinearMipmapNearestBO,
+        (void*)TextureGradKernelCubeLinearMipmapLinearBO,
+    };
+
+    // Function index.
+    int func_idx = p.filterMode;
+    if (cube_mode)
+        func_idx += TEX_MODE_COUNT; // Cube variant.
+    if (p.enableMip && !has_uv_da)
+        func_idx += TEX_MODE_COUNT * 2; // Bias-only variant.
+
+    // Launch main gradient kernel.
+    NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(func_tbl[func_idx], gridSize, blockSize, args, 0, stream));
+
+    // Launch kernel to pull gradients from mip levels. Don't do this if mip stack was supplied - individual level gradients are already there.
+    if (p.enableMip && !has_mip_stack)
+    {
+        dim3 blockSize = getLaunchBlockSize(TEX_GRAD_MAX_MIP_KERNEL_BLOCK_WIDTH, TEX_GRAD_MAX_MIP_KERNEL_BLOCK_HEIGHT, p.texWidth, p.texHeight);
+        dim3 gridSize  = getLaunchGridSize(blockSize, p.texWidth, p.texHeight, p.texDepth * (cube_mode ? 6 : 1));
+        int sharedBytes = blockSize.x * blockSize.y * p.channels * sizeof(float);
+
+        void* mip_grad_func_tbl[3] = { (void*)MipGradKernel1, (void*)MipGradKernel2, (void*)MipGradKernel4 };
+        NVDR_CHECK_CUDA_ERROR(cudaLaunchKernel(mip_grad_func_tbl[channel_div_idx], gridSize, blockSize, args, sharedBytes, stream));
+    }
+
+    // Return output tensors.
+    return std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(grad_tex, grad_uv, grad_uv_da, grad_mip_level_bias, grad_mip_stack);
+}
+
+// Version for nearest filter mode.
+torch::Tensor texture_grad_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::get<0>(result);
+}
+
+// Version for linear filter mode.
+std::tuple<torch::Tensor, torch::Tensor> texture_grad_linear(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, int filter_mode, int boundary_mode)
+{
+    torch::Tensor empty_tensor;
+    std::vector<torch::Tensor> empty_vector;
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, empty_tensor, empty_tensor, TextureMipWrapper(), empty_vector, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor>(std::get<0>(result), std::get<1>(result));
+}
+
+// Version for linear-mipmap-nearest mode.
+std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > texture_grad_linear_mipmap_nearest(torch::Tensor tex, torch::Tensor uv, torch::Tensor dy, torch::Tensor uv_da, torch::Tensor mip_level_bias, TextureMipWrapper mip_wrapper, std::vector<torch::Tensor> mip_stack, int filter_mode, int boundary_mode)
+{
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::vector<torch::Tensor> > result = texture_grad_linear_mipmap_linear(tex, uv, dy, uv_da, mip_level_bias, mip_wrapper, mip_stack, filter_mode, boundary_mode);
+    return std::tuple<torch::Tensor, torch::Tensor, std::vector<torch::Tensor> >(std::get<0>(result), std::get<1>(result), std::get<4>(result));
+}
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_types.h b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_types.h
new file mode 100644
index 00000000..8e389582
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/nvdiffrast/torch/torch_types.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include "torch_common.inl"
+
+//------------------------------------------------------------------------
+// Python GL state wrapper.
+
+class RasterizeGLState;
+class RasterizeGLStateWrapper
+{
+public:
+    RasterizeGLStateWrapper     (bool enableDB, bool automatic, int cudaDeviceIdx);
+    ~RasterizeGLStateWrapper    (void);
+
+    void setContext             (void);
+    void releaseContext         (void);
+
+    RasterizeGLState*           pState;
+    bool                        automatic;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Python CudaRaster state wrapper.
+
+namespace CR { class CudaRaster; }
+class RasterizeCRStateWrapper
+{
+public:
+    RasterizeCRStateWrapper     (int cudaDeviceIdx);
+    ~RasterizeCRStateWrapper    (void);
+
+    CR::CudaRaster*             cr;
+    int                         cudaDeviceIdx;
+};
+
+//------------------------------------------------------------------------
+// Mipmap wrapper to prevent intrusion from Python side.
+
+class TextureMipWrapper
+{
+public:
+    torch::Tensor               mip;
+    int                         max_mip_level;
+    std::vector<int64_t>        texture_size;   // For error checking.
+    bool                        cube_mode;      // For error checking.
+};
+
+
+//------------------------------------------------------------------------
+// Antialias topology hash wrapper to prevent intrusion from Python side.
+
+class TopologyHashWrapper
+{
+public:
+    torch::Tensor               ev_hash;
+};
+
+//------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/run_sample.sh b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/run_sample.sh
new file mode 100644
index 00000000..3758865c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/run_sample.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+function print_help {
+    echo "Usage: `basename $0` [--build-container] <python_file>"
+    echo ""
+    echo "Option --build-container will build the Docker container based on"
+    echo "docker/Dockerfile and tag the image with gltorch:latest."
+    echo ""
+    echo "Example: `basename $0` samples/torch/envphong.py"
+}
+
+build_container=0
+sample=""
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --build-container) build_container=1;;
+        -h|--help) print_help; exit 0 ;;
+        --*) echo "Unknown parameter passed: $1"; exit 1 ;;
+        *) sample="$1"; shift; break;
+    esac
+    shift
+done
+
+rest=$@
+
+# Build the docker container
+if [ "$build_container" = "1" ]; then
+    docker build --tag gltorch:latest -f docker/Dockerfile .
+fi
+
+if [ ! -f "$sample" ]; then
+    echo
+    echo "No python sample given or file '$sample' not found.  Exiting."
+    exit 1
+fi
+
+image="gltorch:latest"
+
+echo "Using container image: $image"
+echo "Running command: $sample $rest"
+
+# Run a sample with docker
+docker run --rm -it --gpus all --user $(id -u):$(id -g) \
+    -v `pwd`:/app --workdir /app -e TORCH_EXTENSIONS_DIR=/app/tmp $image python3 $sample $rest
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/NOTICE.txt b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/NOTICE.txt
new file mode 100644
index 00000000..1c4fe0a6
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/NOTICE.txt
@@ -0,0 +1,225 @@
+
+Environment map stored as part of samples/data/envphong.npz is derived from a Wave Engine sample material originally shared under MIT License that is reproduced below.
+Original material: https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+Original license:  https://github.com/WaveEngine/Samples/blob/master/LICENSE.md
+
+Mesh and texture stored as part of samples/data/earth.npz are derived from "3D Earth Photorealistic 2K" model originally made available under TurboSquid 3D Model License that is reproduced below.
+Original material: https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+Original license:  https://blog.turbosquid.com/turbosquid-3d-model-license/#3d-model-license
+
+
+
+MIT License
+
+Copyright (c) 2016 Wave Coorporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+
+TurboSquid 3D Model License
+
+This is a legally binding agreement between licensee ("you"), and TurboSquid regarding your rights to use 3D Models from the Site under this license. "You" refers to the purchasing entity, whether that is a natural person who must be at least 18 years of age, or a corporate entity. The rights granted in this agreement are granted to the purchasing entity, its parent company, and its majority owned affiliates on a "royalty free" basis, which means that after a Purchase, there are no future royalties or payments that are required. This agreement incorporates by reference the Terms of Use as well as the Site's policies and procedures as such.
+I. Introduction & Definitions
+
+Definitions
+
+This agreement is intended to be easy to understand, and to provide clarity for using 3D Models in the work you create ("Creations"). Over the years, TurboSquid has been asked many questions about how 3D Models may be used in Creations, and we have attempted to answer those questions in this agreement.
+
+Some words in this agreement are given specific meanings. Words that appear initially in quotations, such as "you" and "Creations", are defined in the text preceding the word. Other capitalized words are defined below:
+
+"3D Model" is the collection of one or more digital files, packaged in the form of a product on the Site that can be identified by a 3D Model ID, and that is made available to you for Purchase on the Site. A 3D Model may include 3D Model files, geometry, texture maps, materials, motion captures, renderings and other constituent files related to the 3D Model data and its representation.
+
+"Site" refers to the TurboSquid websites, API's, software applications or any approved means or utility either currently in existence or in the future; the software and source code used by TurboSquid to provide such services; user interface layouts, designs, images, text, knowledgebase articles, program offers; site information provided in reports (such as popular keyword searches); and all other intellectual property protected under copyright, trademark, patent, publicity, or any other proprietary right.
+
+"Purchase" is the acquisition of a 3D Model by you from the Site under this agreement, whether as a purchase of 3D Model made available at a price of greater than $0, or a download of 3D Model made available at no charge.
+
+"TurboSquid" includes TurboSquid, Inc. and all licensed affiliates and partners that distribute 3D Models on behalf of TurboSquid, Inc.
+
+"Product Page" is the product page or interface that displays 3D Models available for Purchase on the Site.
+
+"Computer Game" is a type of Creation that includes digital games, computer-based games, handheld electronic games, mobile games, online games, web-games, social games, game mods, and console-based games.
+
+"Imagery" is a Creation made of any single image or sequence of images.
+
+"Depicted Intellectual Property" means any intellectual property depicted in the 3D Model, including any copyright, trademark, trade dress, right of publicity, or any other proprietary right throughout the world that may apply. For purposes of clarity, this does not refer to the copyrights owned by the creator of the 3D Model that are licensed in this agreement.
+
+To make reading this agreement easier and less repetitive, the following constructions are used:
+
+"Include," including," and "such as" are considered to be followed with "but not limited to." Examples are used in this agreement to illustrate, rather than limit, the scope of the terms.
+
+"The following restrictions", "the foregoing restrictions", and "subject to the restrictions" are considered to be followed with "in addition to all other restrictions applicable within this agreement."
+II. License Rights
+
+1. Ownership. TurboSquid does not grant title or ownership in 3D Models. All rights in 3D Models not expressly granted in this agreement are reserved by TurboSquid for itself and its licensors.
+
+2. Rights Granted. For 3D Models, TurboSquid grants to you a non-exclusive, perpetual, worldwide right and license to copy, distribute, reproduce, adapt, publicly display, publicly perform, digitally perform, transmit, broadcast, telecast, advertise, create derivative works, and market 3D Models within Creations in the uses authorized in this agreement. You may request authorization for a use not covered by this agreement ("New Use") by writing use@turbosquid.com. TurboSquid is authorized to approve a New Use if TurboSquid finds in its sole judgment that the New Use is substantially similar to another established use in this agreement and authorizes the New Use in writing.
+
+3. Rights Granted When Sharing 3D Models. If you Purchase as an employee of a corporate entity, sharing Purchased 3D Models with other employees of your corporate entity is allowed. Examples of allowed sharing include storing files on a networked hard drive, and aggregating 3D Models for later use in future Creations. You are responsible for any downstream distribution, use, or misuse by a recipient of a shared 3D Models. In all cases, sharing 3D Models with external people or entities is only allowed in the following situations, and with the following restrictions:
+
+a. In the production of a Creation owned by you, if you are working in collaboration with external parties, and there is a need to share 3D Models for the development and production of your Creation, sharing 3D Models with those external parties is allowed. Any external party that receives 3D Models may only use 3D Models on your Creations and must take reasonable care to secure and limit access to 3D Models to that purpose.
+
+b. In the production of a Creation owned by another entity ("your Client"), if you are working as a contractor and need to share 3D Models with your Client, or any external parties working with your Client, sharing 3D Models is allowed, subject to the restriction that all parties may use 3D Models only for your Client's particular Creation, and for successive versions of your Client's Creation, such as sequel Computer Games or movies that utilize the same 3D Models. All parties must take reasonable care to secure and limit access to 3D Models to the parties working on your Client's Creation. For all other use by any party, 3D Models must be Purchased again to create a new license agreement governing that use
+
+4. Editorial Use Restriction for Some 3D Models. The following restrictions apply to any 3D Model with an "Editorial Uses Only" label on its Product Page. Permitted use of Depicted Intellectual Property in such 3D Models is limited to news reporting in Creations of some cultural, editorial, journalistic, or otherwise newsworthy value, including news reporting on television and the internet. A second permitted use is use within an academic setting, limited to teaching, scholarship, and research. This restriction does not apply if you have the needed authorization to use the Depicted Intellectual Property for your Creation, such as if you are owner of the Depicted Intellectual Property, or the advertising team, hired party, or licensee of the Depicted Intellectual Property owner.
+
+5. Depicted Intellectual Property. TurboSquid does not own or license any Depicted Intellectual Property. TurboSquid does not in any way make any representations or warranties about Depicted Intellectual Property associated with 3D Models. You are solely responsible for determining the need for and, if appropriate, obtaining any needed clearance, consent, or release to use any Depicted Intellectual Property in your Creations.
+
+6. Creations of Imagery.
+
+Permitted Uses of Creations of Imagery. Subject to the following restrictions, you may use Creations of Imagery within news, film, movies, television programs, video projects, multi-media projects, theatrical display, software user interfaces; architectural renderings, Computer Games, virtual worlds, simulation and training environments; corporate communications, marketing collateral, tradeshow promotional items, booth decorations and presentations; pre-visualizations, product prototyping and research; mobile, web, print, television, and billboard advertising; online and electronic publications of blogs, literature, social media, and email campaigns; website designs and layouts, desktop and mobile wallpapers, screensavers, toolbar skins; books, magazines, posters, greeting cards; apparel items, brochures, framed or printed artwork, household items, office items, lenticular prints, product packaging and manufactured products.
+
+Restrictions on Permitted Uses of Creations of Imagery.
+
+a. Stock Media Clearinghouse. You may NOT publish or distribute Creations of Imagery through another stock media clearinghouse, for example as part of an online marketplace for photography, clip art, or design templates.
+
+b. Promotional Images. Images displayed for the promotion a 3D Model on its Product Page ("Promotional Images") may be used in Creations of Imagery, provided that the 3D Model itself has been Purchased and subject to the following restrictions:
+
+i. You may NOT use a Promotional Image that has any added element which is not included as part of the 3D Model. An example of this type of restricted use is if the 3D Model contains an airplane, and there is a Promotional Image of that airplane rendered over a blue sky; however, the blue sky image is not included as part of the 3D Model. Other prohibited examples include use of Promotional Images from movies or advertisements that may have used 3D Model.
+
+ii. You may NOT use any Promotional Image that has a logo, mark, watermark, attribution, copyright or other notice superimposed on the image without prior approval from TurboSquid Support.
+
+c. Business Logos. You may NOT use Imagery in any Creation that is a trademark, servicemark, or business logo. This restriction is included because the owners of these types of Creations typically seek exclusivity on the use of the imagery in their Creation, which is incompatible with the non-exclusive license granted to you under this agreement.
+
+
+7. Creations of Computer Games and Software
+
+Permitted Uses in Creations of Computer Games and Software. Subject to the following restrictions, you may incorporate 3D Models in Creations of Computer Games, virtual worlds, simulation and training environments; mobile, desktop and web applications; and interactive electronic publications of literature such as e-books and electronic textbooks.
+
+Restrictions on Permitted Uses of 3D Models in Creations of Games and Software.
+
+a. Interactivity. Your inclusion of 3D Models within any such Creation is limited to uses where 3D Model is contained in an interactive experience for the user and not made available outside of the interactive experience. Such a permitted example of this use would be to include a 3D Model of human anatomy in a medical training application in a way that the 3D Model or its environment may be manipulated or interacted with.
+
+b. Access to 3D Models. You must take all reasonable and industry standard measures to incorporate 3D Models within Creations to prevent other parties from gaining access to 3D Models. 3D Models must be contained in proprietary formats so that they cannot be opened or imported in a publicly available software application or framework, or extracted without reverse engineering. WebGL exports from Unity, Unreal, and Lumberyard are permitted. Any other open format or format encrypted with decryptable open standards (such as an encrypted compression archive or other WebGL programs not listed here) are prohibited from using 3D Models. If your Creation uses WebGL and you are not sure if it qualifies, please contact use@turbosquid.com and describe your Creation in detail.
+
+c. Open Systems. You typically may NOT include 3D Models in Creations that have the general functionality for importing and/or exporting 3D Models. Please contact use@turbosquid.com and describe your Creation in detail if this is your desired use. An example of such a prohibited use is to include 3D Models as a starter library within a standard retail Software Creation that allows users to generally work with 3D Models, even if the 3D Model itself is somehow protected and is not capable of being exported. An allowed use is for custom or enterprise software in certain circumstances.
+
+d. Virtual Good Sales. You may NOT import, upload, reproduce, make available, publish, transmit, distribute, or sublicense 3D Models in Creations of virtual goods or worlds for any 3D community ("Virtual World"), unless you or your Client owns the Virtual World platform and it complies with the previous restrictions.
+
+
+8. Creations of Physical Form.
+
+Permitted Uses in Creations of Physical Form. Subject to the following restrictions, you may use 3D Models to make Physical Creations such as 3D printed works, articles of manufacture, custom vehicles, furniture, jewelry, sculptural artwork, toys, and physical entertainment goods ("Creations of Physical Form").
+
+Restrictions on Permitted Uses in Creations of Physical Form.
+
+a. Substantially Similar Creations. Permitted use of any Creation of Physical Form in which a 3D Model is untransformed or substantially similar to the 3D Model is limited to personal use, gifts, or charitable donations, with a maximum of 5 instances of such Creation per Purchase; unless the 3D Model is a small part of a much larger array of other physical objects in the Creation. For example, if you are creating a real-world, physical human skeleton for manufacture for sale, it is permitted to add a 3D printed human head that exactly resembles the Purchased 3D Model, but it is not permitted to sell the 3D printed head by itself. Another permitted example of a 3D Model being a small part of a larger array is using a 3D Model that ends up within an automobile as a part of the automobile.
+
+b. No Depicted Intellectual Property. You may NOT reproduce Depicted Intellectual Property in any Creation of Physical Form for any purpose. For example, you may NOT make Physical Form Creations of a copyrighted character (Spiderman, Elsa, Slimer), or branded technology (Apple, Toshiba, Samsung).
+
+9. 3D Industry Promotional Use. If TurboSquid has granted you, as a hardware or software partner, access to priced 3D Models on a free-of-charge basis, your use of 3D Models is restricted to internal testing for your 3D software or hardware products, and to the promotion of your software or hardware products with Creations of Imagery provided that an attribution of the artist's name and the Site are included. You agree that should any 3D Models be used outside of these purposes in ways that are normally allowed after a Purchase, that you will notify TurboSquid and promptly Purchase the 3D Models and otherwise comply with the terms herein.
+
+10. Unauthorized Use. If you use 3D Models in an unauthorized way, TurboSquid may terminate your account and pursue other penalties, damages, losses, and profits TurboSquid is entitled to under this agreement or at law or equity. The following are unauthorized uses that are explicitly prohibited:
+
+a. Competition. You may NOT use 3D Models in a way that competes with the Site, including distributing through 3D Model Clearinghouses. You may NOT publish, distribute, or make 3D Models available through any online clearinghouse infrastructure. You may not redistribute 3D Models as part of any design template, After Effects template, stock photography, video or clip art for distribution or licensing through any online stock media clearinghouse whatever.
+
+b. Re-Distribution. You may NOT re-distribute, publish, or make 3D Models available to any third party except in the form of a permitted Creation, or shared as authorized in this agreement.
+
+c. Group Buying. You may NOT aggregate funds to Purchase 3D Models with one or more other parties. An example of this prohibited use is a website membership where members pool their money to make a single Purchase that is shared by the members of the group. Each such member must Purchase individually.
+
+d. No Obscene or Unlawful Use. You may NOT use 3D Models for any defamatory, harassing, pornographic, obscene, or racist purpose, or to infringe any party's Depicted Intellectual Property rights.
+
+e. False Attribution. You may NOT misrepresent yourself as the creator of 3D Models.
+
+11. Resellers. The license granted herein is wholly transferable by an authorized reseller ("Reseller") to another party ("Transferee"). Each transferred license must be transferred entirely and all transferred 3D Models must be permanently deleted from the Reseller's systems after the transfer. When transferring the license, Reseller represents and warrants that the Reseller has the authority to bind the Transferee to these terms. The Reseller is jointly and severally responsible with any Transferee and each are liable for the transferee's use and compliance with TurboSquid's Terms of Use and Site's policies and procedures as well as any financial obligations hereunder.
+III. License Term & Termination
+
+1. Term. Your right and license to 3D Models is perpetual, unless terminated as described herein.
+
+2. Termination. Your license grant is terminated immediately and without notice in the cases below. In such termination, you and any recipients of 3D Models must cease use, distribution, and destroy all copies of 3D Models.
+
+a. Reversal of Purchase. Your right and license to 3D Models are contingent on your Purchase of 3D Models. Any payment reversal of a Purchase for any reason immediately terminates all rights granted under this agreement. Potential Reasons for a payment reversal include:
+
+i. TurboSquid reverses your Purchase at your request.
+
+ii. TurboSquid receives a charge back or other notice from your bank or credit card cancelling your Purchase and/or withdrawing the funds used for your Purchase.
+
+iii. TurboSquid determines in its sole discretion that your Purchase was fraudulent.
+
+iv. When you are granted delayed payment terms, and fail to make payments such that TurboSquid sends you notice and terminates your account.
+
+b. Failure to Abide by the License Grant. Material failure to abide by the terms of this agreement immediately terminates your right and license to 3D Models. If you detect a violation of the license grant by you or any recipient of shared 3D Models, and promptly report the violation to agent@turbosquid.com, TurboSquid will make a good faith effort to find an appropriate remedy to preserve your license grant.
+IV. Warranties
+
+You covenant, represent, and warrant to TurboSquid that:
+
+    You have full right, power, legal capacity, and authority to enter into and perform this agreement, have obtained any third-party consent needed to do so, and, prior to any Purchase, had an opportunity to seek independent legal counsel.
+    You will not use 3D Models except pursuant to the terms of this agreement. Should you use 3D Models in an unauthorized way, you agree to any reasonable fee or penalty exercised by TurboSquid under this agreement or applicable law.
+    You will, prior to Purchase, determine the need for and, if appropriate, obtain any needed third-party clearance, consent, or release to use Depicted Intellectual Property shown in the digital rendering of 3D Models, and shall not use 3D Models to infringe any party's Depicted Intellectual Property rights.
+    You will immediately notify TurboSquid of any legal claim or challenge against your use of 3D Models or any other rights issue, before disclosing such issue to any third-party.
+
+V. Limitation of Liability
+
+1. 3D Models are provided on an "as is", "as available", and "with all faults" basis. TurboSquid makes no representations, warranties, conditions, or guarantees as to the usefulness, quality, suitability, truth, fitness for a particular purpose, non-infringement, merchantability, or cosmetic attributes of 3D Models, and does not guarantee the accuracy or completeness of specifications associated with 3D Models, including measurements, weight, durability, strength, materials, general physical properties, regulatory compliance, other engineering or construction attributes.
+
+2. TurboSquid disclaims all express or implied conditions, representations, and warranties of any kind regarding 3D Models, including any implied warranty or condition of merchantability. TurboSquid allows your Purchase to be refunded under certain reasonable time frames and conditions, subject to the Site's policies.
+
+3. You assume all risk for any damage to your computer systems and network for any damage to your computer system by obtaining 3D Models, including any damages resulting from computer viruses.
+
+4. To the fullest extent permitted by law, TurboSquid shall not be liable for (A) any direct, indirect, punitive, special, incidental, consequential, or exemplary damages (including loss of business, revenue, profits, goodwill, use, data, electronically transmitted orders, or other economic advantage) arising out of or in connection with 3D Models, even if TurboSquid has previously been advised of, or reasonably could have foreseen, the possibility of such damages, however they arise, whether in breach of contract or in tort (including negligence) or (B) any damages in excess of $1,000. To the extent that any jurisdiction does not allow the exclusion or limitation of direct, incidental, or consequential damages, portions of the preceding limitation or exclusion may not apply, but should be construed to the greatest extent applicable in such jurisdictions. Notwithstanding anything to the contrary herein, the TurboSquid indemnification obligation set forth below shall be limited to the following depending on the licensing tier:
+
+Tier 0: 3D Models acquired at free-of-charge are not indemnified.
+
+Tier 1: Standard License indemnity limitation is ten thousand ($10,000) dollars for all 3D Models acquired with payment. This indemnity is in aggregate for all 3D Models acquired under the Standard License.
+
+Tier 2: Small Business License indemnity limitation is two hundred and fifty thousand ($250,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Small Business License.
+
+Tier 3: Enterprise License indemnity limitation is one million ($1,000,000) dollars for any 3D Model. This indemnity is in aggregate for all 3D Models acquired under the Enterprise License.
+
+For any 3D Model labeled Editorial, the above indemnities shall only apply if the model is properly used within the editorial license set forth herein (i.e. for news and editorial purposes in association with newsworthy media.)  For use outside the Editorial scope, no indemnification from TurboSquid shall apply. 
+
+5. You agree to indemnify and hold TurboSquid and its subsidiaries, affiliates, shareholders, officers, directors, agents, licensors, licensee, suppliers, alliance members, other partners, employees and representatives ("TurboSquid Parties") harmless from any claim or demand, including reasonable attorneys' fees, made by any third party due to, or arising out of your use of 3D Models or Creations.
+
+6. Subject to sections 4 and 5 above, TurboSquid shall indemnify, defend, and hold you harmless from and against any claim or demand, including reasonable attorneys' fees made by any third party for copyright or trademark infringement due to or arising out of your use of the 3D Models in accordance with these Terms, but excluding any modifications made by You, if such infringement was caused by the modification. This indemnity shall not apply to any 3D Model labeled for Editorial Use or a brand name, logo, or other Depicted Intellectual Property prior identified in a 3D Model.
+
+7. In the event of an indemnification claim by You, you agree to provide notice to TurboSquid within thirty days' of receiving any claim and allowing TurboSquid to fully control such claim, including but not limited to, selection of counsel, reasonable diligence into the claim, and if necessary litigation and/or settlement. Notice must be given via email to: agent@turbosquid.com. Notice is not considered made until it is acknowledged in writing by TurboSquid.
+VI. Other Terms
+
+1. Entire Agreement. This agreement constitutes the entire agreement between you and TurboSquid relating to your Purchase, unless you have a corporate license agreement with TurboSquid. Corporate licenses are available with additional protections for additional fees. Please contact enterprise@turbosquid.com if your organization requires a corporate license. TurboSquid does not otherwise offer any other changes, additions, variations, or additional signed forms related to this agreement. No modification to this agreement will be binding, unless in writing and signed by an authorized TurboSquid representative.
+
+2. Material Breach and Injunction.
+
+Your rights hereunder vary by licensing tier as follows:
+
+For the Standard License, you agree that any material breach of these Terms will result in irreparable harm to TurboSquid for which damages would be an inadequate remedy and, therefore, in addition to its rights and remedies otherwise available at law, TurboSquid will be entitled to equitable relief, including both a preliminary and permanent injunction, if such a breach occurs. You waive any requirement for the posting of a bond or other security if TurboSquid seeks such an injunction.
+
+For the Enterprise License, TurboSquid may not seek injunctive relief hereunder for any 3D Model. It hereby waives all right to equitable and injunctive relief and its damages shall be limited to monetary damages.
+
+Notwithstanding anything to the contrary herein, TurboSquid would be irreparably harmed and shall be entitled to equitable relief including injunctive relief for any hacking, theft, or misuse of the Site.
+
+3. Import/Export Regulations. 3D Models may be subject to the U.S. export laws and the export or import laws of other countries. You agree to comply strictly with all such laws and, in particular, shall with 3D Models: (a) obtain any export, re-export, or import authorizations required by U.S. or Your local laws; (b) not design, develop or produce missile, chemical/biological, or nuclear weaponry; and (c) not provide 3D Models to prohibited countries and entities identified in the U.S. export regulations.
+
+4. Governing Law. This agreement is governed by New York law, excluding conflict of law principles. Any action or proceeding arising out of or related to this agreement must be brought in a state or federal court located in New York, New York, and both parties irrevocably submit to the exclusive jurisdiction of such courts. All notices, requests and other communications under this agreement must be in writing (e-mail messages shall be deemed writings).
+
+5. LIMITED INTERNAL USER ARBITRATION. You acknowledge and agree that TurboSquid may, in its sole discretion, arbitrate disputes between TurboSquid users involving 3D Models (including any purchaser or supplier of 3D Models), and such findings shall be final and non-appealable. Either party may request that TurboSquid arbitrate the dispute, or TurboSquid may elect, at its option, to arbitrate the dispute. After TurboSquid elects to arbitrate any dispute hereunder, TurboSquid will waive any rights to a commission from both the Purchase and arbitration, and the parties must keep the results and process confidential and may not disclose anything related to the dispute to any other party (whether by oral, written, or other type of disclosure). To resolve disputes, TurboSquid may decide to terminate or suspend users, revoke the license, offer replacement 3D Models, reestablish the licensee, or surrender or reallocate fees (whether by refund, charitable donation, or otherwise). TurboSquid may award up to 3X the Purchase price to either party depending on the circumstances. YOU UNDERSTAND, ACKNOWLEDGE, AND AGREE THAT ACCEPTING THIS ARBITRATION PROVISION WAIVES RIGHTS TO JUDICIAL RESOLUTION, TRIAL BY JURY AND RIGHTS YOU WOULD OTHERWISE HAVE IF YOU HAD NOT AGREED TO THIS ARBITRATION PROVISION.
+
+6. Notice. Any notice under this agreement shall be via email to agent@turbosquid.com, provided that you receive an acknowledgement email from a TurboSquid representative within 5 business days. If no such acknowledgement email is received, notice must be in writing and delivered by mail to the following address.
+
+TurboSquid, Inc.
+c/o TurboSquid Support
+935 Gravier St., Suite 1600
+New Orleans, LA 70112
+
+7. Assignment. TurboSquid may not assign its rights under this agreement without providing you notice, except in the case of a bankruptcy, merger, acquisition, sale of all or substantially all of TurboSquid's assets to a subsequent owner or operator, or similar event.
+
+Your assignment rights vary based on the licensing tier of your purchase:
+
+For the Standard License, you may not assign your rights under this agreement without the prior written consent of TurboSquid.
+
+For Small Business or Enterprise Licenses, you may assign your rights under this agreement without the notice and consent of TurboSquid.
+
+8. English. This agreement may be translated into other languages, but English is the official language of this agreement and in any conflict between the English language version and any other version, the English language version shall control.
+
+9. Publicity. The following advertising, marketing, and publicity rights are granted to TurboSquid for each licensing tier:
+
+Standard License purchases may be fully publicized by TurboSquid and you hereby grant TurboSquid the right to use you and your company's name, logo, and project name on the TurboSquid website and in its related marketing and advertising materials.
+
+Small Business and Enterprise License purchase may not be publicized by TurboSquid in any way without prior written permission of the purchaser.
+
+10. Time limitations on any claim hereunder. Any claim by you hereunder, including without limitation a claim for indemnification under section V must be made within two years of purchasing the 3D Model.
+
+This 3D Model License is effective for use with 3D Models for use on or after June 17, 2020.
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_c.npz b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_c.npz
new file mode 100644
index 00000000..40826cd3
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_c.npz differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_d.npz b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_d.npz
new file mode 100644
index 00000000..66479d1c
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_d.npz differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_p.npz b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_p.npz
new file mode 100644
index 00000000..7eb9c308
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/cube_p.npz differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/earth.npz b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/earth.npz
new file mode 100644
index 00000000..30df3782
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/earth.npz differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/envphong.npz b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/envphong.npz
new file mode 100644
index 00000000..bc5fc9e3
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/data/envphong.npz differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/cube.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/cube.py
new file mode 100644
index 00000000..9ca5454a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/cube.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import os
+import sys
+import pathlib
+
+import util
+import tensorflow as tf
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Cube shape fitter.
+#----------------------------------------------------------------------------
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4, 
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10, 
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = '.',
+             log_fn            = None,
+             imgsave_interval  = None,
+             imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+        
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Setup TF graph for reference.
+    vtxw = np.concatenate([vtxp, np.ones([vtxp.shape[0], 1])], axis=1).astype(np.float32)
+    pos_clip = tf.matmul(vtxw, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Optimized variables.
+    vtxc_opt = tf.get_variable('vtxc', initializer=tf.zeros_initializer(), shape=vtxc.shape)
+    vtxp_opt = tf.get_variable('vtxp', initializer=tf.zeros_initializer(), shape=vtxp.shape)
+
+    # Optimization variable setters for initialization.
+    vtxc_opt_in = tf.placeholder(tf.float32, vtxc.shape)
+    vtxp_opt_in = tf.placeholder(tf.float32, vtxp.shape)
+    opt_set = tf.group(tf.assign(vtxc_opt, vtxc_opt_in), tf.assign(vtxp_opt, vtxp_opt_in))
+
+    # Setup TF graph for what we optimize result.
+    vtxw_opt = tf.concat([vtxp_opt, tf.ones([vtxp.shape[0], 1], tf.float32)], axis=1)
+    pos_clip_opt = tf.matmul(vtxw_opt, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss and optimizer.
+    loss = tf.reduce_mean((color_opt - color)**2)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[vtxp_opt, vtxc_opt])
+
+    # Setup TF graph for display.
+    rast_out_disp, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp, _ = dr.interpolate(vtxc_opt[tf.newaxis, ...], rast_out_disp, col_idx)
+    color_disp = dr.antialias(color_disp, rast_out_disp, pos_clip_opt, pos_idx)
+    rast_out_disp_ref, _ = dr.rasterize(pos_clip, pos_idx, resolution=[display_res, display_res], output_db=False)
+    color_disp_ref, _ = dr.interpolate(vtxc[tf.newaxis, ...], rast_out_disp_ref, col_idx)
+    color_disp_ref = dr.antialias(color_disp_ref, rast_out_disp_ref, pos_clip, pos_idx)
+
+    # Geometric error calculation
+    geom_loss = tf.reduce_mean(tf.reduce_sum((tf.abs(vtxp_opt) - .5)**2, axis=1)**0.5)
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        ang = 0.0
+        gl_avg = []
+        util.init_uninitialized_vars()
+        for it in range(max_iter + 1):
+            # Initialize optimization.
+            if it == 0:
+                vtxp_init = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+                vtxc_init = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+                util.run(opt_set, {vtxc_opt_in: vtxc_init.astype(np.float32), vtxp_opt_in: vtxp_init.astype(np.float32)})
+
+            # Learning rate ramp.
+            lr = 1e-2
+            lr = lr * max(0.01, 10**(-it*0.0005))
+
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+        
+            # Run training and measure geometric error.
+            gl_val, _ = util.run([geom_loss, train_op], {mtx_in: r_mvp, lr_in: lr})
+            gl_avg.append(gl_val)
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val, gl_avg = np.mean(np.asarray(gl_avg)), []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                ang = ang + 0.1
+                img_o = util.run(color_opt,      {mtx_in: r_mvp})[0]
+                img_b = util.run(color,          {mtx_in: r_mvp})[0]
+                img_d = util.run(color_disp,     {mtx_in: a_mvp})[0]
+                img_r = util.run(color_disp_ref, {mtx_in: a_mvp})[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = np.concatenate([img_o, img_b, img_d, img_r], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    discontinuous = False
+    resolution = 0
+
+    def usage():
+        print("Usage: python cube.py [-v] [-discontinuous] resolution")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 100
+        elif a == '-discontinuous':
+            discontinuous = True
+        elif a.isdecimal():
+            resolution = int(a)
+        else:
+            usage()
+
+    if resolution <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/cube_%s_%d' % (('d' if discontinuous else 'c'), resolution)
+    fit_cube(max_iter=5000, resolution=resolution, discontinuous=discontinuous, log_interval=10, display_interval=display_interval, out_dir=out_dir, log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/earth.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/earth.py
new file mode 100644
index 00000000..166cf458
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/earth.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Texture learning with/without mipmaps.
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 4096,
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = '.',
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Learned texture.
+    tex_var = tf.get_variable('tex', initializer=tf.constant_initializer(0.2), shape=tex.shape)
+
+    # Setup TF graph for reference rendering in high resolution.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [ref_res, ref_res])
+    texc, texd = dr.interpolate(uv[tf.newaxis, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+    color = dr.texture(tex[np.newaxis], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    color = color * tf.clip_by_value(rast_out[..., -1:], 0, 1) # Mask out background.
+    
+    # Reduce the reference to correct size.
+    while color.shape[1] > res:
+        color = util.bilinear_downsample(color)
+
+    # TF Graph for rendered candidate.
+    if enable_mip:
+        # With mipmaps.
+        rast_out_opt, rast_out_db_opt = dr.rasterize(pos_clip, pos_idx, [res, res])
+        texc_opt, texd_opt = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx, rast_db=rast_out_db_opt, diff_attrs='all')
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, texd_opt, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        # No mipmaps: no image-space derivatives anywhere.
+        rast_out_opt, _ = dr.rasterize(pos_clip, pos_idx, [res, res], output_db=False)
+        texc_opt, _ = dr.interpolate(uv[tf.newaxis, ...], rast_out_opt, uv_idx)
+        color_opt = dr.texture(tex_var[np.newaxis], texc_opt, filter_mode='linear')    
+    color_opt = color_opt * tf.clip_by_value(rast_out_opt[..., -1:], 0, 1) # Mask out background.
+
+    # Measure only relevant portions of texture when calculating texture PSNR.
+    loss = tf.reduce_mean((color - color_opt)**2)
+    texmask = np.zeros_like(tex)
+    tr = tex.shape[1]//4
+    texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+    texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+    texloss = (tf.reduce_sum(texmask * (tex - tex_var)**2)/np.sum(texmask))**0.5 # RMSE within masked area.
+
+    # Training driven by image-space loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[tex_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5-dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Run training and measure texture-space RMSE loss.
+        texloss_val, _ = util.run([texloss, train_op], {mtx_in: r_mvp, lr_in: lr})
+        texloss_avg.append(texloss_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val, texloss_avg = np.mean(np.asarray(texloss_avg)), []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result images/textures.
+        display_image = display_interval and (it % display_interval) == 0
+        save_image = imgsave_interval and (it % imgsave_interval) == 0
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+        if save_texture:
+            util.save_image(out_dir + '/' + (texsave_fn % it), util.run(tex_var)[::-1])
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    enable_mip = None
+
+    def usage():
+        print("Usage: python earth.py [-v] [-mip|-nomip]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if   a == '-v':     display_interval = 10
+        elif a == '-mip':   enable_mip = True
+        elif a == '-nomip': enable_mip = False
+        else:               usage()
+
+    if enable_mip is None:
+        usage()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    out_dir = 'out/earth_mip' if enable_mip else 'out/earth_nomip'
+    fit_earth(max_iter=20000, log_interval=10, display_interval=display_interval, enable_mip=enable_mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/envphong.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/envphong.py
new file mode 100644
index 00000000..06b10218
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/envphong.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import pathlib
+
+import util
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Environment map and Phong BRDF learning.
+#----------------------------------------------------------------------------
+
+def fit_env_phong(max_iter          = 1000,
+                  log_interval      = 10,
+                  display_interval  = None,
+                  display_res       = 1024,
+                  res               = 1024,
+                  lr_base           = 1e-2,
+                  lr_ramp           = 1.0,
+                  out_dir           = '.',
+                  log_fn            = None,
+                  imgsave_interval  = None,
+                  imgsave_fn        = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    # Texture adapted from https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/envphong.npz') as f:
+        pos_idx, pos, normals, env = f.values()
+    env = env.astype(np.float32)/255.0
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Target Phong parameters.
+    phong_rgb = np.asarray([1.0, 0.8, 0.6], np.float32)
+    phong_exp = 25.0
+
+    # Inputs to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+    invmtx_in = tf.placeholder(tf.float32, [4, 4]) # Inverse.
+    campos_in = tf.placeholder(tf.float32, [3]) # Camera position in world space.
+    lightdir_in = tf.placeholder(tf.float32, [3]) # Light direction.
+
+    # Learned variables: environment maps, phong color, phong exponent.
+    env_var = tf.get_variable('env_var', initializer=tf.constant_initializer(0.5), shape=env.shape)
+    phong_var_raw = tf.get_variable('phong_var', initializer=tf.random_uniform_initializer(0.0, 1.0), shape=[4]) # R, G, B, exp.
+    phong_var = phong_var_raw * [1.0, 1.0, 1.0, 10.0] # Faster learning rate for the exponent.
+
+    # Transform and rasterize.
+    viewvec = pos[..., :3] - campos_in[np.newaxis, np.newaxis, :] # View vectors at vertices.
+    reflvec = viewvec - 2.0 * normals[tf.newaxis, ...] * tf.reduce_sum(normals[tf.newaxis, ...] * viewvec, axis=-1, keepdims=True) # Reflection vectors at vertices.
+    reflvec = reflvec / tf.reduce_sum(reflvec**2, axis=-1, keepdims=True)**0.5 # Normalize.
+    pos_clip = tf.matmul(pos, mtx_in, transpose_b=True)[tf.newaxis, ...]
+    rast_out, rast_out_db = dr.rasterize(pos_clip, pos_idx, [res, res])
+    refl, refld = dr.interpolate(reflvec, rast_out, pos_idx, rast_db=rast_out_db, diff_attrs='all') # Interpolated reflection vectors.
+    
+    # Phong light.
+    refl = refl / tf.reduce_sum(refl**2, axis=-1, keepdims=True)**0.5  # Normalize.
+    ldotr = tf.reduce_sum(-lightdir_in * refl, axis=-1, keepdims=True) # L dot R.
+
+    # Reference color. No need for AA because we are not learning geometry.
+    env = np.stack(env)[:, ::-1]
+    color = dr.texture(env[np.newaxis, ...], refl, refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color = tf.reduce_sum(tf.stack(color), axis=0)
+    color = color + phong_rgb * tf.maximum(0.0, ldotr) ** phong_exp # Phong.
+    color = tf.maximum(color, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Candidate rendering same up to this point, but uses learned texture and Phong parameters instead.
+    color_opt = dr.texture(env_var[tf.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+    color_opt = tf.reduce_sum(tf.stack(color_opt), axis=0)
+    color_opt = color_opt + phong_var[:3] * tf.maximum(0.0, ldotr) ** phong_var[3] # Phong.
+    color_opt = tf.maximum(color_opt, 1.0 - tf.clip_by_value(rast_out[..., -1:], 0, 1)) # White background.
+
+    # Training.
+    loss = tf.reduce_mean((color - color_opt)**2) # L2 pixel loss.
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.99).minimize(loss, var_list=[env_var, phong_var_raw])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Render.
+    ang = 0.0
+    util.init_uninitialized_vars()
+    imgloss_avg, phong_avg = [], []
+    for it in range(max_iter + 1):
+        lr = lr_base * lr_ramp**(float(it)/float(max_iter))
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+    
+        # Solve camera positions.
+        a_campos = np.linalg.inv(a_mv)[:3, 3]
+        r_campos = np.linalg.inv(r_mv)[:3, 3]
+
+        # Random light direction.        
+        lightdir = np.random.normal(size=[3])
+        lightdir /= np.linalg.norm(lightdir) + 1e-8
+
+        # Run training and measure image-space RMSE loss.
+        imgloss_val, phong_val, _ = util.run([loss, phong_var, train_op], {mtx_in: r_mvp, invmtx_in: np.linalg.inv(r_mvp), campos_in: r_campos, lightdir_in: lightdir, lr_in: lr})
+        imgloss_avg.append(imgloss_val**0.5)
+        phong_avg.append(phong_val)
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            imgloss_val, imgloss_avg = np.mean(np.asarray(imgloss_avg, np.float32)), []
+            phong_val, phong_avg = np.mean(np.asarray(phong_avg, np.float32), axis=0), []
+            phong_rgb_rmse = np.mean((phong_val[:3] - phong_rgb)**2)**0.5
+            phong_exp_rel_err = np.abs(phong_val[3] - phong_exp)/phong_exp
+            s = "iter=%d,phong_rgb_rmse=%f,phong_exp_rel_err=%f,img_rmse=%f" % (it, phong_rgb_rmse, phong_exp_rel_err, imgloss_val)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result image.        
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+        if display_image or save_image:
+            result_image = util.run(color_opt, {mtx_in: a_mvp, invmtx_in: np.linalg.inv(a_mvp), campos_in: a_campos, lightdir_in: lightdir})[0]
+        if display_image:
+            util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+        if save_image:
+            util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        else:
+            print("Usage: python envphong.py [-v]")
+            exit()
+
+    # Initialize TensorFlow.        
+    util.init_tf()
+
+    # Run.
+    fit_env_phong(max_iter=1500, log_interval=10, display_interval=display_interval, out_dir='out/env_phong', log_fn='log.txt', imgsave_interval=100, imgsave_fn='img_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/pose.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/pose.py
new file mode 100644
index 00000000..af8fca6e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/pose.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import tensorflow as tf
+import os
+import sys
+import util
+import pathlib
+
+sys.path.insert(0, os.path.join(sys.path[0], '../..')) # for nvdiffrast
+import nvdiffrast.tensorflow as dr
+
+#----------------------------------------------------------------------------
+# Quaternion math.
+#----------------------------------------------------------------------------
+
+# Unit quaternion.
+def q_unit():
+    return np.asarray([1, 0, 0, 0], np.float32)
+
+# Get a random normalized quaternion.
+def q_rnd():
+    u, v, w = np.random.uniform(0.0, 1.0, size=[3])
+    v *= 2.0 * np.pi
+    w *= 2.0 * np.pi
+    return np.asarray([(1.0-u)**0.5 * np.sin(v), (1.0-u)**0.5 * np.cos(v), u**0.5 * np.sin(w), u**0.5 * np.cos(w)], np.float32)
+
+# Get a random quaternion from the octahedral symmetric group S_4.
+_r2 = 0.5**0.5
+_q_S4 = [[ 1.0, 0.0, 0.0, 0.0], [ 0.0, 1.0, 0.0, 0.0], [ 0.0, 0.0, 1.0, 0.0], [ 0.0, 0.0, 0.0, 1.0],
+         [-0.5, 0.5, 0.5, 0.5], [-0.5,-0.5,-0.5, 0.5], [ 0.5,-0.5, 0.5, 0.5], [ 0.5, 0.5,-0.5, 0.5],
+         [ 0.5, 0.5, 0.5, 0.5], [-0.5, 0.5,-0.5, 0.5], [ 0.5,-0.5,-0.5, 0.5], [-0.5,-0.5, 0.5, 0.5],
+         [ _r2,-_r2, 0.0, 0.0], [ _r2, _r2, 0.0, 0.0], [ 0.0, 0.0, _r2, _r2], [ 0.0, 0.0,-_r2, _r2],
+         [ 0.0, _r2, _r2, 0.0], [ _r2, 0.0, 0.0,-_r2], [ _r2, 0.0, 0.0, _r2], [ 0.0,-_r2, _r2, 0.0],
+         [ _r2, 0.0, _r2, 0.0], [ 0.0, _r2, 0.0, _r2], [ _r2, 0.0,-_r2, 0.0], [ 0.0,-_r2, 0.0, _r2]]
+def q_rnd_S4():
+    return np.asarray(_q_S4[np.random.randint(24)], np.float32)
+
+# Quaternion slerp.
+def q_slerp(p, q, t):
+    d = np.dot(p, q)
+    if d < 0.0:
+        q = -q
+        d = -d
+    if d > 0.999:
+        a = p + t * (q-p)
+        return a / np.linalg.norm(a)
+    t0 = np.arccos(d)
+    tt = t0 * t
+    st = np.sin(tt)
+    st0 = np.sin(t0)
+    s1 = st / st0
+    s0 = np.cos(tt) - d*s1
+    return s0*p + s1*q
+
+# Quaterion scale (slerp vs. identity quaternion).
+def q_scale(q, scl):
+    return q_slerp(q_unit(), q, scl)
+
+# Quaternion product.
+def q_mul(p, q):
+    s1, V1 = p[0], p[1:]
+    s2, V2 = q[0], q[1:]
+    s = s1*s2 - np.dot(V1, V2)
+    V = s1*V2 + s2*V1 + np.cross(V1, V2)
+    return np.asarray([s, V[0], V[1], V[2]], np.float32)
+
+# Angular difference between two quaternions in degrees.
+def q_angle_deg(p, q):
+    d = np.abs(np.dot(p, q))
+    d = min(d, 1.0)
+    return np.degrees(2.0 * np.arccos(d))
+
+# Quaternion product in TensorFlow.
+def q_mul_tf(p, q):
+    a = p[0]*q[0] - p[1]*q[1] - p[2]*q[2] - p[3]*q[3]
+    b = p[0]*q[1] + p[1]*q[0] + p[2]*q[3] - p[3]*q[2]
+    c = p[0]*q[2] + p[2]*q[0] + p[3]*q[1] - p[1]*q[3]
+    d = p[0]*q[3] + p[3]*q[0] + p[1]*q[2] - p[2]*q[1]
+    return tf.stack([a, b, c, d])
+
+# Convert quaternion to 4x4 rotation matrix. TensorFlow.
+def q_to_mtx_tf(q):
+    r0 = tf.stack([1.0-2.0*q[1]**2 - 2.0*q[2]**2, 2.0*q[0]*q[1] - 2.0*q[2]*q[3], 2.0*q[0]*q[2] + 2.0*q[1]*q[3]])
+    r1 = tf.stack([2.0*q[0]*q[1] + 2.0*q[2]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[2]**2, 2.0*q[1]*q[2] - 2.0*q[0]*q[3]])
+    r2 = tf.stack([2.0*q[0]*q[2] - 2.0*q[1]*q[3], 2.0*q[1]*q[2] + 2.0*q[0]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[1]**2])
+    rr = tf.transpose(tf.stack([r0, r1, r2]), [1, 0])
+    rr = tf.concat([rr, tf.convert_to_tensor([[0], [0], [0]], tf.float32)], axis=1) # Pad right column.
+    rr = tf.concat([rr, tf.convert_to_tensor([[0, 0, 0, 1]], tf.float32)], axis=0)  # Pad bottom row.
+    return rr
+
+#----------------------------------------------------------------------------
+# Cube pose fitter.
+#----------------------------------------------------------------------------
+
+def fit_pose(max_iter           = 10000,
+             repeats            = 1,
+             log_interval       = 10,
+             display_interval   = None,
+             display_res        = 512,
+             lr_base            = 0.01,
+             lr_falloff         = 1.0,
+             nr_base            = 1.0,
+             nr_falloff         = 1e-4,
+             grad_phase_start   = 0.5,
+             resolution         = 256,
+             out_dir            = '.',
+             log_fn             = None,
+             imgsave_interval   = None,
+             imgsave_fn         = None):
+
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/cube_p.npz') as f:
+        pos_idx, pos, col_idx, col = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Transformation matrix input to TF graph.
+    mtx_in = tf.placeholder(tf.float32, [4, 4])
+
+    # Pose matrix input to TF graph.    
+    pose_in = tf.placeholder(tf.float32, [4]) # Quaternion.
+    noise_in = tf.placeholder(tf.float32, [4]) # Mollification noise.
+    
+    # Setup TF graph for reference.
+    mtx_total = tf.matmul(mtx_in, q_to_mtx_tf(pose_in))
+    pos_clip = tf.matmul(pos, mtx_total, transpose_b=True)[tf.newaxis, ...]
+    rast_out, _ = dr.rasterize(pos_clip, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color, _ = dr.interpolate(col[tf.newaxis, ...], rast_out, col_idx)
+    color = dr.antialias(color, rast_out, pos_clip, pos_idx)
+
+    # Setup TF graph for optimization candidate.
+    pose_var = tf.get_variable('pose', initializer=tf.zeros_initializer(), shape=[4])
+    pose_var_in = tf.placeholder(tf.float32, [4])
+    pose_set = tf.assign(pose_var, pose_var_in)
+    pose_norm_op = tf.assign(pose_var, pose_var / tf.reduce_sum(pose_var**2)**0.5) # Normalization operation.
+    pose_total = q_mul_tf(pose_var, noise_in)
+    mtx_total_opt = tf.matmul(mtx_in, q_to_mtx_tf(pose_total))
+    pos_clip_opt = tf.matmul(pos, mtx_total_opt, transpose_b=True)[tf.newaxis, ...]
+    rast_out_opt, _ = dr.rasterize(pos_clip_opt, pos_idx, resolution=[resolution, resolution], output_db=False)
+    color_opt, _ = dr.interpolate(col[tf.newaxis, ...], rast_out_opt, col_idx)
+    color_opt = dr.antialias(color_opt, rast_out_opt, pos_clip_opt, pos_idx)
+
+    # Image-space loss.
+    diff = (color_opt - color)**2 # L2 norm.
+    diff = tf.tanh(5.0 * tf.reduce_max(diff, axis=-1)) # Add some oomph to the loss.
+    loss = tf.reduce_mean(diff)
+    lr_in = tf.placeholder(tf.float32, [])
+    train_op = tf.train.AdamOptimizer(lr_in, 0.9, 0.999).minimize(loss, var_list=[pose_var])
+
+    # Open log file.
+    log_file = open(out_dir + '/' + log_fn, 'wt') if log_fn else None
+
+    # Repeats.
+    for rep in range(repeats):
+
+        # Optimize.
+        util.init_uninitialized_vars()
+        loss_best = np.inf
+        pose_best = None
+        for it in range(max_iter + 1):
+            # Modelview + projection matrix.
+            mvp = np.matmul(util.projection(x=0.4), util.translate(0, 0, -3.5)).astype(np.float32)
+
+            # Learning and noise rate scheduling.
+            itf = 1.0 * it / max_iter
+            lr = lr_base * lr_falloff**itf
+            nr = nr_base * nr_falloff**itf
+
+            # Noise input.
+            if itf >= grad_phase_start:
+                noise = q_unit()
+            else:
+                noise = q_scale(q_rnd(), nr)
+                noise = q_mul(noise, q_rnd_S4()) # Orientation noise.
+
+            # Initialize optimization.
+            if it == 0:
+                pose_target = q_rnd()                
+                util.run(pose_set, {pose_var_in: q_rnd()})
+                util.run(pose_norm_op)
+                util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise}) # Pipecleaning pass.
+
+            # Run gradient training step.
+            if itf >= grad_phase_start:
+                util.run(train_op, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+                util.run(pose_norm_op)
+
+            # Measure image-space loss and update best found pose.
+            loss_val = util.run(loss, {mtx_in: mvp, pose_in: pose_target, noise_in: noise, lr_in: lr})
+            if loss_val < loss_best:
+                pose_best = util.run(pose_total, {noise_in: noise})
+                if loss_val > 0.0:
+                    loss_best = loss_val
+            else:
+                # Return to best pose in the greedy phase.
+                if itf < grad_phase_start:
+                    util.run(pose_set, {pose_var_in: pose_best})
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                err = q_angle_deg(util.run(pose_var), pose_target)
+                ebest = q_angle_deg(pose_best, pose_target)
+                s = "rep=%d,iter=%d,err=%f,err_best=%f,loss=%f,loss_best=%f,lr=%f,nr=%f" % (rep, it, err, ebest, loss_val, loss_best, lr, nr)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_image = imgsave_interval and (it % imgsave_interval == 0)
+
+            if display_image or save_image:
+                img_ref, img_opt = util.run([color, color_opt], {mtx_in: mvp, pose_in: pose_target, noise_in: noise})
+                img_best, = util.run([color_opt], {mtx_in: mvp, pose_in: pose_best, noise_in: q_unit()})
+                img_ref = img_ref[0]
+                img_opt = img_opt[0]
+                img_best = img_best[0]
+                result_image = np.concatenate([img_ref, img_best, img_opt], axis=1)
+
+            if display_image:
+                util.display_image(result_image, size=display_res, title='(%d) %d / %d' % (rep, it, max_iter))
+            if save_image:
+                util.save_image(out_dir + '/' + (imgsave_fn % (rep, it)), result_image)
+
+    # All repeats done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    display_interval = 0
+    repeats = 1
+
+    def usage():
+        print("Usage: python pose.py [-v] [repeats]")
+        exit()
+
+    for a in sys.argv[1:]:
+        if a == '-v':
+            display_interval = 10
+        elif a.isascii() and a.isdecimal():
+            repeats = int(a)
+        else:
+            usage()
+
+    if repeats <= 0:
+        usage()
+
+    # Initialize TensorFlow.
+    util.init_tf()
+
+    # Run.
+    fit_pose(max_iter=1000, repeats=repeats, log_interval=100, display_interval=display_interval, out_dir='out/pose', log_fn='log.txt', imgsave_interval=1000, imgsave_fn='img_%03d_%06d.png')
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/triangle.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/triangle.py
new file mode 100644
index 00000000..4d4c5442
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/triangle.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imageio
+import logging
+import os
+import numpy as np
+import tensorflow as tf
+import nvdiffrast.tensorflow as dr
+
+# Silence deprecation warnings and debug level logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '1')
+
+pos = tf.convert_to_tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=tf.float32)
+col = tf.convert_to_tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=tf.float32)
+tri = tf.convert_to_tensor([[0, 1, 2]], dtype=tf.int32)
+
+rast, _ = dr.rasterize(pos, tri, resolution=[256, 256])
+out, _ = dr.interpolate(col, rast, tri)
+
+with tf.Session() as sess:
+    img = sess.run(out)
+    
+img = img[0, ::-1, :, :] # Flip vertically.
+img = np.clip(np.rint(img * 255), 0, 255).astype(np.uint8) # Quantize to np.uint8
+
+print("Saving to 'tri.png'.")
+imageio.imsave('tri.png', img)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/util.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/util.py
new file mode 100644
index 00000000..64fc2d9e
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/tensorflow/util.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+
+import os
+import numpy as np
+import tensorflow as tf
+
+# Silence deprecation warnings from TensorFlow 1.13 onwards
+import logging
+logging.getLogger('tensorflow').setLevel(logging.ERROR)
+
+from typing import Any, List
+
+#----------------------------------------------------------------------------
+# Projection and transformation matrix helpers.
+#----------------------------------------------------------------------------
+
+def projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0], 
+                     [  0, n/-x,            0,              0], 
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)], 
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+                    
+def translate(x, y, z):
+    return np.array([[1, 0, 0, x], 
+                     [0, 1, 0, y], 
+                     [0, 0, 1, z], 
+                     [0, 0, 0, 1]]).astype(np.float32)
+
+def rotate_x(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[1,  0, 0, 0], 
+                     [0,  c, s, 0], 
+                     [0, -s, c, 0], 
+                     [0,  0, 0, 1]]).astype(np.float32)
+
+def rotate_y(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[ c, 0, s, 0], 
+                     [ 0, 1, 0, 0], 
+                     [-s, 0, c, 0], 
+                     [ 0, 0, 0, 1]]).astype(np.float32)
+
+def random_rotation_translation(t):
+    m = np.random.normal(size=[3, 3])
+    m[1] = np.cross(m[0], m[2])
+    m[2] = np.cross(m[0], m[1])
+    m = m / np.linalg.norm(m, axis=1, keepdims=True)
+    m = np.pad(m, [[0, 1], [0, 1]], mode='constant')
+    m[3, 3] = 1.0
+    m[:3, 3] = np.random.uniform(-t, t, size=[3])
+    return m
+
+#----------------------------------------------------------------------------
+# Bilinear downsample by 2x.
+#----------------------------------------------------------------------------
+
+def bilinear_downsample(x):
+    w = tf.constant([[1, 3, 3, 1], [3, 9, 9, 3], [3, 9, 9, 3], [1, 3, 3, 1]], dtype=tf.float32) / 64.0
+    w = w[..., tf.newaxis, tf.newaxis] * tf.eye(x.shape[-1].value, batch_shape=[1, 1])
+    x = tf.nn.conv2d(x, w, strides=2, padding='SAME')
+    return x
+
+#----------------------------------------------------------------------------
+# Image display function using OpenGL.
+#----------------------------------------------------------------------------
+
+_glfw_window = None
+def display_image(image, zoom=None, size=None, title=None): # HWC
+    # Import OpenGL and glfw.
+    import OpenGL.GL as gl
+    import glfw
+
+    # Zoom image if requested.
+    image = np.asarray(image)
+    if size is not None:
+        assert zoom is None
+        zoom = max(1, size // image.shape[0])
+    if zoom is not None:
+        image = image.repeat(zoom, axis=0).repeat(zoom, axis=1)
+    height, width, channels = image.shape
+
+    # Initialize window.
+    if title is None:
+        title = 'Debug window'
+    global _glfw_window
+    if _glfw_window is None:
+        glfw.init()
+        _glfw_window = glfw.create_window(width, height, title, None, None)
+        glfw.make_context_current(_glfw_window)
+        glfw.show_window(_glfw_window)
+        glfw.swap_interval(0)
+    else:
+        glfw.make_context_current(_glfw_window)
+        glfw.set_window_title(_glfw_window, title)
+        glfw.set_window_size(_glfw_window, width, height)
+
+    # Update window.
+    glfw.poll_events()
+    gl.glClearColor(0, 0, 0, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    gl.glWindowPos2f(0, 0)
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl_format = {3: gl.GL_RGB, 2: gl.GL_RG, 1: gl.GL_LUMINANCE}[channels]
+    gl_dtype = {'uint8': gl.GL_UNSIGNED_BYTE, 'float32': gl.GL_FLOAT}[image.dtype.name]
+    gl.glDrawPixels(width, height, gl_format, gl_dtype, image[::-1])
+    glfw.swap_buffers(_glfw_window)
+    if glfw.window_should_close(_glfw_window):
+        return False
+    return True
+
+#----------------------------------------------------------------------------
+# Image save helper.
+#----------------------------------------------------------------------------
+
+def save_image(fn, x):
+    import imageio
+    x = np.rint(x * 255.0)
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    imageio.imsave(fn, x)
+
+#----------------------------------------------------------------------------
+
+# TensorFlow utilities
+
+#----------------------------------------------------------------------------
+
+def _sanitize_tf_config(config_dict: dict = None) -> dict:
+    # Defaults.
+    cfg = dict()
+    cfg["rnd.np_random_seed"]               = None      # Random seed for NumPy. None = keep as is.
+    cfg["rnd.tf_random_seed"]               = "auto"    # Random seed for TensorFlow. 'auto' = derive from NumPy random state. None = keep as is.
+    cfg["env.TF_CPP_MIN_LOG_LEVEL"]         = "1"       # 0 = Print all available debug info from TensorFlow. 1 = Print warnings and errors, but disable debug info.
+    cfg["env.HDF5_USE_FILE_LOCKING"]        = "FALSE"   # Disable HDF5 file locking to avoid concurrency issues with network shares.
+    cfg["graph_options.place_pruned_graph"] = True      # False = Check that all ops are available on the designated device. True = Skip the check for ops that are not used.
+    cfg["gpu_options.allow_growth"]         = True      # False = Allocate all GPU memory at the beginning. True = Allocate only as much GPU memory as needed.
+
+    # Remove defaults for environment variables that are already set.
+    for key in list(cfg):
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            if fields[1] in os.environ:
+                del cfg[key]
+
+    # User overrides.
+    if config_dict is not None:
+        cfg.update(config_dict)
+    return cfg
+
+
+def init_tf(config_dict: dict = None) -> None:
+    """Initialize TensorFlow session using good default settings."""
+    # Skip if already initialized.
+    if tf.get_default_session() is not None:
+        return
+
+    # Setup config dict and random seeds.
+    cfg = _sanitize_tf_config(config_dict)
+    np_random_seed = cfg["rnd.np_random_seed"]
+    if np_random_seed is not None:
+        np.random.seed(np_random_seed)
+    tf_random_seed = cfg["rnd.tf_random_seed"]
+    if tf_random_seed == "auto":
+        tf_random_seed = np.random.randint(1 << 31)
+    if tf_random_seed is not None:
+        tf.set_random_seed(tf_random_seed)
+
+    # Setup environment variables.
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] == "env":
+            assert len(fields) == 2
+            os.environ[fields[1]] = str(value)
+
+    # Create default TensorFlow session.
+    create_session(cfg, force_as_default=True)
+
+
+def assert_tf_initialized():
+    """Check that TensorFlow session has been initialized."""
+    if tf.get_default_session() is None:
+        raise RuntimeError("No default TensorFlow session found. Please call util.init_tf().")
+
+
+def create_session(config_dict: dict = None, force_as_default: bool = False) -> tf.Session:
+    """Create tf.Session based on config dict."""
+    # Setup TensorFlow config proto.
+    cfg = _sanitize_tf_config(config_dict)
+    config_proto = tf.ConfigProto()
+    for key, value in cfg.items():
+        fields = key.split(".")
+        if fields[0] not in ["rnd", "env"]:
+            obj = config_proto
+            for field in fields[:-1]:
+                obj = getattr(obj, field)
+            setattr(obj, fields[-1], value)
+
+    # Create session.
+    session = tf.Session(config=config_proto)
+    if force_as_default:
+        # pylint: disable=protected-access
+        session._default_session = session.as_default()
+        session._default_session.enforce_nesting = False
+        session._default_session.__enter__()
+    return session
+
+
+def is_tf_expression(x: Any) -> bool:
+    """Check whether the input is a valid Tensorflow expression, i.e., Tensorflow Tensor, Variable, or Operation."""
+    return isinstance(x, (tf.Tensor, tf.Variable, tf.Operation))
+
+
+def absolute_name_scope(scope: str) -> tf.name_scope:
+    """Forcefully enter the specified name scope, ignoring any surrounding scopes."""
+    return tf.name_scope(scope + "/")
+
+
+def init_uninitialized_vars(target_vars: List[tf.Variable] = None) -> None:
+    """Initialize all tf.Variables that have not already been initialized.
+
+    Equivalent to the following, but more efficient and does not bloat the tf graph:
+    tf.variables_initializer(tf.report_uninitialized_variables()).run()
+    """
+    assert_tf_initialized()
+    if target_vars is None:
+        target_vars = tf.global_variables()
+
+    test_vars = []
+    test_ops = []
+
+    with tf.control_dependencies(None):  # ignore surrounding control_dependencies
+        for var in target_vars:
+            assert is_tf_expression(var)
+
+            try:
+                tf.get_default_graph().get_tensor_by_name(var.name.replace(":0", "/IsVariableInitialized:0"))
+            except KeyError:
+                # Op does not exist => variable may be uninitialized.
+                test_vars.append(var)
+
+                with absolute_name_scope(var.name.split(":")[0]):
+                    test_ops.append(tf.is_variable_initialized(var))
+
+    init_vars = [var for var, inited in zip(test_vars, run(test_ops)) if not inited]
+    run([var.initializer for var in init_vars])
+
+def run(*args, **kwargs) -> Any:
+    """Run the specified ops in the default session."""
+    assert_tf_initialized()
+    return tf.get_default_session().run(*args, **kwargs)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/cube.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/cube.py
new file mode 100644
index 00000000..49e38768
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/cube.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import sys
+import numpy as np
+import torch
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+# Transform vertex positions to clip space
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    # (x,y,z) -> (x,y,z,1)
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, vtx_col, col_idx, resolution: int):
+    pos_clip    = transform_pos(mtx, pos)
+    rast_out, _ = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+    color, _    = dr.interpolate(vtx_col[None, ...], rast_out, col_idx)
+    color       = dr.antialias(color, rast_out, pos_clip, pos_idx)
+    return color
+
+def make_grid(arr, ncols=2):
+    n, height, width, nc = arr.shape
+    nrows = n//ncols
+    assert n == nrows*ncols
+    return arr.reshape(nrows, ncols, height, width, nc).swapaxes(1,2).reshape(height*nrows, width*ncols, nc)
+
+def fit_cube(max_iter          = 5000,
+             resolution        = 4,
+             discontinuous     = False,
+             repeats           = 1,
+             log_interval      = 10,
+             display_interval  = None,
+             display_res       = 512,
+             out_dir           = None,
+             log_fn            = None,
+             mp4save_interval  = None,
+             mp4save_fn        = None,
+             use_opengl        = False):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(f'{out_dir}/{log_fn}', 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    fn = 'cube_%s.npz' % ('d' if discontinuous else 'c')
+    with np.load(f'{datadir}/{fn}') as f:
+        pos_idx, vtxp, col_idx, vtxc = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], vtxp.shape[0]))
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(vtxp.astype(np.float32)).cuda()
+    vtx_col = torch.from_numpy(vtxc.astype(np.float32)).cuda()
+
+    # Rasterizer context
+    # glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
+    glctx = dr.RasterizeCudaContext()
+    # Repeats.
+    for rep in range(repeats):
+
+        ang = 0.0
+        gl_avg = []
+
+        vtx_pos_rand = np.random.uniform(-0.5, 0.5, size=vtxp.shape) + vtxp
+        vtx_col_rand = np.random.uniform(0.0, 1.0, size=vtxc.shape)
+        vtx_pos_opt  = torch.tensor(vtx_pos_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+        vtx_col_opt  = torch.tensor(vtx_col_rand, dtype=torch.float32, device='cuda', requires_grad=True)
+
+        # Adam optimizer for vertex position and color with a learning rate ramp.
+        optimizer    = torch.optim.Adam([vtx_pos_opt, vtx_col_opt], lr=1e-2)
+        scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: max(0.01, 10**(-x*0.0005)))
+
+        for it in range(max_iter + 1):
+            # Random rotation/translation matrix for optimization.
+            r_rot = util.random_rotation_translation(0.25)
+
+            # Smooth rotation for display.
+            a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+            # Modelview and modelview + projection matrices.
+            proj  = util.projection(x=0.4)
+            r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+            r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+            a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+            a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+            # Compute geometric error for logging.
+            with torch.no_grad():
+                geom_loss = torch.mean(torch.sum((torch.abs(vtx_pos_opt) - .5)**2, dim=1)**0.5)
+                gl_avg.append(float(geom_loss))
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                gl_val = np.mean(np.asarray(gl_avg))
+                gl_avg = []
+                s = ("rep=%d," % rep) if repeats > 1 else ""
+                s += "iter=%d,err=%f" % (it, gl_val)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            color     = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+            color_opt = render(glctx, r_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, resolution)
+
+            # Compute loss and train.
+            loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_mp4      = mp4save_interval and (it % mp4save_interval == 0)
+
+            if display_image or save_mp4:
+                ang = ang + 0.01
+
+                img_b = color[0].cpu().numpy()[::-1]
+                img_o = color_opt[0].detach().cpu().numpy()[::-1]
+                img_d = render(glctx, a_mvp, vtx_pos_opt, pos_idx, vtx_col_opt, col_idx, display_res)[0]
+                img_r = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_col, col_idx, display_res)[0]
+
+                scl = display_res // img_o.shape[0]
+                img_b = np.repeat(np.repeat(img_b, scl, axis=0), scl, axis=1)
+                img_o = np.repeat(np.repeat(img_o, scl, axis=0), scl, axis=1)
+                result_image = make_grid(np.stack([img_o, img_b, img_d.detach().cpu().numpy()[::-1], img_r.cpu().numpy()[::-1]]))
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_mp4:
+                    writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Cube fit example')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
+    parser.add_argument('--discontinuous', action='store_true', default=False)
+    parser.add_argument('--resolution', type=int, default=0, required=True)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=100)
+    parser.add_argument('--max-iter', type=int, default=1000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ds = 'd' if args.discontinuous else 'c'
+        out_dir = f'{args.outdir}/cube_{ds}_{args.resolution}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_cube(
+        max_iter=args.max_iter,
+        resolution=args.resolution,
+        discontinuous=args.discontinuous,
+        log_interval=10,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        log_fn='log.txt',
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/earth.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/earth.py
new file mode 100644
index 00000000..8101b96a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/earth.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import sys
+import numpy as np
+import torch
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Helpers.
+
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, uv, uv_idx, tex, resolution, enable_mip, max_mip_level):
+    pos_clip = transform_pos(mtx, pos)
+    rast_out, rast_out_db = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+
+    if enable_mip:
+        texc, texd = dr.interpolate(uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+        color = dr.texture(tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+    else:
+        texc, _ = dr.interpolate(uv[None, ...], rast_out, uv_idx)
+        color = dr.texture(tex[None, ...], texc, filter_mode='linear')
+
+    color = color * torch.clamp(rast_out[..., -1:], 0, 1) # Mask out background.
+    return color
+
+#----------------------------------------------------------------------------
+
+def fit_earth(max_iter          = 20000,
+              log_interval      = 10,
+              display_interval  = None,
+              display_res       = 1024,
+              enable_mip        = True,
+              res               = 512,
+              ref_res           = 2048,  # Dropped from 4096 to 2048 to allow using the Cuda rasterizer.
+              lr_base           = 1e-2,
+              lr_ramp           = 0.1,
+              out_dir           = None,
+              log_fn            = None,
+              texsave_interval  = None,
+              texsave_fn        = None,
+              imgsave_interval  = None,
+              imgsave_fn        = None,
+              use_opengl        = False):
+
+    log_file = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+    else:
+        imgsave_interval, texsave_interval = None, None
+
+    # Mesh and texture adapted from "3D Earth Photorealistic 2K" model at
+    # https://www.turbosquid.com/3d-models/3d-realistic-earth-photorealistic-2k-1279125
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/earth.npz') as f:
+        pos_idx, pos, uv_idx, uv, tex = f.values()
+    tex = tex.astype(np.float32)/255.0
+    max_mip_level = 9 # Texture is a 4x3 atlas of 512x512 maps.
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Some input geometry contains vertex positions in (N, 4) (with v[:,3]==1).  Drop
+    # the last column in that case.
+    if pos.shape[1] == 4: pos = pos[:, 0:3]
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(pos.astype(np.float32)).cuda()
+    uv_idx  = torch.from_numpy(uv_idx.astype(np.int32)).cuda()
+    vtx_uv  = torch.from_numpy(uv.astype(np.float32)).cuda()
+
+    tex     = torch.from_numpy(tex.astype(np.float32)).cuda()
+    tex_opt = torch.full(tex.shape, 0.2, device='cuda', requires_grad=True)
+    # glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
+    glctx = dr.RasterizeCudaContext()
+
+    ang = 0.0
+
+    # Adam optimizer for texture with a learning rate ramp.
+    optimizer    = torch.optim.Adam([tex_opt], lr=lr_base)
+    scheduler    = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: lr_ramp**(float(x)/float(max_iter)))
+
+    # Render.
+    ang = 0.0
+    texloss_avg = []
+    for it in range(max_iter + 1):
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+        dist = np.random.uniform(0.0, 48.5)
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -1.5-dist), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+
+        # Measure texture-space RMSE loss
+        with torch.no_grad():
+            texmask = torch.zeros_like(tex)
+            tr = tex.shape[1]//4
+            texmask[tr+13:2*tr-13, 25:-25, :] += 1.0
+            texmask[25:-25, tr+13:2*tr-13, :] += 1.0
+            # Measure only relevant portions of texture when calculating texture
+            # PSNR.
+            texloss = (torch.sum(texmask * (tex - tex_opt)**2)/torch.sum(texmask))**0.5 # RMSE within masked area.
+            texloss_avg.append(float(texloss))
+
+        # Render reference and optimized frames. Always enable mipmapping for reference.
+        color = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex, ref_res, True, max_mip_level)
+        color_opt = render(glctx, r_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)
+
+        # Reduce the reference to correct size.
+        while color.shape[1] > res:
+            color = util.bilinear_downsample(color)
+
+        # Compute loss and perform a training step.
+        loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            texloss_val = np.mean(np.asarray(texloss_avg))
+            texloss_avg = []
+            psnr = -10.0 * np.log10(texloss_val**2) # PSNR based on average RMSE.
+            s = "iter=%d,loss=%f,psnr=%f" % (it, texloss_val, psnr)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save image.
+        display_image = display_interval and (it % display_interval == 0)
+        save_image = imgsave_interval and (it % imgsave_interval == 0)
+        save_texture = texsave_interval and (it % texsave_interval) == 0
+
+        if display_image or save_image:
+            ang = ang + 0.1
+
+            with torch.no_grad():
+                result_image = render(glctx, a_mvp, vtx_pos, pos_idx, vtx_uv, uv_idx, tex_opt, res, enable_mip, max_mip_level)[0].cpu().numpy()[::-1]
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+                if save_image:
+                    util.save_image(out_dir + '/' + (imgsave_fn % it), result_image)
+
+                if save_texture:
+                    texture = tex_opt.cpu().numpy()[::-1]
+                    util.save_image(out_dir + '/' + (texsave_fn % it), texture)
+
+
+    # Done.
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Earth texture fitting example')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
+    parser.add_argument('--mip', help='enable mipmapping', action='store_true', default=False)
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--max-iter', type=int, default=10000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        ms = 'mip' if args.mip else 'nomip'
+        out_dir = f'{args.outdir}/earth_{ms}'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_earth(max_iter=args.max_iter, log_interval=10, display_interval=args.display_interval, enable_mip=args.mip, out_dir=out_dir, log_fn='log.txt', texsave_interval=1000, texsave_fn='tex_%06d.png', imgsave_interval=1000, imgsave_fn='img_%06d.png', use_opengl=args.opengl)
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/envphong.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/envphong.py
new file mode 100644
index 00000000..14732bf3
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/envphong.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import numpy as np
+import torch
+import os
+import sys
+import pathlib
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Environment map and Phong BRDF learning.
+#----------------------------------------------------------------------------
+
+def fit_env_phong(max_iter          = 1000,
+                  log_interval      = 10,
+                  display_interval  = None,
+                  display_res       = 1024,
+                  res               = 1024,
+                  lr_base           = 1e-2,
+                  lr_ramp           = 1.0,
+                  out_dir           = None,
+                  log_fn            = None,
+                  mp4save_interval  = None,
+                  mp4save_fn        = None,
+                  use_opengl        = False):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    # Texture adapted from https://github.com/WaveEngine/Samples/tree/master/Materials/EnvironmentMap/Content/Assets/CubeMap.cubemap
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/envphong.npz') as f:
+        pos_idx, pos, normals, env = f.values()
+    env = env.astype(np.float32)/255.0
+    env = np.stack(env)[:, ::-1].copy()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Move all the stuff to GPU.
+    pos_idx = torch.as_tensor(pos_idx, dtype=torch.int32, device='cuda')
+    pos = torch.as_tensor(pos, dtype=torch.float32, device='cuda')
+    normals = torch.as_tensor(normals, dtype=torch.float32, device='cuda')
+    env = torch.as_tensor(env, dtype=torch.float32, device='cuda')
+
+    # Target Phong parameters.
+    phong_rgb = np.asarray([1.0, 0.8, 0.6], np.float32)
+    phong_exp = 25.0
+    phong_rgb_t = torch.as_tensor(phong_rgb, dtype=torch.float32, device='cuda')
+
+    # Learned variables: environment maps, phong color, phong exponent.
+    env_var = torch.ones_like(env) * .5
+    env_var.requires_grad_()
+    phong_var_raw = torch.as_tensor(np.random.uniform(size=[4]), dtype=torch.float32, device='cuda')
+    phong_var_raw.requires_grad_()
+    phong_var_mul = torch.as_tensor([1.0, 1.0, 1.0, 10.0], dtype=torch.float32, device='cuda')
+
+    # Render.
+    ang = 0.0
+    imgloss_avg, phong_avg = [], []
+
+    # glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
+
+    glctx = dr.RasterizeCudaContext()
+    zero_tensor = torch.as_tensor(0.0, dtype=torch.float32, device='cuda')
+    one_tensor = torch.as_tensor(1.0, dtype=torch.float32, device='cuda')
+
+    # Adam optimizer for environment map and phong with a learning rate ramp.
+    optimizer = torch.optim.Adam([env_var, phong_var_raw], lr=lr_base)
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda x: lr_ramp**(float(x)/float(max_iter)))
+
+    for it in range(max_iter + 1):
+        phong_var = phong_var_raw * phong_var_mul
+
+        # Random rotation/translation matrix for optimization.
+        r_rot = util.random_rotation_translation(0.25)
+
+        # Smooth rotation for display.
+        ang = ang + 0.01
+        a_rot = np.matmul(util.rotate_x(-0.4), util.rotate_y(ang))
+
+        # Modelview and modelview + projection matrices.
+        proj  = util.projection(x=0.4, n=1.0, f=200.0)
+        r_mv  = np.matmul(util.translate(0, 0, -3.5), r_rot)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        a_mv  = np.matmul(util.translate(0, 0, -3.5), a_rot)
+        a_mvp = np.matmul(proj, a_mv).astype(np.float32)
+        a_mvc = a_mvp
+        r_mvp = torch.as_tensor(r_mvp, dtype=torch.float32, device='cuda')
+        a_mvp = torch.as_tensor(a_mvp, dtype=torch.float32, device='cuda')
+
+        # Solve camera positions.
+        a_campos = torch.as_tensor(np.linalg.inv(a_mv)[:3, 3], dtype=torch.float32, device='cuda')
+        r_campos = torch.as_tensor(np.linalg.inv(r_mv)[:3, 3], dtype=torch.float32, device='cuda')
+
+        # Random light direction.        
+        lightdir = np.random.normal(size=[3])
+        lightdir /= np.linalg.norm(lightdir) + 1e-8
+        lightdir = torch.as_tensor(lightdir, dtype=torch.float32, device='cuda')
+
+        def render_refl(ldir, cpos, mvp):
+            # Transform and rasterize.
+            viewvec = pos[..., :3] - cpos[np.newaxis, np.newaxis, :] # View vectors at vertices.
+            reflvec = viewvec - 2.0 * normals[np.newaxis, ...] * torch.sum(normals[np.newaxis, ...] * viewvec, -1, keepdim=True) # Reflection vectors at vertices.
+            reflvec = reflvec / torch.sum(reflvec**2, -1, keepdim=True)**0.5 # Normalize.
+            pos_clip = torch.matmul(pos, mvp.t())[np.newaxis, ...]
+            rast_out, rast_out_db = dr.rasterize(glctx, pos_clip, pos_idx, [res, res])
+            refl, refld = dr.interpolate(reflvec, rast_out, pos_idx, rast_db=rast_out_db, diff_attrs='all') # Interpolated reflection vectors.
+
+            # Phong light.
+            refl = refl / (torch.sum(refl**2, -1, keepdim=True) + 1e-8)**0.5  # Normalize.
+            ldotr = torch.sum(-ldir * refl, -1, keepdim=True) # L dot R.
+
+            # Return
+            return refl, refld, ldotr, (rast_out[..., -1:] == 0)
+
+        # Render the reflections.
+        refl, refld, ldotr, mask = render_refl(lightdir, r_campos, r_mvp)
+
+        # Reference color. No need for AA because we are not learning geometry.
+        color = dr.texture(env[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+        color = color + phong_rgb_t * torch.max(zero_tensor, ldotr) ** phong_exp # Phong.
+        color = torch.where(mask, one_tensor, color) # White background.
+
+        # Candidate rendering same up to this point, but uses learned texture and Phong parameters instead.
+        color_opt = dr.texture(env_var[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+        color_opt = color_opt + phong_var[:3] * torch.max(zero_tensor, ldotr) ** phong_var[3] # Phong.
+        color_opt = torch.where(mask, one_tensor, color_opt) # White background.
+
+        # Compute loss and train.
+        loss = torch.mean((color - color_opt)**2) # L2 pixel loss.
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+
+        # Collect losses.
+        imgloss_avg.append(loss.detach().cpu().numpy())
+        phong_avg.append(phong_var.detach().cpu().numpy())
+
+        # Print/save log.
+        if log_interval and (it % log_interval == 0):
+            imgloss_val, imgloss_avg = np.mean(np.asarray(imgloss_avg, np.float32)), []
+            phong_val, phong_avg = np.mean(np.asarray(phong_avg, np.float32), axis=0), []
+            phong_rgb_rmse = np.mean((phong_val[:3] - phong_rgb)**2)**0.5
+            phong_exp_rel_err = np.abs(phong_val[3] - phong_exp)/phong_exp
+            s = "iter=%d,phong_rgb_rmse=%f,phong_exp_rel_err=%f,img_rmse=%f" % (it, phong_rgb_rmse, phong_exp_rel_err, imgloss_val)
+            print(s)
+            if log_file:
+                log_file.write(s + '\n')
+
+        # Show/save result image.
+        display_image = display_interval and (it % display_interval == 0)
+        save_mp4 = mp4save_interval and (it % mp4save_interval == 0)
+
+        if display_image or save_mp4:
+            lightdir = np.asarray([.8, -1., .5, 0.0])
+            lightdir = np.matmul(a_mvc, lightdir)[:3]
+            lightdir /= np.linalg.norm(lightdir)
+            lightdir = torch.as_tensor(lightdir, dtype=torch.float32, device='cuda')
+            refl, refld, ldotr, mask = render_refl(lightdir, a_campos, a_mvp)
+            color_opt = dr.texture(env_var[np.newaxis, ...], refl, uv_da=refld, filter_mode='linear-mipmap-linear', boundary_mode='cube')
+            color_opt = color_opt + phong_var[:3] * torch.max(zero_tensor, ldotr) ** phong_var[3]
+            color_opt = torch.where(mask, one_tensor, color_opt)
+            result_image = color_opt.detach()[0].cpu().numpy()[::-1]
+            if display_image:
+                util.display_image(result_image, size=display_res, title='%d / %d' % (it, max_iter))
+            if save_mp4:
+                writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+# Main function.
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Environment map fitting example')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=10)
+    parser.add_argument('--max-iter', type=int, default=5000)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        out_dir = f'{args.outdir}/env_phong'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_env_phong(
+        max_iter=args.max_iter,
+        log_interval=100,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/pose.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/pose.py
new file mode 100644
index 00000000..fc86bddf
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/pose.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import argparse
+import os
+import pathlib
+import sys
+import numpy as np
+import torch
+import imageio
+
+import util
+
+import nvdiffrast.torch as dr
+
+#----------------------------------------------------------------------------
+# Quaternion math.
+#----------------------------------------------------------------------------
+
+# Unit quaternion.
+def q_unit():
+    return np.asarray([1, 0, 0, 0], np.float32)
+
+# Get a random normalized quaternion.
+def q_rnd():
+    u, v, w = np.random.uniform(0.0, 1.0, size=[3])
+    v *= 2.0 * np.pi
+    w *= 2.0 * np.pi
+    return np.asarray([(1.0-u)**0.5 * np.sin(v), (1.0-u)**0.5 * np.cos(v), u**0.5 * np.sin(w), u**0.5 * np.cos(w)], np.float32)
+
+# Get a random quaternion from the octahedral symmetric group S_4.
+_r2 = 0.5**0.5
+_q_S4 = [[ 1.0, 0.0, 0.0, 0.0], [ 0.0, 1.0, 0.0, 0.0], [ 0.0, 0.0, 1.0, 0.0], [ 0.0, 0.0, 0.0, 1.0],
+         [-0.5, 0.5, 0.5, 0.5], [-0.5,-0.5,-0.5, 0.5], [ 0.5,-0.5, 0.5, 0.5], [ 0.5, 0.5,-0.5, 0.5],
+         [ 0.5, 0.5, 0.5, 0.5], [-0.5, 0.5,-0.5, 0.5], [ 0.5,-0.5,-0.5, 0.5], [-0.5,-0.5, 0.5, 0.5],
+         [ _r2,-_r2, 0.0, 0.0], [ _r2, _r2, 0.0, 0.0], [ 0.0, 0.0, _r2, _r2], [ 0.0, 0.0,-_r2, _r2],
+         [ 0.0, _r2, _r2, 0.0], [ _r2, 0.0, 0.0,-_r2], [ _r2, 0.0, 0.0, _r2], [ 0.0,-_r2, _r2, 0.0],
+         [ _r2, 0.0, _r2, 0.0], [ 0.0, _r2, 0.0, _r2], [ _r2, 0.0,-_r2, 0.0], [ 0.0,-_r2, 0.0, _r2]]
+def q_rnd_S4():
+    return np.asarray(_q_S4[np.random.randint(24)], np.float32)
+
+# Quaternion slerp.
+def q_slerp(p, q, t):
+    d = np.dot(p, q)
+    if d < 0.0:
+        q = -q
+        d = -d
+    if d > 0.999:
+        a = p + t * (q-p)
+        return a / np.linalg.norm(a)
+    t0 = np.arccos(d)
+    tt = t0 * t
+    st = np.sin(tt)
+    st0 = np.sin(t0)
+    s1 = st / st0
+    s0 = np.cos(tt) - d*s1
+    return s0*p + s1*q
+
+# Quaterion scale (slerp vs. identity quaternion).
+def q_scale(q, scl):
+    return q_slerp(q_unit(), q, scl)
+
+# Quaternion product.
+def q_mul(p, q):
+    s1, V1 = p[0], p[1:]
+    s2, V2 = q[0], q[1:]
+    s = s1*s2 - np.dot(V1, V2)
+    V = s1*V2 + s2*V1 + np.cross(V1, V2)
+    return np.asarray([s, V[0], V[1], V[2]], np.float32)
+
+# Angular difference between two quaternions in degrees.
+def q_angle_deg(p, q):
+    p = p.detach().cpu().numpy()
+    q = q.detach().cpu().numpy()
+    d = np.abs(np.dot(p, q))
+    d = min(d, 1.0)
+    return np.degrees(2.0 * np.arccos(d))
+
+# Quaternion product
+def q_mul_torch(p, q):
+    a = p[0]*q[0] - p[1]*q[1] - p[2]*q[2] - p[3]*q[3]
+    b = p[0]*q[1] + p[1]*q[0] + p[2]*q[3] - p[3]*q[2]
+    c = p[0]*q[2] + p[2]*q[0] + p[3]*q[1] - p[1]*q[3]
+    d = p[0]*q[3] + p[3]*q[0] + p[1]*q[2] - p[2]*q[1]
+    return torch.stack([a, b, c, d])
+
+# Convert quaternion to 4x4 rotation matrix.
+def q_to_mtx(q):
+    r0 = torch.stack([1.0-2.0*q[1]**2 - 2.0*q[2]**2, 2.0*q[0]*q[1] - 2.0*q[2]*q[3], 2.0*q[0]*q[2] + 2.0*q[1]*q[3]])
+    r1 = torch.stack([2.0*q[0]*q[1] + 2.0*q[2]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[2]**2, 2.0*q[1]*q[2] - 2.0*q[0]*q[3]])
+    r2 = torch.stack([2.0*q[0]*q[2] - 2.0*q[1]*q[3], 2.0*q[1]*q[2] + 2.0*q[0]*q[3], 1.0 - 2.0*q[0]**2 - 2.0*q[1]**2])
+    rr = torch.transpose(torch.stack([r0, r1, r2]), 1, 0)
+    rr = torch.cat([rr, torch.tensor([[0], [0], [0]], dtype=torch.float32).cuda()], dim=1) # Pad right column.
+    rr = torch.cat([rr, torch.tensor([[0, 0, 0, 1]], dtype=torch.float32).cuda()], dim=0)  # Pad bottom row.
+    return rr
+
+# Transform vertex positions to clip space
+def transform_pos(mtx, pos):
+    t_mtx = torch.from_numpy(mtx).cuda() if isinstance(mtx, np.ndarray) else mtx
+    # (x,y,z) -> (x,y,z,1)
+    posw = torch.cat([pos, torch.ones([pos.shape[0], 1]).cuda()], axis=1)
+    return torch.matmul(posw, t_mtx.t())[None, ...]
+
+def render(glctx, mtx, pos, pos_idx, col, col_idx, resolution: int):
+    # Setup TF graph for reference.
+    pos_clip    = transform_pos(mtx, pos)
+    rast_out, _ = dr.rasterize(glctx, pos_clip, pos_idx, resolution=[resolution, resolution])
+    color   , _ = dr.interpolate(col[None, ...], rast_out, col_idx)
+    color       = dr.antialias(color, rast_out, pos_clip, pos_idx)
+    return color
+
+#----------------------------------------------------------------------------
+# Cube pose fitter.
+#----------------------------------------------------------------------------
+
+def fit_pose(max_iter           = 10000,
+             repeats            = 1,
+             log_interval       = 10,
+             display_interval   = None,
+             display_res        = 512,
+             lr_base            = 0.01,
+             lr_falloff         = 1.0,
+             nr_base            = 1.0,
+             nr_falloff         = 1e-4,
+             grad_phase_start   = 0.5,
+             resolution         = 256,
+             out_dir            = None,
+             log_fn             = None,
+             mp4save_interval   = None,
+             mp4save_fn         = None,
+             use_opengl         = False):
+
+    log_file = None
+    writer = None
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+        if log_fn:
+            log_file = open(out_dir + '/' + log_fn, 'wt')
+        if mp4save_interval != 0:
+            writer = imageio.get_writer(f'{out_dir}/{mp4save_fn}', mode='I', fps=30, codec='libx264', bitrate='16M')
+    else:
+        mp4save_interval = None
+
+    datadir = f'{pathlib.Path(__file__).absolute().parents[1]}/data'
+    with np.load(f'{datadir}/cube_p.npz') as f:
+        pos_idx, pos, col_idx, col = f.values()
+    print("Mesh has %d triangles and %d vertices." % (pos_idx.shape[0], pos.shape[0]))
+
+    # Some input geometry contains vertex positions in (N, 4) (with v[:,3]==1).  Drop
+    # the last column in that case.
+    if pos.shape[1] == 4: pos = pos[:, 0:3]
+
+    # Create position/triangle index tensors
+    pos_idx = torch.from_numpy(pos_idx.astype(np.int32)).cuda()
+    vtx_pos = torch.from_numpy(pos.astype(np.float32)).cuda()
+    col_idx = torch.from_numpy(col_idx.astype(np.int32)).cuda()
+    vtx_col = torch.from_numpy(col.astype(np.float32)).cuda()
+
+    # glctx = dr.RasterizeGLContext() if use_opengl else dr.RasterizeCudaContext()
+
+    glctx = dr.RasterizeCudaContext()
+
+    for rep in range(repeats):
+        pose_target = torch.tensor(q_rnd(), device='cuda')
+        pose_init   = q_rnd()
+        pose_opt    = torch.tensor(pose_init / np.sum(pose_init**2)**0.5, dtype=torch.float32, device='cuda', requires_grad=True)
+
+        loss_best   = np.inf
+        pose_best   = pose_opt.detach().clone()
+
+        # Modelview + projection matrix.
+        mvp = torch.tensor(np.matmul(util.projection(x=0.4), util.translate(0, 0, -3.5)).astype(np.float32), device='cuda')
+
+        # Adam optimizer for texture with a learning rate ramp.
+        optimizer = torch.optim.Adam([pose_opt], betas=(0.9, 0.999), lr=lr_base)
+        # Render.
+        for it in range(max_iter + 1):
+            # Set learning rate.
+            itf = 1.0 * it / max_iter
+            nr = nr_base * nr_falloff**itf
+            lr = lr_base * lr_falloff**itf
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+
+            # Noise input.
+            if itf >= grad_phase_start:
+                noise = q_unit()
+            else:
+                noise = q_scale(q_rnd(), nr)
+                noise = q_mul(noise, q_rnd_S4()) # Orientation noise.
+
+            # Render.
+            color          = render(glctx, torch.matmul(mvp, q_to_mtx(pose_target)), vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+            pose_total_opt = q_mul_torch(pose_opt, noise)
+            mtx_total_opt  = torch.matmul(mvp, q_to_mtx(pose_total_opt))
+            color_opt      = render(glctx, mtx_total_opt, vtx_pos, pos_idx, vtx_col, col_idx, resolution)
+
+            # Image-space loss.
+            diff = (color_opt - color)**2 # L2 norm.
+            diff = torch.tanh(5.0 * torch.max(diff, dim=-1)[0])
+            loss = torch.mean(diff)
+
+            # Measure image-space loss and update best found pose.
+            loss_val = float(loss)
+            if (loss_val < loss_best) and (loss_val > 0.0):
+                pose_best = pose_total_opt.detach().clone()
+                loss_best = loss_val
+                if itf < grad_phase_start:
+                    with torch.no_grad(): pose_opt[:] = pose_best
+
+            # Print/save log.
+            if log_interval and (it % log_interval == 0):
+                err = q_angle_deg(pose_opt, pose_target)
+                ebest = q_angle_deg(pose_best, pose_target)
+                s = "rep=%d,iter=%d,err=%f,err_best=%f,loss=%f,loss_best=%f,lr=%f,nr=%f" % (rep, it, err, ebest, loss_val, loss_best, lr, nr)
+                print(s)
+                if log_file:
+                    log_file.write(s + "\n")
+
+            # Run gradient training step.
+            if itf >= grad_phase_start:
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+            with torch.no_grad():
+                pose_opt /= torch.sum(pose_opt**2)**0.5
+
+            # Show/save image.
+            display_image = display_interval and (it % display_interval == 0)
+            save_mp4      = mp4save_interval and (it % mp4save_interval == 0)
+
+            if display_image or save_mp4:
+                img_ref  = color[0].detach().cpu().numpy()
+                img_opt  = color_opt[0].detach().cpu().numpy()
+                img_best = render(glctx, torch.matmul(mvp, q_to_mtx(pose_best)), vtx_pos, pos_idx, vtx_col, col_idx, resolution)[0].detach().cpu().numpy()
+                result_image = np.concatenate([img_ref, img_best, img_opt], axis=1)[::-1]
+
+                if display_image:
+                    util.display_image(result_image, size=display_res, title='(%d) %d / %d' % (rep, it, max_iter))
+                if save_mp4:
+                    writer.append_data(np.clip(np.rint(result_image*255.0), 0, 255).astype(np.uint8))
+
+    # Done.
+    if writer is not None:
+        writer.close()
+    if log_file:
+        log_file.close()
+
+#----------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(description='Cube pose fitting example')
+    parser.add_argument('--opengl', help='enable OpenGL rendering', action='store_true', default=False)
+    parser.add_argument('--outdir', help='specify output directory', default='')
+    parser.add_argument('--display-interval', type=int, default=0)
+    parser.add_argument('--mp4save-interval', type=int, default=10)
+    parser.add_argument('--max-iter', type=int, default=1000)
+    parser.add_argument('--repeats', type=int, default=1)
+    args = parser.parse_args()
+
+    # Set up logging.
+    if args.outdir:
+        out_dir = f'{args.outdir}/pose'
+        print (f'Saving results under {out_dir}')
+    else:
+        out_dir = None
+        print ('No output directory specified, not saving log or images')
+
+    # Run.
+    fit_pose(
+        max_iter=args.max_iter,
+        repeats=args.repeats,
+        log_interval=100,
+        display_interval=args.display_interval,
+        out_dir=out_dir,
+        log_fn='log.txt',
+        mp4save_interval=args.mp4save_interval,
+        mp4save_fn='progress.mp4',
+        use_opengl=args.opengl
+    )
+
+    # Done.
+    print("Done.")
+
+#----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/triangle.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/triangle.py
new file mode 100644
index 00000000..58ecc95a
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/triangle.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import imageio
+import numpy as np
+import torch
+import nvdiffrast.torch as dr
+import sys
+
+def tensor(*args, **kwargs):
+    return torch.tensor(*args, device='cuda', **kwargs)
+
+glctx = dr.RasterizeCudaContext()
+
+# if sys.argv[1:] == ['--cuda']:
+#     glctx = dr.RasterizeCudaContext()
+# elif sys.argv[1:] == ['--opengl']:
+#     glctx = dr.RasterizeGLContext()
+# else:
+#     print("Specify either --cuda or --opengl")
+#     exit(1)
+
+pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
+col = tensor([[[1, 0, 0], [0, 1, 0], [0, 0, 1]]], dtype=torch.float32)
+tri = tensor([[0, 1, 2]], dtype=torch.int32)
+
+rast, _ = dr.rasterize(glctx, pos, tri, resolution=[256, 256])
+out, _ = dr.interpolate(col, rast, tri)
+
+img = out.cpu().numpy()[0, ::-1, :, :] # Flip vertically.
+img = np.clip(np.rint(img * 255), 0, 255).astype(np.uint8) # Quantize to np.uint8
+
+print("Saving to 'tri.png'.")
+imageio.imsave('tri.png', img)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/util.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/util.py
new file mode 100644
index 00000000..8c53bad8
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/samples/torch/util.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import numpy as np
+import torch
+
+#----------------------------------------------------------------------------
+# Projection and transformation matrix helpers.
+#----------------------------------------------------------------------------
+
+def projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0],
+                     [  0,  n/x,            0,              0],
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+
+def translate(x, y, z):
+    return np.array([[1, 0, 0, x],
+                     [0, 1, 0, y],
+                     [0, 0, 1, z],
+                     [0, 0, 0, 1]]).astype(np.float32)
+
+def rotate_x(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[1,  0, 0, 0],
+                     [0,  c, s, 0],
+                     [0, -s, c, 0],
+                     [0,  0, 0, 1]]).astype(np.float32)
+
+def rotate_y(a):
+    s, c = np.sin(a), np.cos(a)
+    return np.array([[ c, 0, s, 0],
+                     [ 0, 1, 0, 0],
+                     [-s, 0, c, 0],
+                     [ 0, 0, 0, 1]]).astype(np.float32)
+
+def random_rotation_translation(t):
+    m = np.random.normal(size=[3, 3])
+    m[1] = np.cross(m[0], m[2])
+    m[2] = np.cross(m[0], m[1])
+    m = m / np.linalg.norm(m, axis=1, keepdims=True)
+    m = np.pad(m, [[0, 1], [0, 1]], mode='constant')
+    m[3, 3] = 1.0
+    m[:3, 3] = np.random.uniform(-t, t, size=[3])
+    return m
+
+#----------------------------------------------------------------------------
+# Bilinear downsample by 2x.
+#----------------------------------------------------------------------------
+
+def bilinear_downsample(x):
+    w = torch.tensor([[1, 3, 3, 1], [3, 9, 9, 3], [3, 9, 9, 3], [1, 3, 3, 1]], dtype=torch.float32, device=x.device) / 64.0
+    w = w.expand(x.shape[-1], 1, 4, 4) 
+    x = torch.nn.functional.conv2d(x.permute(0, 3, 1, 2), w, padding=1, stride=2, groups=x.shape[-1])
+    return x.permute(0, 2, 3, 1)
+
+#----------------------------------------------------------------------------
+# Image display function using OpenGL.
+#----------------------------------------------------------------------------
+
+_glfw_window = None
+def display_image(image, zoom=None, size=None, title=None): # HWC
+    # Import OpenGL and glfw.
+    import OpenGL.GL as gl
+    import glfw
+
+    # Zoom image if requested.
+    image = np.asarray(image)
+    if size is not None:
+        assert zoom is None
+        zoom = max(1, size // image.shape[0])
+    if zoom is not None:
+        image = image.repeat(zoom, axis=0).repeat(zoom, axis=1)
+    height, width, channels = image.shape
+
+    # Initialize window.
+    if title is None:
+        title = 'Debug window'
+    global _glfw_window
+    if _glfw_window is None:
+        glfw.init()
+        _glfw_window = glfw.create_window(width, height, title, None, None)
+        glfw.make_context_current(_glfw_window)
+        glfw.show_window(_glfw_window)
+        glfw.swap_interval(0)
+    else:
+        glfw.make_context_current(_glfw_window)
+        glfw.set_window_title(_glfw_window, title)
+        glfw.set_window_size(_glfw_window, width, height)
+
+    # Update window.
+    glfw.poll_events()
+    gl.glClearColor(0, 0, 0, 1)
+    gl.glClear(gl.GL_COLOR_BUFFER_BIT)
+    gl.glWindowPos2f(0, 0)
+    gl.glPixelStorei(gl.GL_UNPACK_ALIGNMENT, 1)
+    gl_format = {3: gl.GL_RGB, 2: gl.GL_RG, 1: gl.GL_LUMINANCE}[channels]
+    gl_dtype = {'uint8': gl.GL_UNSIGNED_BYTE, 'float32': gl.GL_FLOAT}[image.dtype.name]
+    gl.glDrawPixels(width, height, gl_format, gl_dtype, image[::-1])
+    glfw.swap_buffers(_glfw_window)
+    if glfw.window_should_close(_glfw_window):
+        return False
+    return True
+
+#----------------------------------------------------------------------------
+# Image save helper.
+#----------------------------------------------------------------------------
+
+def save_image(fn, x):
+    import imageio
+    x = np.rint(x * 255.0)
+    x = np.clip(x, 0, 255).astype(np.uint8)
+    imageio.imsave(fn, x)
+
+#----------------------------------------------------------------------------
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/setup.py b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/setup.py
new file mode 100644
index 00000000..f7f9dede
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/nvdiffrast/setup.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+import nvdiffrast
+import setuptools
+import os
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="nvdiffrast",
+    version=nvdiffrast.__version__,
+    author="Samuli Laine",
+    author_email="slaine@nvidia.com",
+    description="nvdiffrast - modular primitives for high-performance differentiable rendering",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/NVlabs/nvdiffrast",
+    packages=setuptools.find_packages(),
+    package_data={
+        'nvdiffrast': [
+            'common/*.h',
+            'common/*.inl',
+            'common/*.cu',
+            'common/*.cpp',
+            'common/cudaraster/*.hpp',
+            'common/cudaraster/impl/*.cpp',
+            'common/cudaraster/impl/*.hpp',
+            'common/cudaraster/impl/*.inl',
+            'common/cudaraster/impl/*.cu',
+            'lib/*.h',
+            'torch/*.h',
+            'torch/*.inl',
+            'torch/*.cpp',
+            'tensorflow/*.cu',
+        ] + (['lib/*.lib'] if os.name == 'nt' else [])
+    },
+    include_package_data=True,
+    install_requires=['numpy'],  # note: can't require torch here as it will install torch even for a TensorFlow container
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/options/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/options/__init__.py
new file mode 100644
index 00000000..e7eedebe
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/options/__init__.py
@@ -0,0 +1 @@
+"""This package options includes option modules: training options, test options, and basic options (used in both training and test)."""
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/options/base_options.py b/dreamtalk/Deep3DFaceRecon_pytorch/options/base_options.py
new file mode 100644
index 00000000..67375d08
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/options/base_options.py
@@ -0,0 +1,169 @@
+"""This script contains base options for Deep3DFaceRecon_pytorch
+"""
+
+import argparse
+import os
+from util import util
+import numpy as np
+import torch
+import models
+import data
+
+
+class BaseOptions():
+    """This class defines options used during both training and test time.
+
+    It also implements several helper functions such as parsing, printing, and saving the options.
+    It also gathers additional options defined in <modify_commandline_options> functions in both dataset class and model class.
+    """
+
+    def __init__(self, cmd_line=None):
+        """Reset the class; indicates the class hasn't been initailized"""
+        self.initialized = False
+        self.cmd_line = None
+        if cmd_line is not None:
+            self.cmd_line = cmd_line.split()
+
+    def initialize(self, parser):
+        """Define the common options that are used in both training and test."""
+        # basic parameters
+        parser.add_argument('--name', type=str, default='face_recon', help='name of the experiment. It decides where to store samples and models')
+        parser.add_argument('--gpu_ids', type=str, default='0', help='gpu ids: e.g. 0  0,1,2, 0,2. use -1 for CPU')
+        parser.add_argument('--checkpoints_dir', type=str, default='./checkpoints', help='models are saved here')
+        parser.add_argument('--vis_batch_nums', type=float, default=1, help='batch nums of images for visulization')
+        parser.add_argument('--eval_batch_nums', type=float, default=float('inf'), help='batch nums of images for evaluation')
+        parser.add_argument('--use_ddp', type=util.str2bool, nargs='?', const=True, default=True, help='whether use distributed data parallel')
+        parser.add_argument('--ddp_port', type=str, default='12355', help='ddp port')
+        parser.add_argument('--display_per_batch', type=util.str2bool, nargs='?', const=True, default=True, help='whether use batch to show losses')
+        parser.add_argument('--add_image', type=util.str2bool, nargs='?', const=True, default=True, help='whether add image to tensorboard')
+        parser.add_argument('--world_size', type=int, default=1, help='batch nums of images for evaluation')
+
+        # model parameters
+        parser.add_argument('--model', type=str, default='facerecon', help='chooses which model to use.')
+
+        # additional parameters
+        parser.add_argument('--epoch', type=str, default='latest', help='which epoch to load? set to latest to use latest cached model')
+        parser.add_argument('--verbose', action='store_true', help='if specified, print more debugging information')
+        parser.add_argument('--suffix', default='', type=str, help='customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}')
+
+        self.initialized = True
+        return parser
+
+    def gather_options(self):
+        """Initialize our parser with basic options(only once).
+        Add additional model-specific and dataset-specific options.
+        These options are defined in the <modify_commandline_options> function
+        in model and dataset classes.
+        """
+        if not self.initialized:  # check if it has been initialized
+            parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+            parser = self.initialize(parser)
+
+        # get the basic options
+        if self.cmd_line is None:
+            opt, _ = parser.parse_known_args()
+        else:
+            opt, _ = parser.parse_known_args(self.cmd_line)
+
+        # set cuda visible devices
+        os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_ids
+
+        # modify model-related parser options
+        model_name = opt.model
+        model_option_setter = models.get_option_setter(model_name)
+        parser = model_option_setter(parser, self.isTrain)
+        if self.cmd_line is None:
+            opt, _ = parser.parse_known_args()  # parse again with new defaults
+        else:
+            opt, _ = parser.parse_known_args(self.cmd_line)  # parse again with new defaults
+
+        # modify dataset-related parser options
+        if opt.dataset_mode:
+            dataset_name = opt.dataset_mode
+            dataset_option_setter = data.get_option_setter(dataset_name)
+            parser = dataset_option_setter(parser, self.isTrain)
+
+        # save and return the parser
+        self.parser = parser
+        if self.cmd_line is None:
+            return parser.parse_args()
+        else:
+            return parser.parse_args(self.cmd_line)
+
+    def print_options(self, opt):
+        """Print and save options
+
+        It will print both current options and default values(if different).
+        It will save options into a text file / [checkpoints_dir] / opt.txt
+        """
+        message = ''
+        message += '----------------- Options ---------------\n'
+        for k, v in sorted(vars(opt).items()):
+            comment = ''
+            default = self.parser.get_default(k)
+            if v != default:
+                comment = '\t[default: %s]' % str(default)
+            message += '{:>25}: {:<30}{}\n'.format(str(k), str(v), comment)
+        message += '----------------- End -------------------'
+        print(message)
+
+        # save to the disk
+        expr_dir = os.path.join(opt.checkpoints_dir, opt.name)
+        util.mkdirs(expr_dir)
+        file_name = os.path.join(expr_dir, '{}_opt.txt'.format(opt.phase))
+        try:
+            with open(file_name, 'wt') as opt_file:
+                opt_file.write(message)
+                opt_file.write('\n')
+        except PermissionError as error:
+            print("permission error {}".format(error))
+            pass
+
+    def parse(self):
+        """Parse our options, create checkpoints directory suffix, and set up gpu device."""
+        opt = self.gather_options()
+        opt.isTrain = self.isTrain   # train or test
+
+        # process opt.suffix
+        if opt.suffix:
+            suffix = ('_' + opt.suffix.format(**vars(opt))) if opt.suffix != '' else ''
+            opt.name = opt.name + suffix
+
+
+        # set gpu ids
+        str_ids = opt.gpu_ids.split(',')
+        gpu_ids = []
+        for str_id in str_ids:
+            id = int(str_id)
+            if id >= 0:
+                gpu_ids.append(id)
+        opt.world_size = len(gpu_ids)
+        # if len(opt.gpu_ids) > 0:
+        #     torch.cuda.set_device(gpu_ids[0])
+        if opt.world_size == 1:
+            opt.use_ddp = False
+
+        if opt.phase != 'test':
+            # set continue_train automatically
+            if opt.pretrained_name is None:
+                model_dir = os.path.join(opt.checkpoints_dir, opt.name)
+            else:
+                model_dir = os.path.join(opt.checkpoints_dir, opt.pretrained_name)
+            if os.path.isdir(model_dir):
+                model_pths = [i for i in os.listdir(model_dir) if i.endswith('pth')]
+                if os.path.isdir(model_dir) and len(model_pths) != 0:
+                    opt.continue_train= True
+        
+            # update the latest epoch count
+            if opt.continue_train:
+                if opt.epoch == 'latest':
+                    epoch_counts = [int(i.split('.')[0].split('_')[-1]) for i in model_pths if 'latest' not in i]
+                    if len(epoch_counts) != 0:
+                        opt.epoch_count = max(epoch_counts) + 1
+                else:
+                    opt.epoch_count = int(opt.epoch) + 1
+                    
+
+        self.print_options(opt)
+        self.opt = opt
+        return self.opt
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/options/inference_options.py b/dreamtalk/Deep3DFaceRecon_pytorch/options/inference_options.py
new file mode 100644
index 00000000..cdc4b8ef
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/options/inference_options.py
@@ -0,0 +1,24 @@
+from .base_options import BaseOptions
+
+
+class InferenceOptions(BaseOptions):
+    """This class includes test options.
+
+    It also includes shared options defined in BaseOptions.
+    """
+
+    def initialize(self, parser):
+        parser = BaseOptions.initialize(self, parser)  # define shared options
+        parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
+        parser.add_argument('--dataset_mode', type=str, default=None, help='chooses how datasets are loaded. [None | flist]')
+
+        parser.add_argument('--input_dir', type=str, help='the folder of the input files')
+        parser.add_argument('--keypoint_dir', type=str, help='the folder of the keypoint files')
+        parser.add_argument('--output_dir', type=str, default='mp4', help='the output dir to save the extracted coefficients')
+        parser.add_argument('--preprocessed_data_dir', type=str, default='data/temp', help='the temp dir to save the extracted coefficients')
+        parser.add_argument('--save_split_files', action='store_true', help='save split files or not')
+        parser.add_argument('--inference_batch_size', type=int, default=8)
+
+        # Dropout and Batchnorm has different behavior during training and test.
+        self.isTrain = False
+        return parser
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/options/test_options.py b/dreamtalk/Deep3DFaceRecon_pytorch/options/test_options.py
new file mode 100644
index 00000000..4ff3ad14
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/options/test_options.py
@@ -0,0 +1,21 @@
+"""This script contains the test options for Deep3DFaceRecon_pytorch
+"""
+
+from .base_options import BaseOptions
+
+
+class TestOptions(BaseOptions):
+    """This class includes test options.
+
+    It also includes shared options defined in BaseOptions.
+    """
+
+    def initialize(self, parser):
+        parser = BaseOptions.initialize(self, parser)  # define shared options
+        parser.add_argument('--phase', type=str, default='test', help='train, val, test, etc')
+        parser.add_argument('--dataset_mode', type=str, default=None, help='chooses how datasets are loaded. [None | flist]')
+        parser.add_argument('--img_folder', type=str, default='examples', help='folder for test images.')
+
+        # Dropout and Batchnorm has different behavior during training and test.
+        self.isTrain = False
+        return parser
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/options/train_options.py b/dreamtalk/Deep3DFaceRecon_pytorch/options/train_options.py
new file mode 100644
index 00000000..1337bfdd
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/options/train_options.py
@@ -0,0 +1,53 @@
+"""This script contains the training options for Deep3DFaceRecon_pytorch
+"""
+
+from .base_options import BaseOptions
+from util import util
+
+class TrainOptions(BaseOptions):
+    """This class includes training options.
+
+    It also includes shared options defined in BaseOptions.
+    """
+
+    def initialize(self, parser):
+        parser = BaseOptions.initialize(self, parser)
+        # dataset parameters
+        # for train
+        parser.add_argument('--data_root', type=str, default='./', help='dataset root')
+        parser.add_argument('--flist', type=str, default='datalist/train/masks.txt', help='list of mask names of training set')
+        parser.add_argument('--batch_size', type=int, default=32)
+        parser.add_argument('--dataset_mode', type=str, default='flist', help='chooses how datasets are loaded. [None | flist]')
+        parser.add_argument('--serial_batches', action='store_true', help='if true, takes images in order to make batches, otherwise takes them randomly')
+        parser.add_argument('--num_threads', default=4, type=int, help='# threads for loading data')
+        parser.add_argument('--max_dataset_size', type=int, default=float("inf"), help='Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded.')
+        parser.add_argument('--preprocess', type=str, default='shift_scale_rot_flip', help='scaling and cropping of images at load time [shift_scale_rot_flip | shift_scale | shift | shift_rot_flip ]')
+        parser.add_argument('--use_aug', type=util.str2bool, nargs='?', const=True, default=True, help='whether use data augmentation')
+
+        # for val
+        parser.add_argument('--flist_val', type=str, default='datalist/val/masks.txt', help='list of mask names of val set')
+        parser.add_argument('--batch_size_val', type=int, default=32)
+
+
+        # visualization parameters
+        parser.add_argument('--display_freq', type=int, default=1000, help='frequency of showing training results on screen')
+        parser.add_argument('--print_freq', type=int, default=100, help='frequency of showing training results on console')
+        
+        # network saving and loading parameters
+        parser.add_argument('--save_latest_freq', type=int, default=5000, help='frequency of saving the latest results')
+        parser.add_argument('--save_epoch_freq', type=int, default=1, help='frequency of saving checkpoints at the end of epochs')
+        parser.add_argument('--evaluation_freq', type=int, default=5000, help='evaluation freq')
+        parser.add_argument('--save_by_iter', action='store_true', help='whether saves model by iteration')
+        parser.add_argument('--continue_train', action='store_true', help='continue training: load the latest model')
+        parser.add_argument('--epoch_count', type=int, default=1, help='the starting epoch count, we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>, ...')
+        parser.add_argument('--phase', type=str, default='train', help='train, val, test, etc')
+        parser.add_argument('--pretrained_name', type=str, default=None, help='resume training from another checkpoint')
+
+        # training parameters
+        parser.add_argument('--n_epochs', type=int, default=20, help='number of epochs with the initial learning rate')
+        parser.add_argument('--lr', type=float, default=0.0001, help='initial learning rate for adam')
+        parser.add_argument('--lr_policy', type=str, default='step', help='learning rate policy. [linear | step | plateau | cosine]')
+        parser.add_argument('--lr_decay_epochs', type=int, default=10, help='multiply by a gamma every lr_decay_epochs epoches')
+
+        self.isTrain = True
+        return parser
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/test.py b/dreamtalk/Deep3DFaceRecon_pytorch/test.py
new file mode 100644
index 00000000..13e1a7d5
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/test.py
@@ -0,0 +1,74 @@
+"""This script is the test script for Deep3DFaceRecon_pytorch
+"""
+
+import os
+from options.test_options import TestOptions
+from data import create_dataset
+from models import create_model
+from util.visualizer import MyVisualizer
+from util.preprocess import align_img
+from PIL import Image
+import numpy as np
+from util.load_mats import load_lm3d
+import torch 
+from data.flist_dataset import default_flist_reader
+from scipy.io import loadmat, savemat
+
+def get_data_path(root='examples'):
+    
+    im_path = [os.path.join(root, i) for i in sorted(os.listdir(root)) if i.endswith('png') or i.endswith('jpg')]
+    lm_path = [i.replace('png', 'txt').replace('jpg', 'txt') for i in im_path]
+    lm_path = [os.path.join(i.replace(i.split(os.path.sep)[-1],''),'detections',i.split(os.path.sep)[-1]) for i in lm_path]
+
+    return im_path, lm_path
+
+def read_data(im_path, lm_path, lm3d_std, to_tensor=True):
+    # to RGB 
+    im = Image.open(im_path).convert('RGB')
+    W,H = im.size
+    lm = np.loadtxt(lm_path).astype(np.float32)
+    lm = lm.reshape([-1, 2])
+    lm[:, -1] = H - 1 - lm[:, -1]
+    _, im, lm, _ = align_img(im, lm, lm3d_std)
+    if to_tensor:
+        im = torch.tensor(np.array(im)/255., dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)
+        lm = torch.tensor(lm).unsqueeze(0)
+    return im, lm
+
+def main(rank, opt, name='examples'):
+    device = torch.device(rank)
+    torch.cuda.set_device(device)
+    model = create_model(opt)
+    model.setup(opt)
+    model.device = device
+    model.parallelize()
+    model.eval()
+    visualizer = MyVisualizer(opt)
+
+    im_path, lm_path = get_data_path(name)
+    lm3d_std = load_lm3d(opt.bfm_folder) 
+
+    for i in range(len(im_path)):
+        print(i, im_path[i])
+        img_name = im_path[i].split(os.path.sep)[-1].replace('.png','').replace('.jpg','')
+        if not os.path.isfile(lm_path[i]):
+            print("%s is not found !!!"%lm_path[i])
+            continue
+        im_tensor, lm_tensor = read_data(im_path[i], lm_path[i], lm3d_std)
+        data = {
+            'imgs': im_tensor,
+            'lms': lm_tensor
+        }
+        model.set_input(data)  # unpack data from data loader
+        model.test()           # run inference
+        visuals = model.get_current_visuals()  # get image results
+        visualizer.display_current_results(visuals, 0, opt.epoch, dataset=name.split(os.path.sep)[-1], 
+            save_results=True, count=i, name=img_name, add_image=False)
+
+        model.save_mesh(os.path.join(visualizer.img_dir, name.split(os.path.sep)[-1], 'epoch_%s_%06d'%(opt.epoch, 0),img_name+'.obj')) # save reconstruction meshes
+        model.save_coeff(os.path.join(visualizer.img_dir, name.split(os.path.sep)[-1], 'epoch_%s_%06d'%(opt.epoch, 0),img_name+'.mat')) # save predicted coefficients
+
+if __name__ == '__main__':
+    opt = TestOptions().parse()  # get test options
+    main(0, opt,opt.img_folder)
+    
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/train.py b/dreamtalk/Deep3DFaceRecon_pytorch/train.py
new file mode 100644
index 00000000..26e856f4
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/train.py
@@ -0,0 +1,166 @@
+"""This script is the training script for Deep3DFaceRecon_pytorch
+"""
+
+import os
+import time
+import numpy as np
+import torch
+from options.train_options import TrainOptions
+from data import create_dataset
+from models import create_model
+from util.visualizer import MyVisualizer
+from util.util import genvalconf
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+
+def setup(rank, world_size, port):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = port
+
+    # initialize the process group
+    dist.init_process_group("gloo", rank=rank, world_size=world_size)
+
+def cleanup():
+    dist.destroy_process_group()
+
+def main(rank, world_size, train_opt):
+    val_opt = genvalconf(train_opt, isTrain=False)
+    
+    device = torch.device(rank)
+    torch.cuda.set_device(device)
+    use_ddp = train_opt.use_ddp
+    
+    if use_ddp:
+        setup(rank, world_size, train_opt.ddp_port)
+
+    train_dataset, val_dataset = create_dataset(train_opt, rank=rank), create_dataset(val_opt, rank=rank)
+    train_dataset_batches, val_dataset_batches = \
+        len(train_dataset) // train_opt.batch_size, len(val_dataset) // val_opt.batch_size
+    
+    model = create_model(train_opt)   # create a model given train_opt.model and other options
+    model.setup(train_opt)
+    model.device = device
+    model.parallelize()
+
+    if rank == 0:
+        print('The batch number of training images = %d\n, \
+            the batch number of validation images = %d'% (train_dataset_batches, val_dataset_batches))
+        model.print_networks(train_opt.verbose)
+        visualizer = MyVisualizer(train_opt)   # create a visualizer that display/save images and plots
+
+    total_iters = train_dataset_batches * (train_opt.epoch_count - 1)   # the total number of training iterations
+    t_data = 0
+    t_val = 0
+    optimize_time = 0.1
+    batch_size = 1 if train_opt.display_per_batch else train_opt.batch_size
+
+    if use_ddp:
+        dist.barrier()
+
+    times = []
+    for epoch in range(train_opt.epoch_count, train_opt.n_epochs + 1):    # outer loop for different epochs; we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>
+        epoch_start_time = time.time()  # timer for entire epoch
+        iter_data_time = time.time()    # timer for train_data loading per iteration
+        epoch_iter = 0                  # the number of training iterations in current epoch, reset to 0 every epoch
+
+        train_dataset.set_epoch(epoch)
+        for i, train_data in enumerate(train_dataset):  # inner loop within one epoch
+            iter_start_time = time.time()  # timer for computation per iteration
+            if total_iters % train_opt.print_freq == 0:
+                t_data = iter_start_time - iter_data_time
+            total_iters += batch_size
+            epoch_iter += batch_size
+
+            torch.cuda.synchronize()
+            optimize_start_time = time.time()
+
+            model.set_input(train_data)  # unpack train_data from dataset and apply preprocessing
+            model.optimize_parameters()   # calculate loss functions, get gradients, update network weights
+
+            torch.cuda.synchronize()
+            optimize_time = (time.time() - optimize_start_time) / batch_size * 0.005 + 0.995 * optimize_time
+
+            if use_ddp:
+                dist.barrier()
+
+            if rank == 0 and (total_iters == batch_size or total_iters % train_opt.display_freq == 0):   # display images on visdom and save images to a HTML file
+                model.compute_visuals()
+                visualizer.display_current_results(model.get_current_visuals(), total_iters, epoch,
+                    save_results=True,
+                    add_image=train_opt.add_image)
+                    # (total_iters == batch_size or total_iters % train_opt.evaluation_freq == 0)
+            
+            if rank == 0 and (total_iters == batch_size or total_iters % train_opt.print_freq == 0):    # print training losses and save logging information to the disk
+                losses = model.get_current_losses()
+                visualizer.print_current_losses(epoch, epoch_iter, losses, optimize_time, t_data)
+                visualizer.plot_current_losses(total_iters, losses)
+
+            if total_iters == batch_size or total_iters % train_opt.evaluation_freq == 0:
+                with torch.no_grad():
+                    torch.cuda.synchronize()
+                    val_start_time = time.time()
+                    losses_avg = {}
+                    model.eval()
+                    for j, val_data in enumerate(val_dataset):
+                        model.set_input(val_data)
+                        model.optimize_parameters(isTrain=False)
+                        if rank == 0 and j < train_opt.vis_batch_nums:
+                            model.compute_visuals()
+                            visualizer.display_current_results(model.get_current_visuals(), total_iters, epoch,
+                                    dataset='val', save_results=True, count=j * val_opt.batch_size,
+                                    add_image=train_opt.add_image)
+
+                        if j < train_opt.eval_batch_nums:
+                            losses = model.get_current_losses()
+                            for key, value in losses.items():
+                                losses_avg[key] = losses_avg.get(key, 0) + value
+
+                    for key, value in losses_avg.items():
+                        losses_avg[key] = value / min(train_opt.eval_batch_nums, val_dataset_batches)
+
+                    torch.cuda.synchronize()
+                    eval_time = time.time() - val_start_time
+                    
+                    if rank == 0:
+                        visualizer.print_current_losses(epoch, epoch_iter, losses_avg, eval_time, t_data, dataset='val') # visualize training results
+                        visualizer.plot_current_losses(total_iters, losses_avg, dataset='val')
+                model.train()      
+
+            if use_ddp:
+                dist.barrier()
+
+            if rank == 0 and (total_iters == batch_size or total_iters % train_opt.save_latest_freq == 0):   # cache our latest model every <save_latest_freq> iterations
+                print('saving the latest model (epoch %d, total_iters %d)' % (epoch, total_iters))
+                print(train_opt.name)  # it's useful to occasionally show the experiment name on console
+                save_suffix = 'iter_%d' % total_iters if train_opt.save_by_iter else 'latest'
+                model.save_networks(save_suffix)
+            
+            if use_ddp:
+                dist.barrier()
+            
+            iter_data_time = time.time()
+
+        print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, train_opt.n_epochs, time.time() - epoch_start_time))
+        model.update_learning_rate()                     # update learning rates at the end of every epoch.
+        
+        if rank == 0 and epoch % train_opt.save_epoch_freq == 0:              # cache our model every <save_epoch_freq> epochs
+            print('saving the model at the end of epoch %d, iters %d' % (epoch, total_iters))
+            model.save_networks('latest')
+            model.save_networks(epoch)
+
+        if use_ddp:
+            dist.barrier()
+
+if __name__ == '__main__':
+
+    import warnings
+    warnings.filterwarnings("ignore")
+    
+    train_opt = TrainOptions().parse()   # get training options
+    world_size = train_opt.world_size               
+
+    if train_opt.use_ddp:
+        mp.spawn(main, args=(world_size, train_opt), nprocs=world_size, join=True)
+    else:
+        main(0, world_size, train_opt)
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/BBRegressorParam_r.mat b/dreamtalk/Deep3DFaceRecon_pytorch/util/BBRegressorParam_r.mat
new file mode 100644
index 00000000..1430a94e
Binary files /dev/null and b/dreamtalk/Deep3DFaceRecon_pytorch/util/BBRegressorParam_r.mat differ
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/__init__.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/__init__.py
new file mode 100644
index 00000000..718f8f67
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/__init__.py
@@ -0,0 +1,2 @@
+"""This package includes a miscellaneous collection of useful helper functions."""
+from util import *
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/detect_lm68.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/detect_lm68.py
new file mode 100644
index 00000000..b7e40997
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/detect_lm68.py
@@ -0,0 +1,106 @@
+import os
+import cv2
+import numpy as np
+from scipy.io import loadmat
+import tensorflow as tf
+from util.preprocess import align_for_lm
+from shutil import move
+
+mean_face = np.loadtxt('util/test_mean_face.txt')
+mean_face = mean_face.reshape([68, 2])
+
+def save_label(labels, save_path):
+    np.savetxt(save_path, labels)
+
+def draw_landmarks(img, landmark, save_name):
+    landmark = landmark
+    lm_img = np.zeros([img.shape[0], img.shape[1], 3])
+    lm_img[:] = img.astype(np.float32)
+    landmark = np.round(landmark).astype(np.int32)
+
+    for i in range(len(landmark)):
+        for j in range(-1, 1):
+            for k in range(-1, 1):
+                if img.shape[0] - 1 - landmark[i, 1]+j > 0 and \
+                        img.shape[0] - 1 - landmark[i, 1]+j < img.shape[0] and \
+                        landmark[i, 0]+k > 0 and \
+                        landmark[i, 0]+k < img.shape[1]:
+                    lm_img[img.shape[0] - 1 - landmark[i, 1]+j, landmark[i, 0]+k,
+                           :] = np.array([0, 0, 255])
+    lm_img = lm_img.astype(np.uint8)
+
+    cv2.imwrite(save_name, lm_img)
+
+
+def load_data(img_name, txt_name):
+    return cv2.imread(img_name), np.loadtxt(txt_name)
+
+# create tensorflow graph for landmark detector
+def load_lm_graph(graph_filename):
+    with tf.gfile.GFile(graph_filename, 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='net')
+        img_224 = graph.get_tensor_by_name('net/input_imgs:0')
+        output_lm = graph.get_tensor_by_name('net/lm:0')
+        lm_sess = tf.Session(graph=graph)
+
+    return lm_sess,img_224,output_lm
+
+# landmark detection
+def detect_68p(img_path,sess,input_op,output_op):
+    print('detecting landmarks......')
+    names = [i for i in sorted(os.listdir(
+        img_path)) if 'jpg' in i or 'png' in i or 'jpeg' in i or 'PNG' in i]
+    vis_path = os.path.join(img_path, 'vis')
+    remove_path = os.path.join(img_path, 'remove')
+    save_path = os.path.join(img_path, 'landmarks')
+    if not os.path.isdir(vis_path):
+        os.makedirs(vis_path)
+    if not os.path.isdir(remove_path):
+        os.makedirs(remove_path)
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+
+    for i in range(0, len(names)):
+        name = names[i]
+        print('%05d' % (i), ' ', name)
+        full_image_name = os.path.join(img_path, name)
+        txt_name = '.'.join(name.split('.')[:-1]) + '.txt'
+        full_txt_name = os.path.join(img_path, 'detections', txt_name) # 5 facial landmark path for each image
+
+        # if an image does not have detected 5 facial landmarks, remove it from the training list
+        if not os.path.isfile(full_txt_name):
+            move(full_image_name, os.path.join(remove_path, name))
+            continue 
+
+        # load data
+        img, five_points = load_data(full_image_name, full_txt_name)
+        input_img, scale, bbox = align_for_lm(img, five_points) # align for 68 landmark detection 
+
+        # if the alignment fails, remove corresponding image from the training list
+        if scale == 0:
+            move(full_txt_name, os.path.join(
+                remove_path, txt_name))
+            move(full_image_name, os.path.join(remove_path, name))
+            continue
+
+        # detect landmarks
+        input_img = np.reshape(
+            input_img, [1, 224, 224, 3]).astype(np.float32)
+        landmark = sess.run(
+            output_op, feed_dict={input_op: input_img})
+
+        # transform back to original image coordinate
+        landmark = landmark.reshape([68, 2]) + mean_face
+        landmark[:, 1] = 223 - landmark[:, 1]
+        landmark = landmark / scale
+        landmark[:, 0] = landmark[:, 0] + bbox[0]
+        landmark[:, 1] = landmark[:, 1] + bbox[1]
+        landmark[:, 1] = img.shape[0] - 1 - landmark[:, 1]
+
+        if i % 100 == 0:
+            draw_landmarks(img, landmark, os.path.join(vis_path, name))
+        save_label(landmark, os.path.join(save_path, txt_name))
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/generate_list.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/generate_list.py
new file mode 100644
index 00000000..943d9067
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/generate_list.py
@@ -0,0 +1,34 @@
+"""This script is to generate training list files for Deep3DFaceRecon_pytorch
+"""
+
+import os
+
+# save path to training data
+def write_list(lms_list, imgs_list, msks_list, mode='train',save_folder='datalist', save_name=''):
+    save_path = os.path.join(save_folder, mode)
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    with open(os.path.join(save_path, save_name + 'landmarks.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in lms_list])
+
+    with open(os.path.join(save_path, save_name + 'images.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in imgs_list])
+
+    with open(os.path.join(save_path, save_name + 'masks.txt'), 'w') as fd:
+        fd.writelines([i + '\n' for i in msks_list])   
+
+# check if the path is valid
+def check_list(rlms_list, rimgs_list, rmsks_list):
+    lms_list, imgs_list, msks_list = [], [], []
+    for i in range(len(rlms_list)):
+        flag = 'false'
+        lm_path = rlms_list[i]
+        im_path = rimgs_list[i]
+        msk_path = rmsks_list[i]
+        if os.path.isfile(lm_path) and os.path.isfile(im_path) and os.path.isfile(msk_path):
+            flag = 'true'
+            lms_list.append(rlms_list[i])
+            imgs_list.append(rimgs_list[i])
+            msks_list.append(rmsks_list[i])
+        print(i, rlms_list[i], flag)
+    return lms_list, imgs_list, msks_list
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/html.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/html.py
new file mode 100644
index 00000000..cc3262a1
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/html.py
@@ -0,0 +1,86 @@
+import dominate
+from dominate.tags import meta, h3, table, tr, td, p, a, img, br
+import os
+
+
+class HTML:
+    """This HTML class allows us to save images and write texts into a single HTML file.
+
+     It consists of functions such as <add_header> (add a text header to the HTML file),
+     <add_images> (add a row of images to the HTML file), and <save> (save the HTML to the disk).
+     It is based on Python library 'dominate', a Python library for creating and manipulating HTML documents using a DOM API.
+    """
+
+    def __init__(self, web_dir, title, refresh=0):
+        """Initialize the HTML classes
+
+        Parameters:
+            web_dir (str) -- a directory that stores the webpage. HTML file will be created at <web_dir>/index.html; images will be saved at <web_dir/images/
+            title (str)   -- the webpage name
+            refresh (int) -- how often the website refresh itself; if 0; no refreshing
+        """
+        self.title = title
+        self.web_dir = web_dir
+        self.img_dir = os.path.join(self.web_dir, 'images')
+        if not os.path.exists(self.web_dir):
+            os.makedirs(self.web_dir)
+        if not os.path.exists(self.img_dir):
+            os.makedirs(self.img_dir)
+
+        self.doc = dominate.document(title=title)
+        if refresh > 0:
+            with self.doc.head:
+                meta(http_equiv="refresh", content=str(refresh))
+
+    def get_image_dir(self):
+        """Return the directory that stores images"""
+        return self.img_dir
+
+    def add_header(self, text):
+        """Insert a header to the HTML file
+
+        Parameters:
+            text (str) -- the header text
+        """
+        with self.doc:
+            h3(text)
+
+    def add_images(self, ims, txts, links, width=400):
+        """add images to the HTML file
+
+        Parameters:
+            ims (str list)   -- a list of image paths
+            txts (str list)  -- a list of image names shown on the website
+            links (str list) --  a list of hyperref links; when you click an image, it will redirect you to a new page
+        """
+        self.t = table(border=1, style="table-layout: fixed;")  # Insert a table
+        self.doc.add(self.t)
+        with self.t:
+            with tr():
+                for im, txt, link in zip(ims, txts, links):
+                    with td(style="word-wrap: break-word;", halign="center", valign="top"):
+                        with p():
+                            with a(href=os.path.join('images', link)):
+                                img(style="width:%dpx" % width, src=os.path.join('images', im))
+                            br()
+                            p(txt)
+
+    def save(self):
+        """save the current content to the HMTL file"""
+        html_file = '%s/index.html' % self.web_dir
+        f = open(html_file, 'wt')
+        f.write(self.doc.render())
+        f.close()
+
+
+if __name__ == '__main__':  # we show an example usage here.
+    html = HTML('web/', 'test_html')
+    html.add_header('hello world')
+
+    ims, txts, links = [], [], []
+    for n in range(4):
+        ims.append('image_%d.png' % n)
+        txts.append('text_%d' % n)
+        links.append('image_%d.png' % n)
+    html.add_images(ims, txts, links)
+    html.save()
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/load_mats.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/load_mats.py
new file mode 100644
index 00000000..5b1f4a73
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/load_mats.py
@@ -0,0 +1,117 @@
+"""This script is to load 3D face model for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+from PIL import Image
+from scipy.io import loadmat, savemat
+from array import array
+import os.path as osp
+
+# load expression basis
+def LoadExpBasis(bfm_folder='BFM'):
+    n_vertex = 53215
+    Expbin = open(osp.join(bfm_folder, 'Exp_Pca.bin'), 'rb')
+    exp_dim = array('i')
+    exp_dim.fromfile(Expbin, 1)
+    expMU = array('f')
+    expPC = array('f')
+    expMU.fromfile(Expbin, 3*n_vertex)
+    expPC.fromfile(Expbin, 3*exp_dim[0]*n_vertex)
+    Expbin.close()
+
+    expPC = np.array(expPC)
+    expPC = np.reshape(expPC, [exp_dim[0], -1])
+    expPC = np.transpose(expPC)
+
+    expEV = np.loadtxt(osp.join(bfm_folder, 'std_exp.txt'))
+
+    return expPC, expEV
+
+
+# transfer original BFM09 to our face model
+def transferBFM09(bfm_folder='BFM'):
+    print('Transfer BFM09 to BFM_model_front......')
+    original_BFM = loadmat(osp.join(bfm_folder, '01_MorphableModel.mat'))
+    shapePC = original_BFM['shapePC']  # shape basis
+    shapeEV = original_BFM['shapeEV']  # corresponding eigen value
+    shapeMU = original_BFM['shapeMU']  # mean face
+    texPC = original_BFM['texPC']  # texture basis
+    texEV = original_BFM['texEV']  # eigen value
+    texMU = original_BFM['texMU']  # mean texture
+
+    expPC, expEV = LoadExpBasis()
+
+    # transfer BFM09 to our face model
+
+    idBase = shapePC*np.reshape(shapeEV, [-1, 199])
+    idBase = idBase/1e5  # unify the scale to decimeter
+    idBase = idBase[:, :80]  # use only first 80 basis
+
+    exBase = expPC*np.reshape(expEV, [-1, 79])
+    exBase = exBase/1e5  # unify the scale to decimeter
+    exBase = exBase[:, :64]  # use only first 64 basis
+
+    texBase = texPC*np.reshape(texEV, [-1, 199])
+    texBase = texBase[:, :80]  # use only first 80 basis
+
+    # our face model is cropped along face landmarks and contains only 35709 vertex.
+    # original BFM09 contains 53490 vertex, and expression basis provided by Guo et al. contains 53215 vertex.
+    # thus we select corresponding vertex to get our face model.
+
+    index_exp = loadmat(osp.join(bfm_folder, 'BFM_front_idx.mat'))
+    index_exp = index_exp['idx'].astype(np.int32) - 1  # starts from 0 (to 53215)
+
+    index_shape = loadmat(osp.join(bfm_folder, 'BFM_exp_idx.mat'))
+    index_shape = index_shape['trimIndex'].astype(
+        np.int32) - 1  # starts from 0 (to 53490)
+    index_shape = index_shape[index_exp]
+
+    idBase = np.reshape(idBase, [-1, 3, 80])
+    idBase = idBase[index_shape, :, :]
+    idBase = np.reshape(idBase, [-1, 80])
+
+    texBase = np.reshape(texBase, [-1, 3, 80])
+    texBase = texBase[index_shape, :, :]
+    texBase = np.reshape(texBase, [-1, 80])
+
+    exBase = np.reshape(exBase, [-1, 3, 64])
+    exBase = exBase[index_exp, :, :]
+    exBase = np.reshape(exBase, [-1, 64])
+
+    meanshape = np.reshape(shapeMU, [-1, 3])/1e5
+    meanshape = meanshape[index_shape, :]
+    meanshape = np.reshape(meanshape, [1, -1])
+
+    meantex = np.reshape(texMU, [-1, 3])
+    meantex = meantex[index_shape, :]
+    meantex = np.reshape(meantex, [1, -1])
+
+    # other info contains triangles, region used for computing photometric loss,
+    # region used for skin texture regularization, and 68 landmarks index etc.
+    other_info = loadmat(osp.join(bfm_folder, 'facemodel_info.mat'))
+    frontmask2_idx = other_info['frontmask2_idx']
+    skinmask = other_info['skinmask']
+    keypoints = other_info['keypoints']
+    point_buf = other_info['point_buf']
+    tri = other_info['tri']
+    tri_mask2 = other_info['tri_mask2']
+
+    # save our face model
+    savemat(osp.join(bfm_folder, 'BFM_model_front.mat'), {'meanshape': meanshape, 'meantex': meantex, 'idBase': idBase, 'exBase': exBase, 'texBase': texBase,
+            'tri': tri, 'point_buf': point_buf, 'tri_mask2': tri_mask2, 'keypoints': keypoints, 'frontmask2_idx': frontmask2_idx, 'skinmask': skinmask})
+
+
+# load landmarks for standard face, which is used for image preprocessing
+def load_lm3d(bfm_folder):
+
+    Lm3D = loadmat(osp.join(bfm_folder, 'similarity_Lm3D_all.mat'))
+    Lm3D = Lm3D['lm']
+
+    # calculate 5 facial landmarks using 68 landmarks
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    Lm3D = np.stack([Lm3D[lm_idx[0], :], np.mean(Lm3D[lm_idx[[1, 2]], :], 0), np.mean(
+        Lm3D[lm_idx[[3, 4]], :], 0), Lm3D[lm_idx[5], :], Lm3D[lm_idx[6], :]], axis=0)
+    Lm3D = Lm3D[[1, 2, 0, 3, 4], :]
+
+    return Lm3D
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/nvdiffrast.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/nvdiffrast.py
new file mode 100644
index 00000000..b8503ff9
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/nvdiffrast.py
@@ -0,0 +1,96 @@
+"""This script is the differentiable renderer for Deep3DFaceRecon_pytorch
+    Attention, antialiasing step is missing in current version.
+"""
+
+import torch
+import torch.nn.functional as F
+import kornia
+from kornia.geometry.camera import pixel2cam
+import numpy as np
+from typing import List
+import nvdiffrast.torch as dr
+from scipy.io import loadmat
+from torch import nn
+
+def ndc_projection(x=0.1, n=1.0, f=50.0):
+    return np.array([[n/x,    0,            0,              0],
+                     [  0, n/-x,            0,              0],
+                     [  0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+                     [  0,    0,           -1,              0]]).astype(np.float32)
+
+class MeshRenderer(nn.Module):
+    def __init__(self,
+                rasterize_fov,
+                znear=0.1,
+                zfar=10, 
+                rasterize_size=224,
+                use_opengl=False):
+        super(MeshRenderer, self).__init__()
+
+        x = np.tan(np.deg2rad(rasterize_fov * 0.5)) * znear
+        self.ndc_proj = torch.tensor(ndc_projection(x=x, n=znear, f=zfar)).matmul(
+                torch.diag(torch.tensor([1., -1, -1, 1])))
+        self.rasterize_size = rasterize_size
+        self.use_opengl = False
+        self.ctx = None
+    
+    def forward(self, vertex, tri, feat=None):
+        """
+        Return:
+            mask               -- torch.tensor, size (B, 1, H, W)
+            depth              -- torch.tensor, size (B, 1, H, W)
+            features(optional) -- torch.tensor, size (B, C, H, W) if feat is not None
+
+        Parameters:
+            vertex          -- torch.tensor, size (B, N, 3)
+            tri             -- torch.tensor, size (B, M, 3) or (M, 3), triangles
+            feat(optional)  -- torch.tensor, size (B, C), features
+        """
+        device = vertex.device
+        rsize = int(self.rasterize_size)
+        ndc_proj = self.ndc_proj.to(device)
+        # trans to homogeneous coordinates of 3d vertices, the direction of y is the same as v
+        if vertex.shape[-1] == 3:
+            vertex = torch.cat([vertex, torch.ones([*vertex.shape[:2], 1]).to(device)], dim=-1)
+            vertex[..., 1] = -vertex[..., 1] 
+
+
+        vertex_ndc = vertex @ ndc_proj.t()
+        if self.ctx is None:
+            # if self.use_opengl:
+            #     self.ctx = dr.RasterizeGLContext(device=device)
+            #     ctx_str = "opengl"
+            # else:
+            self.ctx = dr.RasterizeCudaContext(device=device)
+            ctx_str = "cuda"
+            print("create %s ctx on device cuda:%d"%(ctx_str, device.index))
+        
+        ranges = None
+        if isinstance(tri, List) or len(tri.shape) == 3:
+            vum = vertex_ndc.shape[1]
+            fnum = torch.tensor([f.shape[0] for f in tri]).unsqueeze(1).to(device) 
+            fstartidx = torch.cumsum(fnum, dim=0) - fnum 
+            ranges = torch.cat([fstartidx, fnum], axis=1).type(torch.int32).cpu()
+            for i in range(tri.shape[0]):
+                tri[i] = tri[i] + i*vum
+            vertex_ndc = torch.cat(vertex_ndc, dim=0)
+            tri = torch.cat(tri, dim=0)
+
+        # for range_mode vetex: [B*N, 4], tri: [B*M, 3], for instance_mode vetex: [B, N, 4], tri: [M, 3]
+        tri = tri.type(torch.int32).contiguous()
+        rast_out, _ = dr.rasterize(self.ctx, vertex_ndc.contiguous(), tri, resolution=[rsize, rsize], ranges=ranges)
+
+        depth, _ = dr.interpolate(vertex.reshape([-1,4])[...,2].unsqueeze(1).contiguous(), rast_out, tri) 
+        depth = depth.permute(0, 3, 1, 2)
+        mask =  (rast_out[..., 3] > 0).float().unsqueeze(1)
+        depth = mask * depth
+        
+
+        image = None
+        if feat is not None:
+            image, _ = dr.interpolate(feat, rast_out, tri)
+            image = image.permute(0, 3, 1, 2)
+            image = mask * image
+        
+        return mask, depth, image
+
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/preprocess.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/preprocess.py
new file mode 100644
index 00000000..abee9cf4
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/preprocess.py
@@ -0,0 +1,236 @@
+"""This script contains the image preprocessing code for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+from scipy.io import loadmat
+
+try:
+    from PIL.Image import Resampling
+    RESAMPLING_METHOD = Resampling.BICUBIC
+except ImportError:
+    from PIL.Image import BICUBIC
+    RESAMPLING_METHOD = BICUBIC
+
+import cv2
+import os
+from skimage import transform as trans
+import torch
+import warnings
+warnings.filterwarnings("ignore") 
+warnings.filterwarnings("ignore", category=FutureWarning) 
+
+
+# calculating least square problem for image alignment
+def POS(xp, x):
+    npts = xp.shape[1]
+
+    A = np.zeros([2*npts, 8])
+
+    A[0:2*npts-1:2, 0:3] = x.transpose()
+    A[0:2*npts-1:2, 3] = 1
+
+    A[1:2*npts:2, 4:7] = x.transpose()
+    A[1:2*npts:2, 7] = 1
+
+    b = np.reshape(xp.transpose(), [2*npts, 1])
+
+    k, _, _, _ = np.linalg.lstsq(A, b)
+
+    R1 = k[0:3]
+    R2 = k[4:7]
+    sTx = k[3]
+    sTy = k[7]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2))/2
+    t = np.stack([sTx, sTy], axis=0)
+
+    return t, s
+
+# bounding box for 68 landmark detection
+def BBRegression(points, params):
+
+    w1 = params['W1']
+    b1 = params['B1']
+    w2 = params['W2']
+    b2 = params['B2']
+    data = points.copy()
+    data = data.reshape([5, 2])
+    data_mean = np.mean(data, axis=0)
+    x_mean = data_mean[0]
+    y_mean = data_mean[1]
+    data[:, 0] = data[:, 0] - x_mean
+    data[:, 1] = data[:, 1] - y_mean
+
+    rms = np.sqrt(np.sum(data ** 2)/5)
+    data = data / rms
+    data = data.reshape([1, 10])
+    data = np.transpose(data)
+    inputs = np.matmul(w1, data) + b1
+    inputs = 2 / (1 + np.exp(-2 * inputs)) - 1
+    inputs = np.matmul(w2, inputs) + b2
+    inputs = np.transpose(inputs)
+    x = inputs[:, 0] * rms + x_mean
+    y = inputs[:, 1] * rms + y_mean
+    w = 224/inputs[:, 2] * rms
+    rects = [x, y, w, w]
+    return np.array(rects).reshape([4])
+
+# utils for landmark detection
+def img_padding(img, box):
+    success = True
+    bbox = box.copy()
+    res = np.zeros([2*img.shape[0], 2*img.shape[1], 3])
+    res[img.shape[0] // 2: img.shape[0] + img.shape[0] //
+        2, img.shape[1] // 2: img.shape[1] + img.shape[1]//2] = img
+
+    bbox[0] = bbox[0] + img.shape[1] // 2
+    bbox[1] = bbox[1] + img.shape[0] // 2
+    if bbox[0] < 0 or bbox[1] < 0:
+        success = False
+    return res, bbox, success
+
+# utils for landmark detection
+def crop(img, bbox):
+    padded_img, padded_bbox, flag = img_padding(img, bbox)
+    if flag:
+        crop_img = padded_img[padded_bbox[1]: padded_bbox[1] +
+                            padded_bbox[3], padded_bbox[0]: padded_bbox[0] + padded_bbox[2]]
+        crop_img = cv2.resize(crop_img.astype(np.uint8),
+                            (224, 224), interpolation=cv2.INTER_CUBIC)
+        scale = 224 / padded_bbox[3]
+        return crop_img, scale
+    else:
+        return padded_img, 0
+
+# utils for landmark detection
+def scale_trans(img, lm, t, s):
+    imgw = img.shape[1]
+    imgh = img.shape[0]
+    M_s = np.array([[1, 0, -t[0] + imgw//2 + 0.5], [0, 1, -imgh//2 + t[1]]],
+                   dtype=np.float32)
+    img = cv2.warpAffine(img, M_s, (imgw, imgh))
+    w = int(imgw / s * 100)
+    h = int(imgh / s * 100)
+    img = cv2.resize(img, (w, h))
+    lm = np.stack([lm[:, 0] - t[0] + imgw // 2, lm[:, 1] -
+                   t[1] + imgh // 2], axis=1) / s * 100
+
+    left = w//2 - 112
+    up = h//2 - 112
+    bbox = [left, up, 224, 224]
+    cropped_img, scale2 = crop(img, bbox)
+    assert(scale2!=0)
+    t1 = np.array([bbox[0], bbox[1]])
+
+    # back to raw img s * crop + s * t1 + t2
+    t1 = np.array([w//2 - 112, h//2 - 112])
+    scale = s / 100
+    t2 = np.array([t[0] - imgw/2, t[1] - imgh / 2])
+    inv = (scale/scale2, scale * t1 + t2.reshape([2]))
+    return cropped_img, inv
+
+# utils for landmark detection
+def align_for_lm(img, five_points):
+    five_points = np.array(five_points).reshape([1, 10])
+    params = loadmat('util/BBRegressorParam_r.mat')
+    bbox = BBRegression(five_points, params)
+    assert(bbox[2] != 0)
+    bbox = np.round(bbox).astype(np.int32)
+    crop_img, scale = crop(img, bbox)
+    return crop_img, scale, bbox
+
+
+# resize and crop images for face reconstruction
+def resize_n_crop_img(img, lm, t, s, target_size=224., mask=None):
+    w0, h0 = img.size
+    w = (w0*s).astype(np.int32)
+    h = (h0*s).astype(np.int32)
+    left = (w/2 - target_size/2 + float((t[0] - w0/2)*s)).astype(np.int32)
+    right = left + target_size
+    up = (h/2 - target_size/2 + float((h0/2 - t[1])*s)).astype(np.int32)
+    below = up + target_size
+
+    img = img.resize((w, h), resample=RESAMPLING_METHOD)
+    img = img.crop((left, up, right, below))
+
+    if mask is not None:
+        mask = mask.resize((w, h), resample=RESAMPLING_METHOD)
+        mask = mask.crop((left, up, right, below))
+
+    lm = np.stack([lm[:, 0] - t[0] + w0/2, lm[:, 1] -
+                  t[1] + h0/2], axis=1)*s
+    lm = lm - np.reshape(
+            np.array([(w/2 - target_size/2), (h/2-target_size/2)]), [1, 2])
+
+    return img, lm, mask
+
+# utils for face reconstruction
+def extract_5p(lm):
+    lm_idx = np.array([31, 37, 40, 43, 46, 49, 55]) - 1
+    lm5p = np.stack([lm[lm_idx[0], :], np.mean(lm[lm_idx[[1, 2]], :], 0), np.mean(
+        lm[lm_idx[[3, 4]], :], 0), lm[lm_idx[5], :], lm[lm_idx[6], :]], axis=0)
+    lm5p = lm5p[[1, 2, 0, 3, 4], :]
+    return lm5p
+
+# utils for face reconstruction
+def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.):
+    """
+    Return:
+        transparams        --numpy.array  (raw_W, raw_H, scale, tx, ty)
+        img_new            --PIL.Image  (target_size, target_size, 3)
+        lm_new             --numpy.array  (68, 2), y direction is opposite to v direction
+        mask_new           --PIL.Image  (target_size, target_size)
+    
+    Parameters:
+        img                --PIL.Image  (raw_H, raw_W, 3)
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        lm3D               --numpy.array  (5, 3)
+        mask               --PIL.Image  (raw_H, raw_W, 3)
+    """
+
+    w0, h0 = img.size
+    if lm.shape[0] != 5:
+        lm5p = extract_5p(lm)
+    else:
+        lm5p = lm
+
+    # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
+    t, s = POS(lm5p.transpose(), lm3D.transpose())
+    s = rescale_factor/s
+
+    # processing the image
+    img_new, lm_new, mask_new = resize_n_crop_img(img, lm, t, s, target_size=target_size, mask=mask)
+    trans_params = np.array([w0, h0, s, t[0][0], t[1][0]])
+
+    return trans_params, img_new, lm_new, mask_new
+
+# utils for face recognition model
+def estimate_norm(lm_68p, H):
+    # from https://github.com/deepinsight/insightface/blob/c61d3cd208a603dfa4a338bd743b320ce3e94730/recognition/common/face_align.py#L68
+    """
+    Return:
+        trans_m            --numpy.array  (2, 3)
+    Parameters:
+        lm                 --numpy.array  (68, 2), y direction is opposite to v direction
+        H                  --int/float , image height
+    """
+    lm = extract_5p(lm_68p)
+    lm[:, -1] = H - 1 - lm[:, -1]
+    tform = trans.SimilarityTransform()
+    src = np.array(
+    [[38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366],
+     [41.5493, 92.3655], [70.7299, 92.2041]],
+    dtype=np.float32)
+    tform.estimate(lm, src)
+    M = tform.params
+    if np.linalg.det(M) == 0:
+        M = np.eye(3)
+
+    return M[0:2, :]
+
+def estimate_norm_torch(lm_68p, H):
+    lm_68p_ = lm_68p.detach().cpu().numpy()
+    M = []
+    for i in range(lm_68p_.shape[0]):
+        M.append(estimate_norm(lm_68p_[i], H))
+    M = torch.tensor(np.array(M), dtype=torch.float32).to(lm_68p.device)
+    return M
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/skin_mask.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/skin_mask.py
new file mode 100644
index 00000000..a8a74e4c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/skin_mask.py
@@ -0,0 +1,125 @@
+"""This script is to generate skin attention mask for Deep3DFaceRecon_pytorch
+"""
+
+import math
+import numpy as np
+import os
+import cv2
+
+class GMM:
+    def __init__(self, dim, num, w, mu, cov, cov_det, cov_inv):
+        self.dim = dim # feature dimension
+        self.num = num # number of Gaussian components
+        self.w = w # weights of Gaussian components (a list of scalars)
+        self.mu= mu # mean of Gaussian components (a list of 1xdim vectors)
+        self.cov = cov # covariance matrix of Gaussian components (a list of dimxdim matrices)
+        self.cov_det = cov_det # pre-computed determinet of covariance matrices (a list of scalars)
+        self.cov_inv = cov_inv # pre-computed inverse covariance matrices (a list of dimxdim matrices)
+
+        self.factor = [0]*num
+        for i in range(self.num):
+            self.factor[i] = (2*math.pi)**(self.dim/2) * self.cov_det[i]**0.5
+        
+    def likelihood(self, data):
+        assert(data.shape[1] == self.dim)
+        N = data.shape[0]
+        lh = np.zeros(N)
+
+        for i in range(self.num):
+            data_ = data - self.mu[i]
+
+            tmp = np.matmul(data_,self.cov_inv[i]) * data_
+            tmp = np.sum(tmp,axis=1)
+            power = -0.5 * tmp
+
+            p = np.array([math.exp(power[j]) for j in range(N)])
+            p = p/self.factor[i]
+            lh += p*self.w[i]
+        
+        return lh
+
+
+def _rgb2ycbcr(rgb):
+    m = np.array([[65.481, 128.553, 24.966],
+                  [-37.797, -74.203, 112],
+                  [112, -93.786, -18.214]])
+    shape = rgb.shape
+    rgb = rgb.reshape((shape[0] * shape[1], 3))
+    ycbcr = np.dot(rgb, m.transpose() / 255.)
+    ycbcr[:, 0] += 16.
+    ycbcr[:, 1:] += 128.
+    return ycbcr.reshape(shape)
+
+
+def _bgr2ycbcr(bgr):
+    rgb = bgr[..., ::-1]
+    return _rgb2ycbcr(rgb)
+
+
+gmm_skin_w = [0.24063933, 0.16365987, 0.26034665, 0.33535415]
+gmm_skin_mu = [np.array([113.71862, 103.39613, 164.08226]),
+                np.array([150.19858, 105.18467, 155.51428]),
+                np.array([183.92976, 107.62468, 152.71820]),
+                np.array([114.90524, 113.59782, 151.38217])]
+gmm_skin_cov_det = [5692842.5, 5851930.5, 2329131., 1585971.]
+gmm_skin_cov_inv = [np.array([[0.0019472069, 0.0020450759, -0.00060243998],[0.0020450759, 0.017700525, 0.0051420014],[-0.00060243998, 0.0051420014, 0.0081308950]]),
+                    np.array([[0.0027110141, 0.0011036990, 0.0023122299],[0.0011036990, 0.010707724, 0.010742856],[0.0023122299, 0.010742856, 0.017481629]]),
+                    np.array([[0.0048026871, 0.00022935172, 0.0077668377],[0.00022935172, 0.011729696, 0.0081661865],[0.0077668377, 0.0081661865, 0.025374353]]),
+                    np.array([[0.0011989699, 0.0022453172, -0.0010748957],[0.0022453172, 0.047758564, 0.020332102],[-0.0010748957, 0.020332102, 0.024502251]])]
+
+gmm_skin = GMM(3, 4, gmm_skin_w, gmm_skin_mu, [], gmm_skin_cov_det, gmm_skin_cov_inv)
+
+gmm_nonskin_w = [0.12791070, 0.31130761, 0.34245777, 0.21832393]
+gmm_nonskin_mu = [np.array([99.200851, 112.07533, 140.20602]),
+                    np.array([110.91392, 125.52969, 130.19237]),
+                    np.array([129.75864, 129.96107, 126.96808]),
+                    np.array([112.29587, 128.85121, 129.05431])]
+gmm_nonskin_cov_det = [458703648., 6466488., 90611376., 133097.63]
+gmm_nonskin_cov_inv = [np.array([[0.00085371657, 0.00071197288, 0.00023958916],[0.00071197288, 0.0025935620, 0.00076557708],[0.00023958916, 0.00076557708, 0.0015042332]]),
+                    np.array([[0.00024650150, 0.00045542428, 0.00015019422],[0.00045542428, 0.026412144, 0.018419769],[0.00015019422, 0.018419769, 0.037497383]]),
+                    np.array([[0.00037054974, 0.00038146760, 0.00040408765],[0.00038146760, 0.0085505722, 0.0079136286],[0.00040408765, 0.0079136286, 0.010982352]]),
+                    np.array([[0.00013709733, 0.00051228428, 0.00012777430],[0.00051228428, 0.28237113, 0.10528370],[0.00012777430, 0.10528370, 0.23468947]])]
+
+gmm_nonskin = GMM(3, 4, gmm_nonskin_w, gmm_nonskin_mu, [], gmm_nonskin_cov_det, gmm_nonskin_cov_inv)
+
+prior_skin = 0.8
+prior_nonskin = 1 - prior_skin
+
+
+# calculate skin attention mask
+def skinmask(imbgr):
+    im = _bgr2ycbcr(imbgr)
+
+    data = im.reshape((-1,3))
+
+    lh_skin = gmm_skin.likelihood(data)
+    lh_nonskin = gmm_nonskin.likelihood(data)
+
+    tmp1 = prior_skin * lh_skin
+    tmp2 = prior_nonskin * lh_nonskin
+    post_skin = tmp1 / (tmp1+tmp2) # posterior probability
+
+    post_skin = post_skin.reshape((im.shape[0],im.shape[1]))
+
+    post_skin = np.round(post_skin*255)
+    post_skin = post_skin.astype(np.uint8)
+    post_skin = np.tile(np.expand_dims(post_skin,2),[1,1,3]) # reshape to H*W*3
+
+    return post_skin
+
+
+def get_skin_mask(img_path):
+    print('generating skin masks......')
+    names = [i for i in sorted(os.listdir(
+        img_path)) if 'jpg' in i or 'png' in i or 'jpeg' in i or 'PNG' in i]
+    save_path = os.path.join(img_path, 'mask')
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    
+    for i in range(0, len(names)):
+        name = names[i]
+        print('%05d' % (i), ' ', name)
+        full_image_name = os.path.join(img_path, name)
+        img = cv2.imread(full_image_name).astype(np.float32)
+        skin_img = skinmask(img)
+        cv2.imwrite(os.path.join(save_path, name), skin_img.astype(np.uint8))
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/test_mean_face.txt b/dreamtalk/Deep3DFaceRecon_pytorch/util/test_mean_face.txt
new file mode 100644
index 00000000..3a46d4db
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/test_mean_face.txt
@@ -0,0 +1,136 @@
+-5.228591537475585938e+01
+2.078247070312500000e-01
+-5.064269638061523438e+01
+-1.315765380859375000e+01
+-4.952939224243164062e+01
+-2.592591094970703125e+01
+-4.793047332763671875e+01
+-3.832135772705078125e+01
+-4.512159729003906250e+01
+-5.059623336791992188e+01
+-3.917720794677734375e+01
+-6.043736648559570312e+01
+-2.929953765869140625e+01
+-6.861183166503906250e+01
+-1.719801330566406250e+01
+-7.572736358642578125e+01
+-1.961936950683593750e+00
+-7.862001037597656250e+01
+1.467941284179687500e+01
+-7.607844543457031250e+01
+2.744073486328125000e+01
+-6.915261840820312500e+01
+3.855677795410156250e+01
+-5.950350570678710938e+01
+4.478240966796875000e+01
+-4.867547225952148438e+01
+4.714337158203125000e+01
+-3.800830078125000000e+01
+4.940315246582031250e+01
+-2.496297454833984375e+01
+5.117234802246093750e+01
+-1.241538238525390625e+01
+5.190507507324218750e+01
+8.244247436523437500e-01
+-4.150688934326171875e+01
+2.386329650878906250e+01
+-3.570307159423828125e+01
+3.017010498046875000e+01
+-2.790358734130859375e+01
+3.212951660156250000e+01
+-1.941773223876953125e+01
+3.156523132324218750e+01
+-1.138106536865234375e+01
+2.841992187500000000e+01
+5.993263244628906250e+00
+2.895182800292968750e+01
+1.343590545654296875e+01
+3.189880371093750000e+01
+2.203153991699218750e+01
+3.302221679687500000e+01
+2.992478942871093750e+01
+3.099150085449218750e+01
+3.628388977050781250e+01
+2.765748596191406250e+01
+-1.933914184570312500e+00
+1.405374145507812500e+01
+-2.153038024902343750e+00
+5.772636413574218750e+00
+-2.270050048828125000e+00
+-2.121643066406250000e+00
+-2.218330383300781250e+00
+-1.068978118896484375e+01
+-1.187252044677734375e+01
+-1.997912597656250000e+01
+-6.879402160644531250e+00
+-2.143579864501953125e+01
+-1.227821350097656250e+00
+-2.193494415283203125e+01
+4.623237609863281250e+00
+-2.152721405029296875e+01
+9.721397399902343750e+00
+-1.953671264648437500e+01
+-3.648714447021484375e+01
+9.811126708984375000e+00
+-3.130242919921875000e+01
+1.422447967529296875e+01
+-2.212834930419921875e+01
+1.493019866943359375e+01
+-1.500880432128906250e+01
+1.073588562011718750e+01
+-2.095037078857421875e+01
+9.054298400878906250e+00
+-3.050099182128906250e+01
+8.704177856445312500e+00
+1.173237609863281250e+01
+1.054329681396484375e+01
+1.856353759765625000e+01
+1.535009765625000000e+01
+2.893331909179687500e+01
+1.451992797851562500e+01
+3.452944946289062500e+01
+1.065280151367187500e+01
+2.875990295410156250e+01
+8.654792785644531250e+00
+1.942100524902343750e+01
+9.422447204589843750e+00
+-2.204488372802734375e+01
+-3.983994293212890625e+01
+-1.324458312988281250e+01
+-3.467377471923828125e+01
+-6.749649047851562500e+00
+-3.092894744873046875e+01
+-9.183349609375000000e-01
+-3.196458435058593750e+01
+4.220649719238281250e+00
+-3.090406036376953125e+01
+1.089889526367187500e+01
+-3.497008514404296875e+01
+1.874589538574218750e+01
+-4.065438079833984375e+01
+1.124106597900390625e+01
+-4.438417816162109375e+01
+5.181709289550781250e+00
+-4.649170684814453125e+01
+-1.158607482910156250e+00
+-4.680406951904296875e+01
+-7.918922424316406250e+00
+-4.671575164794921875e+01
+-1.452505493164062500e+01
+-4.416526031494140625e+01
+-2.005007171630859375e+01
+-3.997841644287109375e+01
+-1.054919433593750000e+01
+-3.849683380126953125e+01
+-1.051826477050781250e+00
+-3.794863128662109375e+01
+6.412681579589843750e+00
+-3.804645538330078125e+01
+1.627674865722656250e+01
+-4.039697265625000000e+01
+6.373878479003906250e+00
+-4.087213897705078125e+01
+-8.551712036132812500e-01
+-4.157129669189453125e+01
+-1.014953613281250000e+01
+-4.128469085693359375e+01
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/util.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/util.py
new file mode 100644
index 00000000..0db5ec9c
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/util.py
@@ -0,0 +1,214 @@
+"""This script contains basic utilities for Deep3DFaceRecon_pytorch
+"""
+from __future__ import print_function
+import numpy as np
+import torch
+from PIL import Image
+try:
+    from PIL.Image import Resampling
+    RESAMPLING_METHOD = Resampling.BICUBIC
+except ImportError:
+    from PIL.Image import BICUBIC
+    RESAMPLING_METHOD = BICUBIC
+import os
+import importlib
+import argparse
+from argparse import Namespace
+import torchvision
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+def copyconf(default_opt, **kwargs):
+    conf = Namespace(**vars(default_opt))
+    for key in kwargs:
+        setattr(conf, key, kwargs[key])
+    return conf
+
+def genvalconf(train_opt, **kwargs):
+    conf = Namespace(**vars(train_opt))
+    attr_dict = train_opt.__dict__
+    for key, value in attr_dict.items():
+        if 'val' in key and key.split('_')[0] in attr_dict:
+            setattr(conf, key.split('_')[0], value)
+
+    for key in kwargs:
+        setattr(conf, key, kwargs[key])
+
+    return conf
+        
+def find_class_in_module(target_cls_name, module):
+    target_cls_name = target_cls_name.replace('_', '').lower()
+    clslib = importlib.import_module(module)
+    cls = None
+    for name, clsobj in clslib.__dict__.items():
+        if name.lower() == target_cls_name:
+            cls = clsobj
+
+    assert cls is not None, "In %s, there should be a class whose name matches %s in lowercase without underscore(_)" % (module, target_cls_name)
+
+    return cls
+
+
+def tensor2im(input_image, imtype=np.uint8):
+    """"Converts a Tensor array into a numpy image array.
+
+    Parameters:
+        input_image (tensor) --  the input image tensor array, range(0, 1)
+        imtype (type)        --  the desired type of the converted numpy array
+    """
+    if not isinstance(input_image, np.ndarray):
+        if isinstance(input_image, torch.Tensor):  # get the data from a variable
+            image_tensor = input_image.data
+        else:
+            return input_image
+        image_numpy = image_tensor.clamp(0.0, 1.0).cpu().float().numpy()  # convert it into a numpy array
+        if image_numpy.shape[0] == 1:  # grayscale to RGB
+            image_numpy = np.tile(image_numpy, (3, 1, 1))
+        image_numpy = np.transpose(image_numpy, (1, 2, 0)) * 255.0  # post-processing: tranpose and scaling
+    else:  # if it is a numpy array, do nothing
+        image_numpy = input_image
+    return image_numpy.astype(imtype)
+
+
+def diagnose_network(net, name='network'):
+    """Calculate and print the mean of average absolute(gradients)
+
+    Parameters:
+        net (torch network) -- Torch network
+        name (str) -- the name of the network
+    """
+    mean = 0.0
+    count = 0
+    for param in net.parameters():
+        if param.grad is not None:
+            mean += torch.mean(torch.abs(param.grad.data))
+            count += 1
+    if count > 0:
+        mean = mean / count
+    print(name)
+    print(mean)
+
+
+def save_image(image_numpy, image_path, aspect_ratio=1.0):
+    """Save a numpy image to the disk
+
+    Parameters:
+        image_numpy (numpy array) -- input numpy array
+        image_path (str)          -- the path of the image
+    """
+
+    image_pil = Image.fromarray(image_numpy)
+    h, w, _ = image_numpy.shape
+
+    if aspect_ratio is None:
+        pass
+    elif aspect_ratio > 1.0:
+        image_pil = image_pil.resize((h, int(w * aspect_ratio)), RESAMPLING_METHOD)
+    elif aspect_ratio < 1.0:
+        image_pil = image_pil.resize((int(h / aspect_ratio), w), RESAMPLING_METHOD)
+    image_pil.save(image_path)
+
+
+def print_numpy(x, val=True, shp=False):
+    """Print the mean, min, max, median, std, and size of a numpy array
+
+    Parameters:
+        val (bool) -- if print the values of the numpy array
+        shp (bool) -- if print the shape of the numpy array
+    """
+    x = x.astype(np.float64)
+    if shp:
+        print('shape,', x.shape)
+    if val:
+        x = x.flatten()
+        print('mean = %3.3f, min = %3.3f, max = %3.3f, median = %3.3f, std=%3.3f' % (
+            np.mean(x), np.min(x), np.max(x), np.median(x), np.std(x)))
+
+
+def mkdirs(paths):
+    """create empty directories if they don't exist
+
+    Parameters:
+        paths (str list) -- a list of directory paths
+    """
+    if isinstance(paths, list) and not isinstance(paths, str):
+        for path in paths:
+            mkdir(path)
+    else:
+        mkdir(paths)
+
+
+def mkdir(path):
+    """create a single empty directory if it didn't exist
+
+    Parameters:
+        path (str) -- a single directory path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def correct_resize_label(t, size):
+    device = t.device
+    t = t.detach().cpu()
+    resized = []
+    for i in range(t.size(0)):
+        one_t = t[i, :1]
+        one_np = np.transpose(one_t.numpy().astype(np.uint8), (1, 2, 0))
+        one_np = one_np[:, :, 0]
+        one_image = Image.fromarray(one_np).resize(size, Image.NEAREST)
+        resized_t = torch.from_numpy(np.array(one_image)).long()
+        resized.append(resized_t)
+    return torch.stack(resized, dim=0).to(device)
+
+
+def correct_resize(t, size, mode=RESAMPLING_METHOD):
+    device = t.device
+    t = t.detach().cpu()
+    resized = []
+    for i in range(t.size(0)):
+        one_t = t[i:i + 1]
+        one_image = Image.fromarray(tensor2im(one_t)).resize(size, RESAMPLING_METHOD)
+        resized_t = torchvision.transforms.functional.to_tensor(one_image) * 2 - 1.0
+        resized.append(resized_t)
+    return torch.stack(resized, dim=0).to(device)
+
+def draw_landmarks(img, landmark, color='r', step=2):
+    """
+    Return:
+        img              -- numpy.array, (B, H, W, 3) img with landmark, RGB order, range (0, 255)
+        
+
+    Parameters:
+        img              -- numpy.array, (B, H, W, 3), RGB order, range (0, 255)
+        landmark         -- numpy.array, (B, 68, 2), y direction is opposite to v direction
+        color            -- str, 'r' or 'b' (red or blue)
+    """
+    if color =='r':
+        c = np.array([255., 0, 0])
+    else:
+        c = np.array([0, 0, 255.])
+
+    _, H, W, _ = img.shape
+    img, landmark = img.copy(), landmark.copy()
+    landmark[..., 1] = H - 1 - landmark[..., 1]
+    landmark = np.round(landmark).astype(np.int32)
+    for i in range(landmark.shape[1]):
+        x, y = landmark[:, i, 0], landmark[:, i, 1]
+        for j in range(-step, step):
+            for k in range(-step, step):
+                u = np.clip(x + j, 0, W - 1)
+                v = np.clip(y + k, 0, H - 1)
+                for m in range(landmark.shape[0]):
+                    img[m, v[m], u[m]] = c
+    return img
diff --git a/dreamtalk/Deep3DFaceRecon_pytorch/util/visualizer.py b/dreamtalk/Deep3DFaceRecon_pytorch/util/visualizer.py
new file mode 100644
index 00000000..4023a6d4
--- /dev/null
+++ b/dreamtalk/Deep3DFaceRecon_pytorch/util/visualizer.py
@@ -0,0 +1,227 @@
+"""This script defines the visualizer for Deep3DFaceRecon_pytorch
+"""
+
+import numpy as np
+import os
+import sys
+import ntpath
+import time
+from . import util, html
+from subprocess import Popen, PIPE
+from torch.utils.tensorboard import SummaryWriter
+
+def save_images(webpage, visuals, image_path, aspect_ratio=1.0, width=256):
+    """Save images to the disk.
+
+    Parameters:
+        webpage (the HTML class) -- the HTML webpage class that stores these imaegs (see html.py for more details)
+        visuals (OrderedDict)    -- an ordered dictionary that stores (name, images (either tensor or numpy) ) pairs
+        image_path (str)         -- the string is used to create image paths
+        aspect_ratio (float)     -- the aspect ratio of saved images
+        width (int)              -- the images will be resized to width x width
+
+    This function will save images stored in 'visuals' to the HTML file specified by 'webpage'.
+    """
+    image_dir = webpage.get_image_dir()
+    short_path = ntpath.basename(image_path[0])
+    name = os.path.splitext(short_path)[0]
+
+    webpage.add_header(name)
+    ims, txts, links = [], [], []
+
+    for label, im_data in visuals.items():
+        im = util.tensor2im(im_data)
+        image_name = '%s/%s.png' % (label, name)
+        os.makedirs(os.path.join(image_dir, label), exist_ok=True)
+        save_path = os.path.join(image_dir, image_name)
+        util.save_image(im, save_path, aspect_ratio=aspect_ratio)
+        ims.append(image_name)
+        txts.append(label)
+        links.append(image_name)
+    webpage.add_images(ims, txts, links, width=width)
+
+
+class Visualizer():
+    """This class includes several functions that can display/save images and print/save logging information.
+
+    It uses a Python library tensprboardX for display, and a Python library 'dominate' (wrapped in 'HTML') for creating HTML files with images.
+    """
+
+    def __init__(self, opt):
+        """Initialize the Visualizer class
+
+        Parameters:
+            opt -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        Step 1: Cache the training/test options
+        Step 2: create a tensorboard writer
+        Step 3: create an HTML object for saveing HTML filters
+        Step 4: create a logging file to store training losses
+        """
+        self.opt = opt  # cache the option
+        self.use_html = opt.isTrain and not opt.no_html
+        self.writer = SummaryWriter(os.path.join(opt.checkpoints_dir, 'logs', opt.name))
+        self.win_size = opt.display_winsize
+        self.name = opt.name
+        self.saved = False
+        if self.use_html:  # create an HTML object at <checkpoints_dir>/web/; images will be saved under <checkpoints_dir>/web/images/
+            self.web_dir = os.path.join(opt.checkpoints_dir, opt.name, 'web')
+            self.img_dir = os.path.join(self.web_dir, 'images')
+            print('create web directory %s...' % self.web_dir)
+            util.mkdirs([self.web_dir, self.img_dir])
+        # create a logging file to store training losses
+        self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt')
+        with open(self.log_name, "a") as log_file:
+            now = time.strftime("%c")
+            log_file.write('================ Training Loss (%s) ================\n' % now)
+
+    def reset(self):
+        """Reset the self.saved status"""
+        self.saved = False
+
+
+    def display_current_results(self, visuals, total_iters, epoch, save_result):
+        """Display current results on tensorboad; save current results to an HTML file.
+
+        Parameters:
+            visuals (OrderedDict) - - dictionary of images to display or save
+            total_iters (int) -- total iterations
+            epoch (int) - - the current epoch
+            save_result (bool) - - if save the current results to an HTML file
+        """
+        for label, image in visuals.items():
+            self.writer.add_image(label, util.tensor2im(image), total_iters, dataformats='HWC')
+
+        if self.use_html and (save_result or not self.saved):  # save images to an HTML file if they haven't been saved.
+            self.saved = True
+            # save images to the disk
+            for label, image in visuals.items():
+                image_numpy = util.tensor2im(image)
+                img_path = os.path.join(self.img_dir, 'epoch%.3d_%s.png' % (epoch, label))
+                util.save_image(image_numpy, img_path)
+
+            # update website
+            webpage = html.HTML(self.web_dir, 'Experiment name = %s' % self.name, refresh=0)
+            for n in range(epoch, 0, -1):
+                webpage.add_header('epoch [%d]' % n)
+                ims, txts, links = [], [], []
+
+                for label, image_numpy in visuals.items():
+                    image_numpy = util.tensor2im(image)
+                    img_path = 'epoch%.3d_%s.png' % (n, label)
+                    ims.append(img_path)
+                    txts.append(label)
+                    links.append(img_path)
+                webpage.add_images(ims, txts, links, width=self.win_size)
+            webpage.save()
+
+    def plot_current_losses(self, total_iters, losses):
+        # G_loss_collection = {}
+        # D_loss_collection = {}
+        # for name, value in losses.items():
+        #     if 'G' in name or 'NCE' in name or 'idt' in name:
+        #         G_loss_collection[name] = value
+        #     else:
+        #         D_loss_collection[name] = value
+        # self.writer.add_scalars('G_collec', G_loss_collection, total_iters)
+        # self.writer.add_scalars('D_collec', D_loss_collection, total_iters)
+        for name, value in losses.items():
+            self.writer.add_scalar(name, value, total_iters)
+
+    # losses: same format as |losses| of plot_current_losses
+    def print_current_losses(self, epoch, iters, losses, t_comp, t_data):
+        """print current losses on console; also save the losses to the disk
+
+        Parameters:
+            epoch (int) -- current epoch
+            iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch)
+            losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
+            t_comp (float) -- computational time per data point (normalized by batch_size)
+            t_data (float) -- data loading time per data point (normalized by batch_size)
+        """
+        message = '(epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (epoch, iters, t_comp, t_data)
+        for k, v in losses.items():
+            message += '%s: %.3f ' % (k, v)
+
+        print(message)  # print the message
+        with open(self.log_name, "a") as log_file:
+            log_file.write('%s\n' % message)  # save the message
+
+
+class MyVisualizer:
+    def __init__(self, opt):
+        """Initialize the Visualizer class
+
+        Parameters:
+            opt -- stores all the experiment flags; needs to be a subclass of BaseOptions
+        Step 1: Cache the training/test options
+        Step 2: create a tensorboard writer
+        Step 3: create an HTML object for saveing HTML filters
+        Step 4: create a logging file to store training losses
+        """
+        self.opt = opt  # cache the optio
+        self.name = opt.name
+        self.img_dir = os.path.join(opt.checkpoints_dir, opt.name, 'results')
+        
+        if opt.phase != 'test':
+            self.writer = SummaryWriter(os.path.join(opt.checkpoints_dir, opt.name, 'logs'))
+            # create a logging file to store training losses
+            self.log_name = os.path.join(opt.checkpoints_dir, opt.name, 'loss_log.txt')
+            with open(self.log_name, "a") as log_file:
+                now = time.strftime("%c")
+                log_file.write('================ Training Loss (%s) ================\n' % now)
+
+
+    def display_current_results(self, visuals, total_iters, epoch, dataset='train', save_results=False, count=0, name=None,
+            add_image=True):
+        """Display current results on tensorboad; save current results to an HTML file.
+
+        Parameters:
+            visuals (OrderedDict) - - dictionary of images to display or save
+            total_iters (int) -- total iterations
+            epoch (int) - - the current epoch
+            dataset (str) - - 'train' or 'val' or 'test'
+        """
+        # if (not add_image) and (not save_results): return
+        
+        for label, image in visuals.items():
+            for i in range(image.shape[0]):
+                image_numpy = util.tensor2im(image[i])
+                if add_image:
+                    self.writer.add_image(label + '%s_%02d'%(dataset, i + count),
+                            image_numpy, total_iters, dataformats='HWC')
+
+                if save_results:
+                    save_path = os.path.join(self.img_dir, dataset, 'epoch_%s_%06d'%(epoch, total_iters))
+                    if not os.path.isdir(save_path):
+                        os.makedirs(save_path)
+
+                    if name is not None:
+                        img_path = os.path.join(save_path, '%s.png' % name)
+                    else:
+                        img_path = os.path.join(save_path, '%s_%03d.png' % (label, i + count))
+                    util.save_image(image_numpy, img_path)
+
+
+    def plot_current_losses(self, total_iters, losses, dataset='train'):
+        for name, value in losses.items():
+            self.writer.add_scalar(name + '/%s'%dataset, value, total_iters)
+
+    # losses: same format as |losses| of plot_current_losses
+    def print_current_losses(self, epoch, iters, losses, t_comp, t_data, dataset='train'):
+        """print current losses on console; also save the losses to the disk
+
+        Parameters:
+            epoch (int) -- current epoch
+            iters (int) -- current training iteration during this epoch (reset to 0 at the end of every epoch)
+            losses (OrderedDict) -- training losses stored in the format of (name, float) pairs
+            t_comp (float) -- computational time per data point (normalized by batch_size)
+            t_data (float) -- data loading time per data point (normalized by batch_size)
+        """
+        message = '(dataset: %s, epoch: %d, iters: %d, time: %.3f, data: %.3f) ' % (
+            dataset, epoch, iters, t_comp, t_data)
+        for k, v in losses.items():
+            message += '%s: %.3f ' % (k, v)
+
+        print(message)  # print the message
+        with open(self.log_name, "a") as log_file:
+            log_file.write('%s\n' % message)  # save the message
diff --git a/dreamtalk/MetaPortrait/README.md b/dreamtalk/MetaPortrait/README.md
new file mode 100644
index 00000000..ffd658df
--- /dev/null
+++ b/dreamtalk/MetaPortrait/README.md
@@ -0,0 +1,52 @@
+### MetaPortrait: 
+
+||||
+|:--:|:--:|:--:|
+| **[论文网址](https://arxiv.org/abs/2212.08062)** | **[项目网页](https://meta-portrait.github.io/)** | **[Github](https://github.com/Meta-Portrait/MetaPortrait/tree/v0.0.1)** |
+
+### 部署项目
+
+部署可以按照 Github 的说明进行 ( v0.0.1 )，且此模块并非必须项目，因此在此处不再提供部署说明。
+
+如果出现问题可以考虑通过 conda 使用文件夹下的 `environment.yml` 进行环境配置，`environment.yml` 由本机完成部署的 `WSL ( Ubuntu-20.4 )` 中直接导出。
+
+> 使用其超分辨率模型，在本机 WSL 上推理时间较长, 原作者使用 8 张 A100 进行推理训练，他说推理的很快啊，笑 ^_^ 
+
+```shell
+python -m torch.distributed.run --nproc_per_node=1 --master_port=4321 Experimental_root/test.py -opt options/test/same_id.yml --launcher pytorch
+```
+
+#### 效果展示
+
+只在其提供的 demo 上进行了推理测试，为了便于评估最后没有使用 ( 用于评估的视频有的分辨率小于 512*512 )
+
+**单帧修复**
+
+|原始低分辨率图像|生成图像|原始高分辨率图像|
+|:--:|:--:|:--:|
+|![](./demo/lq.png)|![](./demo/gq.png)|![](./demo/gt.png)|
+
+**视频修复**
+
+<div style="display: flex; justify-content: center; align-items: center; height: 100vh;">
+    <video src="./demo/MP.mp4" controls width="512" height="512"></video>
+</div>
+
+
+效果不错，但是感觉时间不是很够，而且不适合评估（论文中有提到说使用超分辨模型提高分辨率会造成一定程度的面部运动的准确性降低）
+
+### 另一个解决分辨率的方法 CodeFormer
+
+||||
+|:--:|:--:|:--:|
+| **[论文地址](https://arxiv.org/abs/2206.11253)** | **[项目网页](https://shangchenzhou.com/projects/CodeFormer/)** | **[Github](https://github.com/sczhou/CodeFormer)** |
+
+dreamtalk 作者说 CodeFormer 更慢而且存在时间一致性问题，故不采用（面部波动很大，在服务器上试了试，面部很容易变形或者说一眼 AI）
+
+> 而且据说 A100 GPU 上每秒只能处理一帧，相当耗时
+
+#### 效果展示
+
+<div style="display: flex; justify-content: center; align-items: center; height: 100vh;">
+    <video src="./demo/CF.mp4" controls width="1024" height="1024"></video>
+</div>
diff --git a/dreamtalk/MetaPortrait/demo/CF.mp4 b/dreamtalk/MetaPortrait/demo/CF.mp4
new file mode 100644
index 00000000..239f1adc
Binary files /dev/null and b/dreamtalk/MetaPortrait/demo/CF.mp4 differ
diff --git a/dreamtalk/MetaPortrait/demo/MP.mp4 b/dreamtalk/MetaPortrait/demo/MP.mp4
new file mode 100644
index 00000000..ec46c837
Binary files /dev/null and b/dreamtalk/MetaPortrait/demo/MP.mp4 differ
diff --git a/dreamtalk/MetaPortrait/demo/gq.png b/dreamtalk/MetaPortrait/demo/gq.png
new file mode 100644
index 00000000..8693376e
Binary files /dev/null and b/dreamtalk/MetaPortrait/demo/gq.png differ
diff --git a/dreamtalk/MetaPortrait/demo/gt.png b/dreamtalk/MetaPortrait/demo/gt.png
new file mode 100644
index 00000000..26be3338
Binary files /dev/null and b/dreamtalk/MetaPortrait/demo/gt.png differ
diff --git a/dreamtalk/MetaPortrait/demo/lq.png b/dreamtalk/MetaPortrait/demo/lq.png
new file mode 100644
index 00000000..6cea4e2b
Binary files /dev/null and b/dreamtalk/MetaPortrait/demo/lq.png differ
diff --git a/dreamtalk/MetaPortrait/environment.yml b/dreamtalk/MetaPortrait/environment.yml
new file mode 100644
index 00000000..8c8d60b4
--- /dev/null
+++ b/dreamtalk/MetaPortrait/environment.yml
@@ -0,0 +1,161 @@
+name: MP
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+  - conda-forge
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py39h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.11.26=h06a4308_0
+  - certifi=2024.8.30=py39h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cudart=12.4.127=0
+  - cuda-cupti=12.4.127=0
+  - cuda-libraries=12.4.1=0
+  - cuda-nvrtc=12.4.127=0
+  - cuda-nvtx=12.4.127=0
+  - cuda-opencl=12.6.77=0
+  - cuda-runtime=12.4.1=0
+  - cuda-version=12.6=3
+  - ffmpeg=4.3=hf484d3e_0
+  - filelock=3.13.1=py39h06a4308_0
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.2=h5eee18b_0
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py39heeb90bb_0
+  - gnutls=3.6.15=he1e5248_0
+  - idna=3.7=py39h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py39h06a4308_1
+  - jpeg=9e=h5eee18b_3
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.16=hb9589c4_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - lerc=4.0.0=h6a678d5_0
+  - libcublas=12.4.5.8=0
+  - libcufft=11.2.1.3=0
+  - libcufile=1.11.1.6=0
+  - libcurand=10.3.7.77=0
+  - libcusolver=11.6.1.9=0
+  - libcusparse=12.3.1.170=0
+  - libdeflate=1.22=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libidn2=2.3.4=h5eee18b_0
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.2.5.30=0
+  - libnvfatbin=12.6.77=0
+  - libnvjitlink=12.4.127=0
+  - libnvjpeg=12.3.1.117=0
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.19.0=h5eee18b_0
+  - libtiff=4.5.1=hffd6297_1
+  - libunistring=0.9.10=h27cfd23_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_1
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py39h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.11=py39h5eee18b_0
+  - mkl_random=1.2.8=py39h1128e8f_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpmath=1.3.0=py39h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - nettle=3.7.3=hbbd107a_1
+  - networkx=3.2.1=py39h06a4308_0
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.15=h5eee18b_0
+  - pillow=11.0.0=py39hcea889d_1
+  - pip=24.2=py39h06a4308_0
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.21=he870216_1
+  - pytorch=2.5.1=py3.9_cuda12.4_cudnn9.1.0_0
+  - pytorch-cuda=12.4=hc786d27_7
+  - pytorch-mutex=1.0=cuda
+  - pyyaml=6.0.2=py39h5eee18b_0
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py39h06a4308_1
+  - setuptools=75.1.0=py39h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tbb=2021.8.0=hdb19cb5_0
+  - tk=8.6.14=h39e8969_0
+  - torchaudio=2.5.1=py39_cu124
+  - torchtriton=3.1.0=py39
+  - torchvision=0.20.1=py39_cu124
+  - typing_extensions=4.11.0=py39h06a4308_0
+  - tzdata=2024b=h04d1e81_0
+  - urllib3=2.2.3=py39h06a4308_0
+  - wheel=0.44.0=py39h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.6=hc292b87_0
+  - pip:
+      - absl-py==2.1.0
+      - addict==2.4.0
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - decorator==4.4.2
+      - easydict==1.13
+      - einops==0.8.0
+      - facexlib==0.3.0
+      - filterpy==1.4.5
+      - fonttools==4.55.3
+      - fsspec==2024.10.0
+      - future==1.0.0
+      - gfpgan==1.3.8
+      - grpcio==1.68.1
+      - huggingface-hub==0.27.0
+      - imageio==2.36.1
+      - imageio-ffmpeg==0.5.1
+      - importlib-metadata==8.5.0
+      - joblib==1.4.2
+      - kiwisolver==1.4.7
+      - lazy-loader==0.4
+      - llvmlite==0.39.1
+      - lmdb==1.5.1
+      - lpips==0.1.4
+      - markdown==3.7
+      - matplotlib==3.7.5
+      - moviepy==1.0.3
+      - numba==0.56.4
+      - numpy==1.20.3
+      - opencv-python==4.10.0.84
+      - packaging==24.2
+      - platformdirs==4.3.6
+      - proglog==0.1.10
+      - protobuf==5.29.1
+      - pyparsing==3.2.0
+      - python-dateutil==2.9.0.post0
+      - pywavelets==1.4.1
+      - realesrgan==0.3.0
+      - safetensors==0.4.5
+      - scikit-image==0.19.3
+      - scikit-learn==1.6.0
+      - scipy==1.10.1
+      - six==1.17.0
+      - sympy==1.13.1
+      - tb-nightly==2.19.0a20241217
+      - tensorboard-data-server==0.7.2
+      - threadpoolctl==3.5.0
+      - tifffile==2024.8.30
+      - timm==1.0.12
+      - tomli==2.2.1
+      - tqdm==4.67.1
+      - werkzeug==3.1.3
+      - yapf==0.43.0
+      - zipp==3.21.0
+prefix: /root/anaconda3/envs/MP
diff --git a/dreamtalk/README.md b/dreamtalk/README.md
new file mode 100644
index 00000000..7be26ddc
--- /dev/null
+++ b/dreamtalk/README.md
@@ -0,0 +1,27 @@
+## 语音识别大作业 Dreamtalk
+
+联系方式:
+
+Github: 通过 issue 即可
+邮件: wincosmo9@outlook.com
+
+**项目结构说明**:
+
++ `/dreamtalk/`：项目源代码，其下 [README.md](./dreamtalk/README.md) 文件包含了项目部署及 `Docker` 使用说明，同时也是项目的主要指引文档
+
++ `/eval/`: 评估时用到的脚本，用于加快 `WSL` 开发，可以部署到任意项目中进行便捷操作，对环境几乎无依赖
+
++ `/Deep3DFaceRecon_pytorch/`: `3DMM` 参数的提取项目代码，是项目评估视频生成的基础
+
+    > 项目依赖于硬件驱动进行渲染，无法通过 `Docker` 保证兼容性 ( 详细原因参见其 [README.md](./Deep3DFaceRecon_pytorch/README.md) 文件 )，因此只提供了 `WSL` ( 理论上 `Ubuntu` 等效 ) 的环境配置及使用方法
+
++ `/MetaPortrait/`: 扩展超分辨率项目，已经完成部署，还没写配置文档和 `Docker`，用于提高视频分辨率，推理巨慢
+
++ `/syncnet_python/`: 定量评估 `LSE-C & LSE-D` 的相关代码，已封装 `Docker`，详细参考对应 [README.md](./syncnet_python/README.md)
+
+### 文件下载
+
+> 后来者提示：使用 google 云盘分享需要改权限哦，要不然需要审批每个请求很烦的
+
+你可以通过这个[链接](https://drive.google.com/drive/folders/1FHesuin8l2HyQHf1vQT2CGCsZ4gejSuR?usp=drive_link)下载项目内给出需要下载的全部资源
+
diff --git a/dreamtalk/dreamtalk/.dockerfile b/dreamtalk/dreamtalk/.dockerfile
new file mode 100644
index 00000000..01edac70
--- /dev/null
+++ b/dreamtalk/dreamtalk/.dockerfile
@@ -0,0 +1,36 @@
+FROM continuumio/anaconda3:latest
+
+RUN mkdir dreamtalk
+
+# 设置工作目录
+WORKDIR /dreamtalk
+
+# 复制需要的文件
+COPY environment.yml /dreamtalk/
+
+RUN apt-get update
+
+RUN apt-get install build-essential libgl1 dialog libssl-dev g++ cmake -y
+
+RUN conda env create -f environment.yml
+
+SHELL ["/bin/bash", "-c"]
+
+RUN conda init bash && \
+    source ~/.bashrc && \
+    conda activate dt && \
+    pip install torchaudio --index-url https://download.pytorch.org/whl/cu121 && \
+    pip install sox
+
+
+COPY inference_for_demo_video.py /dreamtalk/
+COPY generators /dreamtalk/generators
+COPY core /dreamtalk/core
+COPY configs /dreamtalk/configs
+COPY media /dreamtalk/media
+
+
+# 设置默认命令
+ENTRYPOINT ["conda", "run", "-n", "dt", "python", "inference_for_demo_video.py"]
+
+
diff --git a/dreamtalk/dreamtalk/README.md b/dreamtalk/dreamtalk/README.md
new file mode 100644
index 00000000..0f2fdd96
--- /dev/null
+++ b/dreamtalk/dreamtalk/README.md
@@ -0,0 +1,162 @@
+### 部署到本地
+
+```shell
+git clone https://github.com/Academic-Hammer/talkingface-kit/dreamtalk.git
+cd dreamtalk
+mkdir output_video
+mkdir jonatasgrosman/wav2vec2-large-xlsr-53-english
+```
+
+### 下载检查点与模型
+
+在构筑 docker 前需要下载[检查点文件](https://drive.google.com/drive/folders/1MUrhcxLSLwcv76QSgtT-aV9c24DMBxiP)到 `/dreamtalk/checkpoints/`, 文件目录格式如下：
+
+📦checkpoints  
+ ┣ 📜denoising_network.pth  
+ ┗ 📜renderer.pt
+
+同时您还需要下载用于英语语音识别的微调 [XLSR-53 模型](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english/tree/main)到 `/jonatasgrosman/` 目录下，必要的文件目录如下：
+
+> 不下全会在加载模型时产生警告信息（参数加载的警告，有剩余的参数好像是，不影响使用），据说将链接内的全下下来可以解决，未经测试
+
+📦jonatasgrosman  
+ ┗ 📂wav2vec2-large-xlsr-53-english  
+ ┃ ┣ 📜config.json  
+ ┃ ┣ 📜preprocessor_config.json  
+ ┃ ┣ 📜pytorch_model.bin  
+ ┃ ┗ 📜vocab.json  
+
+#### 下载镜像
+
+> 不建议，镜像文件很大，建议本地构筑，其实我传了一个早期的版本在 dockerhub 上，不知道搜 dreamtalk 能不能找到，不过好像是私有的
+
+好像我构筑的有亿点点大，虽然是两位数，但是单位是 GB hh ( 上传好浪费时间，我的建议是自己构筑，如果一定需要可以通过 GitHub 或者邮箱: wincomso9@outlook.com 联系我 )
+
+下载好后放到 `dreamtalk` 目录下，然后执行以下命令
+
+> 可以 `docker images` 看加载是否成功
+
+```bash
+    docker load -i dreamtalk.tar
+```
+
+### demo Docker
+
+#### 构筑 Docker
+
+```
+docker build -f .dockerfile -t dreamtalk:v1 .
+```
+
+#### 运行 demo
+
+```bash
+docker run --gpus all `
+    -v ${PWD}/data:/dreamtalk/data `
+    -v ${PWD}/output_video:/dreamtalk/output_video `
+    -v ${PWD}/checkpoints:/dreamtalk/checkpoints `
+    -v ${PWD}/jonatasgrosman:/dreamtalk/jonatasgrosman `
+    -v ${PWD}/tmp:/dreamtalk/tmp `
+    dreamtalk:v1 `
+    --wav_path /dreamtalk/data/audio/acknowledgement_chinese.m4a `
+    --style_clip_path /dreamtalk/data/style_clip/3DMM/M030_front_neutral_level1_001.mat `
+    --pose_path /dreamtalk/data/pose/RichardShelby_front_neutral_level1_001.mat `
+    --image_path /dreamtalk/data/src_img/uncropped/male_face.png `
+    --cfg_scale 1.0 `
+    --max_gen_len 30 `
+    --output_name demo
+```
+
+也即
+
+```bash
+docker run --gpus all -v ${PWD}/data:/dreamtalk/data -v ${PWD}/output_video:/dreamtalk/output_video -v ${PWD}/checkpoints:/dreamtalk/checkpoints -v ${PWD}/jonatasgrosman:/dreamtalk/jonatasgrosman -v ${PWD}/tmp:/dreamtalk/tmp dreamtalk:v1 --wav_path /dreamtalk/data/audio/acknowledgement_chinese.m4a --style_clip_path /dreamtalk/data/style_clip/3DMM/M030_front_neutral_level1_001.mat --pose_path /dreamtalk/data/pose/RichardShelby_front_neutral_level1_001.mat --image_path /dreamtalk/data/src_img/uncropped/male_face.png --cfg_scale 1.0 --max_gen_len 30 --output_name demo
+```
+
+#### 参数说明
+
++ `--wav_path` 
+
+    输入音频 ( 视频也可, 会从其中提取音频, 兼容格式: m4a, wav, mp4 等  )
+
++ `--style_clip_path` 
+
+    面部运动参考
+
++ `--pose_path`
+
+    头部姿势参考
+
++ `--image_path`
+
+    输入图像
+
++ `--cfg_scale`
+
+    风格的强度参数
+
++ `--max_gen_len`
+
+    生成视频的最大长度，单位是秒
+
++ `--output_name demo`  
+
+    输出文件名 
+
+### 从评估视频中提取 3DMM 参数
+
+#### 环境配置
+
+需要与底层交互，环境依赖于 CUDA 驱动 ( 而不是 pytorch 安装的 cuda-tookit ), docker 环境也需要参考 linux 或 WSL 安装的驱动 ( WSL 调用的实际是 windows 的显卡驱动 )
+
+> 如果提示 pytorch 支持的算力和当前显卡不匹配需要升级 pytorch：
+
+```bash
+conda create -n DP python=3.9
+conda activate DP
+conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
+# 不要按照论文项目网站使用对应的代码，存在版本问题
+pip install face_alignment
+pip install ffmpeg-python
+pip install kornia
+pip install trimesh
+pip install Ninja
+
+# nvdiffrast 经过个人修改，现在在 WSL 支持使用 CUDA，而不是 OpenGL 进行渲染（
+# WSL 上 OpenGL 存在版本限制，无法满足 nvdiffrast 的版本需求，通过微软的强制指令升级后也会与 CUDA 驱动冲突 
+# 而项目只要检测到 OpenGL 就会使用 OpenGL 而不是 CUDA
+# 因此我修改了 nvdiffrast 的源码，将其改为仅使用 CUDA 进行渲染
+cd nvdiffrast
+pip install .
+```
+
+按照[教程](https://blog.csdn.net/Sihang_Xie/article/details/127347139)安装 cuda 官网给的安装包后添加环境变量
+
+> 很详细的博客，WSL 照着做就是了，云服务器需要选择 cuda=12.4 的预装驱动，要不还要自己升级很麻烦
+
+#### 提取 3DMM 参数
+
+将视频放入对应文件夹运行即可，我在 16G/i9/4060 下使用的参数如下，在 `face_recon_videos.py` 可以进行第二段命令的参数调整：
+
+> 需要较多的内存，WSL 环境需要修改 .wslconfig 文件中的限制，评估视频的使用的最多内存大概在 26G 左右
+
+```bash
+python extract_kp_videos.py --input_dir data/input --output_dir data/keypoint --device_ids 0 --workers 6
+python face_recon_videos.py --input_dir data/input --keypoint_dir data/keypoint --output_dir data/output --inference_batch_size 200 --name=test --epoch=20 --model facerecon
+```
+
+### 生成评估视频
+
+将上面生成的 mat 文件放入 dreamtalk/data/eval/pose/ 下即可，在推理时修改参数即可
+
+需要注意的是，dreamtalk 的推理几乎无法在本地运行( 单卡推理且未分段，直接推理长视频 )，最短的视频使用显存也超过了 24G ( 还是 32G )，我在 A800 ( 80G ) 上完成了推理，但是其中 Macron 的视频需要的显存在 95 G 左右，因此我将其分段后重新提取了两段视频的 3DMM 参数，推理后再将其拼接成一个视频。
+
+> 分段推理在理论上是可行的，但是这样在提取 3DMM 的时候就需要进行分段 
+> ( 或许有时间看看3DMM 的论文看看 mat 里面的数据结构就不需要在提取时分段，可以在推理时处理数据，但是来不及了hh )
+
+
+### 模型评估
+
+具体参见 `/syncnet_python/` 和 `eval/`
+
+此处不加描述
\ No newline at end of file
diff --git a/dreamtalk/dreamtalk/config/default.py b/dreamtalk/dreamtalk/config/default.py
new file mode 100644
index 00000000..46c3063d
--- /dev/null
+++ b/dreamtalk/dreamtalk/config/default.py
@@ -0,0 +1,91 @@
+from yacs.config import CfgNode as CN
+
+
+_C = CN()
+_C.TAG = "style_id_emotion"
+_C.DECODER_TYPE = "DisentangleDecoder"
+_C.CONTENT_ENCODER_TYPE = "ContentW2VEncoder"
+_C.STYLE_ENCODER_TYPE = "StyleEncoder"
+
+_C.DIFFNET_TYPE = "DiffusionNet"
+
+_C.WIN_SIZE = 5
+_C.D_MODEL = 256
+
+_C.DATASET = CN()
+_C.DATASET.FACE3D_DIM = 64
+_C.DATASET.NUM_FRAMES = 64
+_C.DATASET.STYLE_MAX_LEN = 256
+
+_C.TRAIN = CN()
+_C.TRAIN.FACE3D_LATENT = CN()
+_C.TRAIN.FACE3D_LATENT.TYPE = "face3d"
+
+_C.DIFFUSION = CN()
+_C.DIFFUSION.PREDICT_WHAT = "x0"  # noise | x0
+_C.DIFFUSION.SCHEDULE = CN()
+_C.DIFFUSION.SCHEDULE.NUM_STEPS = 1000
+_C.DIFFUSION.SCHEDULE.BETA_1 = 1e-4
+_C.DIFFUSION.SCHEDULE.BETA_T = 0.02
+_C.DIFFUSION.SCHEDULE.MODE = "linear"
+
+_C.CONTENT_ENCODER = CN()
+_C.CONTENT_ENCODER.d_model = _C.D_MODEL
+_C.CONTENT_ENCODER.nhead = 8
+_C.CONTENT_ENCODER.num_encoder_layers = 3
+_C.CONTENT_ENCODER.dim_feedforward = 4 * _C.D_MODEL
+_C.CONTENT_ENCODER.dropout = 0.1
+_C.CONTENT_ENCODER.activation = "relu"
+_C.CONTENT_ENCODER.normalize_before = False
+_C.CONTENT_ENCODER.pos_embed_len = 2 * _C.WIN_SIZE + 1
+
+_C.STYLE_ENCODER = CN()
+_C.STYLE_ENCODER.d_model = _C.D_MODEL
+_C.STYLE_ENCODER.nhead = 8
+_C.STYLE_ENCODER.num_encoder_layers = 3
+_C.STYLE_ENCODER.dim_feedforward = 4 * _C.D_MODEL
+_C.STYLE_ENCODER.dropout = 0.1
+_C.STYLE_ENCODER.activation = "relu"
+_C.STYLE_ENCODER.normalize_before = False
+_C.STYLE_ENCODER.pos_embed_len = _C.DATASET.STYLE_MAX_LEN
+_C.STYLE_ENCODER.aggregate_method = (
+    "self_attention_pooling"  # average | self_attention_pooling
+)
+# _C.STYLE_ENCODER.input_dim = _C.DATASET.FACE3D_DIM
+
+_C.DECODER = CN()
+_C.DECODER.d_model = _C.D_MODEL
+_C.DECODER.nhead = 8
+_C.DECODER.num_decoder_layers = 3
+_C.DECODER.dim_feedforward = 4 * _C.D_MODEL
+_C.DECODER.dropout = 0.1
+_C.DECODER.activation = "relu"
+_C.DECODER.normalize_before = False
+_C.DECODER.return_intermediate_dec = False
+_C.DECODER.pos_embed_len = 2 * _C.WIN_SIZE + 1
+_C.DECODER.network_type = "TransformerDecoder"
+_C.DECODER.dynamic_K = None
+_C.DECODER.dynamic_ratio = None
+# _C.DECODER.output_dim = _C.DATASET.FACE3D_DIM
+# LSFM basis:
+# _C.DECODER.upper_face3d_indices = tuple(list(range(19)) + list(range(46, 51)))
+# _C.DECODER.lower_face3d_indices = tuple(range(19, 46))
+# BFM basis:
+# fmt: off
+_C.DECODER.upper_face3d_indices = [6, 8, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63] 
+# fmt: on
+_C.DECODER.lower_face3d_indices = [0, 1, 2, 3, 4, 5, 7, 9, 10, 11, 12, 13, 14]
+
+_C.CF_GUIDANCE = CN()
+_C.CF_GUIDANCE.TRAINING = True
+_C.CF_GUIDANCE.INFERENCE = True
+_C.CF_GUIDANCE.NULL_PROB = 0.1
+_C.CF_GUIDANCE.SCALE = 1.0
+
+_C.INFERENCE = CN()
+_C.INFERENCE.CHECKPOINT = "checkpoints/denoising_network.pth"
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    return _C.clone()
diff --git a/dreamtalk/dreamtalk/core/networks/__init__.py b/dreamtalk/dreamtalk/core/networks/__init__.py
new file mode 100644
index 00000000..e75bb644
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/__init__.py
@@ -0,0 +1,14 @@
+from core.networks.generator import (
+    StyleEncoder,
+    Decoder,
+    ContentW2VEncoder,
+)
+from core.networks.disentangle_decoder import DisentangleDecoder
+
+
+def get_network(name: str):
+    obj = globals().get(name)
+    if obj is None:
+        raise KeyError("Unknown Network: %s" % name)
+    else:
+        return obj
diff --git a/dreamtalk/dreamtalk/core/networks/diffusion_net.py b/dreamtalk/dreamtalk/core/networks/diffusion_net.py
new file mode 100644
index 00000000..e8401000
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/diffusion_net.py
@@ -0,0 +1,353 @@
+import math
+import torch
+import torch.nn.functional as F
+from torch.nn import Module
+from core.networks.diffusion_util import VarianceSchedule
+import numpy as np
+
+
+def face3d_raw_to_norm(face3d_raw, exp_min, exp_max):
+    """
+
+    Args:
+        face3d_raw (_type_): (B, L, C_face3d)
+        exp_min (_type_): (C_face3d)
+        exp_max (_type_): (C_face3d)
+
+    Returns:
+        _type_: (B, L, C_face3d) in [-1, 1]
+    """
+    exp_min_expand = exp_min[None, None, :]
+    exp_max_expand = exp_max[None, None, :]
+    face3d_norm_01 = (face3d_raw - exp_min_expand) / \
+        (exp_max_expand - exp_min_expand)
+    face3d_norm = face3d_norm_01 * 2 - 1
+    return face3d_norm
+
+
+def face3d_norm_to_raw(face3d_norm, exp_min, exp_max):
+    """
+
+    Args:
+        face3d_norm (_type_): (B, L, C_face3d)
+        exp_min (_type_): (C_face3d)
+        exp_max (_type_): (C_face3d)
+
+    Returns:
+        _type_: (B, L, C_face3d)
+    """
+    exp_min_expand = exp_min[None, None, :]
+    exp_max_expand = exp_max[None, None, :]
+    face3d_norm_01 = (face3d_norm + 1) / 2
+    face3d_raw = face3d_norm_01 * \
+        (exp_max_expand - exp_min_expand) + exp_min_expand
+    return face3d_raw
+
+
+class DiffusionNet(Module):
+    def __init__(self, cfg, net, var_sched: VarianceSchedule):
+        super().__init__()
+        self.cfg = cfg
+        self.net = net
+        self.var_sched = var_sched
+        self.face3d_latent_type = self.cfg.TRAIN.FACE3D_LATENT.TYPE
+        self.predict_what = self.cfg.DIFFUSION.PREDICT_WHAT
+
+        if self.cfg.CF_GUIDANCE.TRAINING:
+            null_style_clip = torch.zeros(
+                self.cfg.DATASET.STYLE_MAX_LEN, self.cfg.DATASET.FACE3D_DIM
+            )
+            self.register_buffer("null_style_clip", null_style_clip)
+
+            null_pad_mask = torch.tensor(
+                [False] * self.cfg.DATASET.STYLE_MAX_LEN)
+            self.register_buffer("null_pad_mask", null_pad_mask)
+
+    def _face3d_to_latent(self, face3d):
+        latent = None
+        if self.face3d_latent_type == "face3d":
+            latent = face3d
+        elif self.face3d_latent_type == "normalized_face3d":
+            latent = face3d_raw_to_norm(
+                face3d, exp_min=self.exp_min, exp_max=self.exp_max
+            )
+        else:
+            raise ValueError(
+                f"Invalid face3d latent type: {self.face3d_latent_type}")
+        return latent
+
+    def _latent_to_face3d(self, latent):
+        face3d = None
+        if self.face3d_latent_type == "face3d":
+            face3d = latent
+        elif self.face3d_latent_type == "normalized_face3d":
+            latent = torch.clamp(latent, min=-1, max=1)
+            face3d = face3d_norm_to_raw(
+                latent, exp_min=self.exp_min, exp_max=self.exp_max
+            )
+        else:
+            raise ValueError(
+                f"Invalid face3d latent type: {self.face3d_latent_type}")
+        return face3d
+
+    def ddim_sample(
+        self,
+        audio,
+        style_clip,
+        style_pad_mask,
+        output_dim,
+        flexibility=0.0,
+        ret_traj=False,
+        use_cf_guidance=False,
+        cfg_scale=2.0,
+        ddim_num_step=50,
+        ready_style_code=None,
+    ):
+        """
+
+        Args:
+            audio (_type_): (B, L, W) or (B, L, W, C)
+            style_clip (_type_): (B, L_clipmax, C_face3d)
+            style_pad_mask : (B, L_clipmax)
+            pose_dim (_type_): int
+            flexibility (float, optional): _description_. Defaults to 0.0.
+            ret_traj (bool, optional): _description_. Defaults to False.
+
+
+        Returns:
+            _type_: (B, L, C_face)
+        """
+        if self.predict_what != "x0":
+            raise NotImplementedError(self.predict_what)
+
+        if ready_style_code is not None and use_cf_guidance:
+            raise NotImplementedError("not implement cfg for ready style code")
+
+        c = self.var_sched.num_steps // ddim_num_step
+        time_steps = torch.tensor(
+            np.asarray(list(range(0, self.var_sched.num_steps, c))) + 1
+        )
+        assert len(time_steps) == ddim_num_step
+        prev_time_steps = torch.cat((torch.tensor([0]), time_steps[:-1]))
+
+        batch_size, output_len = audio.shape[:2]
+        # batch_size = context.size(0)
+        context = {
+            "audio": audio,
+            "style_clip": style_clip,
+            "style_pad_mask": style_pad_mask,
+            "ready_style_code": ready_style_code,
+        }
+        if use_cf_guidance:
+            uncond_style_clip = self.null_style_clip.unsqueeze(0).repeat(
+                batch_size, 1, 1
+            )
+            uncond_pad_mask = self.null_pad_mask.unsqueeze(
+                0).repeat(batch_size, 1)
+
+            context_double = {
+                "audio": torch.cat([audio] * 2, dim=0),
+                "style_clip": torch.cat([style_clip, uncond_style_clip], dim=0),
+                "style_pad_mask": torch.cat([style_pad_mask, uncond_pad_mask], dim=0),
+                "ready_style_code": None
+                if ready_style_code is None
+                else torch.cat(
+                    [
+                        ready_style_code,
+                        self.net.style_encoder(
+                            uncond_style_clip, uncond_pad_mask),
+                    ],
+                    dim=0,
+                ),
+            }
+
+        x_t = torch.randn(
+            [batch_size, output_len, output_dim]).to(audio.device)
+
+        for idx in list(range(ddim_num_step))[::-1]:
+            t = time_steps[idx]
+            t_prev = prev_time_steps[idx]
+            ddim_alpha = self.var_sched.alpha_bars[t]
+            ddim_alpha_prev = self.var_sched.alpha_bars[t_prev]
+
+            t_tensor = torch.tensor([t] * batch_size).to(audio.device).float()
+            if use_cf_guidance:
+                x_t_double = torch.cat([x_t] * 2, dim=0)
+                t_tensor_double = torch.cat([t_tensor] * 2, dim=0)
+                cond_output, uncond_output = self.net(
+                    x_t_double, t=t_tensor_double, **context_double
+                ).chunk(2)
+                diff_output = uncond_output + cfg_scale * \
+                    (cond_output - uncond_output)
+            else:
+                diff_output = self.net(x_t, t=t_tensor, **context)
+
+            pred_x0 = diff_output
+            eps = (x_t - torch.sqrt(ddim_alpha) * pred_x0) / \
+                torch.sqrt(1 - ddim_alpha)
+            c1 = torch.sqrt(ddim_alpha_prev)
+            c2 = torch.sqrt(1 - ddim_alpha_prev)
+
+            x_t = c1 * pred_x0 + c2 * eps
+
+        latent_output = x_t
+        face3d_output = self._latent_to_face3d(latent_output)
+        return face3d_output
+
+    def sample(
+        self,
+        audio,
+        style_clip,
+        style_pad_mask,
+        output_dim,
+        flexibility=0.0,
+        ret_traj=False,
+        use_cf_guidance=False,
+        cfg_scale=2.0,
+        sample_method="ddpm",
+        ddim_num_step=50,
+        ready_style_code=None,
+    ):
+        # sample_method = kwargs["sample_method"]
+        if sample_method == "ddpm":
+            if ready_style_code is not None:
+                raise NotImplementedError("ready style code in ddpm")
+            return self.ddpm_sample(
+                audio,
+                style_clip,
+                style_pad_mask,
+                output_dim,
+                flexibility=flexibility,
+                ret_traj=ret_traj,
+                use_cf_guidance=use_cf_guidance,
+                cfg_scale=cfg_scale,
+            )
+        elif sample_method == "ddim":
+            return self.ddim_sample(
+                audio,
+                style_clip,
+                style_pad_mask,
+                output_dim,
+                flexibility=flexibility,
+                ret_traj=ret_traj,
+                use_cf_guidance=use_cf_guidance,
+                cfg_scale=cfg_scale,
+                ddim_num_step=ddim_num_step,
+                ready_style_code=ready_style_code,
+            )
+
+    def ddpm_sample(
+        self,
+        audio,
+        style_clip,
+        style_pad_mask,
+        output_dim,
+        flexibility=0.0,
+        ret_traj=False,
+        use_cf_guidance=False,
+        cfg_scale=2.0,
+    ):
+        """
+
+        Args:
+            audio (_type_): (B, L, W) or (B, L, W, C)
+            style_clip (_type_): (B, L_clipmax, C_face3d)
+            style_pad_mask : (B, L_clipmax)
+            pose_dim (_type_): int
+            flexibility (float, optional): _description_. Defaults to 0.0.
+            ret_traj (bool, optional): _description_. Defaults to False.
+
+
+        Returns:
+            _type_: (B, L, C_face)
+        """
+        batch_size, output_len = audio.shape[:2]
+        # batch_size = context.size(0)
+        context = {
+            "audio": audio,
+            "style_clip": style_clip,
+            "style_pad_mask": style_pad_mask,
+        }
+        if use_cf_guidance:
+            uncond_style_clip = self.null_style_clip.unsqueeze(0).repeat(
+                batch_size, 1, 1
+            )
+            uncond_pad_mask = self.null_pad_mask.unsqueeze(
+                0).repeat(batch_size, 1)
+            context_double = {
+                "audio": torch.cat([audio] * 2, dim=0),
+                "style_clip": torch.cat([style_clip, uncond_style_clip], dim=0),
+                "style_pad_mask": torch.cat([style_pad_mask, uncond_pad_mask], dim=0),
+            }
+
+        x_T = torch.randn(
+            [batch_size, output_len, output_dim]).to(audio.device)
+        traj = {self.var_sched.num_steps: x_T}
+        for t in range(self.var_sched.num_steps, 0, -1):
+            alpha = self.var_sched.alphas[t]
+            alpha_bar = self.var_sched.alpha_bars[t]
+            alpha_bar_prev = self.var_sched.alpha_bars[t - 1]
+            sigma = self.var_sched.get_sigmas(t, flexibility)
+
+            z = torch.randn_like(x_T) if t > 1 else torch.zeros_like(x_T)
+            x_t = traj[t]
+            t_tensor = torch.tensor([t] * batch_size).to(audio.device).float()
+            if use_cf_guidance:
+                x_t_double = torch.cat([x_t] * 2, dim=0)
+                t_tensor_double = torch.cat([t_tensor] * 2, dim=0)
+                cond_output, uncond_output = self.net(
+                    x_t_double, t=t_tensor_double, **context_double
+                ).chunk(2)
+                diff_output = uncond_output + cfg_scale * \
+                    (cond_output - uncond_output)
+            else:
+                diff_output = self.net(x_t, t=t_tensor, **context)
+
+            if self.predict_what == "noise":
+                c0 = 1.0 / torch.sqrt(alpha)
+                c1 = (1 - alpha) / torch.sqrt(1 - alpha_bar)
+                x_next = c0 * (x_t - c1 * diff_output) + sigma * z
+            elif self.predict_what == "x0":
+                d0 = torch.sqrt(alpha) * (1 - alpha_bar_prev) / (1 - alpha_bar)
+                d1 = torch.sqrt(alpha_bar_prev) * (1 - alpha) / (1 - alpha_bar)
+                x_next = d0 * x_t + d1 * diff_output + sigma * z
+            traj[t - 1] = x_next.detach()
+            traj[t] = traj[t].cpu()
+            if not ret_traj:
+                del traj[t]
+
+        if ret_traj:
+            raise NotImplementedError
+            return traj
+        else:
+            latent_output = traj[0]
+            face3d_output = self._latent_to_face3d(latent_output)
+            return face3d_output
+
+
+if __name__ == "__main__":
+    from core.networks.diffusion_util import NoisePredictor, VarianceSchedule
+
+    diffnet = DiffusionNet(
+        net=NoisePredictor(),
+        var_sched=VarianceSchedule(
+            num_steps=500, beta_1=1e-4, beta_T=0.02, mode="linear"
+        ),
+    )
+
+    import torch
+
+    gt_face3d = torch.randn(16, 64, 64)
+    audio = torch.randn(16, 64, 11)
+    style_clip = torch.randn(16, 256, 64)
+    style_pad_mask = torch.ones(16, 256)
+
+    context = {
+        "audio": audio,
+        "style_clip": style_clip,
+        "style_pad_mask": style_pad_mask,
+    }
+
+    loss = diffnet.get_loss(gt_face3d, context)
+
+    print("hello")
diff --git a/dreamtalk/dreamtalk/core/networks/diffusion_util.py b/dreamtalk/dreamtalk/core/networks/diffusion_util.py
new file mode 100644
index 00000000..b77f39f6
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/diffusion_util.py
@@ -0,0 +1,131 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import Module
+from core.networks import get_network
+from core.utils import sinusoidal_embedding
+
+
+class VarianceSchedule(Module):
+    def __init__(self, num_steps, beta_1, beta_T, mode="linear"):
+        super().__init__()
+        assert mode in ("linear",)
+        self.num_steps = num_steps
+        self.beta_1 = beta_1
+        self.beta_T = beta_T
+        self.mode = mode
+
+        if mode == "linear":
+            betas = torch.linspace(beta_1, beta_T, steps=num_steps)
+
+        betas = torch.cat([torch.zeros([1]), betas], dim=0)  # Padding
+
+        alphas = 1 - betas
+        log_alphas = torch.log(alphas)
+        for i in range(1, log_alphas.size(0)):  # 1 to T
+            log_alphas[i] += log_alphas[i - 1]
+        alpha_bars = log_alphas.exp()
+
+        sigmas_flex = torch.sqrt(betas)
+        sigmas_inflex = torch.zeros_like(sigmas_flex)
+        for i in range(1, sigmas_flex.size(0)):
+            sigmas_inflex[i] = ((1 - alpha_bars[i - 1]) / (1 - alpha_bars[i])) * betas[
+                i
+            ]
+        sigmas_inflex = torch.sqrt(sigmas_inflex)
+
+        self.register_buffer("betas", betas)
+        self.register_buffer("alphas", alphas)
+        self.register_buffer("alpha_bars", alpha_bars)
+        self.register_buffer("sigmas_flex", sigmas_flex)
+        self.register_buffer("sigmas_inflex", sigmas_inflex)
+
+    def uniform_sample_t(self, batch_size):
+        ts = np.random.choice(np.arange(1, self.num_steps + 1), batch_size)
+        return ts.tolist()
+
+    def get_sigmas(self, t, flexibility):
+        assert 0 <= flexibility and flexibility <= 1
+        sigmas = self.sigmas_flex[t] * flexibility + self.sigmas_inflex[t] * (
+            1 - flexibility
+        )
+        return sigmas
+
+
+class NoisePredictor(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        content_encoder_class = get_network(cfg.CONTENT_ENCODER_TYPE)
+        self.content_encoder = content_encoder_class(**cfg.CONTENT_ENCODER)
+
+        style_encoder_class = get_network(cfg.STYLE_ENCODER_TYPE)
+        cfg.defrost()
+        cfg.STYLE_ENCODER.input_dim = cfg.DATASET.FACE3D_DIM
+        cfg.freeze()
+        self.style_encoder = style_encoder_class(**cfg.STYLE_ENCODER)
+
+        decoder_class = get_network(cfg.DECODER_TYPE)
+        cfg.defrost()
+        cfg.DECODER.output_dim = cfg.DATASET.FACE3D_DIM
+        cfg.freeze()
+        self.decoder = decoder_class(**cfg.DECODER)
+
+        self.content_xt_to_decoder_input_wo_time = nn.Sequential(
+            nn.Linear(cfg.D_MODEL + cfg.DATASET.FACE3D_DIM, cfg.D_MODEL),
+            nn.ReLU(),
+            nn.Linear(cfg.D_MODEL, cfg.D_MODEL),
+            nn.ReLU(),
+            nn.Linear(cfg.D_MODEL, cfg.D_MODEL),
+        )
+
+        self.time_sinusoidal_dim = cfg.D_MODEL
+        self.time_embed_net = nn.Sequential(
+            nn.Linear(cfg.D_MODEL, cfg.D_MODEL),
+            nn.SiLU(),
+            nn.Linear(cfg.D_MODEL, cfg.D_MODEL),
+        )
+
+    def forward(self, x_t, t, audio, style_clip, style_pad_mask, ready_style_code=None):
+        """_summary_
+
+        Args:
+            x_t (_type_): (B, L, C_face)
+            t (_type_): (B,) dtype:float32
+            audio (_type_): (B, L, W)
+            style_clip (_type_): (B, L_clipmax, C_face3d)
+            style_pad_mask : (B, L_clipmax)
+            ready_style_code: (B, C_model)
+        Returns:
+            e_theta : (B, L, C_face)
+        """
+        W = audio.shape[2]
+        content = self.content_encoder(audio)
+        # (B, L, W, C_model)
+        x_t_expand = x_t.unsqueeze(2).repeat(1, 1, W, 1)
+        # (B, L, C_face) -> (B, L, W, C_face)
+        content_xt_concat = torch.cat((content, x_t_expand), dim=3)
+        # (B, L, W, C_model+C_face)
+        decoder_input_without_time = self.content_xt_to_decoder_input_wo_time(
+            content_xt_concat
+        )
+        # (B, L, W, C_model)
+
+        time_sinusoidal = sinusoidal_embedding(t, self.time_sinusoidal_dim)
+        # (B, C_embed)
+        time_embedding = self.time_embed_net(time_sinusoidal)
+        # (B, C_model)
+        B, C = time_embedding.shape
+        time_embed_expand = time_embedding.view(B, 1, 1, C)
+        decoder_input = decoder_input_without_time + time_embed_expand
+        # (B, L, W, C_model)
+
+        if ready_style_code is not None:
+            style_code = ready_style_code
+        else:
+            style_code = self.style_encoder(style_clip, style_pad_mask)
+        # (B, C_model)
+
+        e_theta = self.decoder(decoder_input, style_code)
+        # (B, L, C_face)
+        return e_theta
diff --git a/dreamtalk/dreamtalk/core/networks/disentangle_decoder.py b/dreamtalk/dreamtalk/core/networks/disentangle_decoder.py
new file mode 100644
index 00000000..71bebbab
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/disentangle_decoder.py
@@ -0,0 +1,240 @@
+import torch
+from torch import nn
+
+from .transformer import (
+    PositionalEncoding,
+    TransformerDecoderLayer,
+    TransformerDecoder,
+)
+from core.networks.dynamic_fc_decoder import DynamicFCDecoderLayer, DynamicFCDecoder
+from core.utils import _reset_parameters
+
+
+def get_decoder_network(
+    network_type,
+    d_model,
+    nhead,
+    dim_feedforward,
+    dropout,
+    activation,
+    normalize_before,
+    num_decoder_layers,
+    return_intermediate_dec,
+    dynamic_K,
+    dynamic_ratio,
+):
+    decoder = None
+    if network_type == "TransformerDecoder":
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        norm = nn.LayerNorm(d_model)
+        decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            norm,
+            return_intermediate_dec,
+        )
+    elif network_type == "DynamicFCDecoder":
+        d_style = d_model
+        decoder_layer = DynamicFCDecoderLayer(
+            d_model,
+            nhead,
+            d_style,
+            dynamic_K,
+            dynamic_ratio,
+            dim_feedforward,
+            dropout,
+            activation,
+            normalize_before,
+        )
+        norm = nn.LayerNorm(d_model)
+        decoder = DynamicFCDecoder(
+            decoder_layer, num_decoder_layers, norm, return_intermediate_dec
+        )
+    elif network_type == "DynamicFCEncoder":
+        d_style = d_model
+        decoder_layer = DynamicFCEncoderLayer(
+            d_model,
+            nhead,
+            d_style,
+            dynamic_K,
+            dynamic_ratio,
+            dim_feedforward,
+            dropout,
+            activation,
+            normalize_before,
+        )
+        norm = nn.LayerNorm(d_model)
+        decoder = DynamicFCEncoder(decoder_layer, num_decoder_layers, norm)
+
+    else:
+        raise ValueError(f"Invalid network_type {network_type}")
+
+    return decoder
+
+
+class DisentangleDecoder(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_decoder_layers=3,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+        pos_embed_len=80,
+        upper_face3d_indices=tuple(list(range(19)) + list(range(46, 51))),
+        lower_face3d_indices=tuple(range(19, 46)),
+        network_type="None",
+        dynamic_K=None,
+        dynamic_ratio=None,
+        **_,
+    ) -> None:
+        super().__init__()
+
+        self.upper_face3d_indices = upper_face3d_indices
+        self.lower_face3d_indices = lower_face3d_indices
+
+        # upper_decoder_layer = TransformerDecoderLayer(
+        #     d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        # )
+        # upper_decoder_norm = nn.LayerNorm(d_model)
+        # self.upper_decoder = TransformerDecoder(
+        #     upper_decoder_layer,
+        #     num_decoder_layers,
+        #     upper_decoder_norm,
+        #     return_intermediate=return_intermediate_dec,
+        # )
+        self.upper_decoder = get_decoder_network(
+            network_type,
+            d_model,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            normalize_before,
+            num_decoder_layers,
+            return_intermediate_dec,
+            dynamic_K,
+            dynamic_ratio,
+        )
+        _reset_parameters(self.upper_decoder)
+
+        # lower_decoder_layer = TransformerDecoderLayer(
+        #     d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        # )
+        # lower_decoder_norm = nn.LayerNorm(d_model)
+        # self.lower_decoder = TransformerDecoder(
+        #     lower_decoder_layer,
+        #     num_decoder_layers,
+        #     lower_decoder_norm,
+        #     return_intermediate=return_intermediate_dec,
+        # )
+        self.lower_decoder = get_decoder_network(
+            network_type,
+            d_model,
+            nhead,
+            dim_feedforward,
+            dropout,
+            activation,
+            normalize_before,
+            num_decoder_layers,
+            return_intermediate_dec,
+            dynamic_K,
+            dynamic_ratio,
+        )
+        _reset_parameters(self.lower_decoder)
+
+        self.pos_embed = PositionalEncoding(d_model, pos_embed_len)
+
+        tail_hidden_dim = d_model // 2
+        self.upper_tail_fc = nn.Sequential(
+            nn.Linear(d_model, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, len(upper_face3d_indices)),
+        )
+        self.lower_tail_fc = nn.Sequential(
+            nn.Linear(d_model, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, len(lower_face3d_indices)),
+        )
+
+    def forward(self, content, style_code):
+        """
+
+        Args:
+            content (_type_): (B, num_frames, window, C_dmodel)
+            style_code (_type_): (B, C_dmodel)
+
+        Returns:
+            face3d: (B, L_clip, C_3dmm)
+        """
+        B, N, W, C = content.shape
+        style = style_code.reshape(B, 1, 1, C).expand(B, N, W, C)
+        style = style.permute(2, 0, 1, 3).reshape(W, B * N, C)
+        # (W, B*N, C)
+
+        content = content.permute(2, 0, 1, 3).reshape(W, B * N, C)
+        # (W, B*N, C)
+        tgt = torch.zeros_like(style)
+        pos_embed = self.pos_embed(W)
+        pos_embed = pos_embed.permute(1, 0, 2)
+
+        upper_face3d_feat = self.upper_decoder(
+            tgt, content, pos=pos_embed, query_pos=style
+        )[0]
+        # (W, B*N, C)
+        upper_face3d_feat = upper_face3d_feat.permute(1, 0, 2).reshape(B, N, W, C)[
+            :, :, W // 2, :
+        ]
+        # (B, N, C)
+        upper_face3d = self.upper_tail_fc(upper_face3d_feat)
+        # (B, N, C_exp)
+
+        lower_face3d_feat = self.lower_decoder(
+            tgt, content, pos=pos_embed, query_pos=style
+        )[0]
+        lower_face3d_feat = lower_face3d_feat.permute(1, 0, 2).reshape(B, N, W, C)[
+            :, :, W // 2, :
+        ]
+        lower_face3d = self.lower_tail_fc(lower_face3d_feat)
+        C_exp = len(self.upper_face3d_indices) + len(self.lower_face3d_indices)
+        face3d = torch.zeros(B, N, C_exp).to(upper_face3d)
+        face3d[:, :, self.upper_face3d_indices] = upper_face3d
+        face3d[:, :, self.lower_face3d_indices] = lower_face3d
+        return face3d
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.path.append("/home/mayifeng/Research/styleTH")
+
+    from configs.default import get_cfg_defaults
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file("configs/styleTH_unpair_lsfm_emotion.yaml")
+    cfg.freeze()
+
+    # content_encoder = ContentEncoder(**cfg.CONTENT_ENCODER)
+
+    # dummy_audio = torch.randint(0, 41, (5, 64, 11))
+    # dummy_content = content_encoder(dummy_audio)
+
+    # style_encoder = StyleEncoder(**cfg.STYLE_ENCODER)
+    # dummy_face3d_seq = torch.randn(5, 64, 64)
+    # dummy_style_code = style_encoder(dummy_face3d_seq)
+
+    decoder = DisentangleDecoder(**cfg.DECODER)
+    dummy_content = torch.randn(5, 64, 11, 256)
+    dummy_style = torch.randn(5, 256)
+    dummy_output = decoder(dummy_content, dummy_style)
+
+    print("hello")
diff --git a/dreamtalk/dreamtalk/core/networks/dynamic_conv.py b/dreamtalk/dreamtalk/core/networks/dynamic_conv.py
new file mode 100644
index 00000000..5cf253b4
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/dynamic_conv.py
@@ -0,0 +1,160 @@
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Attention(nn.Module):
+    def __init__(self, cond_planes, ratio, K, temperature=30, init_weight=True):
+        super().__init__()
+        # self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.temprature = temperature
+        assert cond_planes > ratio
+        hidden_planes = cond_planes // ratio
+        self.net = nn.Sequential(
+            nn.Conv2d(cond_planes, hidden_planes, kernel_size=1, bias=False),
+            nn.ReLU(),
+            nn.Conv2d(hidden_planes, K, kernel_size=1, bias=False),
+        )
+
+        if init_weight:
+            self._initialize_weights()
+
+    def update_temprature(self):
+        if self.temprature > 1:
+            self.temprature -= 1
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode="fan_out", nonlinearity="relu")
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, cond):
+        """
+
+        Args:
+            cond (_type_): (B, C_style)
+
+        Returns:
+            _type_: (B, K)
+        """
+
+        # att = self.avgpool(cond)  # bs,dim,1,1
+        att = cond.view(cond.shape[0], cond.shape[1], 1, 1)
+        att = self.net(att).view(cond.shape[0], -1)  # bs,K
+        return F.softmax(att / self.temprature, -1)
+
+
+class DynamicConv(nn.Module):
+    def __init__(
+        self,
+        in_planes,
+        out_planes,
+        cond_planes,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+        K=4,
+        temperature=30,
+        ratio=4,
+        init_weight=True,
+    ):
+        super().__init__()
+        self.in_planes = in_planes
+        self.out_planes = out_planes
+        self.cond_planes = cond_planes
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.K = K
+        self.init_weight = init_weight
+        self.attention = Attention(
+            cond_planes=cond_planes, ratio=ratio, K=K, temperature=temperature, init_weight=init_weight
+        )
+
+        self.weight = nn.Parameter(
+            torch.randn(K, out_planes, in_planes // groups, kernel_size, kernel_size), requires_grad=True
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.randn(
+                K, out_planes), requires_grad=True)
+        else:
+            self.bias = None
+
+        if self.init_weight:
+            self._initialize_weights()
+
+    def _initialize_weights(self):
+        for i in range(self.K):
+            nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
+            if self.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(
+                    self.weight[i])
+                if fan_in != 0:
+                    bound = 1 / math.sqrt(fan_in)
+                    nn.init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, x, cond):
+        """
+
+        Args:
+            x (_type_): (B, C_in, L, 1)
+            cond (_type_): (B, C_style)
+
+        Returns:
+            _type_: (B, C_out, L, 1)
+        """
+        bs, in_planels, h, w = x.shape
+        softmax_att = self.attention(cond)  # bs,K
+        x = x.view(1, -1, h, w)
+        weight = self.weight.view(self.K, -1)  # K,-1
+        aggregate_weight = torch.mm(softmax_att, weight).view(
+            bs * self.out_planes, self.in_planes // self.groups, self.kernel_size, self.kernel_size
+        )  # bs*out_p,in_p,k,k
+
+        if self.bias is not None:
+            bias = self.bias.view(self.K, -1)  # K,out_p
+            aggregate_bias = torch.mm(softmax_att, bias).view(-1)  # bs*out_p
+            output = F.conv2d(
+                x,  # 1, bs*in_p, L, 1
+                weight=aggregate_weight,
+                bias=aggregate_bias,
+                stride=self.stride,
+                padding=self.padding,
+                groups=self.groups * bs,
+                dilation=self.dilation,
+            )
+        else:
+            output = F.conv2d(
+                x,
+                weight=aggregate_weight,
+                bias=None,
+                stride=self.stride,
+                padding=self.padding,
+                groups=self.groups * bs,
+                dilation=self.dilation,
+            )
+
+        output = output.view(bs, self.out_planes, h, w)
+        return output
+
+
+if __name__ == "__main__":
+    input = torch.randn(3, 32, 64, 64)
+    m = DynamicConv(in_planes=32, out_planes=64, kernel_size=3,
+                    stride=1, padding=1, bias=True)
+    out = m(input)
+    print(out.shape)
diff --git a/dreamtalk/dreamtalk/core/networks/dynamic_fc_decoder.py b/dreamtalk/dreamtalk/core/networks/dynamic_fc_decoder.py
new file mode 100644
index 00000000..9b2b4e7a
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/dynamic_fc_decoder.py
@@ -0,0 +1,182 @@
+import torch.nn as nn
+import torch
+
+from core.networks.transformer import _get_activation_fn, _get_clones
+from core.networks.dynamic_linear import DynamicLinear
+
+
+class DynamicFCDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        d_style,
+        dynamic_K,
+        dynamic_ratio,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        # self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear1 = DynamicLinear(
+            d_model, dim_feedforward, d_style, K=dynamic_K, ratio=dynamic_ratio)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        # self.linear2 = DynamicLinear(dim_feedforward, d_model, d_style, K=dynamic_K, ratio=dynamic_ratio)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        tgt,
+        memory,
+        style,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        query_pos=None,
+    ):
+        # q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(tgt, tgt, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=tgt, key=memory, value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        # tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt, style))), style)
+        tgt2 = self.linear2(self.dropout(
+            self.activation(self.linear1(tgt, style))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    # def forward_pre(
+    #     self,
+    #     tgt,
+    #     memory,
+    #     tgt_mask=None,
+    #     memory_mask=None,
+    #     tgt_key_padding_mask=None,
+    #     memory_key_padding_mask=None,
+    #     pos=None,
+    #     query_pos=None,
+    # ):
+    #     tgt2 = self.norm1(tgt)
+    #     # q = k = self.with_pos_embed(tgt2, query_pos)
+    #     tgt2 = self.self_attn(tgt2, tgt2, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
+    #     tgt = tgt + self.dropout1(tgt2)
+    #     tgt2 = self.norm2(tgt)
+    #     tgt2 = self.multihead_attn(
+    #         query=tgt2, key=memory, value=memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask
+    #     )[0]
+    #     tgt = tgt + self.dropout2(tgt2)
+    #     tgt2 = self.norm3(tgt)
+    #     tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+    #     tgt = tgt + self.dropout3(tgt2)
+    #     return tgt
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        style,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        query_pos=None,
+    ):
+        if self.normalize_before:
+            raise NotImplementedError
+            # return self.forward_pre(
+            #     tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos
+            # )
+        return self.forward_post(
+            tgt, memory, style, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos
+        )
+
+
+class DynamicFCDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+        pos=None,
+        query_pos=None,
+    ):
+        style = query_pos[0]
+        # (B*N, C)
+        output = tgt + pos + query_pos
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                style,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+if __name__ == "__main__":
+    query = torch.randn(11, 1024, 256)
+    content = torch.randn(11, 1024, 256)
+    style = torch.randn(1024, 256)
+    pos = torch.randn(11, 1, 256)
+    m = DynamicFCDecoderLayer(256, 4, 256, 4, 4, 1024)
+
+    out = m(query, content, style, pos=pos)
+    print(out.shape)
diff --git a/dreamtalk/dreamtalk/core/networks/dynamic_linear.py b/dreamtalk/dreamtalk/core/networks/dynamic_linear.py
new file mode 100644
index 00000000..0e881539
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/dynamic_linear.py
@@ -0,0 +1,50 @@
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from core.networks.dynamic_conv import DynamicConv
+
+
+class DynamicLinear(nn.Module):
+    def __init__(self, in_planes, out_planes, cond_planes, bias=True, K=4, temperature=30, ratio=4, init_weight=True):
+        super().__init__()
+
+        self.dynamic_conv = DynamicConv(
+            in_planes,
+            out_planes,
+            cond_planes,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+            K=K,
+            ratio=ratio,
+            temperature=temperature,
+            init_weight=init_weight,
+        )
+
+    def forward(self, x, cond):
+        """
+
+        Args:
+            x (_type_): (L, B, C_in)
+            cond (_type_): (B, C_style)
+
+        Returns:
+            _type_: (L, B, C_out)
+        """
+        x = x.permute(1, 2, 0).unsqueeze(-1)
+        out = self.dynamic_conv(x, cond)
+        # (B, C_out, L, 1)
+        out = out.squeeze().permute(2, 0, 1)
+        return out
+
+
+if __name__ == "__main__":
+    input = torch.randn(11, 1024, 255)
+    cond = torch.randn(1024, 256)
+    m = DynamicLinear(255, 1000, 256, K=7, temperature=5, ratio=8)
+    out = m(input, cond)
+    print(out.shape)
diff --git a/dreamtalk/dreamtalk/core/networks/generator.py b/dreamtalk/dreamtalk/core/networks/generator.py
new file mode 100644
index 00000000..4330ba30
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/generator.py
@@ -0,0 +1,311 @@
+import torch
+from torch import nn
+
+from .transformer import (
+    TransformerEncoder,
+    TransformerEncoderLayer,
+    PositionalEncoding,
+    TransformerDecoderLayer,
+    TransformerDecoder,
+)
+from core.utils import _reset_parameters
+from core.networks.self_attention_pooling import SelfAttentionPooling
+
+
+# class ContentEncoder(nn.Module):
+#     def __init__(
+#         self,
+#         d_model=512,
+#         nhead=8,
+#         num_encoder_layers=6,
+#         dim_feedforward=2048,
+#         dropout=0.1,
+#         activation="relu",
+#         normalize_before=False,
+#         pos_embed_len=80,
+#         ph_embed_dim=128,
+#     ):
+#         super().__init__()
+
+#         encoder_layer = TransformerEncoderLayer(
+#             d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+#         )
+#         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+#         self.encoder = TransformerEncoder(
+#             encoder_layer, num_encoder_layers, encoder_norm
+#         )
+
+#         _reset_parameters(self.encoder)
+
+#         self.pos_embed = PositionalEncoding(d_model, pos_embed_len)
+
+#         self.ph_embedding = nn.Embedding(41, ph_embed_dim)
+#         self.increase_embed_dim = nn.Linear(ph_embed_dim, d_model)
+
+#     def forward(self, x):
+#         """
+
+#         Args:
+#             x (_type_): (B, num_frames, window)
+
+#         Returns:
+#             content: (B, num_frames, window, C_dmodel)
+#         """
+#         x_embedding = self.ph_embedding(x)
+#         x_embedding = self.increase_embed_dim(x_embedding)
+#         # (B, N, W, C)
+#         B, N, W, C = x_embedding.shape
+#         x_embedding = x_embedding.reshape(B * N, W, C)
+#         x_embedding = x_embedding.permute(1, 0, 2)
+#         # (W, B*N, C)
+
+#         pos = self.pos_embed(W)
+#         pos = pos.permute(1, 0, 2)
+#         # (W, 1, C)
+
+#         content = self.encoder(x_embedding, pos=pos)
+#         # (W, B*N, C)
+#         content = content.permute(1, 0, 2).reshape(B, N, W, C)
+#         # (B, N, W, C)
+
+#         return content
+
+
+class ContentW2VEncoder(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        pos_embed_len=80,
+        ph_embed_dim=128,
+    ):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )
+
+        _reset_parameters(self.encoder)
+
+        self.pos_embed = PositionalEncoding(d_model, pos_embed_len)
+
+        self.increase_embed_dim = nn.Linear(1024, d_model)
+
+    def forward(self, x):
+        """
+        Args:
+            x (_type_): (B, num_frames, window, C_wav2vec)
+
+        Returns:
+            content: (B, num_frames, window, C_dmodel)
+        """
+        x_embedding = self.increase_embed_dim(
+            x
+        )  # [16, 64, 11, 1024] -> [16, 64, 11, 256]
+        # (B, N, W, C)
+        B, N, W, C = x_embedding.shape
+        x_embedding = x_embedding.reshape(B * N, W, C)
+        x_embedding = x_embedding.permute(1, 0, 2)  # [11, 1024, 256]
+        # (W, B*N, C)
+
+        pos = self.pos_embed(W)
+        pos = pos.permute(1, 0, 2)  # [11, 1, 256]
+        # (W, 1, C)
+
+        content = self.encoder(x_embedding, pos=pos)  # [11, 1024, 256]
+        # (W, B*N, C)
+        content = content.permute(1, 0, 2).reshape(B, N, W, C)
+        # (B, N, W, C)
+
+        return content
+
+
+class StyleEncoder(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        pos_embed_len=80,
+        input_dim=128,
+        aggregate_method="average",
+    ):
+        super().__init__()
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )
+        _reset_parameters(self.encoder)
+
+        self.pos_embed = PositionalEncoding(d_model, pos_embed_len)
+
+        self.increase_embed_dim = nn.Linear(input_dim, d_model)
+
+        self.aggregate_method = None
+        if aggregate_method == "self_attention_pooling":
+            self.aggregate_method = SelfAttentionPooling(d_model)
+        elif aggregate_method == "average":
+            pass
+        else:
+            raise ValueError(f"Invalid aggregate method {aggregate_method}")
+
+    def forward(self, x, pad_mask=None):
+        """
+
+        Args:
+            x (_type_): (B, num_frames(L), C_exp)
+            pad_mask: (B, num_frames)
+
+        Returns:
+            style_code: (B, C_model)
+        """
+        x = self.increase_embed_dim(x)
+        # (B, L, C)
+        x = x.permute(1, 0, 2)
+        # (L, B, C)
+
+        pos = self.pos_embed(x.shape[0])
+        pos = pos.permute(1, 0, 2)
+        # (L, 1, C)
+
+        style = self.encoder(x, pos=pos, src_key_padding_mask=pad_mask)
+        # (L, B, C)
+
+        if self.aggregate_method is not None:
+            permute_style = style.permute(1, 0, 2)
+            # (B, L, C)
+            style_code = self.aggregate_method(permute_style, pad_mask)
+            return style_code
+
+        if pad_mask is None:
+            style = style.permute(1, 2, 0)
+            # (B, C, L)
+            style_code = style.mean(2)
+            # (B, C)
+        else:
+            permute_style = style.permute(1, 0, 2)
+            # (B, L, C)
+            permute_style[pad_mask] = 0
+            sum_style_code = permute_style.sum(dim=1)
+            # (B, C)
+            valid_token_num = (~pad_mask).sum(dim=1).unsqueeze(-1)
+            # (B, 1)
+            style_code = sum_style_code / valid_token_num
+            # (B, C)
+
+        return style_code
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_decoder_layers=3,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+        pos_embed_len=80,
+        output_dim=64,
+        **_,
+    ) -> None:
+        super().__init__()
+
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+        _reset_parameters(self.decoder)
+
+        self.pos_embed = PositionalEncoding(d_model, pos_embed_len)
+
+        tail_hidden_dim = d_model // 2
+        self.tail_fc = nn.Sequential(
+            nn.Linear(d_model, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, tail_hidden_dim),
+            nn.ReLU(),
+            nn.Linear(tail_hidden_dim, output_dim),
+        )
+
+    def forward(self, content, style_code):
+        """
+
+        Args:
+            content (_type_): (B, num_frames, window, C_dmodel)
+            style_code (_type_): (B, C_dmodel)
+
+        Returns:
+            face3d: (B, num_frames, C_3dmm)
+        """
+        B, N, W, C = content.shape
+        style = style_code.reshape(B, 1, 1, C).expand(B, N, W, C)
+        style = style.permute(2, 0, 1, 3).reshape(W, B * N, C)
+        # (W, B*N, C)
+
+        content = content.permute(2, 0, 1, 3).reshape(W, B * N, C)
+        # (W, B*N, C)
+        tgt = torch.zeros_like(style)
+        pos_embed = self.pos_embed(W)
+        pos_embed = pos_embed.permute(1, 0, 2)
+        face3d_feat = self.decoder(
+            tgt, content, pos=pos_embed, query_pos=style)[0]
+        # (W, B*N, C)
+        face3d_feat = face3d_feat.permute(1, 0, 2).reshape(B, N, W, C)[
+            :, :, W // 2, :]
+        # (B, N, C)
+        face3d = self.tail_fc(face3d_feat)
+        # (B, N, C_exp)
+        return face3d
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.path.append("/home/mayifeng/Research/styleTH")
+
+    from configs.default import get_cfg_defaults
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file("configs/styleTH_bp.yaml")
+    cfg.freeze()
+
+    # content_encoder = ContentEncoder(**cfg.CONTENT_ENCODER)
+
+    # dummy_audio = torch.randint(0, 41, (5, 64, 11))
+    # dummy_content = content_encoder(dummy_audio)
+
+    # style_encoder = StyleEncoder(**cfg.STYLE_ENCODER)
+    # dummy_face3d_seq = torch.randn(5, 64, 64)
+    # dummy_style_code = style_encoder(dummy_face3d_seq)
+
+    decoder = Decoder(**cfg.DECODER)
+    dummy_content = torch.randn(5, 64, 11, 512)
+    dummy_style = torch.randn(5, 512)
+    dummy_output = decoder(dummy_content, dummy_style)
+
+    print("hello")
diff --git a/dreamtalk/dreamtalk/core/networks/mish.py b/dreamtalk/dreamtalk/core/networks/mish.py
new file mode 100644
index 00000000..8aa99af6
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/mish.py
@@ -0,0 +1,53 @@
+"""
+Applies the mish function element-wise:
+mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+"""
+
+# import pytorch
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+@torch.jit.script
+def mish(input):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    See additional documentation for mish class.
+    """
+    return input * torch.tanh(F.softplus(input))
+
+
+class Mish(nn.Module):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+
+    Examples:
+        >>> m = Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    Reference: https://pytorch.org/docs/stable/generated/torch.nn.Mish.html
+    """
+
+    def __init__(self):
+        """
+        Init method.
+        """
+        super().__init__()
+
+    def forward(self, input):
+        """
+        Forward pass of the function.
+        """
+        if torch.__version__ >= "1.9":
+            return F.mish(input)
+        else:
+            return mish(input)
diff --git a/dreamtalk/dreamtalk/core/networks/self_attention_pooling.py b/dreamtalk/dreamtalk/core/networks/self_attention_pooling.py
new file mode 100644
index 00000000..ecf6ca13
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/self_attention_pooling.py
@@ -0,0 +1,54 @@
+import torch
+import torch.nn as nn
+from core.networks.mish import Mish
+
+
+class SelfAttentionPooling(nn.Module):
+    """
+    Implementation of SelfAttentionPooling
+    Original Paper: Self-Attention Encoding and Pooling for Speaker Recognition
+    https://arxiv.org/pdf/2008.01077v1.pdf
+    """
+
+    def __init__(self, input_dim):
+        super(SelfAttentionPooling, self).__init__()
+        self.W = nn.Sequential(
+            nn.Linear(input_dim, input_dim), Mish(), nn.Linear(input_dim, 1))
+        self.softmax = nn.functional.softmax
+
+    def forward(self, batch_rep, att_mask=None):
+        """
+        N: batch size, T: sequence length, H: Hidden dimension
+        input:
+            batch_rep : size (N, T, H)
+        attention_weight:
+            att_w : size (N, T, 1)
+        att_mask:
+            att_mask: size (N, T): if True, mask this item.
+        return:
+            utter_rep: size (N, H)
+        """
+
+        att_logits = self.W(batch_rep).squeeze(-1)
+        # (N, T)
+        if att_mask is not None:
+            att_mask_logits = att_mask.to(dtype=batch_rep.dtype) * -100000.0
+            # (N, T)
+            att_logits = att_mask_logits + att_logits
+
+        att_w = self.softmax(att_logits, dim=-1).unsqueeze(-1)
+        utter_rep = torch.sum(batch_rep * att_w, dim=1)
+
+        return utter_rep
+
+
+if __name__ == "__main__":
+    batch = torch.randn(8, 64, 256)
+    self_attn_pool = SelfAttentionPooling(256)
+    att_mask = torch.zeros(8, 64)
+    att_mask[:, 60:] = 1
+    att_mask = att_mask.to(torch.bool)
+    output = self_attn_pool(batch, att_mask)
+    # (8, 256)
+
+    print("hello")
diff --git a/dreamtalk/dreamtalk/core/networks/transformer.py b/dreamtalk/dreamtalk/core/networks/transformer.py
new file mode 100644
index 00000000..fd0269fc
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/networks/transformer.py
@@ -0,0 +1,297 @@
+import torch.nn as nn
+import torch
+import numpy as np
+import torch.nn.functional as F
+import copy
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_hid, n_position=200):
+        super(PositionalEncoding, self).__init__()
+
+        # Not a parameter
+        self.register_buffer(
+            'pos_table', self._get_sinusoid_encoding_table(n_position, d_hid))
+
+    def _get_sinusoid_encoding_table(self, n_position, d_hid):
+        ''' Sinusoid position encoding table '''
+        # TODO: make it with torch instead of numpy
+
+        def get_position_angle_vec(position):
+            return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
+
+        sinusoid_table = np.array([get_position_angle_vec(pos_i)
+                                  for pos_i in range(n_position)])
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+
+    def forward(self, winsize):
+        return self.pos_table[:, :winsize].clone().detach()
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=True):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, opt, src, query_embed, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+
+        src = src.permute(1, 0, 2)
+        pos_embed = pos_embed.permute(1, 0, 2)
+        query_embed = query_embed.permute(1, 0, 2)
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, pos=pos_embed)
+
+        hs = self.decoder(tgt, memory,
+                          pos=pos_embed, query_pos=query_embed)
+        return hs
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, mask=None, src_key_padding_mask=None, pos=None):
+        output = src+pos
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self, tgt, memory,  tgt_mask=None,  memory_mask=None, tgt_key_padding_mask=None,
+                memory_key_padding_mask=None,
+                pos=None,
+                query_pos=None):
+        output = tgt+pos+query_pos
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask=None,
+                     src_key_padding_mask=None,
+                     pos=None):
+        # q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(src, src, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self, src,
+                    src_mask=None,
+                    src_key_padding_mask=None,
+                    pos=None):
+        src2 = self.norm1(src)
+        # q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(src2, src2, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self, src,
+                src_mask=None,
+                src_key_padding_mask=None,
+                pos=None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, tgt, memory,
+                     tgt_mask=None,
+                     memory_mask=None,
+                     tgt_key_padding_mask=None,
+                     memory_key_padding_mask=None,
+                     pos=None,
+                     query_pos=None):
+        # q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(tgt, tgt, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=tgt,
+                                   key=memory,
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self, tgt, memory,
+                    tgt_mask=None,
+                    memory_mask=None,
+                    tgt_key_padding_mask=None,
+                    memory_key_padding_mask=None,
+                    pos=None,
+                    query_pos=None):
+        tgt2 = self.norm1(tgt)
+        # q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(tgt2, tgt2, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=tgt2,
+                                   key=memory,
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self, tgt, memory,
+                tgt_mask=None,
+                memory_mask=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None,
+                pos=None,
+                query_pos=None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
diff --git a/dreamtalk/dreamtalk/core/utils.py b/dreamtalk/dreamtalk/core/utils.py
new file mode 100644
index 00000000..6bf92818
--- /dev/null
+++ b/dreamtalk/dreamtalk/core/utils.py
@@ -0,0 +1,467 @@
+import random
+import os
+import argparse
+from collections import defaultdict
+import logging
+import pickle
+import json
+
+import numpy as np
+import torch
+from torch import nn
+from scipy.io import loadmat
+
+from configs.default import get_cfg_defaults
+import dlib
+import cv2
+
+
+def _reset_parameters(model):
+    for p in model.parameters():
+        if p.dim() > 1:
+            nn.init.xavier_uniform_(p)
+
+
+def get_video_style(video_name, style_type):
+    person_id, direction, emotion, level, *_ = video_name.split("_")
+    if style_type == "id_dir_emo_level":
+        style = "_".join([person_id, direction, emotion, level])
+    elif style_type == "emotion":
+        style = emotion
+    elif style_type == "id":
+        style = person_id
+    else:
+        raise ValueError("Unknown style type")
+
+    return style
+
+
+def get_style_video_lists(video_list, style_type):
+    style2video_list = defaultdict(list)
+    for video in video_list:
+        style = get_video_style(video, style_type)
+        style2video_list[style].append(video)
+
+    return style2video_list
+
+
+def get_face3d_clip(
+    video_name, video_root_dir, num_frames, start_idx, dtype=torch.float32
+):
+    """_summary_
+
+    Args:
+        video_name (_type_): _description_
+        video_root_dir (_type_): _description_
+        num_frames (_type_): _description_
+        start_idx (_type_): "random" , middle, int
+        dtype (_type_, optional): _description_. Defaults to torch.float32.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        _type_: _description_
+    """
+    video_path = os.path.join(video_root_dir, video_name)
+    if video_path[-3:] == "mat":
+        face3d_all = loadmat(video_path)["coeff"]
+        face3d_exp = face3d_all[:, 80:144]  # expression 3DMM range
+    elif video_path[-3:] == "txt":
+        face3d_exp = np.loadtxt(video_path)
+    else:
+        raise ValueError("Invalid 3DMM file extension")
+
+    length = face3d_exp.shape[0]
+    clip_num_frames = num_frames
+    if start_idx == "random":
+        clip_start_idx = np.random.randint(
+            low=0, high=length - clip_num_frames + 1)
+    elif start_idx == "middle":
+        clip_start_idx = (length - clip_num_frames + 1) // 2
+    elif isinstance(start_idx, int):
+        clip_start_idx = start_idx
+    else:
+        raise ValueError(f"Invalid start_idx {start_idx}")
+
+    face3d_clip = face3d_exp[clip_start_idx: clip_start_idx + clip_num_frames]
+    face3d_clip = torch.tensor(face3d_clip, dtype=dtype)
+
+    return face3d_clip
+
+
+def get_video_style_clip(
+    video_name,
+    video_root_dir,
+    style_max_len,
+    start_idx="random",
+    dtype=torch.float32,
+    return_start_idx=False,
+):
+    video_path = os.path.join(video_root_dir, video_name)
+    if video_path[-3:] == "mat":
+        face3d_all = loadmat(video_path)["coeff"]
+        face3d_exp = face3d_all[:, 80:144]  # expression 3DMM range
+    elif video_path[-3:] == "txt":
+        face3d_exp = np.loadtxt(video_path)
+    else:
+        raise ValueError("Invalid 3DMM file extension")
+
+    face3d_exp = torch.tensor(face3d_exp, dtype=dtype)
+
+    length = face3d_exp.shape[0]
+    if length >= style_max_len:
+        clip_num_frames = style_max_len
+        if start_idx == "random":
+            clip_start_idx = np.random.randint(
+                low=0, high=length - clip_num_frames + 1)
+        elif start_idx == "middle":
+            clip_start_idx = (length - clip_num_frames + 1) // 2
+        elif isinstance(start_idx, int):
+            clip_start_idx = start_idx
+        else:
+            raise ValueError(f"Invalid start_idx {start_idx}")
+
+        face3d_clip = face3d_exp[clip_start_idx: clip_start_idx + clip_num_frames]
+        pad_mask = torch.tensor([False] * style_max_len)
+    else:
+        clip_start_idx = None
+        padding = torch.zeros(style_max_len - length, face3d_exp.shape[1])
+        face3d_clip = torch.cat((face3d_exp, padding), dim=0)
+        pad_mask = torch.tensor(
+            [False] * length + [True] * (style_max_len - length))
+
+    if return_start_idx:
+        return face3d_clip, pad_mask, clip_start_idx
+    else:
+        return face3d_clip, pad_mask
+
+
+def get_video_style_clip_from_np(
+    face3d_exp,
+    style_max_len,
+    start_idx="random",
+    dtype=torch.float32,
+    return_start_idx=False,
+):
+    face3d_exp = torch.tensor(face3d_exp, dtype=dtype)
+
+    length = face3d_exp.shape[0]
+    if length >= style_max_len:
+        clip_num_frames = style_max_len
+        if start_idx == "random":
+            clip_start_idx = np.random.randint(
+                low=0, high=length - clip_num_frames + 1)
+        elif start_idx == "middle":
+            clip_start_idx = (length - clip_num_frames + 1) // 2
+        elif isinstance(start_idx, int):
+            clip_start_idx = start_idx
+        else:
+            raise ValueError(f"Invalid start_idx {start_idx}")
+
+        face3d_clip = face3d_exp[clip_start_idx: clip_start_idx + clip_num_frames]
+        pad_mask = torch.tensor([False] * style_max_len)
+    else:
+        clip_start_idx = None
+        padding = torch.zeros(style_max_len - length, face3d_exp.shape[1])
+        face3d_clip = torch.cat((face3d_exp, padding), dim=0)
+        pad_mask = torch.tensor(
+            [False] * length + [True] * (style_max_len - length))
+
+    if return_start_idx:
+        return face3d_clip, pad_mask, clip_start_idx
+    else:
+        return face3d_clip, pad_mask
+
+
+def get_wav2vec_audio_window(audio_feat, start_idx, num_frames, win_size):
+    """
+
+    Args:
+        audio_feat (np.ndarray): (N, 1024)
+        start_idx (_type_): _description_
+        num_frames (_type_): _description_
+    """
+    center_idx_list = [
+        2 * idx for idx in range(start_idx, start_idx + num_frames)]
+    audio_window_list = []
+    padding = np.zeros(audio_feat.shape[1], dtype=np.float32)
+    for center_idx in center_idx_list:
+        cur_audio_window = []
+        for i in range(center_idx - win_size, center_idx + win_size + 1):
+            if i < 0:
+                cur_audio_window.append(padding)
+            elif i >= len(audio_feat):
+                cur_audio_window.append(padding)
+            else:
+                cur_audio_window.append(audio_feat[i])
+        cur_audio_win_array = np.stack(cur_audio_window, axis=0)
+        audio_window_list.append(cur_audio_win_array)
+
+    audio_window_array = np.stack(audio_window_list, axis=0)
+    return audio_window_array
+
+
+def setup_config():
+    parser = argparse.ArgumentParser(description="voice2pose main program")
+    parser.add_argument(
+        "--config_file", default="", metavar="FILE", help="path to config file"
+    )
+    parser.add_argument(
+        "--resume_from", type=str, default=None, help="the checkpoint to resume from"
+    )
+    parser.add_argument(
+        "--test_only", action="store_true", help="perform testing and evaluation only"
+    )
+    parser.add_argument(
+        "--demo_input", type=str, default=None, help="path to input for demo"
+    )
+    parser.add_argument(
+        "--checkpoint", type=str, default=None, help="the checkpoint to test with"
+    )
+    parser.add_argument("--tag", type=str, default="",
+                        help="tag for the experiment")
+    parser.add_argument(
+        "opts",
+        help="Modify config options using the command-line",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        help="local rank for DistributedDataParallel",
+    )
+    parser.add_argument(
+        "--master_port",
+        type=str,
+        default="12345",
+    )
+    parser.add_argument(
+        "--max_audio_len",
+        type=int,
+        default=450,
+        help="max_audio_len for inference",
+    )
+    parser.add_argument(
+        "--ddim_num_step",
+        type=int,
+        default=10,
+    )
+    parser.add_argument(
+        "--inference_seed",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--inference_sample_method",
+        type=str,
+        default="ddim",
+    )
+    args = parser.parse_args()
+
+    cfg = get_cfg_defaults()
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return args, cfg
+
+
+def setup_logger(base_path, exp_name):
+    rootLogger = logging.getLogger()
+    rootLogger.setLevel(logging.INFO)
+
+    logFormatter = logging.Formatter(
+        "%(asctime)s [%(levelname)-0.5s] %(message)s")
+
+    log_path = "{0}/{1}.log".format(base_path, exp_name)
+    fileHandler = logging.FileHandler(log_path)
+    fileHandler.setFormatter(logFormatter)
+    rootLogger.addHandler(fileHandler)
+
+    consoleHandler = logging.StreamHandler()
+    consoleHandler.setFormatter(logFormatter)
+    rootLogger.addHandler(consoleHandler)
+    rootLogger.handlers[0].setLevel(logging.INFO)
+
+    logging.info("log path: %s" % log_path)
+
+
+def cosine_loss(a, v, y, logloss=nn.BCELoss()):
+    d = nn.functional.cosine_similarity(a, v)
+    loss = logloss(d.unsqueeze(1), y)
+    return loss
+
+
+def get_pose_params(mat_path):
+    """Get pose parameters from mat file
+
+    Args:
+        mat_path (str): path of mat file
+
+    Returns:
+        pose_params (numpy.ndarray): shape (L_video, 9), angle, translation, crop paramters
+    """
+    mat_dict = loadmat(mat_path)
+
+    np_3dmm = mat_dict["coeff"]
+    angles = np_3dmm[:, 224:227]
+    translations = np_3dmm[:, 254:257]
+
+    np_trans_params = mat_dict["transform_params"]
+    crop = np_trans_params[:, -3:]
+
+    pose_params = np.concatenate((angles, translations, crop), axis=1)
+
+    return pose_params
+
+
+def sinusoidal_embedding(timesteps, dim):
+    """
+
+    Args:
+        timesteps (_type_): (B,)
+        dim (_type_): (C_embed)
+
+    Returns:
+        _type_: (B, C_embed)
+    """
+    # check input
+    half = dim // 2
+    timesteps = timesteps.float()
+
+    # compute sinusoidal embedding
+    sinusoid = torch.outer(
+        timesteps, torch.pow(
+            10000, -torch.arange(half).to(timesteps).div(half))
+    )
+    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
+    if dim % 2 != 0:
+        x = torch.cat([x, torch.zeros_like(x[:, :1])], dim=1)
+    return x
+
+
+def get_wav2vec_audio_window(audio_feat, start_idx, num_frames, win_size):
+    """
+
+    Args:
+        audio_feat (np.ndarray): (250, 1024)
+        start_idx (_type_): _description_
+        num_frames (_type_): _description_
+    """
+    center_idx_list = [
+        2 * idx for idx in range(start_idx, start_idx + num_frames)]
+    audio_window_list = []
+    padding = np.zeros(audio_feat.shape[1], dtype=np.float32)
+    for center_idx in center_idx_list:
+        cur_audio_window = []
+        for i in range(center_idx - win_size, center_idx + win_size + 1):
+            if i < 0:
+                cur_audio_window.append(padding)
+            elif i >= len(audio_feat):
+                cur_audio_window.append(padding)
+            else:
+                cur_audio_window.append(audio_feat[i])
+        cur_audio_win_array = np.stack(cur_audio_window, axis=0)
+        audio_window_list.append(cur_audio_win_array)
+
+    audio_window_array = np.stack(audio_window_list, axis=0)
+    return audio_window_array
+
+
+def reshape_audio_feat(style_audio_all_raw, stride):
+    """_summary_
+
+    Args:
+        style_audio_all_raw (_type_): (stride * L, C)
+        stride (_type_): int
+
+    Returns:
+        _type_: (L, C * stride)
+    """
+    style_audio_all_raw = style_audio_all_raw[
+        : style_audio_all_raw.shape[0] // stride * stride
+    ]
+    style_audio_all_raw = style_audio_all_raw.reshape(
+        style_audio_all_raw.shape[0] // stride, stride, style_audio_all_raw.shape[1]
+    )
+    style_audio_all = style_audio_all_raw.reshape(
+        style_audio_all_raw.shape[0], -1)
+    return style_audio_all
+
+
+def get_derangement_tuple(n):
+    while True:
+        v = [i for i in range(n)]
+        for j in range(n - 1, -1, -1):
+            p = random.randint(0, j)
+            if v[p] == j:
+                break
+            else:
+                v[j], v[p] = v[p], v[j]
+        else:
+            if v[0] != 0:
+                return tuple(v)
+
+
+def compute_aspect_preserved_bbox(bbox, increase_area, h, w):
+    left, top, right, bot = bbox
+    width = right - left
+    height = bot - top
+
+    width_increase = max(
+        increase_area, ((1 + 2 * increase_area) * height - width) / (2 * width)
+    )
+    height_increase = max(
+        increase_area, ((1 + 2 * increase_area) *
+                        width - height) / (2 * height)
+    )
+
+    left_t = int(left - width_increase * width)
+    top_t = int(top - height_increase * height)
+    right_t = int(right + width_increase * width)
+    bot_t = int(bot + height_increase * height)
+
+    left_oob = -min(0, left_t)
+    right_oob = right - min(right_t, w)
+    top_oob = -min(0, top_t)
+    bot_oob = bot - min(bot_t, h)
+
+    if max(left_oob, right_oob, top_oob, bot_oob) > 0:
+        max_w = max(left_oob, right_oob)
+        max_h = max(top_oob, bot_oob)
+        if max_w > max_h:
+            return left_t + max_w, top_t + max_w, right_t - max_w, bot_t - max_w
+        else:
+            return left_t + max_h, top_t + max_h, right_t - max_h, bot_t - max_h
+
+    else:
+        return (left_t, top_t, right_t, bot_t)
+
+
+def crop_src_image(src_img, save_img, increase_ratio, detector=None):
+    if detector is None:
+        detector = dlib.get_frontal_face_detector()
+
+    img = cv2.imread(src_img)
+    faces = detector(img, 0)
+    h, width, _ = img.shape
+    if len(faces) > 0:
+        bbox = [faces[0].left(), faces[0].top(),
+                faces[0].right(), faces[0].bottom()]
+        l = bbox[3] - bbox[1]
+        bbox[1] = bbox[1] - l * 0.1
+        bbox[3] = bbox[3] - l * 0.1
+        bbox[1] = max(0, bbox[1])
+        bbox[3] = min(h, bbox[3])
+        bbox = compute_aspect_preserved_bbox(
+            tuple(bbox), increase_ratio, img.shape[0], img.shape[1]
+        )
+        img = img[bbox[1]: bbox[3], bbox[0]: bbox[2]]
+        img = cv2.resize(img, (256, 256))
+        cv2.imwrite(save_img, img)
+    else:
+        raise ValueError("No face detected in the input image")
+        # img = cv2.resize(img, (256, 256))
+        # cv2.imwrite(save_img, img)
diff --git a/dreamtalk/dreamtalk/data/README.md b/dreamtalk/dreamtalk/data/README.md
new file mode 100644
index 00000000..15300f3e
--- /dev/null
+++ b/dreamtalk/dreamtalk/data/README.md
@@ -0,0 +1,11 @@
+### 简单介绍
+
++ `audio` : 作者使用的输入音频目录，支持多种格式的输入（保险起见我都使用 `m4a` 作为输入）
+
++ `eval` : 评估使用的文件，用于生成评估视频（需要注意的是，只有 `pose` 的 `mat` 文件，作者论文中提到的嘴唇风格预测器给出的推理代码中并未使用（参数模型也没开源），也就是说，`style_clip` 是必须输入的参数，而不是如论文中所说可以从参考视频中获取风格 `mat`, 因此我们使用作者提供的 `style_clip` 参数，也即 `style_clip` 下的 `3DMM` ）
+
++ `pose` : `demo` 视频的头部参考 `3DMM` 参数文件，需要使用 `Deep3DFaceRecon_pytorch` 提取视频中的 3DMM 参数
+
++ `src_img` : 参考图片，视频的第一帧
+
++ `style_clip` : 面部风格参数，用于指定情绪，使用 `M030_front_neutral_level1_001.mat` 作为常规风格
\ No newline at end of file
diff --git a/dreamtalk/dreamtalk/data/audio/acknowledgement_chinese.m4a b/dreamtalk/dreamtalk/data/audio/acknowledgement_chinese.m4a
new file mode 100644
index 00000000..229cfe91
Binary files /dev/null and b/dreamtalk/dreamtalk/data/audio/acknowledgement_chinese.m4a differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Jae-in_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Jae-in_256.mat
new file mode 100644
index 00000000..65d66554
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Jae-in_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Lieu_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Lieu_256.mat
new file mode 100644
index 00000000..31bc7d12
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Lieu_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Macron1_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Macron1_256.mat
new file mode 100644
index 00000000..17403b3f
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Macron1_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Macron2_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Macron2_256.mat
new file mode 100644
index 00000000..7f5a47a6
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Macron2_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/May_256.mat b/dreamtalk/dreamtalk/data/eval/pose/May_256.mat
new file mode 100644
index 00000000..d158c813
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/May_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Obama1_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Obama1_256.mat
new file mode 100644
index 00000000..f361226f
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Obama1_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Obama2_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Obama2_256.mat
new file mode 100644
index 00000000..195b6ba0
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Obama2_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Obama_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Obama_256.mat
new file mode 100644
index 00000000..74900421
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Obama_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/pose/Shaheen_256.mat b/dreamtalk/dreamtalk/data/eval/pose/Shaheen_256.mat
new file mode 100644
index 00000000..60537539
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/pose/Shaheen_256.mat differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Jae-in_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Jae-in_256_first/frame_0000.png
new file mode 100644
index 00000000..61ecff21
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Jae-in_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Lieu_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Lieu_256_first/frame_0000.png
new file mode 100644
index 00000000..e923b4f6
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Lieu_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Macron_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Macron_256_first/frame_0000.png
new file mode 100644
index 00000000..31a86668
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Macron_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/May_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/May_256_first/frame_0000.png
new file mode 100644
index 00000000..8dbdb0f6
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/May_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Obama1_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Obama1_256_first/frame_0000.png
new file mode 100644
index 00000000..fe8855eb
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Obama1_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Obama2_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Obama2_256_first/frame_0000.png
new file mode 100644
index 00000000..a4ff9526
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Obama2_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Obama_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Obama_256_first/frame_0000.png
new file mode 100644
index 00000000..3d9f07f5
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Obama_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/eval/src_img/Shaheen_256_first/frame_0000.png b/dreamtalk/dreamtalk/data/eval/src_img/Shaheen_256_first/frame_0000.png
new file mode 100644
index 00000000..096087aa
Binary files /dev/null and b/dreamtalk/dreamtalk/data/eval/src_img/Shaheen_256_first/frame_0000.png differ
diff --git a/dreamtalk/dreamtalk/data/pose/RichardShelby_front_neutral_level1_001.mat b/dreamtalk/dreamtalk/data/pose/RichardShelby_front_neutral_level1_001.mat
new file mode 100644
index 00000000..81568c5d
Binary files /dev/null and b/dreamtalk/dreamtalk/data/pose/RichardShelby_front_neutral_level1_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/chpa5.png b/dreamtalk/dreamtalk/data/src_img/cropped/chpa5.png
new file mode 100644
index 00000000..28d92086
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/chpa5.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/cut_img.png b/dreamtalk/dreamtalk/data/src_img/cropped/cut_img.png
new file mode 100644
index 00000000..327972e6
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/cut_img.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/f30.png b/dreamtalk/dreamtalk/data/src_img/cropped/f30.png
new file mode 100644
index 00000000..5c5118ef
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/f30.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/menglu2.png b/dreamtalk/dreamtalk/data/src_img/cropped/menglu2.png
new file mode 100644
index 00000000..7c869aaa
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/menglu2.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/nscu2.png b/dreamtalk/dreamtalk/data/src_img/cropped/nscu2.png
new file mode 100644
index 00000000..4b16dc9c
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/nscu2.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/zp1.png b/dreamtalk/dreamtalk/data/src_img/cropped/zp1.png
new file mode 100644
index 00000000..20997f79
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/zp1.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/cropped/zt12.png b/dreamtalk/dreamtalk/data/src_img/cropped/zt12.png
new file mode 100644
index 00000000..998551d5
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/cropped/zt12.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/uncropped/face3.png b/dreamtalk/dreamtalk/data/src_img/uncropped/face3.png
new file mode 100644
index 00000000..f9962172
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/uncropped/face3.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/uncropped/male_face.png b/dreamtalk/dreamtalk/data/src_img/uncropped/male_face.png
new file mode 100644
index 00000000..5f62eba0
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/uncropped/male_face.png differ
diff --git a/dreamtalk/dreamtalk/data/src_img/uncropped/uncut_src_img.jpg b/dreamtalk/dreamtalk/data/src_img/uncropped/uncut_src_img.jpg
new file mode 100644
index 00000000..607873fb
Binary files /dev/null and b/dreamtalk/dreamtalk/data/src_img/uncropped/uncut_src_img.jpg differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_angry_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_angry_level3_001.mat
new file mode 100644
index 00000000..111bb62c
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_angry_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_contempt_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_contempt_level3_001.mat
new file mode 100644
index 00000000..99bc6b0f
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_contempt_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_disgusted_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_disgusted_level3_001.mat
new file mode 100644
index 00000000..22defd94
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_disgusted_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_fear_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_fear_level3_001.mat
new file mode 100644
index 00000000..bca8ca65
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_fear_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_happy_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_happy_level3_001.mat
new file mode 100644
index 00000000..e5698aab
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_happy_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_neutral_level1_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_neutral_level1_001.mat
new file mode 100644
index 00000000..a89c415a
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_neutral_level1_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_sad_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_sad_level3_001.mat
new file mode 100644
index 00000000..575299be
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_sad_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_surprised_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_surprised_level3_001.mat
new file mode 100644
index 00000000..cc06d238
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/M030_front_surprised_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_angry_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_angry_level3_001.mat
new file mode 100644
index 00000000..c00e7169
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_angry_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_contempt_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_contempt_level3_001.mat
new file mode 100644
index 00000000..83643c97
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_contempt_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_disgusted_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_disgusted_level3_001.mat
new file mode 100644
index 00000000..8f60c904
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_disgusted_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_fear_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_fear_level3_001.mat
new file mode 100644
index 00000000..6553751c
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_fear_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_happy_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_happy_level3_001.mat
new file mode 100644
index 00000000..9ec3cc30
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_happy_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_neutral_level1_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_neutral_level1_001.mat
new file mode 100644
index 00000000..91ec8d75
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_neutral_level1_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_sad_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_sad_level3_001.mat
new file mode 100644
index 00000000..75fb8f25
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_sad_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_surprised_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_surprised_level3_001.mat
new file mode 100644
index 00000000..8abea0f5
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W009_front_surprised_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_angry_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_angry_level3_001.mat
new file mode 100644
index 00000000..2b7a3f7d
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_angry_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_contempt_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_contempt_level3_001.mat
new file mode 100644
index 00000000..b17954df
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_contempt_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_disgusted_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_disgusted_level3_001.mat
new file mode 100644
index 00000000..ef801454
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_disgusted_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_fear_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_fear_level3_001.mat
new file mode 100644
index 00000000..ec1938f6
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_fear_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_happy_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_happy_level3_001.mat
new file mode 100644
index 00000000..a5d2adff
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_happy_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_neutral_level1_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_neutral_level1_001.mat
new file mode 100644
index 00000000..f4e5cfab
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_neutral_level1_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_sad_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_sad_level3_001.mat
new file mode 100644
index 00000000..ec6ee03f
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_sad_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_surprised_level3_001.mat b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_surprised_level3_001.mat
new file mode 100644
index 00000000..7df4523e
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/3DMM/W011_front_surprised_level3_001.mat differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_angry_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_angry_level3_001.mp4
new file mode 100644
index 00000000..c760ca66
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_angry_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_contempt_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_contempt_level3_001.mp4
new file mode 100644
index 00000000..7b829124
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_contempt_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_disgusted_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_disgusted_level3_001.mp4
new file mode 100644
index 00000000..8fecdabe
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_disgusted_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_fear_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_fear_level3_001.mp4
new file mode 100644
index 00000000..3f6a7061
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_fear_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_happy_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_happy_level3_001.mp4
new file mode 100644
index 00000000..c2f7f561
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_happy_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_neutral_level1_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_neutral_level1_001.mp4
new file mode 100644
index 00000000..45711ed9
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_neutral_level1_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_sad_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_sad_level3_001.mp4
new file mode 100644
index 00000000..a8180468
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_sad_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/M030_front_surprised_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_surprised_level3_001.mp4
new file mode 100644
index 00000000..88398342
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/M030_front_surprised_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_angry_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_angry_level3_001.mp4
new file mode 100644
index 00000000..45f6df25
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_angry_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_contempt_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_contempt_level3_001.mp4
new file mode 100644
index 00000000..0668cd1d
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_contempt_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_disgusted_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_disgusted_level3_001.mp4
new file mode 100644
index 00000000..9a78a57c
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_disgusted_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_fear_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_fear_level3_001.mp4
new file mode 100644
index 00000000..3a288bd8
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_fear_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_happy_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_happy_level3_001.mp4
new file mode 100644
index 00000000..e713c3a0
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_happy_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_neutral_level1_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_neutral_level1_001.mp4
new file mode 100644
index 00000000..95627a64
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_neutral_level1_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_sad_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_sad_level3_001.mp4
new file mode 100644
index 00000000..d2fe6774
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_sad_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W009_front_surprised_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_surprised_level3_001.mp4
new file mode 100644
index 00000000..959b327e
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W009_front_surprised_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_angry_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_angry_level3_001.mp4
new file mode 100644
index 00000000..a3c115ce
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_angry_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_contempt_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_contempt_level3_001.mp4
new file mode 100644
index 00000000..f72b5a69
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_contempt_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_disgusted_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_disgusted_level3_001.mp4
new file mode 100644
index 00000000..e6d11603
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_disgusted_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_fear_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_fear_level3_001.mp4
new file mode 100644
index 00000000..da8172c3
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_fear_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_happy_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_happy_level3_001.mp4
new file mode 100644
index 00000000..c72089c1
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_happy_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_neutral_level1_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_neutral_level1_001.mp4
new file mode 100644
index 00000000..0e892074
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_neutral_level1_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_sad_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_sad_level3_001.mp4
new file mode 100644
index 00000000..b67568a4
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_sad_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/data/style_clip/video/W011_front_surprised_level3_001.mp4 b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_surprised_level3_001.mp4
new file mode 100644
index 00000000..c1e62ffe
Binary files /dev/null and b/dreamtalk/dreamtalk/data/style_clip/video/W011_front_surprised_level3_001.mp4 differ
diff --git a/dreamtalk/dreamtalk/dreamtalk_gradio_colab.ipynb b/dreamtalk/dreamtalk/dreamtalk_gradio_colab.ipynb
new file mode 100644
index 00000000..944ac4d4
--- /dev/null
+++ b/dreamtalk/dreamtalk/dreamtalk_gradio_colab.ipynb
@@ -0,0 +1,193 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/dreamtalk-colab/blob/main/dreamtalk_gradio_colab.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "VjYy0F2gZIPR"
+      },
+      "outputs": [],
+      "source": [
+        "%cd /content\n",
+        "!git clone -b dev https://github.com/camenduru/dreamtalk\n",
+        "%cd /content/dreamtalk\n",
+        "\n",
+        "!wget https://huggingface.co/camenduru/dreamtalk/resolve/main/damo/dreamtalk/checkpoints/denoising_network.pth -O /content/dreamtalk/checkpoints/denoising_network.pth\n",
+        "!wget https://huggingface.co/camenduru/dreamtalk/resolve/main/damo/dreamtalk/checkpoints/renderer.pt -O /content/dreamtalk/checkpoints/renderer.pt\n",
+        "\n",
+        "!pip install -q yacs av gradio\n",
+        "\n",
+        "# https://huggingface.co/spaces/fffiloni/dreamtalk/blob/main/app.py modified\n",
+        "\n",
+        "import gradio as gr\n",
+        "import subprocess\n",
+        "from moviepy.editor import VideoFileClip\n",
+        "import datetime\n",
+        "\n",
+        "def convert_to_mp4_with_aac(input_path, output_path):\n",
+        "    video = VideoFileClip(input_path)\n",
+        "    video.write_videofile(output_path, codec=\"libx264\", audio_codec=\"aac\")\n",
+        "    return output_path\n",
+        "\n",
+        "def check_file_exists(file_path, audio_list):\n",
+        "    return file_path in audio_list\n",
+        "\n",
+        "def load_audio(audio_listed):\n",
+        "    if audio_listed is None:\n",
+        "        return None\n",
+        "    else:\n",
+        "        return f\"data/audio/{audio_listed}\"\n",
+        "\n",
+        "def execute_command(command: str) -> None:\n",
+        "    subprocess.run(command, check=True)\n",
+        "\n",
+        "def infer(audio_input, image_path, emotional_style):\n",
+        "    timestamp = datetime.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n",
+        "    output_name = f\"lipsynced_result_{timestamp}\"\n",
+        "    command = [\n",
+        "        f\"python\",\n",
+        "        f\"inference_for_demo_video.py\",\n",
+        "        f\"--wav_path={audio_input}\",\n",
+        "        f\"--style_clip_path=data/style_clip/3DMM/{emotional_style}\",\n",
+        "        f\"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat\",\n",
+        "        f\"--image_path={image_path}\",\n",
+        "        f\"--cfg_scale=1.0\",\n",
+        "        f\"--max_gen_len=30\",\n",
+        "        f\"--output_name={output_name}\"\n",
+        "    ]\n",
+        "\n",
+        "    execute_command(command)\n",
+        "    input_file = f\"output_video/{output_name}.mp4\"\n",
+        "    output_file = f\"{output_name}.mp4\"\n",
+        "    result = convert_to_mp4_with_aac(input_file, output_file)\n",
+        "    return result\n",
+        "\n",
+        "css=\"\"\"\n",
+        "#project-links{\n",
+        "    margin: 0 0 12px !important;\n",
+        "    column-gap: 8px;\n",
+        "    display: flex;\n",
+        "    justify-content: center;\n",
+        "    flex-wrap: nowrap;\n",
+        "    flex-direction: row;\n",
+        "    align-items: center;\n",
+        "}\n",
+        "\"\"\"\n",
+        "with gr.Blocks(css=css) as demo:\n",
+        "    with gr.Column(elem_id=\"col-container\"):\n",
+        "        gr.HTML(\"\"\"\n",
+        "        <h2 style=\"text-align: center;\">DreamTalk</h2>      \n",
+        "        \"\"\")\n",
+        "        with gr.Row():\n",
+        "            with gr.Column():\n",
+        "                image_path = gr.Image(label=\"肖像\", type=\"filepath\", sources=[\"upload\"])\n",
+        "                audio_input = gr.Audio(label=\"输入音频\", type=\"filepath\", sources=[\"upload\"], value=\"data/audio/acknowledgement_english.m4a\")\n",
+        "                with gr.Row():\n",
+        "                    audio_list = gr.Dropdown(\n",
+        "                        label=\"选择音频\",\n",
+        "                        choices=[\n",
+        "                            \"German1.wav\", \"German2.wav\", \"German3.wav\", \"German4.wav\",\n",
+        "                            \"acknowledgement_chinese.m4a\", \"acknowledgement_english.m4a\",\n",
+        "                            \"chinese1_haierlizhi.wav\", \"chinese2_guanyu.wav\",\n",
+        "                            \"french1.wav\", \"french2.wav\", \"french3.wav\",\n",
+        "                            \"italian1.wav\", \"italian2.wav\", \"italian3.wav\",\n",
+        "                            \"japan1.wav\", \"japan2.wav\", \"japan3.wav\",\n",
+        "                            \"korean1.wav\", \"korean2.wav\", \"korean3.wav\",\n",
+        "                            \"noisy_audio_cafeter_snr_0.wav\", \"noisy_audio_meeting_snr_0.wav\", \"noisy_audio_meeting_snr_10.wav\", \"noisy_audio_meeting_snr_20.wav\", \"noisy_audio_narrative.wav\", \"noisy_audio_office_snr_0.wav\", \"out_of_domain_narrative.wav\",\n",
+        "                            \"spanish1.wav\", \"spanish2.wav\", \"spanish3.wav\"\n",
+        "                            ],\n",
+        "                        value = \"acknowledgement_english.m4a\"\n",
+        "                    )\n",
+        "                    audio_list.change(\n",
+        "                        fn = load_audio,\n",
+        "                        inputs = [audio_list],\n",
+        "                        outputs = [audio_input]\n",
+        "                    )\n",
+        "                    emotional_style = gr.Dropdown(\n",
+        "                        label = \"情绪特征\",\n",
+        "                        choices = [\n",
+        "                            \"M030_front_angry_level3_001.mat\",\n",
+        "                            \"M030_front_contempt_level3_001.mat\",\n",
+        "                            \"M030_front_disgusted_level3_001.mat\",\n",
+        "                            \"M030_front_fear_level3_001.mat\",\n",
+        "                            \"M030_front_happy_level3_001.mat\",\n",
+        "                            \"M030_front_neutral_level1_001.mat\",\n",
+        "                            \"M030_front_sad_level3_001.mat\",\n",
+        "                            \"M030_front_surprised_level3_001.mat\",\n",
+        "                            \"W009_front_angry_level3_001.mat\",\n",
+        "                            \"W009_front_contempt_level3_001.mat\",\n",
+        "                            \"W009_front_disgusted_level3_001.mat\",\n",
+        "                            \"W009_front_fear_level3_001.mat\",\n",
+        "                            \"W009_front_happy_level3_001.mat\",\n",
+        "                            \"W009_front_neutral_level1_001.mat\",\n",
+        "                            \"W009_front_sad_level3_001.mat\",\n",
+        "                            \"W009_front_surprised_level3_001.mat\",\n",
+        "                            \"W011_front_angry_level3_001.mat\",\n",
+        "                            \"W011_front_contempt_level3_001.mat\",\n",
+        "                            \"W011_front_disgusted_level3_001.mat\",\n",
+        "                            \"W011_front_fear_level3_001.mat\",\n",
+        "                            \"W011_front_happy_level3_001.mat\",\n",
+        "                            \"W011_front_neutral_level1_001.mat\",\n",
+        "                            \"W011_front_sad_level3_001.mat\",\n",
+        "                            \"W011_front_surprised_level3_001.mat\"\n",
+        "                        ],\n",
+        "                        value = \"M030_front_neutral_level1_001.mat\"\n",
+        "                    )\n",
+        "                gr.Examples(\n",
+        "                    examples = [\n",
+        "                        \"data/src_img/uncropped/face3.png\",\n",
+        "                        \"data/src_img/uncropped/male_face.png\",\n",
+        "                        \"data/src_img/uncropped/uncut_src_img.jpg\",\n",
+        "                        \"data/src_img/cropped/chpa5.png\",\n",
+        "                        \"data/src_img/cropped/cut_img.png\",\n",
+        "                        \"data/src_img/cropped/f30.png\",\n",
+        "                        \"data/src_img/cropped/menglu2.png\",\n",
+        "                        \"data/src_img/cropped/nscu2.png\",\n",
+        "                        \"data/src_img/cropped/zp1.png\",\n",
+        "                        \"data/src_img/cropped/zt12.png\"\n",
+        "                    ],\n",
+        "                    inputs=[image_path],\n",
+        "                    examples_per_page=5\n",
+        "                )\n",
+        "                with gr.Row():\n",
+        "                    gr.ClearButton([audio_input, image_path, audio_list])\n",
+        "                    run_btn = gr.Button(\"Run\", elem_id=\"run-btn\")\n",
+        "            with gr.Column():\n",
+        "                output_video = gr.Video(format=\"mp4\")\n",
+        "    \n",
+        "    run_btn.click(\n",
+        "        fn = infer,\n",
+        "        inputs = [audio_input, image_path, emotional_style],\n",
+        "        outputs = [output_video]\n",
+        "    )\n",
+        "\n",
+        "demo.queue().launch(inline=False, share=True, debug=True)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/dreamtalk/dreamtalk/environment.yml b/dreamtalk/dreamtalk/environment.yml
new file mode 100644
index 00000000..19bc0ac9
--- /dev/null
+++ b/dreamtalk/dreamtalk/environment.yml
@@ -0,0 +1,170 @@
+name: dt
+channels:
+  - pytorch
+  - nvidia
+  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
+  - defaults
+  - conda-forge
+  - https://repo.anaconda.com/pkgs/main
+  - https://repo.anaconda.com/pkgs/r
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - aom=3.6.0=h6a678d5_0
+  - blas=1.0=mkl
+  - brotli-python=1.0.9=py39h6a678d5_8
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.11.26=h06a4308_0
+  - cairo=1.16.0=hb05425b_5
+  - certifi=2024.8.30=py39h06a4308_0
+  - charset-normalizer=3.3.2=pyhd3eb1b0_0
+  - cuda-cccl=12.6.77=0
+  - cuda-cccl_linux-64=12.6.77=0
+  - cuda-cudart=12.1.105=0
+  - cuda-cudart-dev=12.1.105=0
+  - cuda-cupti=12.1.105=0
+  - cuda-driver-dev=12.6.77=0
+  - cuda-driver-dev_linux-64=12.6.77=0
+  - cuda-libraries=12.1.0=0
+  - cuda-libraries-dev=12.1.0=0
+  - cuda-nvrtc=12.1.105=0
+  - cuda-nvrtc-dev=12.1.105=0
+  - cuda-nvtx=12.1.105=0
+  - cuda-opencl=12.6.77=0
+  - cuda-opencl-dev=12.6.77=0
+  - cuda-profiler-api=12.6.77=0
+  - cuda-runtime=12.1.0=0
+  - cuda-version=12.6=3
+  - dav1d=1.2.1=h5eee18b_0
+  - expat=2.6.4=h6a678d5_0
+  - ffmpeg=6.1.1=h4c62175_0
+  - filelock=3.13.1=py39h06a4308_0
+  - fontconfig=2.14.1=h4c34cd2_2
+  - freetype=2.12.1=h4a9f257_0
+  - giflib=5.2.2=h5eee18b_0
+  - glib=2.78.4=h6a678d5_0
+  - glib-tools=2.78.4=h6a678d5_0
+  - gmp=6.2.1=h295c915_3
+  - gmpy2=2.1.2=py39heeb90bb_0
+  - graphite2=1.3.14=h295c915_1
+  - harfbuzz=4.3.0=hf52aaf7_2
+  - icu=73.1=h6a678d5_0
+  - idna=3.7=py39h06a4308_0
+  - intel-openmp=2023.1.0=hdb19cb5_46306
+  - jinja2=3.1.4=py39h06a4308_1
+  - jpeg=9e=h5eee18b_3
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - leptonica=1.82.0=h42c8aad_2
+  - lerc=3.0=h295c915_0
+  - libarchive=3.6.2=h6ac8c49_3
+  - libcublas=12.1.0.26=0
+  - libcublas-dev=12.1.0.26=0
+  - libcufft=11.0.2.4=0
+  - libcufft-dev=11.0.2.4=0
+  - libcufile=1.11.1.6=0
+  - libcufile-dev=1.11.1.6=0
+  - libcurand=10.3.7.77=0
+  - libcurand-dev=10.3.7.77=0
+  - libcusolver=11.4.4.55=0
+  - libcusolver-dev=11.4.4.55=0
+  - libcusparse=12.0.2.55=0
+  - libcusparse-dev=12.0.2.55=0
+  - libdeflate=1.17=h5eee18b_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libglib=2.78.4=hdc74915_0
+  - libgomp=11.2.0=h1234567_1
+  - libiconv=1.16=h5eee18b_3
+  - libjpeg-turbo=2.0.0=h9bf148f_0
+  - libnpp=12.0.2.50=0
+  - libnpp-dev=12.0.2.50=0
+  - libnvjitlink=12.1.105=0
+  - libnvjitlink-dev=12.1.105=0
+  - libnvjpeg=12.1.1.14=0
+  - libnvjpeg-dev=12.1.1.14=0
+  - libogg=1.3.5=h27cfd23_1
+  - libopus=1.3.1=h5eee18b_1
+  - libpng=1.6.39=h5eee18b_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtheora=1.1.1=h7f8727e_3
+  - libtiff=4.5.1=h6a678d5_0
+  - libuuid=1.41.5=h5eee18b_0
+  - libuv=1.48.0=h5eee18b_0
+  - libvorbis=1.3.7=h7b6447c_0
+  - libvpx=1.13.1=h6a678d5_0
+  - libwebp=1.3.2=h11a3e52_0
+  - libwebp-base=1.3.2=h5eee18b_1
+  - libxcb=1.15=h7f8727e_0
+  - libxml2=2.10.4=hfdd30dd_2
+  - llvm-openmp=14.0.6=h9e868ea_0
+  - lz4-c=1.9.4=h6a678d5_1
+  - markupsafe=2.1.3=py39h5eee18b_0
+  - mkl=2023.1.0=h213fc3f_46344
+  - mkl-service=2.4.0=py39h5eee18b_1
+  - mkl_fft=1.3.11=py39h5eee18b_0
+  - mkl_random=1.2.8=py39h1128e8f_0
+  - mpc=1.1.0=h10f8cd9_1
+  - mpfr=4.0.2=hb69a4c5_1
+  - mpir=3.0.0=h3e5f119_1
+  - mpmath=1.3.0=py39h06a4308_0
+  - ncurses=6.4=h6a678d5_0
+  - networkx=3.2.1=py39h06a4308_0
+  - numpy=1.21.5=py39hf6e8229_4
+  - numpy-base=1.21.5=py39h060ed82_4
+  - openh264=2.1.1=h4ff587b_0
+  - openjpeg=2.5.2=he7f1fd0_0
+  - openssl=3.0.15=h5eee18b_0
+  - pcre2=10.42=hebb0a14_1
+  - pip=24.2=py39h06a4308_0
+  - pixman=0.40.0=h7f8727e_1
+  - pysocks=1.7.1=py39h06a4308_0
+  - python=3.9.21=he870216_1
+  - pytorch=2.5.1=py3.9_cuda12.1_cudnn9.1.0_0
+  - pytorch-cuda=12.1=ha16c6d3_6
+  - pytorch-mutex=1.0=cuda
+  - readline=8.2=h5eee18b_0
+  - requests=2.32.3=py39h06a4308_1
+  - setuptools=75.1.0=py39h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - sympy=1.13.1=pyh04b8f61_3
+  - tbb=2021.8.0=hdb19cb5_0
+  - tesseract=5.2.0=h6a678d5_0
+  - tk=8.6.14=h39e8969_0
+  - torchtriton=3.1.0=py39
+  - typing_extensions=4.11.0=py39h06a4308_0
+  - tzdata=2024b=h04d1e81_0
+  - wheel=0.44.0=py39h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - yaml=0.2.5=h7b6447c_0
+  - zlib=1.2.13=h5eee18b_1
+  - zstd=1.5.6=hc292b87_0
+  - pip:
+      - av==10.0.0
+      - colorama==0.4.6
+      - dlib==19.24.6
+      - ffmpeg-python==0.2.0
+      - fsspec==2024.10.0
+      - future==1.0.0
+      - huggingface-hub==0.26.5
+      - imageio==2.18.0
+      - joblib==1.4.2
+      - opencv-python==4.4.0.46
+      - packaging==24.2
+      - pillow==9.1.0
+      - pywavelets==1.4.1
+      - pyyaml==6.0
+      - regex==2024.11.6
+      - scikit-image==0.19.3
+      - scikit-learn==1.0.2
+      - scipy==1.7.3
+      - threadpoolctl==3.5.0
+      - tifffile==2024.8.30
+      - tokenizers==0.13.3
+      - torchvision==0.20.1
+      - tqdm==4.67.1
+      - transformers==4.28.1
+      - urllib3==1.26.6
+      - yacs==0.1.8
+prefix: /root/anaconda3/envs/dt
diff --git a/dreamtalk/dreamtalk/generators/base_function.py b/dreamtalk/dreamtalk/generators/base_function.py
new file mode 100644
index 00000000..802618a1
--- /dev/null
+++ b/dreamtalk/dreamtalk/generators/base_function.py
@@ -0,0 +1,412 @@
+import sys
+import math
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd import Function
+from torch.nn.utils.spectral_norm import spectral_norm as SpectralNorm
+
+
+class LayerNorm2d(nn.Module):
+    def __init__(self, n_out, affine=True):
+        super(LayerNorm2d, self).__init__()
+        self.n_out = n_out
+        self.affine = affine
+
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(n_out, 1, 1))
+            self.bias = nn.Parameter(torch.zeros(n_out, 1, 1))
+
+    def forward(self, x):
+        normalized_shape = x.size()[1:]
+        if self.affine:
+            return F.layer_norm(x, normalized_shape,
+                                self.weight.expand(normalized_shape),
+                                self.bias.expand(normalized_shape))
+
+        else:
+            return F.layer_norm(x, normalized_shape)
+
+
+class ADAINHourglass(nn.Module):
+    def __init__(self, image_nc, pose_nc, ngf, img_f, encoder_layers, decoder_layers, nonlinearity, use_spect):
+        super(ADAINHourglass, self).__init__()
+        self.encoder = ADAINEncoder(
+            image_nc, pose_nc, ngf, img_f, encoder_layers, nonlinearity, use_spect)
+        self.decoder = ADAINDecoder(
+            pose_nc, ngf, img_f, encoder_layers, decoder_layers, True, nonlinearity, use_spect)
+        self.output_nc = self.decoder.output_nc
+
+    def forward(self, x, z):
+        return self.decoder(self.encoder(x, z), z)
+
+
+class ADAINEncoder(nn.Module):
+    def __init__(self, image_nc, pose_nc, ngf, img_f, layers, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(ADAINEncoder, self).__init__()
+        self.layers = layers
+        self.input_layer = nn.Conv2d(
+            image_nc, ngf, kernel_size=7, stride=1, padding=3)
+        for i in range(layers):
+            in_channels = min(ngf * (2**i), img_f)
+            out_channels = min(ngf * (2**(i+1)), img_f)
+            model = ADAINEncoderBlock(
+                in_channels, out_channels, pose_nc, nonlinearity, use_spect)
+            setattr(self, 'encoder' + str(i), model)
+        self.output_nc = out_channels
+
+    def forward(self, x, z):
+        out = self.input_layer(x)
+        out_list = [out]
+        for i in range(self.layers):
+            model = getattr(self, 'encoder' + str(i))
+            out = model(out, z)
+            out_list.append(out)
+        return out_list
+
+
+class ADAINDecoder(nn.Module):
+    """docstring for ADAINDecoder"""
+
+    def __init__(self, pose_nc, ngf, img_f, encoder_layers, decoder_layers, skip_connect=True,
+                 nonlinearity=nn.LeakyReLU(), use_spect=False):
+
+        super(ADAINDecoder, self).__init__()
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.skip_connect = skip_connect
+        use_transpose = True
+
+        for i in range(encoder_layers-decoder_layers, encoder_layers)[::-1]:
+            in_channels = min(ngf * (2**(i+1)), img_f)
+            in_channels = in_channels * \
+                2 if i != (encoder_layers -
+                           1) and self.skip_connect else in_channels
+            out_channels = min(ngf * (2**i), img_f)
+            model = ADAINDecoderBlock(
+                in_channels, out_channels, out_channels, pose_nc, use_transpose, nonlinearity, use_spect)
+            setattr(self, 'decoder' + str(i), model)
+
+        self.output_nc = out_channels*2 if self.skip_connect else out_channels
+
+    def forward(self, x, z):
+        out = x.pop() if self.skip_connect else x
+        for i in range(self.encoder_layers-self.decoder_layers, self.encoder_layers)[::-1]:
+            model = getattr(self, 'decoder' + str(i))
+            out = model(out, z)
+            out = torch.cat([out, x.pop()], 1) if self.skip_connect else out
+        return out
+
+
+class ADAINEncoderBlock(nn.Module):
+    def __init__(self, input_nc, output_nc, feature_nc, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(ADAINEncoderBlock, self).__init__()
+        kwargs_down = {'kernel_size': 4, 'stride': 2, 'padding': 1}
+        kwargs_fine = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+
+        self.conv_0 = spectral_norm(
+            nn.Conv2d(input_nc,  output_nc, **kwargs_down), use_spect)
+        self.conv_1 = spectral_norm(
+            nn.Conv2d(output_nc, output_nc, **kwargs_fine), use_spect)
+
+        self.norm_0 = ADAIN(input_nc, feature_nc)
+        self.norm_1 = ADAIN(output_nc, feature_nc)
+        self.actvn = nonlinearity
+
+    def forward(self, x, z):
+        x = self.conv_0(self.actvn(self.norm_0(x, z)))
+        x = self.conv_1(self.actvn(self.norm_1(x, z)))
+        return x
+
+
+class ADAINDecoderBlock(nn.Module):
+    def __init__(self, input_nc, output_nc, hidden_nc, feature_nc, use_transpose=True, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(ADAINDecoderBlock, self).__init__()
+        # Attributes
+        self.actvn = nonlinearity
+        hidden_nc = min(
+            input_nc, output_nc) if hidden_nc is None else hidden_nc
+
+        kwargs_fine = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+        if use_transpose:
+            kwargs_up = {'kernel_size': 3, 'stride': 2,
+                         'padding': 1, 'output_padding': 1}
+        else:
+            kwargs_up = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+
+        # create conv layers
+        self.conv_0 = spectral_norm(
+            nn.Conv2d(input_nc, hidden_nc, **kwargs_fine), use_spect)
+        if use_transpose:
+            self.conv_1 = spectral_norm(nn.ConvTranspose2d(
+                hidden_nc, output_nc, **kwargs_up), use_spect)
+            self.conv_s = spectral_norm(nn.ConvTranspose2d(
+                input_nc, output_nc, **kwargs_up), use_spect)
+        else:
+            self.conv_1 = nn.Sequential(spectral_norm(nn.Conv2d(hidden_nc, output_nc, **kwargs_up), use_spect),
+                                        nn.Upsample(scale_factor=2))
+            self.conv_s = nn.Sequential(spectral_norm(nn.Conv2d(input_nc, output_nc, **kwargs_up), use_spect),
+                                        nn.Upsample(scale_factor=2))
+        # define normalization layers
+        self.norm_0 = ADAIN(input_nc, feature_nc)
+        self.norm_1 = ADAIN(hidden_nc, feature_nc)
+        self.norm_s = ADAIN(input_nc, feature_nc)
+
+    def forward(self, x, z):
+        x_s = self.shortcut(x, z)
+        dx = self.conv_0(self.actvn(self.norm_0(x, z)))
+        dx = self.conv_1(self.actvn(self.norm_1(dx, z)))
+        out = x_s + dx
+        return out
+
+    def shortcut(self, x, z):
+        x_s = self.conv_s(self.actvn(self.norm_s(x, z)))
+        return x_s
+
+
+def spectral_norm(module, use_spect=True):
+    """use spectral normal layer to stable the training process"""
+    if use_spect:
+        return SpectralNorm(module)
+    else:
+        return module
+
+
+class ADAIN(nn.Module):
+    def __init__(self, norm_nc, feature_nc):
+        super().__init__()
+
+        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
+
+        nhidden = 128
+        use_bias = True
+
+        self.mlp_shared = nn.Sequential(
+            nn.Linear(feature_nc, nhidden, bias=use_bias),
+            nn.ReLU()
+        )
+        self.mlp_gamma = nn.Linear(nhidden, norm_nc, bias=use_bias)
+        self.mlp_beta = nn.Linear(nhidden, norm_nc, bias=use_bias)
+
+    def forward(self, x, feature):
+
+        # Part 1. generate parameter-free normalized activations
+        normalized = self.param_free_norm(x)
+
+        # Part 2. produce scaling and bias conditioned on feature
+        feature = feature.view(feature.size(0), -1)
+        actv = self.mlp_shared(feature)
+        gamma = self.mlp_gamma(actv)
+        beta = self.mlp_beta(actv)
+
+        # apply scale and bias
+        gamma = gamma.view(*gamma.size()[:2], 1, 1)
+        beta = beta.view(*beta.size()[:2], 1, 1)
+        out = normalized * (1 + gamma) + beta
+        return out
+
+
+class FineEncoder(nn.Module):
+    """docstring for Encoder"""
+
+    def __init__(self, image_nc, ngf, img_f, layers, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(FineEncoder, self).__init__()
+        self.layers = layers
+        self.first = FirstBlock2d(
+            image_nc, ngf, norm_layer, nonlinearity, use_spect)
+        for i in range(layers):
+            in_channels = min(ngf*(2**i), img_f)
+            out_channels = min(ngf*(2**(i+1)), img_f)
+            model = DownBlock2d(in_channels, out_channels,
+                                norm_layer, nonlinearity, use_spect)
+            setattr(self, 'down' + str(i), model)
+        self.output_nc = out_channels
+
+    def forward(self, x):
+        x = self.first(x)
+        out = [x]
+        for i in range(self.layers):
+            model = getattr(self, 'down'+str(i))
+            x = model(x)
+            out.append(x)
+        return out
+
+
+class FineDecoder(nn.Module):
+    """docstring for FineDecoder"""
+
+    def __init__(self, image_nc, feature_nc, ngf, img_f, layers, num_block, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(FineDecoder, self).__init__()
+        self.layers = layers
+        for i in range(layers)[::-1]:
+            in_channels = min(ngf*(2**(i+1)), img_f)
+            out_channels = min(ngf*(2**i), img_f)
+            up = UpBlock2d(in_channels, out_channels,
+                           norm_layer, nonlinearity, use_spect)
+            res = FineADAINResBlocks(
+                num_block, in_channels, feature_nc, norm_layer, nonlinearity, use_spect)
+            jump = Jump(out_channels, norm_layer, nonlinearity, use_spect)
+
+            setattr(self, 'up' + str(i), up)
+            setattr(self, 'res' + str(i), res)
+            setattr(self, 'jump' + str(i), jump)
+
+        self.final = FinalBlock2d(out_channels, image_nc, use_spect, 'tanh')
+
+        self.output_nc = out_channels
+
+    def forward(self, x, z):
+        out = x.pop()
+        for i in range(self.layers)[::-1]:
+            res_model = getattr(self, 'res' + str(i))
+            up_model = getattr(self, 'up' + str(i))
+            jump_model = getattr(self, 'jump' + str(i))
+            out = res_model(out, z)
+            out = up_model(out)
+            out = jump_model(x.pop()) + out
+        out_image = self.final(out)
+        return out_image
+
+
+class FirstBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, input_nc, output_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(FirstBlock2d, self).__init__()
+        kwargs = {'kernel_size': 7, 'stride': 1, 'padding': 3}
+        conv = spectral_norm(
+            nn.Conv2d(input_nc, output_nc, **kwargs), use_spect)
+
+        if type(norm_layer) == type(None):
+            self.model = nn.Sequential(conv, nonlinearity)
+        else:
+            self.model = nn.Sequential(
+                conv, norm_layer(output_nc), nonlinearity)
+
+    def forward(self, x):
+        out = self.model(x)
+        return out
+
+
+class DownBlock2d(nn.Module):
+    def __init__(self, input_nc, output_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(DownBlock2d, self).__init__()
+
+        kwargs = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+        conv = spectral_norm(
+            nn.Conv2d(input_nc, output_nc, **kwargs), use_spect)
+        pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+        if type(norm_layer) == type(None):
+            self.model = nn.Sequential(conv, nonlinearity, pool)
+        else:
+            self.model = nn.Sequential(
+                conv, norm_layer(output_nc), nonlinearity, pool)
+
+    def forward(self, x):
+        out = self.model(x)
+        return out
+
+
+class UpBlock2d(nn.Module):
+    def __init__(self, input_nc, output_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(UpBlock2d, self).__init__()
+        kwargs = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+        conv = spectral_norm(
+            nn.Conv2d(input_nc, output_nc, **kwargs), use_spect)
+        if type(norm_layer) == type(None):
+            self.model = nn.Sequential(conv, nonlinearity)
+        else:
+            self.model = nn.Sequential(
+                conv, norm_layer(output_nc), nonlinearity)
+
+    def forward(self, x):
+        out = self.model(F.interpolate(x, scale_factor=2))
+        return out
+
+
+class FineADAINResBlocks(nn.Module):
+    def __init__(self, num_block, input_nc, feature_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(FineADAINResBlocks, self).__init__()
+        self.num_block = num_block
+        for i in range(num_block):
+            model = FineADAINResBlock2d(
+                input_nc, feature_nc, norm_layer, nonlinearity, use_spect)
+            setattr(self, 'res'+str(i), model)
+
+    def forward(self, x, z):
+        for i in range(self.num_block):
+            model = getattr(self, 'res'+str(i))
+            x = model(x, z)
+        return x
+
+
+class Jump(nn.Module):
+    def __init__(self, input_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(Jump, self).__init__()
+        kwargs = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+        conv = spectral_norm(
+            nn.Conv2d(input_nc, input_nc, **kwargs), use_spect)
+
+        if type(norm_layer) == type(None):
+            self.model = nn.Sequential(conv, nonlinearity)
+        else:
+            self.model = nn.Sequential(
+                conv, norm_layer(input_nc), nonlinearity)
+
+    def forward(self, x):
+        out = self.model(x)
+        return out
+
+
+class FineADAINResBlock2d(nn.Module):
+    """
+    Define an Residual block for different types
+    """
+
+    def __init__(self, input_nc, feature_nc, norm_layer=nn.BatchNorm2d, nonlinearity=nn.LeakyReLU(), use_spect=False):
+        super(FineADAINResBlock2d, self).__init__()
+
+        kwargs = {'kernel_size': 3, 'stride': 1, 'padding': 1}
+
+        self.conv1 = spectral_norm(
+            nn.Conv2d(input_nc, input_nc, **kwargs), use_spect)
+        self.conv2 = spectral_norm(
+            nn.Conv2d(input_nc, input_nc, **kwargs), use_spect)
+        self.norm1 = ADAIN(input_nc, feature_nc)
+        self.norm2 = ADAIN(input_nc, feature_nc)
+
+        self.actvn = nonlinearity
+
+    def forward(self, x, z):
+        dx = self.actvn(self.norm1(self.conv1(x), z))
+        dx = self.norm2(self.conv2(x), z)
+        out = dx + x
+        return out
+
+
+class FinalBlock2d(nn.Module):
+    """
+    Define the output layer
+    """
+
+    def __init__(self, input_nc, output_nc, use_spect=False, tanh_or_sigmoid='tanh'):
+        super(FinalBlock2d, self).__init__()
+
+        kwargs = {'kernel_size': 7, 'stride': 1, 'padding': 3}
+        conv = spectral_norm(
+            nn.Conv2d(input_nc, output_nc, **kwargs), use_spect)
+
+        if tanh_or_sigmoid == 'sigmoid':
+            out_nonlinearity = nn.Sigmoid()
+        else:
+            out_nonlinearity = nn.Tanh()
+
+        self.model = nn.Sequential(conv, out_nonlinearity)
+
+    def forward(self, x):
+        out = self.model(x)
+        return out
diff --git a/dreamtalk/dreamtalk/generators/face_model.py b/dreamtalk/dreamtalk/generators/face_model.py
new file mode 100644
index 00000000..5518ff3d
--- /dev/null
+++ b/dreamtalk/dreamtalk/generators/face_model.py
@@ -0,0 +1,136 @@
+import functools
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import generators.flow_util as flow_util
+from generators.base_function import LayerNorm2d, ADAINHourglass, FineEncoder, FineDecoder
+
+
+class FaceGenerator(nn.Module):
+    def __init__(
+        self,
+        mapping_net,
+        warpping_net,
+        editing_net,
+        common
+    ):
+        super(FaceGenerator, self).__init__()
+        self.mapping_net = MappingNet(**mapping_net)
+        self.warpping_net = WarpingNet(**warpping_net, **common)
+        self.editing_net = EditingNet(**editing_net, **common)
+
+    def forward(
+        self,
+        input_image,
+        driving_source,
+        stage=None
+    ):
+        if stage == 'warp':
+            descriptor = self.mapping_net(driving_source)
+            output = self.warpping_net(input_image, descriptor)
+        else:
+            descriptor = self.mapping_net(driving_source)
+            output = self.warpping_net(input_image, descriptor)
+            output['fake_image'] = self.editing_net(
+                input_image, output['warp_image'], descriptor)
+        return output
+
+
+class MappingNet(nn.Module):
+    def __init__(self, coeff_nc, descriptor_nc, layer):
+        super(MappingNet, self).__init__()
+
+        self.layer = layer
+        nonlinearity = nn.LeakyReLU(0.1)
+
+        self.first = nn.Sequential(
+            torch.nn.Conv1d(coeff_nc, descriptor_nc, kernel_size=7, padding=0, bias=True))
+
+        for i in range(layer):
+            net = nn.Sequential(nonlinearity,
+                                torch.nn.Conv1d(descriptor_nc, descriptor_nc, kernel_size=3, padding=0, dilation=3))
+            setattr(self, 'encoder' + str(i), net)
+
+        self.pooling = nn.AdaptiveAvgPool1d(1)
+        self.output_nc = descriptor_nc
+
+    def forward(self, input_3dmm):
+        out = self.first(input_3dmm)
+        for i in range(self.layer):
+            model = getattr(self, 'encoder' + str(i))
+            out = model(out) + out[:, :, 3:-3]
+        out = self.pooling(out)
+        return out
+
+
+class WarpingNet(nn.Module):
+    def __init__(
+        self,
+        image_nc,
+        descriptor_nc,
+        base_nc,
+        max_nc,
+        encoder_layer,
+        decoder_layer,
+        use_spect
+    ):
+        super(WarpingNet, self).__init__()
+
+        nonlinearity = nn.LeakyReLU(0.1)
+        norm_layer = functools.partial(LayerNorm2d, affine=True)
+        kwargs = {'nonlinearity': nonlinearity, 'use_spect': use_spect}
+
+        self.descriptor_nc = descriptor_nc
+        self.hourglass = ADAINHourglass(image_nc, self.descriptor_nc, base_nc,
+                                        max_nc, encoder_layer, decoder_layer, **kwargs)
+
+        self.flow_out = nn.Sequential(norm_layer(self.hourglass.output_nc),
+                                      nonlinearity,
+                                      nn.Conv2d(self.hourglass.output_nc, 2, kernel_size=7, stride=1, padding=3))
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, input_image, descriptor):
+        final_output = {}
+        output = self.hourglass(input_image, descriptor)
+        final_output['flow_field'] = self.flow_out(output)
+
+        deformation = flow_util.convert_flow_to_deformation(
+            final_output['flow_field'])
+        final_output['warp_image'] = flow_util.warp_image(
+            input_image, deformation)
+        return final_output
+
+
+class EditingNet(nn.Module):
+    def __init__(
+            self,
+            image_nc,
+            descriptor_nc,
+            layer,
+            base_nc,
+            max_nc,
+            num_res_blocks,
+            use_spect):
+        super(EditingNet, self).__init__()
+
+        nonlinearity = nn.LeakyReLU(0.1)
+        norm_layer = functools.partial(LayerNorm2d, affine=True)
+        kwargs = {'norm_layer': norm_layer,
+                  'nonlinearity': nonlinearity, 'use_spect': use_spect}
+        self.descriptor_nc = descriptor_nc
+
+        # encoder part
+        self.encoder = FineEncoder(
+            image_nc*2, base_nc, max_nc, layer, **kwargs)
+        self.decoder = FineDecoder(
+            image_nc, self.descriptor_nc, base_nc, max_nc, layer, num_res_blocks, **kwargs)
+
+    def forward(self, input_image, warp_image, descriptor):
+        x = torch.cat([input_image, warp_image], 1)
+        x = self.encoder(x)
+        gen_image = self.decoder(x, descriptor)
+        return gen_image
diff --git a/dreamtalk/dreamtalk/generators/flow_util.py b/dreamtalk/dreamtalk/generators/flow_util.py
new file mode 100644
index 00000000..8bdc9e3a
--- /dev/null
+++ b/dreamtalk/dreamtalk/generators/flow_util.py
@@ -0,0 +1,60 @@
+import torch
+
+
+def convert_flow_to_deformation(flow):
+    r"""convert flow fields to deformations.
+
+    Args:
+        flow (tensor): Flow field obtained by the model
+    Returns:
+        deformation (tensor): The deformation used for warpping
+    """
+    b, c, h, w = flow.shape
+    flow_norm = 2 * \
+        torch.cat([flow[:, :1, ...]/(w-1), flow[:, 1:, ...]/(h-1)], 1)
+    grid = make_coordinate_grid(flow)
+    deformation = grid + flow_norm.permute(0, 2, 3, 1)
+    return deformation
+
+
+def make_coordinate_grid(flow):
+    r"""obtain coordinate grid with the same size as the flow filed.
+
+    Args:
+        flow (tensor): Flow field obtained by the model
+    Returns:
+        grid (tensor): The grid with the same size as the input flow
+    """
+    b, c, h, w = flow.shape
+
+    x = torch.arange(w).to(flow)
+    y = torch.arange(h).to(flow)
+
+    x = (2 * (x / (w - 1)) - 1)
+    y = (2 * (y / (h - 1)) - 1)
+
+    yy = y.view(-1, 1).repeat(1, w)
+    xx = x.view(1, -1).repeat(h, 1)
+
+    meshed = torch.cat([xx.unsqueeze_(2), yy.unsqueeze_(2)], 2)
+    meshed = meshed.expand(b, -1, -1, -1)
+    return meshed
+
+
+def warp_image(source_image, deformation):
+    r"""warp the input image according to the deformation
+
+    Args:
+        source_image (tensor): source images to be warpped
+        deformation (tensor): deformations used to warp the images; value in range (-1, 1)
+    Returns:
+        output (tensor): the warpped images
+    """
+    _, h_old, w_old, _ = deformation.shape
+    _, _, h, w = source_image.shape
+    if h_old != h or w_old != w:
+        deformation = deformation.permute(0, 3, 1, 2)
+        deformation = torch.nn.functional.interpolate(
+            deformation, size=(h, w), mode='bilinear')
+        deformation = deformation.permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(source_image, deformation)
diff --git a/dreamtalk/dreamtalk/generators/renderer_conf.yaml b/dreamtalk/dreamtalk/generators/renderer_conf.yaml
new file mode 100644
index 00000000..a95fe3b8
--- /dev/null
+++ b/dreamtalk/dreamtalk/generators/renderer_conf.yaml
@@ -0,0 +1,17 @@
+common:
+  descriptor_nc: 256
+  image_nc: 3
+  max_nc: 256
+  use_spect: false
+editing_net:
+  base_nc: 64
+  layer: 3
+  num_res_blocks: 2
+mapping_net:
+  coeff_nc: 73
+  descriptor_nc: 256
+  layer: 3
+warpping_net:
+  base_nc: 32
+  decoder_layer: 3
+  encoder_layer: 5
diff --git a/dreamtalk/dreamtalk/generators/utils.py b/dreamtalk/dreamtalk/generators/utils.py
new file mode 100644
index 00000000..a26b3648
--- /dev/null
+++ b/dreamtalk/dreamtalk/generators/utils.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import os
+
+import cv2
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from PIL import Image
+
+
+def obtain_seq_index(index, num_frames, radius):
+    seq = list(range(index - radius, index + radius + 1))
+    seq = [min(max(item, 0), num_frames - 1) for item in seq]
+    return seq
+
+
+@torch.no_grad()
+def get_netG(checkpoint_path, device):
+    import yaml
+
+    from generators.face_model import FaceGenerator
+
+    with open("generators/renderer_conf.yaml", "r") as f:
+        renderer_config = yaml.load(f, Loader=yaml.FullLoader)
+
+    renderer = FaceGenerator(**renderer_config).to(device)
+
+    checkpoint = torch.load(
+        checkpoint_path, map_location=lambda storage, loc: storage)
+    renderer.load_state_dict(checkpoint["net_G_ema"], strict=False)
+
+    renderer.eval()
+
+    return renderer
+
+
+@torch.no_grad()
+def render_video(
+    net_G,
+    src_img_path,
+    exp_path,
+    wav_path,
+    output_path,
+    device,
+    silent=False,
+    semantic_radius=13,
+    fps=30,
+    split_size=16,
+    no_move=False,
+):
+    """
+    exp: (N, 73)
+    """
+    target_exp_seq = np.load(exp_path)
+    if target_exp_seq.shape[1] == 257:
+        exp_coeff = target_exp_seq[:, 80:144]
+        angle_trans_crop = np.array(
+            [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9370641, 126.84911, 129.03864],
+            dtype=np.float32,
+        )
+        target_exp_seq = np.concatenate(
+            [exp_coeff, angle_trans_crop[None, ...].repeat(
+                exp_coeff.shape[0], axis=0)],
+            axis=1,
+        )
+        # (L, 73)
+    elif target_exp_seq.shape[1] == 73:
+        if no_move:
+            target_exp_seq[:, 64:] = np.array(
+                [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.9370641, 126.84911, 129.03864],
+                dtype=np.float32,
+            )
+    else:
+        raise NotImplementedError
+
+    frame = cv2.imread(src_img_path)
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    src_img_raw = Image.fromarray(frame)
+    image_transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(
+                (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True),
+        ]
+    )
+    src_img = image_transform(src_img_raw)
+
+    target_win_exps = []
+    for frame_idx in range(len(target_exp_seq)):
+        win_indices = obtain_seq_index(
+            frame_idx, target_exp_seq.shape[0], semantic_radius
+        )
+        win_exp = torch.tensor(target_exp_seq[win_indices]).permute(1, 0)
+        # (73, 27)
+        target_win_exps.append(win_exp)
+
+    target_exp_concat = torch.stack(target_win_exps, dim=0)
+    target_splited_exps = torch.split(target_exp_concat, split_size, dim=0)
+    output_imgs = []
+    for win_exp in target_splited_exps:
+        win_exp = win_exp.to(device)
+        cur_src_img = src_img.expand(win_exp.shape[0], -1, -1, -1).to(device)
+        output_dict = net_G(cur_src_img, win_exp)
+        output_imgs.append(output_dict["fake_image"].cpu().clamp_(-1, 1))
+
+    output_imgs = torch.cat(output_imgs, 0)
+    transformed_imgs = ((output_imgs + 1) / 2 *
+                        255).to(torch.uint8).permute(0, 2, 3, 1)
+
+    if silent:
+        torchvision.io.write_video(output_path, transformed_imgs.cpu(), fps)
+    else:
+        silent_video_path = f"{output_path}-silent.mp4"
+        torchvision.io.write_video(
+            silent_video_path, transformed_imgs.cpu(), fps)
+        os.system(
+            f"ffmpeg -loglevel quiet -y -i {silent_video_path} -i {wav_path} -shortest {output_path}"
+        )
+        os.remove(silent_video_path)
diff --git a/dreamtalk/dreamtalk/inference_for_demo_video.py b/dreamtalk/dreamtalk/inference_for_demo_video.py
new file mode 100644
index 00000000..afe94d99
--- /dev/null
+++ b/dreamtalk/dreamtalk/inference_for_demo_video.py
@@ -0,0 +1,255 @@
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import urllib
+
+import numpy as np
+import torch
+import torchaudio
+from scipy.io import loadmat
+from transformers import Wav2Vec2Processor
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
+
+from configs.default import get_cfg_defaults
+from core.networks.diffusion_net import DiffusionNet
+from core.networks.diffusion_util import NoisePredictor, VarianceSchedule
+from core.utils import (
+    crop_src_image,
+    get_pose_params,
+    get_video_style_clip,
+    get_wav2vec_audio_window,
+)
+from generators.utils import get_netG, render_video
+
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+
+@torch.no_grad()
+def get_diff_net(cfg, device):
+    diff_net = DiffusionNet(
+        cfg=cfg,
+        net=NoisePredictor(cfg),
+        var_sched=VarianceSchedule(
+            num_steps=cfg.DIFFUSION.SCHEDULE.NUM_STEPS,
+            beta_1=cfg.DIFFUSION.SCHEDULE.BETA_1,
+            beta_T=cfg.DIFFUSION.SCHEDULE.BETA_T,
+            mode=cfg.DIFFUSION.SCHEDULE.MODE,
+        ),
+    )
+    checkpoint = torch.load(cfg.INFERENCE.CHECKPOINT, map_location=device)
+    model_state_dict = checkpoint["model_state_dict"]
+    diff_net_dict = {
+        k[9:]: v for k, v in model_state_dict.items() if k[:9] == "diff_net."
+    }
+    diff_net.load_state_dict(diff_net_dict, strict=True)
+    diff_net.eval()
+
+    return diff_net
+
+
+@torch.no_grad()
+def get_audio_feat(wav_path, output_name, wav2vec_model):
+    audio_feat_dir = os.path.dirname(audio_feat_path)
+
+    pass
+
+
+@torch.no_grad()
+def inference_one_video(
+    cfg,
+    audio_path,
+    style_clip_path,
+    pose_path,
+    output_path,
+    diff_net,
+    device,
+    max_audio_len=None,
+    sample_method="ddim",
+    ddim_num_step=10,
+):
+    audio_raw = audio_data = np.load(audio_path)
+
+    if max_audio_len is not None:
+        audio_raw = audio_raw[: max_audio_len * 50]
+    gen_num_frames = len(audio_raw) // 2
+
+    audio_win_array = get_wav2vec_audio_window(
+        audio_raw,
+        start_idx=0,
+        num_frames=gen_num_frames,
+        win_size=cfg.WIN_SIZE,
+    )
+
+    audio_win = torch.tensor(audio_win_array).to(device)
+    audio = audio_win.unsqueeze(0)
+
+    # the second parameter is "" because of bad interface design...
+    style_clip_raw, style_pad_mask_raw = get_video_style_clip(
+        style_clip_path, "", style_max_len=256, start_idx=0
+    )
+
+    style_clip = style_clip_raw.unsqueeze(0).to(device)
+    style_pad_mask = (
+        style_pad_mask_raw.unsqueeze(0).to(device)
+        if style_pad_mask_raw is not None
+        else None
+    )
+
+    gen_exp_stack = diff_net.sample(
+        audio,
+        style_clip,
+        style_pad_mask,
+        output_dim=cfg.DATASET.FACE3D_DIM,
+        use_cf_guidance=cfg.CF_GUIDANCE.INFERENCE,
+        cfg_scale=cfg.CF_GUIDANCE.SCALE,
+        sample_method=sample_method,
+        ddim_num_step=ddim_num_step,
+    )
+    gen_exp = gen_exp_stack[0].cpu().numpy()
+
+    pose_ext = pose_path[-3:]
+    pose = None
+    pose = get_pose_params(pose_path)
+    # (L, 9)
+
+    selected_pose = None
+    if len(pose) >= len(gen_exp):
+        selected_pose = pose[: len(gen_exp)]
+    else:
+        pose_tensor = torch.from_numpy(pose)
+        selected_pose = pose_tensor[-1].unsqueeze(0).repeat(len(gen_exp), 1)
+        selected_pose[: len(pose)] = pose_tensor
+
+    gen_exp_pose = np.concatenate((gen_exp, selected_pose), axis=1)
+    np.save(output_path, gen_exp_pose)
+    return output_path
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="inference for demo")
+    parser.add_argument("--wav_path", type=str, default="", help="path for wav")
+    parser.add_argument("--image_path", type=str, default="", help="path for image")
+    parser.add_argument("--disable_img_crop", dest="img_crop", action="store_false")
+    parser.set_defaults(img_crop=True)
+
+    parser.add_argument(
+        "--style_clip_path", type=str, default="", help="path for style_clip_mat"
+    )
+    parser.add_argument("--pose_path", type=str, default="", help="path for pose")
+    parser.add_argument(
+        "--max_gen_len",
+        type=int,
+        default=1000,
+        help="The maximum length (seconds) limitation for generating videos",
+    )
+    parser.add_argument(
+        "--cfg_scale",
+        type=float,
+        default=1.0,
+        help="The scale of classifier-free guidance",
+    )
+    parser.add_argument(
+        "--output_name",
+        type=str,
+        default="test",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+    )
+    args = parser.parse_args()
+
+    if args.device == "cuda" and not torch.cuda.is_available():
+        print("CUDA is not available, set --device=cpu to use CPU.")
+        exit(1)
+
+    device = torch.device(args.device)
+
+    cfg = get_cfg_defaults()
+    cfg.CF_GUIDANCE.SCALE = args.cfg_scale
+    cfg.freeze()
+
+    tmp_dir = f"tmp/{args.output_name}"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    # get audio in 16000Hz
+    wav_16k_path = os.path.join(tmp_dir, f"{args.output_name}_16K.wav")
+    command = f"ffmpeg -y -i {args.wav_path} -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 {wav_16k_path}"
+    subprocess.run(command.split())
+
+    # get wav2vec feat from audio
+    wav2vec_processor = Wav2Vec2Processor.from_pretrained(
+        "jonatasgrosman/wav2vec2-large-xlsr-53-english",
+        ignore_mismatched_sizes=True
+    )
+
+    wav2vec_model = (
+        Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
+        .eval()
+        .to(device)
+    )
+
+    speech_array, sampling_rate = torchaudio.load(wav_16k_path)
+    audio_data = speech_array.squeeze().numpy()
+    inputs = wav2vec_processor(
+        audio_data, sampling_rate=16_000, return_tensors="pt", padding=True
+    )
+
+    with torch.no_grad():
+        audio_embedding = wav2vec_model(
+            inputs.input_values.to(device), return_dict=False
+        )[0]
+
+    audio_feat_path = os.path.join(tmp_dir, f"{args.output_name}_wav2vec.npy")
+    np.save(audio_feat_path, audio_embedding[0].cpu().numpy())
+
+    # get src image
+    src_img_path = os.path.join(tmp_dir, "src_img.png")
+    if args.img_crop:
+        crop_src_image(args.image_path, src_img_path, 0.4)
+    else:
+        shutil.copy(args.image_path, src_img_path)
+
+    with torch.no_grad():
+        # get diff model and load checkpoint
+        diff_net = get_diff_net(cfg, device).to(device)
+        # generate face motion
+        face_motion_path = os.path.join(tmp_dir, f"{args.output_name}_facemotion.npy")
+        inference_one_video(
+            cfg,
+            audio_feat_path,
+            args.style_clip_path,
+            args.pose_path,
+            face_motion_path,
+            diff_net,
+            device,
+            max_audio_len=args.max_gen_len,
+        )
+        # get renderer
+        renderer = get_netG("checkpoints/renderer.pt", device)
+        # render video
+        output_video_path = f"output_video/{args.output_name}.mp4"
+        render_video(
+            renderer,
+            src_img_path,
+            face_motion_path,
+            wav_16k_path,
+            output_video_path,
+            device,
+            fps=25,
+            no_move=False,
+        )
+
+        # add watermark
+        # if you want to generate videos with no watermark (for evaluation), remove this code block.
+        # no_watermark_video_path = f"{output_video_path}-no_watermark.mp4"
+        # shutil.move(output_video_path, no_watermark_video_path)
+        # os.system(
+        #     f'ffmpeg -y -i {no_watermark_video_path} -vf  "movie=media/watermark.png,scale= 120: 36[watermask]; [in] [watermask] overlay=140:220 [out]" {output_video_path}'
+        # )
+        # os.remove(no_watermark_video_path)
+        print(f"output video: {output_video_path}")
\ No newline at end of file
diff --git a/dreamtalk/dreamtalk/output_video/demo.mp4 b/dreamtalk/dreamtalk/output_video/demo.mp4
new file mode 100644
index 00000000..cdfe7381
Binary files /dev/null and b/dreamtalk/dreamtalk/output_video/demo.mp4 differ
diff --git a/dreamtalk/dreamtalk/tmp/demo/demo_16K.wav b/dreamtalk/dreamtalk/tmp/demo/demo_16K.wav
new file mode 100644
index 00000000..c6196e73
Binary files /dev/null and b/dreamtalk/dreamtalk/tmp/demo/demo_16K.wav differ
diff --git a/dreamtalk/dreamtalk/tmp/demo/demo_facemotion.npy b/dreamtalk/dreamtalk/tmp/demo/demo_facemotion.npy
new file mode 100644
index 00000000..e774998c
Binary files /dev/null and b/dreamtalk/dreamtalk/tmp/demo/demo_facemotion.npy differ
diff --git a/dreamtalk/dreamtalk/tmp/demo/demo_wav2vec.npy b/dreamtalk/dreamtalk/tmp/demo/demo_wav2vec.npy
new file mode 100644
index 00000000..634deb6a
Binary files /dev/null and b/dreamtalk/dreamtalk/tmp/demo/demo_wav2vec.npy differ
diff --git a/dreamtalk/dreamtalk/tmp/demo/src_img.png b/dreamtalk/dreamtalk/tmp/demo/src_img.png
new file mode 100644
index 00000000..58ca7bcd
Binary files /dev/null and b/dreamtalk/dreamtalk/tmp/demo/src_img.png differ
diff --git a/dreamtalk/eval/CSIM.py b/dreamtalk/eval/CSIM.py
new file mode 100644
index 00000000..ea6c483d
--- /dev/null
+++ b/dreamtalk/eval/CSIM.py
@@ -0,0 +1,41 @@
+# 余弦相似度计算
+from PIL import Image
+from numpy import average, dot, linalg
+# 对图片进行统一化处理
+
+
+def get_thum(image, size=(256, 256), greyscale=False):
+    # 利用image对图像大小重新设置
+    image = image.resize(size)
+    if greyscale:
+        # 将图片转换为L模式，其为灰度图，其每个像素用8个bit表示
+        image = image.convert('L')
+    return image
+# 计算图片的余弦距离
+
+
+def image_similarity_vectors_via_numpy(image1, image2):
+    image1 = get_thum(image1)
+    image2 = get_thum(image2)
+    images = [image1, image2]
+    vectors = []
+    norms = []
+    for image in images:
+        vector = []
+        for pixel_tuple in image.getdata():
+            vector.append(average(pixel_tuple))
+        vectors.append(vector)
+        # linalg=linear（线性）+algebra（代数），norm则表示范数
+        # 求图片的范数
+        norms.append(linalg.norm(vector, 2))
+    a, b = vectors
+    a_norm, b_norm = norms
+    # dot返回的是点积，对二维数组（矩阵）进行计算
+    res = dot(a / a_norm, b / b_norm)
+    return res
+
+
+image1 = Image.open('output/Jae-in_256_first/frame_0000.png')
+image2 = Image.open('output/frame_0000.png')
+cosin = image_similarity_vectors_via_numpy(image1, image2)
+print('图片余弦相似度', cosin)
diff --git a/dreamtalk/eval/LPIPS.py b/dreamtalk/eval/LPIPS.py
new file mode 100644
index 00000000..c4e9aec6
--- /dev/null
+++ b/dreamtalk/eval/LPIPS.py
@@ -0,0 +1,19 @@
+import torch
+import lpips
+from PIL import Image
+import numpy as np
+
+image1 = Image.open('output/Jae-in_256_first/frame_0000.png')
+image2 = Image.open('output/frame_0000.png')
+
+# 加载预训练的LPIPS模型
+lpips_model = lpips.LPIPS(net="alex")
+
+# 将图像转换为PyTorch的Tensor格式
+image1_tensor = torch.tensor(np.array(image1)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+image2_tensor = torch.tensor(np.array(image2)).permute(2, 0, 1).unsqueeze(0).float() / 255.0
+
+# 使用LPIPS模型计算距离
+distance = lpips_model(image1_tensor, image2_tensor)
+
+print("LPIPS distance:", distance.item())
diff --git a/dreamtalk/eval/README.md b/dreamtalk/eval/README.md
new file mode 100644
index 00000000..ccaee33c
--- /dev/null
+++ b/dreamtalk/eval/README.md
@@ -0,0 +1,25 @@
+### 评估的时候写的脚本
+
+> 环境配置相当简单，可以随意嵌入这里面项目使用 ( 基本不会存在冲突问题 )
+
+跑项目的时候用的脚本，基本上可以任意嵌入，如果是 WSL 环境可以直接在 Windows 下用这里的脚本进行简单处理:
+
+> 使用方法在代码中有注释
+
++ `splite.py`: 将视频分割为等长的两部分
+
++ `concat.py`: 合并两个视频
+
++ `eval.py`: 进行视频定量评估 `PSNR & SSIM`
+
++ `m4a.py`: 将视频转化为 m4a 格式
+
++ `pair_videos.py`: 从两组等长视频中提取指定数目相同随机帧
+
++ `to_frame.py`: 提供三种不同的提取帧方式 ( 第一帧，全部帧，指定数目随机帧 )
+
++ `video.py`: 将视频转化为 256*256 分辨率
+
++ `CSIM.py` : 单帧图像 `CSIM` 分数计算脚本
+
++ `LPIPS` : 单帧图像 `LPIPS` 分数计算脚本
diff --git a/dreamtalk/eval/concat.py b/dreamtalk/eval/concat.py
new file mode 100644
index 00000000..098b68b1
--- /dev/null
+++ b/dreamtalk/eval/concat.py
@@ -0,0 +1,33 @@
+import ffmpeg
+import os
+
+def merge_videos(input1, input2, output):
+    # 确保输入文件存在
+    if not os.path.exists(input1):
+        raise FileNotFoundError(f"输入文件不存在：{input1}")
+    if not os.path.exists(input2):
+        raise FileNotFoundError(f"输入文件不存在：{input2}")
+
+    # 创建一个中间列表文件
+    concat_file = "concat_list.txt"
+    with open(concat_file, "w") as f:
+        f.write(f"file '{os.path.abspath(input1)}'\n")
+        f.write(f"file '{os.path.abspath(input2)}'\n")
+
+    try:
+        ffmpeg.input(concat_file, format='concat', safe=0).output(output, c='copy').run(overwrite_output=True)
+        print(f"视频已成功合并为：{output}")
+    except ffmpeg.Error as e:
+        print("FFmpeg 错误:", e.stderr.decode('utf-8'))
+        raise
+    finally:
+        # 清理中间文件
+        if os.path.exists(concat_file):
+            os.remove(concat_file)
+
+if __name__ == "__main__":
+    input_video1 = "eval/Macron1.mp4"  # 输入视频1
+    input_video2 = "eval/Macron2.mp4"  # 输入视频2
+    output_video = "eval/Macron.mp4"
+
+    merge_videos(input_video1, input_video2, output_video)
diff --git a/dreamtalk/eval/eval.py b/dreamtalk/eval/eval.py
new file mode 100644
index 00000000..72a42ee8
--- /dev/null
+++ b/dreamtalk/eval/eval.py
@@ -0,0 +1,43 @@
+import cv2
+import numpy as np
+from skimage.metrics import peak_signal_noise_ratio as psnr
+from skimage.metrics import structural_similarity as ssim
+
+orig_video_path = 'ref/Shaheen_256.mp4'
+gen_video_path = 'eval/mine/Shaheen.mp4'
+
+cap_orig = cv2.VideoCapture(orig_video_path)
+cap_gen = cv2.VideoCapture(gen_video_path)
+
+# 存储 PSNR 和 SSIM 值的列表
+psnr_values = []
+ssim_values = []
+
+while cap_orig.isOpened() and cap_gen.isOpened():
+    ret_orig, frame_orig = cap_orig.read()
+    ret_gen, frame_gen = cap_gen.read()
+
+    if not ret_orig or not ret_gen:
+        break
+
+    # 转换为灰度图
+    gray_orig = cv2.cvtColor(frame_orig, cv2.COLOR_BGR2GRAY)
+    gray_gen = cv2.cvtColor(frame_gen, cv2.COLOR_BGR2GRAY)
+
+    # 计算 PSNR
+    psnr_value = psnr(gray_orig, gray_gen)
+    psnr_values.append(psnr_value)
+
+    # 计算 SSIM
+    ssim_value, _ = ssim(gray_orig, gray_gen, full=True)
+    ssim_values.append(ssim_value)
+
+cap_orig.release()
+cap_gen.release()
+
+# 计算平均 PSNR 和 SSIM
+average_psnr = np.mean(psnr_values)
+average_ssim = np.mean(ssim_values)
+
+print('Average PSNR:', average_psnr)
+print('Average SSIM:', average_ssim)
\ No newline at end of file
diff --git a/dreamtalk/eval/m4a.py b/dreamtalk/eval/m4a.py
new file mode 100644
index 00000000..408f3e85
--- /dev/null
+++ b/dreamtalk/eval/m4a.py
@@ -0,0 +1,40 @@
+import os
+import subprocess
+
+def convert_mp4_to_m4a(input_folder, output_folder):
+    """
+    将指定文件夹中的所有 MP4 文件转换为 M4A 音频文件。
+    """
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+
+    # 遍历输入文件夹中的所有文件
+    for file_name in os.listdir(input_folder):
+        # 检查文件是否为 MP4 格式
+        if file_name.lower().endswith(".mp4"):
+            input_path = os.path.join(input_folder, file_name)
+            output_name = os.path.splitext(file_name)[0] + ".m4a"
+            output_path = os.path.join(output_folder, output_name)
+
+            command = [
+                "ffmpeg",
+                "-i", input_path,        # 输入文件
+                "-vn",                   # 禁用视频
+                "-acodec", "aac",        # 指定音频编码器为 AAC
+                "-b:a", "192k",          # 设置音频比特率
+                output_path              # 输出文件
+            ]
+
+            # 执行 ffmpeg 命令
+            try:
+                subprocess.run(command, check=True)
+                print(f"Converted: {input_path} -> {output_path}")
+            except subprocess.CalledProcessError as e:
+                print(f"Error converting {input_path}: {e}")
+
+    print("All files processed.")
+
+input_folder = "videos"  # 输入文件夹，包含 MP4 文件
+output_folder = "m4a"  # 输出文件夹，存放转换后的 M4A 文件
+
+convert_mp4_to_m4a(input_folder, output_folder)
diff --git a/dreamtalk/eval/pair_videos.py b/dreamtalk/eval/pair_videos.py
new file mode 100644
index 00000000..120a22a4
--- /dev/null
+++ b/dreamtalk/eval/pair_videos.py
@@ -0,0 +1,104 @@
+import cv2
+import os
+import random
+
+def extract_random_frames_pair(video_paths1, video_paths2, output_folder, num_frames=10):
+    """
+    从两组视频中随机选择指定数量的帧并保存为 256x256 分辨率的图片，
+    确保两组视频提取的随机帧在时间序号上是相同的。
+    """
+    # 确保两组视频路径长度相同
+    if len(video_paths1) != len(video_paths2):
+        print("Error: The two video groups must have the same length.")
+        return
+
+    # 获取随机选择的帧索引
+    total_frames = min(
+        int(cv2.VideoCapture(video_paths1[0]).get(cv2.CAP_PROP_FRAME_COUNT)),
+        int(cv2.VideoCapture(video_paths2[0]).get(cv2.CAP_PROP_FRAME_COUNT))
+    )
+
+    # 如果请求的帧数大于总帧数，调整为总帧数
+    num_frames = min(num_frames, total_frames)
+    
+    # 随机选择不重复的帧索引
+    frame_indices = random.sample(range(total_frames), num_frames)
+
+    # 遍历视频路径组，处理每个视频
+    for idx, (video_path1, video_path2) in enumerate(zip(video_paths1, video_paths2)):
+        # 获取视频名称并创建输出文件夹
+        video_name1 = os.path.splitext(os.path.basename(video_path1))[0]
+        video_name2 = os.path.splitext(os.path.basename(video_path2))[0]
+        
+        video_output_folder1 = os.path.join(output_folder, video_name1 + "_random_" + str(num_frames))
+        video_output_folder2 = os.path.join(output_folder, video_name2 + "_random_" + str(num_frames))
+        
+        if not os.path.exists(video_output_folder1):
+            os.makedirs(video_output_folder1)
+        if not os.path.exists(video_output_folder2):
+            os.makedirs(video_output_folder2)
+
+        video_capture1 = cv2.VideoCapture(video_path1)
+        video_capture2 = cv2.VideoCapture(video_path2)
+        
+        # 逐帧处理并保存
+        for i, frame_index in enumerate(frame_indices):
+            # 设置视频捕捉对象的位置
+            video_capture1.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+            video_capture2.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+            
+            # 读取指定帧
+            ret1, frame1 = video_capture1.read()
+            ret2, frame2 = video_capture2.read()
+            
+            if ret1 and ret2:
+                # 构建帧的输出文件路径
+                frame_filename1 = os.path.join(video_output_folder1, f"frame_{frame_index:04d}_ref.png")
+                frame_filename2 = os.path.join(video_output_folder2, f"frame_{frame_index:04d}_eval.png")
+                
+                # 保存帧为图片
+                cv2.imwrite(frame_filename1, frame1)
+                cv2.imwrite(frame_filename2, frame2)
+                
+                print(f"Saved: {frame_filename1}")
+                print(f"Saved: {frame_filename2}")
+        
+        video_capture1.release()
+        video_capture2.release()
+
+    print(f"Finished extracting {num_frames} random frames for each video pair.")
+
+def process_videos(input_folder1, input_folder2, output_folder, num_random_frames=10):
+    """
+    遍历两个输入文件夹中的所有同名视频文件，提取指定数量的随机帧。
+    """
+    # 获取两个文件夹中的所有视频文件名
+    videos1 = set(os.listdir(input_folder1))
+    videos2 = set(os.listdir(input_folder2))
+    
+    # 只选择两个文件夹中都有的同名视频
+    common_videos = videos1.intersection(videos2)
+    
+    # 遍历所有同名视频
+    for file_name in common_videos:
+        # 只获取 256 x 256 分辨率的视频
+        if '_256' in file_name:
+            video_path1 = os.path.join(input_folder1, file_name)
+            video_path2 = os.path.join(input_folder2, file_name)
+
+            if os.path.isfile(video_path1) and os.path.isfile(video_path2) and file_name.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
+                print(f"Processing video: {file_name}")
+
+                # 从两组视频中提取共同的随机帧
+                extract_random_frames_pair([video_path1], [video_path2], output_folder, num_random_frames)
+    
+    print("Finished processing all videos.")
+
+input_folder1 = "ref"  # 第一个视频文件夹
+input_folder2 = "eval"  # 第二个视频文件夹
+output_folder = "output"  # 输出文件夹
+num_random_frames = 1500  # 提取的随机帧数
+
+# 从每对同名视频中提取随机帧
+process_videos(input_folder1, input_folder2, output_folder, num_random_frames)
+
diff --git a/dreamtalk/eval/splite.py b/dreamtalk/eval/splite.py
new file mode 100644
index 00000000..bc36327e
--- /dev/null
+++ b/dreamtalk/eval/splite.py
@@ -0,0 +1,33 @@
+import ffmpeg
+import os
+
+def split_video(input_path, output_path1, output_path2):
+    # 确保输入路径有效
+    if not os.path.exists(input_path):
+        raise FileNotFoundError(f"输入文件不存在：{input_path}")
+    
+    try:
+        # 获取视频时长
+        probe = ffmpeg.probe(input_path)
+        duration = float(probe['format']['duration'])  # 视频总时长
+        mid_point = duration / 2                      # 中间时间点
+
+        ffmpeg.input(input_path, ss=0, t=mid_point).output(output_path1, c='copy').run(overwrite_output=True)
+
+        ffmpeg.input(input_path, ss=mid_point).output(output_path2, c='copy').run(overwrite_output=True)
+
+        print(f"视频已成功分割：\n1. {output_path1}\n2. {output_path2}")
+    except ffmpeg.Error as e:
+        print("FFmpeg 错误:", e.stderr.decode('utf-8'))
+        raise
+
+# 使用示例
+if __name__ == "__main__":
+    input_video = "videos/Macron.mp4"     # 输入视频路径
+    output_video1 = "videos/Macron1.mp4"  # 输出第一部分路径
+    output_video2 = "videos/Macron2.mp4"  # 输出第二部分路径
+    
+    split_video(input_video, output_video1, output_video2)
+
+
+
diff --git a/dreamtalk/eval/to_frames.py b/dreamtalk/eval/to_frames.py
new file mode 100644
index 00000000..8a9adb09
--- /dev/null
+++ b/dreamtalk/eval/to_frames.py
@@ -0,0 +1,161 @@
+import cv2
+import os
+import random
+
+def extract_frames(video_path, output_folder):
+    """
+    提取视频中的所有帧
+    """
+    video_capture = cv2.VideoCapture(video_path)
+    
+    # 检查视频是否成功打开
+    if not video_capture.isOpened():
+        print(f"Error opening video file: {video_path}")
+        return
+    
+    # 获取视频文件名（不带扩展名）
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+    
+    # 为每个视频创建一个独立的输出文件夹
+    video_output_folder = os.path.join(output_folder, video_name)
+    if not os.path.exists(video_output_folder):
+        os.makedirs(video_output_folder)
+    
+    frame_count = 0
+    while True:
+        # 逐帧读取视频
+        ret, frame = video_capture.read()
+        
+        # 如果读取失败，退出循环
+        if not ret:
+            break
+        
+        
+        # 构建帧的输出文件路径
+        frame_filename = os.path.join(video_output_folder, f"frame_{frame_count:04d}.png")
+        
+        # 保存帧
+        cv2.imwrite(frame_filename, frame)
+        
+        print(f"Saved: {frame_filename}")
+        
+        frame_count += 1
+    
+    video_capture.release()
+    print(f"Finished extracting frames from {video_path}.")
+
+def extract_first_frame(video_path, output_folder):
+    """
+    只提取视频的第一帧
+    """
+    video_capture = cv2.VideoCapture(video_path)
+    
+    if not video_capture.isOpened():
+        print(f"Error opening video file: {video_path}")
+        return
+    
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+    
+    video_output_folder = os.path.join(output_folder, video_name + "_first")
+    if not os.path.exists(video_output_folder):
+        os.makedirs(video_output_folder)
+    
+    # 读取第一帧
+    ret, frame = video_capture.read()
+    
+    if ret:
+        
+        frame_filename = os.path.join(video_output_folder, f"frame_0000.png")
+        
+        cv2.imwrite(frame_filename, frame)
+        
+        print(f"Saved: {frame_filename}")
+    
+    video_capture.release()
+    print(f"Finished extracting the first frame from {video_path}.")
+
+def extract_random_frames(video_path, output_folder, num_frames=10):
+    """
+    从视频中随机选择指定数量的帧并保存为 256x256 分辨率的图片。
+    """
+    video_capture = cv2.VideoCapture(video_path)
+    
+    if not video_capture.isOpened():
+        print(f"Error opening video file: {video_path}")
+        return
+    
+    video_name = os.path.splitext(os.path.basename(video_path))[0]
+    
+    video_output_folder = os.path.join(output_folder, video_name + "_random_" + str(num_frames))
+    if not os.path.exists(video_output_folder):
+        os.makedirs(video_output_folder)
+    
+    # 获取视频的总帧数
+    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
+    
+    # 如果请求的帧数大于总帧数，调整为总帧数
+    num_frames = min(num_frames, total_frames)
+    
+    # 随机选择不重复的帧索引
+    frame_indices = random.sample(range(total_frames), num_frames)
+    
+    # 提取指定的随机帧
+    for i, frame_index in enumerate(frame_indices):
+        # 设置视频捕捉对象的位置
+        video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_index)
+        
+        # 读取指定帧
+        ret, frame = video_capture.read()
+        
+        if ret:
+            frame_filename = os.path.join(video_output_folder, f"frame_{frame_index:04d}.png")
+            
+            cv2.imwrite(frame_filename, frame)
+            
+            print(f"Saved: {frame_filename}")
+    
+    video_capture.release()
+    print(f"Finished extracting {num_frames} random frames from {video_path}.")
+
+def process_videos(input_folder, output_folder, mode=1, num_random_frames=10):
+    """
+    遍历输入文件夹中的所有视频文件，按指定模式提取视频帧。
+    - mode=1: 只提取第一帧
+    - mode=2: 提取所有帧
+    - mode=3: 随机提取指定数量的帧
+    """
+    # 遍历输入文件夹中的所有文件
+    for file_name in os.listdir(input_folder):
+        # 只获取 256 x 256 分辨率的视频
+        if '_256' in file_name:
+            video_path = os.path.join(input_folder, file_name)
+        
+            if os.path.isfile(video_path) and file_name.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
+                print(f"Processing video: {file_name}")
+                
+                if mode == 1:
+                    # 只提取第一帧
+                    extract_first_frame(video_path, output_folder)
+                elif mode == 2:
+                    # 提取所有帧
+                    extract_frames(video_path, output_folder)
+                elif mode == 3:
+                    # 随机提取指定数量的帧
+                    extract_random_frames(video_path, output_folder, num_random_frames)
+                else:
+                    print(f"Invalid mode: {mode}. Please use mode 1, 2, or 3.")
+    
+    print("Finished processing all videos.")
+
+
+input_folder = "ref"
+output_folder = "output"
+
+# 只提取每个视频的第一帧
+# process_videos(input_folder, output_folder, mode=1)
+
+# 提取所有帧, 太大了
+# process_videos(input_folder, output_folder, mode=2)
+
+# 从每个视频中随机提取 x 帧，25 的帧速率的话 1500 刚好 60s 
+# process_videos(input_folder, output_folder, mode=3, num_random_frames=1500)
\ No newline at end of file
diff --git a/dreamtalk/eval/video.py b/dreamtalk/eval/video.py
new file mode 100644
index 00000000..0725e8bb
--- /dev/null
+++ b/dreamtalk/eval/video.py
@@ -0,0 +1,38 @@
+import cv2
+import os
+
+input_folder = 'videos'  # 改为你的文件夹路径
+
+def resize_videos(path):
+    # 遍历文件夹中的所有文件
+    for filename in os.listdir(path):
+        if filename.endswith('.mp4'):  # 仅处理 MP4 文件
+            input_video_path = os.path.join(path, filename)
+            
+            # 创建输出视频文件的路径，添加 "_256" 后缀
+            output_video_path = os.path.join(path, filename.replace('.mp4', '_256.mp4'))
+            
+            cap = cv2.VideoCapture(input_video_path)
+            
+            # 获取输入视频的帧率和原始分辨率
+            fps = cap.get(cv2.CAP_PROP_FPS)
+            
+            # 设置输出视频的编码方式、帧率和分辨率
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(output_video_path, fourcc, fps, (256, 256))
+            
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                
+                resized_frame = cv2.resize(frame, (256, 256))
+                
+                out.write(resized_frame)
+            
+            cap.release()
+            out.release()
+            print(f"Processed video: {input_video_path} -> {output_video_path}")
+    print("All videos processed!")
+
+resize_videos(input_folder)
diff --git a/dreamtalk/syncnet_python/.dockerfile b/dreamtalk/syncnet_python/.dockerfile
new file mode 100644
index 00000000..66a7e26b
--- /dev/null
+++ b/dreamtalk/syncnet_python/.dockerfile
@@ -0,0 +1,33 @@
+FROM continuumio/anaconda3:latest
+
+WORKDIR /syncnet_python
+
+# 复制需要的文件
+COPY requirements.txt /syncnet_python/
+
+RUN apt-get update
+
+# 确保依赖项不会缺失，不一定需要，只是个人习惯下一些依赖
+RUN apt-get install build-essential g++ cmake ffmpeg libgl1 -y
+
+RUN conda create -n SP python=3.9 -y
+
+SHELL ["/bin/bash", "-c"]
+
+RUN conda init bash && \
+    source ~/.bashrc && \
+    conda activate SP && \
+    pip install -r requirements.txt
+
+RUN pip install opencv-python-headless
+
+
+COPY . /syncnet_python/
+
+RUN sh download_model.sh
+
+# 添加脚本
+RUN chmod +x /syncnet_python/run_commands.sh
+
+# 设置默认命令
+ENTRYPOINT ["conda", "run", "-n", "SP", "bash", "/syncnet_python/run_commands.sh"]
diff --git a/dreamtalk/syncnet_python/LICENSE.md b/dreamtalk/syncnet_python/LICENSE.md
new file mode 100644
index 00000000..de4a5458
--- /dev/null
+++ b/dreamtalk/syncnet_python/LICENSE.md
@@ -0,0 +1,19 @@
+Copyright (c) 2016-present Joon Son Chung.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/dreamtalk/syncnet_python/README.md b/dreamtalk/syncnet_python/README.md
new file mode 100644
index 00000000..4a13c1f7
--- /dev/null
+++ b/dreamtalk/syncnet_python/README.md
@@ -0,0 +1,166 @@
+### syncnet_python
+
+|||
+|:--:|:--:|
+| **[论文网址](https://link.springer.com/chapter/10.1007/978-3-319-54427-4_19)** | **[GitHub](https://github.com/joonson/syncnet_python)** |
+
+
+### 使用 Docker 镜像运行
+
+#### 构筑镜像
+
+好像每次构筑都会有重复的下载 ( 捂脸 ), 不过貌似是不占大小，是覆写已经存在的文件，主要构筑里面写了下载模型和 demo 视频的执行脚本，自己下的话可以删掉对应部分
+
+在项目目录下构筑哦！
+
+```bash
+    docker build -f .dockerfile -t syncnet:v1 .
+```
+
+#### 下载镜像
+
+> 不建议，镜像文件很大，建议本地构筑
+
+好像我构筑的有亿点点大，虽然是两位数，但是单位是 GB hh ( 上传好浪费时间，我的建议是自己构筑，如果一定需要可以通过 GitHub 或者邮箱: wincomso9@outlook.com 联系我 )
+
+下载好后放到 `syncnet_python` 目录下，然后执行以下命令
+
+> 可以 `docker images` 看加载是否成功
+
+```bash
+    docker load -i syncnet.tar
+```
+
+#### 运行镜像
+
+> 默认执行的评估是 `eval/data/demo.mp4`, 不存在的话就修改`/syncnet_python/run_commands.sh` 为自定义的路径，
+
+```bash
+    # 本地待测评的数据放到 /syncnet_python/eval/data 下，修改 run_commands.sh 对应的路径
+    docker run --rm --gpus all -v ${PWD}/eval/data:/syncnet_python/eval/data -v ${PWD}/tmp:/syncnet_python/tmp -v ${PWD}/all_scores.txt:/syncnet_python/all_scores.txt syncnet:v1
+```
+
+### 本机 WSL 下配置 ( 和在Ubuntu 下配置一样 )
+
+```bash
+    git clone https://github.com/joonson/syncnet_python.git 
+    cd syncnet_python
+    conda create -n SP python=3.9
+    conda activate SP
+    pip install -r requirements.txt
+    sh download_model.sh
+```
+
+#### 运行测试用例
+
+```bash
+    python demo_syncnet.py --videofile data/example.avi --tmp_dir /path/to/temp/directory
+```
+
+#### 进行评估
+
+使用 [Wav2Lip](https://github.com/Rudrabha/Wav2Lip/tree/master/evaluation) 的评估脚本，按照说明将 `Wav2Lip/evaluation/scores_LSE/` 下的脚本放入 `/syncnet_python/` 目录下 ( 直接下载传入最好，懒得再克隆一个项目 )
+
+本机跑建议运行下面的命令逐一推理:
+
+```bash
+    python run_pipeline.py --videofile eval/Jae-in.mp4 --reference t1 --data_dir tmp
+    python calculate_scores_real_videos.py --videofile eval/Jae-in.mp4 --reference t1 --data_dir tmp >> all_scores.txt
+```
+
+参数说明：
+
+`--videofile`: 视频文件路径
+
+`--reference`: 类似于视频标号, 其实完全可以用视频名字不需要单独传参。。懒得改了
+
+`--data_dir`: 输出的临时存放目录（在该文件夹下的子目录下存放生成的帧），同时作为第二条命令的输入
+
+
+### 问题与解决方案:
+
++ `numpy` 没有 `int` 属性，较新的版本是 `int_`
+
+    ```bash
+        pip install numpy==1.22
+    ```
+
+    或者找到`\syncnet_python\detectors\s3fd\box_utils.py` 38 行， 修改为 `return np.array(keep).astype(np.int_)`
+
++ `scenedetect` 包的较老版本和新版 `python` 存在兼容性问题 ( 具体来讲就是 `python` 里面约束规范加强了, `scenedetect` 之前的版本存在新版本不允许的操作 ( 对于元组 ) )
+
+    ```bash
+        pip install av 
+        pip install scenedetect==0.6.0
+    ```
+
+### 定量评估结果 ( LSE-D & LSE-C )
+
+在项目目录 `syncnet_python` 下找到 `all.txt` 即可, 输出的结果为 `Min dist & Confidence` 也就是 `LSE-D & LSE-C`, 顺序和执行评估的顺序相同, 后评估的在末尾添加
+
+```
+    评估视频        LSE-D          LSE-C(论文里的SyncNet 置信度得分(Sync_conf))
+```
+---
+```
+    Jae-in        9.679484       3.81083
+
+    Lieu          8.466651       6.7788954
+
+    Macron        9.462689       4.1416864
+
+    May           8.08356        6.0931616
+
+    Obama         8.022001       6.5014324
+
+    Obama1        8.213086       6.4382563
+
+    Obama2        7.1203218      6.8484406
+
+    Shaheen       7.524077       7.74151
+```
+
+
+### 定量评估结果 ( PSNR & SSIM )
+
+在 /eval/ 中进行评估，环境比较简单，结果放在这里了
+
+使用 `eval.py`: 进行视频定量评估 `PSNR & SSIM`
+
+```
+    评估视频        PSNR          SSIM
+```
+---
+```
+    Jae-in        19.6442        0.7636
+
+    Lieu          25.0265        0.8276
+
+    Macron        24.6637        0.8391
+
+    May           25.0020        0.7674
+
+    Obama         25.1191        0.8459
+
+    Obama1        22.6824        0.7921
+
+    Obama2        22.4471        0.8374
+
+    Shaheen       26.6465        0.8514
+```
+
+原论文没有 PSNR 评估，SSIM 评估的分数为 0.86/0.85/0.69 ( MEAD / HDTF / Voxceleb2 )
+
+### 定性评估
+
+针对源码中给出的 happy, angry 的情绪风格 mat 进行了评估，评估结果如下：
+
+|Ground Truth| Reference | Result |
+|:---:|:---:|:---:|
+|![](./emotion/gt.png)|![](./emotion/output/angry_256_random_15/frame_0059_eval.png)|![](./emotion/output/angry_256_random_15/frame_0059_ref.png)|
+
+|Ground Truth| Reference | Result |
+|:---:|:---:|:---:|
+|![](./emotion/gt.png)|![](./emotion/output/happy_256_random_15/frame_0066_eval.png)|![](./emotion/output/happy_256_random_15/frame_0066_ref.png)|
+
+
diff --git a/dreamtalk/syncnet_python/SyncNetInstance.py b/dreamtalk/syncnet_python/SyncNetInstance.py
new file mode 100644
index 00000000..497d44fc
--- /dev/null
+++ b/dreamtalk/syncnet_python/SyncNetInstance.py
@@ -0,0 +1,208 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+    
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval();
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+
+        command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+        
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            images.append(cv2.imread(fname))
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        if (float(len(audio))/16000) != (float(len(images))/25) :
+            print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+        
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        print('Framewise conf: ')
+        print(fconfm)
+        print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), dists_npy
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval();
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+
+        self_state = self.__S__.state_dict();
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param);
diff --git a/dreamtalk/syncnet_python/SyncNetInstance_calc_scores.py b/dreamtalk/syncnet_python/SyncNetInstance_calc_scores.py
new file mode 100644
index 00000000..64906e25
--- /dev/null
+++ b/dreamtalk/syncnet_python/SyncNetInstance_calc_scores.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+    
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval();
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+
+        command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+        
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        #if (float(len(audio))/16000) != (float(len(images))/25) :
+        #    print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+        
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        #print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        #print('Framewise conf: ')
+        #print(fconfm)
+        #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), minval.numpy()
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval();
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+
+        self_state = self.__S__.state_dict();
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param);
diff --git a/dreamtalk/syncnet_python/SyncNetModel.py b/dreamtalk/syncnet_python/SyncNetModel.py
new file mode 100644
index 00000000..c21ce25c
--- /dev/null
+++ b/dreamtalk/syncnet_python/SyncNetModel.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import torch
+import torch.nn as nn
+
+def save(model, filename):
+    with open(filename, "wb") as f:
+        torch.save(model, f);
+        print("%s saved."%filename);
+
+def load(filename):
+    net = torch.load(filename)
+    return net;
+    
+class S(nn.Module):
+    def __init__(self, num_layers_in_fc_layers = 1024):
+        super(S, self).__init__();
+
+        self.__nFeatures__ = 24;
+        self.__nChs__ = 32;
+        self.__midChs__ = 32;
+
+        self.netcnnaud = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)),
+
+            nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
+            nn.BatchNorm2d(192),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)),
+
+            nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(384),
+            nn.ReLU(inplace=True),
+
+            nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
+            
+            nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        );
+
+        self.netfcaud = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        );
+
+        self.netfclip = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        );
+
+        self.netcnnlip = nn.Sequential(
+            nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0),
+            nn.BatchNorm3d(96),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
+
+            nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
+
+            nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0),
+            nn.BatchNorm3d(512),
+            nn.ReLU(inplace=True),
+        );
+
+    def forward_aud(self, x):
+
+        mid = self.netcnnaud(x); # N x ch x 24 x M
+        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        out = self.netfcaud(mid);
+
+        return out;
+
+    def forward_lip(self, x):
+
+        mid = self.netcnnlip(x); 
+        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        out = self.netfclip(mid);
+
+        return out;
+
+    def forward_lipfeat(self, x):
+
+        mid = self.netcnnlip(x);
+        out = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+
+        return out;
\ No newline at end of file
diff --git a/dreamtalk/syncnet_python/all_scores.txt b/dreamtalk/syncnet_python/all_scores.txt
new file mode 100644
index 00000000..9a82e1ca
--- /dev/null
+++ b/dreamtalk/syncnet_python/all_scores.txt
@@ -0,0 +1,8 @@
+9.679484 3.81083
+8.466651 6.7788954
+9.462689 4.1416864
+8.08356 6.0931616
+8.022001 6.5014324
+8.213086 6.4382563
+7.1203218 6.8484406
+7.524077 7.74151
diff --git a/dreamtalk/syncnet_python/calculate_scores_LRS.py b/dreamtalk/syncnet_python/calculate_scores_LRS.py
new file mode 100644
index 00000000..eda02b8f
--- /dev/null
+++ b/dreamtalk/syncnet_python/calculate_scores_LRS.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+import glob
+import os
+from tqdm import tqdm
+
+from SyncNetInstance_calc_scores import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_root', type=str, required=True, help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+path = os.path.join(opt.data_root, "*.mp4")
+
+all_videos = glob.glob(path)
+
+prog_bar = tqdm(range(len(all_videos)))
+avg_confidence = 0.
+avg_min_distance = 0.
+
+
+for videofile_idx in prog_bar:
+	videofile = all_videos[videofile_idx]
+	offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
+	avg_confidence += confidence
+	avg_min_distance += min_distance
+	prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
+	prog_bar.refresh()
+
+print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
+print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
+
+
+
diff --git a/dreamtalk/syncnet_python/calculate_scores_real_videos.py b/dreamtalk/syncnet_python/calculate_scores_real_videos.py
new file mode 100644
index 00000000..09622584
--- /dev/null
+++ b/dreamtalk/syncnet_python/calculate_scores_real_videos.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+
+from SyncNetInstance_calc_scores import *
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+
+
+# ==================== LOAD MODEL AND FILE LIST ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+
+# ==================== GET OFFSETS ====================
+
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    print (str(dist)+" "+str(conf))
+      
+# ==================== PRINT RESULTS TO FILE ====================
+
+#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+#    pickle.dump(dists, fil)
diff --git a/dreamtalk/syncnet_python/calculate_scores_real_videos.sh b/dreamtalk/syncnet_python/calculate_scores_real_videos.sh
new file mode 100644
index 00000000..4a45cd56
--- /dev/null
+++ b/dreamtalk/syncnet_python/calculate_scores_real_videos.sh
@@ -0,0 +1,8 @@
+rm all_scores.txt
+yourfilenames=`ls $1`
+
+for eachfile in $yourfilenames
+do
+   python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
+   python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
+done
diff --git a/dreamtalk/syncnet_python/demo_feature.py b/dreamtalk/syncnet_python/demo_feature.py
new file mode 100644
index 00000000..e3bd290e
--- /dev/null
+++ b/dreamtalk/syncnet_python/demo_feature.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+
+from SyncNetInstance import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
+parser.add_argument('--tmp_dir', type=str, default="data", help='');
+parser.add_argument('--save_as', type=str, default="data/features.pt", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+feats = s.extract_feature(opt, videofile=opt.videofile)
+
+torch.save(feats, opt.save_as)
diff --git a/dreamtalk/syncnet_python/demo_syncnet.py b/dreamtalk/syncnet_python/demo_syncnet.py
new file mode 100644
index 00000000..01c25a6f
--- /dev/null
+++ b/dreamtalk/syncnet_python/demo_syncnet.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+
+from SyncNetInstance import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+s.evaluate(opt, videofile=opt.videofile)
diff --git a/dreamtalk/syncnet_python/detectors/README.md b/dreamtalk/syncnet_python/detectors/README.md
new file mode 100644
index 00000000..f5a8d4fe
--- /dev/null
+++ b/dreamtalk/syncnet_python/detectors/README.md
@@ -0,0 +1,3 @@
+# Face detector
+
+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
diff --git a/dreamtalk/syncnet_python/detectors/__init__.py b/dreamtalk/syncnet_python/detectors/__init__.py
new file mode 100644
index 00000000..059d49bf
--- /dev/null
+++ b/dreamtalk/syncnet_python/detectors/__init__.py
@@ -0,0 +1 @@
+from .s3fd import S3FD
\ No newline at end of file
diff --git a/dreamtalk/syncnet_python/detectors/s3fd/__init__.py b/dreamtalk/syncnet_python/detectors/s3fd/__init__.py
new file mode 100644
index 00000000..d7f35e05
--- /dev/null
+++ b/dreamtalk/syncnet_python/detectors/s3fd/__init__.py
@@ -0,0 +1,61 @@
+import time
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+
+PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+
+
+class S3FD():
+
+    def __init__(self, device='cuda'):
+
+        tstamp = time.time()
+        self.device = device
+
+        print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+
+        w, h = image.shape[1], image.shape[0]
+
+        bboxes = np.empty(shape=(0, 5))
+
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+
+        return bboxes
diff --git a/dreamtalk/syncnet_python/detectors/s3fd/box_utils.py b/dreamtalk/syncnet_python/detectors/s3fd/box_utils.py
new file mode 100644
index 00000000..7f71805f
--- /dev/null
+++ b/dreamtalk/syncnet_python/detectors/s3fd/box_utils.py
@@ -0,0 +1,217 @@
+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+
+
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep).astype(np.int_)
+
+
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+
+
+class Detect(object):
+
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+
+    def forward(self, loc_data, conf_data, prior_data):
+
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+
+        return output
+
+
+class PriorBox(object):
+
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+
+        super(PriorBox, self).__init__()
+
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+
+                mean += [cx, cy, s_kw, s_kh]
+
+        output = torch.FloatTensor(mean).view(-1, 4)
+        
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        
+        return output
diff --git a/dreamtalk/syncnet_python/detectors/s3fd/nets.py b/dreamtalk/syncnet_python/detectors/s3fd/nets.py
new file mode 100644
index 00000000..85b5c82c
--- /dev/null
+++ b/dreamtalk/syncnet_python/detectors/s3fd/nets.py
@@ -0,0 +1,174 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from .box_utils import Detect, PriorBox
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
+
+
+class S3FDNet(nn.Module):
+
+    def __init__(self, device='cuda'):
+        super(S3FDNet, self).__init__()
+        self.device = device
+
+        self.vgg = nn.ModuleList([
+            nn.Conv2d(3, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(64, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            
+            nn.Conv2d(128, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2, ceil_mode=True),
+            
+            nn.Conv2d(256, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 1024, 1, 1),
+            nn.ReLU(inplace=True),
+        ])
+
+        self.L2Norm3_3 = L2Norm(256, 10)
+        self.L2Norm4_3 = L2Norm(512, 8)
+        self.L2Norm5_3 = L2Norm(512, 5)
+
+        self.extras = nn.ModuleList([
+            nn.Conv2d(1024, 256, 1, 1),
+            nn.Conv2d(256, 512, 3, 2, padding=1),
+            nn.Conv2d(512, 128, 1, 1),
+            nn.Conv2d(128, 256, 3, 2, padding=1),
+        ])
+        
+        self.loc = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(1024, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+        ])
+
+        self.conf = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(1024, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(256, 2, 3, 1, padding=1),
+        ])
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.detect = Detect()
+
+    def forward(self, x):
+        size = x.size()[2:]
+        sources = list()
+        loc = list()
+        conf = list()
+
+        for k in range(16):
+            x = self.vgg[k](x)
+        s = self.L2Norm3_3(x)
+        sources.append(s)
+
+        for k in range(16, 23):
+            x = self.vgg[k](x)
+        s = self.L2Norm4_3(x)
+        sources.append(s)
+
+        for k in range(23, 30):
+            x = self.vgg[k](x)
+        s = self.L2Norm5_3(x)
+        sources.append(s)
+
+        for k in range(30, len(self.vgg)):
+            x = self.vgg[k](x)
+        sources.append(x)
+        
+        # apply extra layers and cache source layer outputs
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                sources.append(x)
+
+        # apply multibox head to source layers
+        loc_x = self.loc[0](sources[0])
+        conf_x = self.conf[0](sources[0])
+
+        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
+        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
+
+        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
+        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
+
+        for i in range(1, len(sources)):
+            x = sources[i]
+            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
+            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
+
+        features_maps = []
+        for i in range(len(loc)):
+            feat = []
+            feat += [loc[i].size(1), loc[i].size(2)]
+            features_maps += [feat]
+
+        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
+        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
+
+        with torch.no_grad():
+            self.priorbox = PriorBox(size, features_maps)
+            self.priors = self.priorbox.forward()
+
+        output = self.detect.forward(
+            loc.view(loc.size(0), -1, 4),
+            self.softmax(conf.view(conf.size(0), -1, 2)),
+            self.priors.type(type(x.data)).to(self.device)
+        )
+
+        return output
diff --git a/dreamtalk/syncnet_python/download_model.sh b/dreamtalk/syncnet_python/download_model.sh
new file mode 100644
index 00000000..3e3a9dc2
--- /dev/null
+++ b/dreamtalk/syncnet_python/download_model.sh
@@ -0,0 +1,9 @@
+# SyncNet model
+
+mkdir data
+wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model
+wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi
+
+# For the pre-processing pipeline
+mkdir detectors/s3fd/weights
+wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth
\ No newline at end of file
diff --git a/dreamtalk/syncnet_python/emotion/gt.png b/dreamtalk/syncnet_python/emotion/gt.png
new file mode 100644
index 00000000..480e4c8b
Binary files /dev/null and b/dreamtalk/syncnet_python/emotion/gt.png differ
diff --git a/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_eval.png b/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_eval.png
new file mode 100644
index 00000000..bbbf46b9
Binary files /dev/null and b/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_eval.png differ
diff --git a/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_ref.png b/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_ref.png
new file mode 100644
index 00000000..945a0be9
Binary files /dev/null and b/dreamtalk/syncnet_python/emotion/output/angry_256_random_15/frame_0059_ref.png differ
diff --git a/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_eval.png b/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_eval.png
new file mode 100644
index 00000000..8f2e8633
Binary files /dev/null and b/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_eval.png differ
diff --git a/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_ref.png b/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_ref.png
new file mode 100644
index 00000000..a9958fef
Binary files /dev/null and b/dreamtalk/syncnet_python/emotion/output/happy_256_random_15/frame_0066_ref.png differ
diff --git a/dreamtalk/syncnet_python/eval/data/demo.mp4 b/dreamtalk/syncnet_python/eval/data/demo.mp4
new file mode 100644
index 00000000..cdfe7381
Binary files /dev/null and b/dreamtalk/syncnet_python/eval/data/demo.mp4 differ
diff --git a/dreamtalk/syncnet_python/requirements.txt b/dreamtalk/syncnet_python/requirements.txt
new file mode 100644
index 00000000..11fea2db
--- /dev/null
+++ b/dreamtalk/syncnet_python/requirements.txt
@@ -0,0 +1,8 @@
+torch==2.5.1
+torchvision==0.20.1
+numpy==2.0.2
+scipy==1.13.1
+av==14.0.1
+scenedetect==0.6.0
+opencv-contrib-python==4.10.0.84
+python_speech_features==0.6
diff --git a/dreamtalk/syncnet_python/run_commands.sh b/dreamtalk/syncnet_python/run_commands.sh
new file mode 100644
index 00000000..f52a795c
--- /dev/null
+++ b/dreamtalk/syncnet_python/run_commands.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# 执行第一个命令
+python run_pipeline.py --videofile eval/data/demo.mp4 --reference t1 --data_dir tmp
+
+# 执行第二个命令并追加输出到 all_scores.txt
+python calculate_scores_real_videos.py --videofile eval/data/demo.mp4 --reference t1 --data_dir tmp >> all_scores.txt
\ No newline at end of file
diff --git a/dreamtalk/syncnet_python/run_pipeline.py b/dreamtalk/syncnet_python/run_pipeline.py
new file mode 100644
index 00000000..f5fc22e0
--- /dev/null
+++ b/dreamtalk/syncnet_python/run_pipeline.py
@@ -0,0 +1,322 @@
+#!/usr/bin/python
+
+import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
+import numpy as np
+from shutil import rmtree
+
+import scenedetect
+from scenedetect.video_manager import VideoManager
+from scenedetect.scene_manager import SceneManager
+from scenedetect.frame_timecode import FrameTimecode
+from scenedetect.stats_manager import StatsManager
+from scenedetect.detectors import ContentDetector
+
+from scipy.interpolate import interp1d
+from scipy.io import wavfile
+from scipy import signal
+
+from detectors import S3FD
+
+# ========== ========== ========== ==========
+# # PARSE ARGS
+# ========== ========== ========== ==========
+
+parser = argparse.ArgumentParser(description = "FaceTracker");
+parser.add_argument('--data_dir',       type=str, default='data/work', help='Output direcotry');
+parser.add_argument('--videofile',      type=str, default='',   help='Input video file');
+parser.add_argument('--reference',      type=str, default='',   help='Video reference');
+parser.add_argument('--facedet_scale',  type=float, default=0.25, help='Scale factor for face detection');
+parser.add_argument('--crop_scale',     type=float, default=0.40, help='Scale bounding box');
+parser.add_argument('--min_track',      type=int, default=100,  help='Minimum facetrack duration');
+parser.add_argument('--frame_rate',     type=int, default=25,   help='Frame rate');
+parser.add_argument('--num_failed_det', type=int, default=25,   help='Number of missed detections allowed before tracking is stopped');
+parser.add_argument('--min_face_size',  type=int, default=100,  help='Minimum face size in pixels');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
+
+# ========== ========== ========== ==========
+# # IOU FUNCTION
+# ========== ========== ========== ==========
+
+def bb_intersection_over_union(boxA, boxB):
+  
+  xA = max(boxA[0], boxB[0])
+  yA = max(boxA[1], boxB[1])
+  xB = min(boxA[2], boxB[2])
+  yB = min(boxA[3], boxB[3])
+ 
+  interArea = max(0, xB - xA) * max(0, yB - yA)
+ 
+  boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+  boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+ 
+  iou = interArea / float(boxAArea + boxBArea - interArea)
+ 
+  return iou
+
+# ========== ========== ========== ==========
+# # FACE TRACKING
+# ========== ========== ========== ==========
+
+def track_shot(opt,scenefaces):
+
+  iouThres  = 0.5     # Minimum IOU between consecutive face detections
+  tracks    = []
+
+  while True:
+    track     = []
+    for framefaces in scenefaces:
+      for face in framefaces:
+        if track == []:
+          track.append(face)
+          framefaces.remove(face)
+        elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det:
+          iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
+          if iou > iouThres:
+            track.append(face)
+            framefaces.remove(face)
+            continue
+        else:
+          break
+
+    if track == []:
+      break
+    elif len(track) > opt.min_track:
+      
+      framenum    = np.array([ f['frame'] for f in track ])
+      bboxes      = np.array([np.array(f['bbox']) for f in track])
+
+      frame_i   = np.arange(framenum[0],framenum[-1]+1)
+
+      bboxes_i    = []
+      for ij in range(0,4):
+        interpfn  = interp1d(framenum, bboxes[:,ij])
+        bboxes_i.append(interpfn(frame_i))
+      bboxes_i  = np.stack(bboxes_i, axis=1)
+
+      if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size:
+        tracks.append({'frame':frame_i,'bbox':bboxes_i})
+
+  return tracks
+
+# ========== ========== ========== ==========
+# # VIDEO CROP AND SAVE
+# ========== ========== ========== ==========
+        
+def crop_video(opt,track,cropfile):
+
+  flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+  flist.sort()
+
+  fourcc = cv2.VideoWriter_fourcc(*'XVID')
+  vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224))
+
+  dets = {'x':[], 'y':[], 's':[]}
+
+  for det in track['bbox']:
+
+    dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) 
+    dets['y'].append((det[1]+det[3])/2) # crop center x 
+    dets['x'].append((det[0]+det[2])/2) # crop center y
+
+  # Smooth detections
+  dets['s'] = signal.medfilt(dets['s'],kernel_size=13)   
+  dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
+  dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
+
+  for fidx, frame in enumerate(track['frame']):
+
+    cs  = opt.crop_scale
+
+    bs  = dets['s'][fidx]   # Detection box size
+    bsi = int(bs*(1+2*cs))  # Pad videos by this amount 
+
+    image = cv2.imread(flist[frame])
+    
+    frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
+    my  = dets['y'][fidx]+bsi  # BBox center Y
+    mx  = dets['x'][fidx]+bsi  # BBox center X
+
+    face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
+    
+    vOut.write(cv2.resize(face,(224,224)))
+
+  audiotmp    = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
+  audiostart  = (track['frame'][0])/opt.frame_rate
+  audioend    = (track['frame'][-1]+1)/opt.frame_rate
+
+  vOut.release()
+
+  # ========== CROP AUDIO FILE ==========
+
+  command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) 
+  output = subprocess.call(command, shell=True, stdout=None)
+
+  if output != 0:
+    pdb.set_trace()
+
+  sample_rate, audio = wavfile.read(audiotmp)
+
+  # ========== COMBINE AUDIO AND VIDEO FILES ==========
+
+  command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
+  output = subprocess.call(command, shell=True, stdout=None)
+
+  if output != 0:
+    pdb.set_trace()
+
+  print('Written %s'%cropfile)
+
+  os.remove(cropfile+'t.avi')
+
+  print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
+
+  return {'track':track, 'proc_track':dets}
+
+# ========== ========== ========== ==========
+# # FACE DETECTION
+# ========== ========== ========== ==========
+
+def inference_video(opt):
+
+  DET = S3FD(device='cuda')
+
+  flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+  flist.sort()
+
+  dets = []
+      
+  for fidx, fname in enumerate(flist):
+
+    start_time = time.time()
+    
+    image = cv2.imread(fname)
+
+    image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
+
+    dets.append([]);
+    for bbox in bboxes:
+      dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
+
+    elapsed_time = time.time() - start_time
+
+    print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) 
+
+  savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
+
+  with open(savepath, 'wb') as fil:
+    pickle.dump(dets, fil)
+
+  return dets
+
+# ========== ========== ========== ==========
+# # SCENE DETECTION
+# ========== ========== ========== ==========
+
+def scene_detect(opt):
+
+  video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
+  stats_manager = StatsManager()
+  scene_manager = SceneManager(stats_manager)
+  # Add ContentDetector algorithm (constructor takes detector options like threshold).
+  scene_manager.add_detector(ContentDetector())
+  base_timecode = video_manager.get_base_timecode()
+
+  video_manager.set_downscale_factor()
+
+  video_manager.start()
+
+  scene_manager.detect_scenes(frame_source=video_manager)
+
+  scene_list = scene_manager.get_scene_list(base_timecode)
+
+  savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
+
+  if scene_list == []:
+    scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
+
+  with open(savepath, 'wb') as fil:
+    pickle.dump(scene_list, fil)
+
+  print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
+
+  return scene_list
+    
+
+# ========== ========== ========== ==========
+# # EXECUTE DEMO
+# ========== ========== ========== ==========
+
+# ========== DELETE EXISTING DIRECTORIES ==========
+
+if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
+  rmtree(os.path.join(opt.work_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
+  rmtree(os.path.join(opt.crop_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
+  rmtree(os.path.join(opt.avi_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
+  rmtree(os.path.join(opt.frames_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+  rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+# ========== MAKE NEW DIRECTORIES ==========
+
+os.makedirs(os.path.join(opt.work_dir,opt.reference))
+os.makedirs(os.path.join(opt.crop_dir,opt.reference))
+os.makedirs(os.path.join(opt.avi_dir,opt.reference))
+os.makedirs(os.path.join(opt.frames_dir,opt.reference))
+os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+# ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
+
+command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
+output = subprocess.call(command, shell=True, stdout=None)
+
+command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) 
+output = subprocess.call(command, shell=True, stdout=None)
+
+command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) 
+output = subprocess.call(command, shell=True, stdout=None)
+
+# ========== FACE DETECTION ==========
+
+faces = inference_video(opt)
+
+# ========== SCENE DETECTION ==========
+
+scene = scene_detect(opt)
+
+# ========== FACE TRACKING ==========
+
+alltracks = []
+vidtracks = []
+
+for shot in scene:
+
+  if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
+    alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
+
+# ========== FACE TRACK CROP ==========
+
+for ii, track in enumerate(alltracks):
+  vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)))
+
+# ========== SAVE RESULTS ==========
+
+savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl')
+
+with open(savepath, 'wb') as fil:
+  pickle.dump(vidtracks, fil)
+
+rmtree(os.path.join(opt.tmp_dir,opt.reference))
diff --git a/dreamtalk/syncnet_python/run_syncnet.py b/dreamtalk/syncnet_python/run_syncnet.py
new file mode 100644
index 00000000..45099fd6
--- /dev/null
+++ b/dreamtalk/syncnet_python/run_syncnet.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+
+from SyncNetInstance import *
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+
+
+# ==================== LOAD MODEL AND FILE LIST ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+
+# ==================== GET OFFSETS ====================
+
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    dists.append(dist)
+      
+# ==================== PRINT RESULTS TO FILE ====================
+
+with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+    pickle.dump(dists, fil)
diff --git a/dreamtalk/syncnet_python/run_visualise.py b/dreamtalk/syncnet_python/run_visualise.py
new file mode 100644
index 00000000..85d89253
--- /dev/null
+++ b/dreamtalk/syncnet_python/run_visualise.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, pickle, os, glob
+import cv2
+
+from scipy import signal
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--data_dir', 	type=str, default='data/work', help='');
+parser.add_argument('--videofile', 	type=str, default='', help='');
+parser.add_argument('--reference', 	type=str, default='', help='');
+parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
+
+# ==================== LOAD FILES ====================
+
+with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil:
+    tracks = pickle.load(fil, encoding='latin1')
+
+with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil:
+    dists = pickle.load(fil, encoding='latin1')
+
+flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+flist.sort()
+
+# ==================== SMOOTH FACES ====================
+
+faces = [[] for i in range(len(flist))]
+
+for tidx, track in enumerate(tracks):
+
+	mean_dists 	=  numpy.mean(numpy.stack(dists[tidx],1),1)
+	minidx 		= numpy.argmin(mean_dists,0)
+	minval 		= mean_dists[minidx] 
+	
+	fdist   	= numpy.stack([dist[minidx] for dist in dists[tidx]])
+	fdist   	= numpy.pad(fdist, (3,3), 'constant', constant_values=10)
+
+	fconf   = numpy.median(mean_dists) - fdist
+	fconfm  = signal.medfilt(fconf,kernel_size=9)
+
+	for fidx, frame in enumerate(track['track']['frame'].tolist()) :
+		faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
+
+# ==================== ADD DETECTIONS TO VIDEO ====================
+
+first_image = cv2.imread(flist[0])
+
+fw = first_image.shape[1]
+fh = first_image.shape[0]
+
+fourcc = cv2.VideoWriter_fourcc(*'XVID')
+vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh))
+
+for fidx, fname in enumerate(flist):
+
+	image = cv2.imread(fname)
+
+	for face in faces[fidx]:
+
+		clr = max(min(face['conf']*25,255),0)
+
+		cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3)
+		cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2)
+
+	vOut.write(image)
+
+	print('Frame %d'%fidx)
+
+vOut.release()
+
+# ========== COMBINE AUDIO AND VIDEO FILES ==========
+
+command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 
+output = subprocess.call(command, shell=True, stdout=None)
+
+