diff --git a/AniTalker-kit/AniTalker-judge/Dockerfile b/AniTalker-kit/AniTalker-judge/Dockerfile new file mode 100644 index 00000000..7d75f3c6 --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/Dockerfile @@ -0,0 +1,46 @@ +# 使用nvidia的CUDA基础镜像 +FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04 + +# 设置工作目录 +WORKDIR /app + +# 安装依赖,同时设置非交互式前端和时区 +RUN apt-get update && apt-get install -y \ + software-properties-common \ + build-essential \ + libgl1-mesa-glx \ + && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y \ + python3.9 \ + python3.9-distutils \ + python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# 升级pip到最新版本,确保使用的是python3.9的pip +RUN python3.9 -m pip install --upgrade pip + +# 设置Python版本为3.9 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1 + + +# 安装Python依赖 +COPY requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + + +# 将Python脚本复制到容器中 +COPY PSNR_SSIM_FID_NIQE.py /app/PSNR_SSIM_FID_NIQE.py + +COPY inception_v3_google-0cc3c7bd.pth /app/inception_v3_google-0cc3c7bd.pth +COPY niqe_modelparameters.mat /root/.cache/torch/hub/pyiqa/niqe_modelparameters.mat + +# 使脚本可执行 +RUN chmod +x /app/PSNR_SSIM_FID_NIQE.py + +# 设置CUDA_HOME环境变量 +ENV CUDA_HOME=/usr/local/cuda + +# 运行脚本 +ENTRYPOINT ["python3.9", "/app/PSNR_SSIM_FID_NIQE.py"] + diff --git a/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.ipynb b/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.ipynb new file mode 100644 index 00000000..71f23840 --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.ipynb @@ -0,0 +1,476 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "f:\\Anaconda\\envs\\AniTalker_test\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Video Name PSNR Score SSIM Score FID Score NIQE Score \n", + "May.mp4 29.772568 0.652101 42.338212 5.151919 \n", + "Jae-in.mp4 29.750509 0.630766 89.133744 6.13143 \n", + "Lieu.mp4 30.694761 0.759348 30.025657 5.937247 \n", + "Macron.mp4 30.332479 0.733842 28.131079 5.964817 \n", + "Obama.mp4 30.766657 0.758716 37.66468 7.44125 \n", + "Obama1.mp4 29.598956 0.68332 47.335445 6.626041 \n", + "Obama2.mp4 30.605588 0.733553 30.716751 6.35921 \n", + "Shaheen.mp4 30.629148 0.748107 19.622528 5.441265 \n" + ] + } + ], + "source": [ + "import imageio\n", + "import torch\n", + "from pytorch_msssim import ssim\n", + "import numpy as np\n", + "from PIL import Image\n", + "import cv2\n", + "import pyiqa\n", + "from torchvision.models import inception_v3\n", + "from torchvision import transforms\n", + "import scipy.linalg as linalg\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# 视频路径\n", + "base_path = 'videos'\n", + "raw_videos_path = f'{base_path}/raw'\n", + "synthetic_videos_path = f'{base_path}/synthetic'\n", + "\n", + "# 视频列表\n", + "video_names = [\n", + " 'May.mp4', 'Jae-in.mp4', 'Lieu.mp4', 'Macron.mp4', 'Obama.mp4', \n", + " 'Obama1.mp4', 'Obama2.mp4', 'Shaheen.mp4'\n", + "]\n", + "\n", + "# 加载视频帧\n", + "def load_video_frames(video_path):\n", + " reader = imageio.get_reader(video_path)\n", + " frames = [frame for frame in reader]\n", + " reader.close() # 关闭reader释放资源\n", + " return frames\n", + "\n", + "# 调整视频尺寸\n", + "def resize_video(generated_frames, standard_size):\n", + " resized_generated_frames = [cv2.resize(frame, (standard_size[1], standard_size[0])) for frame in generated_frames]\n", + " return resized_generated_frames\n", + "\n", + "# 计算PSNR\n", + "def calculate_psnr(standard_frames, generated_frames):\n", + " psnr_values = []\n", + " for i in range(len(standard_frames)):\n", + " mse = np.mean((standard_frames[i] - generated_frames[i]) ** 2)\n", + " if mse == 0:\n", + " psnr = float('inf') # 对于完全相同的图像,PSNR为无穷大\n", + " else:\n", + " max_pixel = 255.0\n", + " psnr = 20 * np.log10(max_pixel / np.sqrt(mse))\n", + " psnr_values.append(psnr)\n", + " return np.mean(psnr_values)\n", + "\n", + "# 计算SSIM\n", + "def calculate_ssim(standard_frames, generated_frames):\n", + " ssim_values = []\n", + " for i in range(len(standard_frames)):\n", + " # 规范化帧数据到[0, 1]并转换为float32\n", + " standard_frame = torch.tensor(standard_frames[i]).permute(2, 0, 1).unsqueeze(0).float() / 255.0\n", + " generated_frame = torch.tensor(generated_frames[i]).permute(2, 0, 1).unsqueeze(0).float() / 255.0\n", + " ssim_val = ssim(standard_frame, generated_frame, data_range=1.0, size_average=True) # 正确的data_range\n", + " ssim_values.append(ssim_val.item())\n", + " return np.mean(ssim_values)\n", + "\n", + "# 检查是否有可用的 GPU\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# 加载预训练的Inception模型\n", + "inception_model = inception_v3(pretrained=True).to(device).eval()\n", + "\n", + "# 定义一个函数来提取视频帧的特征,使用批量处理\n", + "def extract_features(video_path, model, device, batch_size=16):\n", + " frames = []\n", + " cap = cv2.VideoCapture(video_path)\n", + " batch = []\n", + " \n", + " while True:\n", + " ret, frame = cap.read()\n", + " if not ret:\n", + " break\n", + " \n", + " img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n", + " img = transforms.ToTensor()(img).unsqueeze(0).to(device)\n", + " batch.append(img)\n", + " \n", + " if len(batch) == batch_size:\n", + " with torch.no_grad():\n", + " batch_tensor = torch.cat(batch, dim=0)\n", + " features = model(batch_tensor)\n", + " frames.extend(features.cpu().numpy())\n", + " batch = [] # 重置批量\n", + " \n", + " # 处理剩余的帧\n", + " if batch:\n", + " with torch.no_grad():\n", + " batch_tensor = torch.cat(batch, dim=0)\n", + " features = model(batch_tensor)\n", + " frames.extend(features.cpu().numpy())\n", + " \n", + " cap.release() # 释放视频捕获资源\n", + " return frames\n", + "\n", + "# 计算每个视频特征的均值和协方差\n", + "def calculate_statistics(features):\n", + " mean = np.mean(features, axis=0)\n", + " cov = np.cov(features, rowvar=False)\n", + " return mean, cov\n", + "\n", + "# 使用FID公式计算两个视频特征分布之间的FID分数\n", + "def calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov):\n", + " # 计算均值差\n", + " mean_diff = standard_mean - generated_mean\n", + " # 计算协方差矩阵的平方根\n", + " cov_sqrt = linalg.sqrtm(standard_cov @ generated_cov)\n", + " \n", + " # 计算FID\n", + " fid = np.sum(mean_diff ** 2) + np.trace(standard_cov + generated_cov - 2 * cov_sqrt)\n", + " \n", + " # 检查是否出现NaN或负值\n", + " if np.isnan(fid) or fid < 0:\n", + " fid = 0 # 或者返回一个合理的默认值\n", + " return fid\n", + "\n", + "# 计算NIQE得分\n", + "def calculate_niqe_score(frames):\n", + " niqe_metric = pyiqa.create_metric('niqe')\n", + " scores = []\n", + " for frame in frames:\n", + " frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0 # 规范化到[0, 1]\n", + " score = niqe_metric(frame_tensor)\n", + " scores.append(score)\n", + " del niqe_metric # 删除NIQE指标以释放资源\n", + " return torch.tensor(scores).mean().item()\n", + "\n", + "# 输出指标名作为表头,设置列宽\n", + "header = \"Video Name\".ljust(20) + \"PSNR Score\".ljust(12) + \"SSIM Score\".ljust(12) + \"FID Score\".ljust(12) + \"NIQE Score\".ljust(12)\n", + "print(header)\n", + "\n", + "# 循环处理每个视频\n", + "for video_name in video_names:\n", + " path_standard_video = f'{raw_videos_path}/{video_name}'\n", + " path_generated_video = f'{synthetic_videos_path}/{video_name}'\n", + " \n", + " standard_frames = load_video_frames(path_standard_video)\n", + " generated_frames = load_video_frames(path_generated_video)\n", + " \n", + " # 检查尺寸是否一致,不一致则调整尺寸\n", + " if generated_frames[0].shape != standard_frames[0].shape:\n", + " generated_frames = resize_video(generated_frames, standard_frames[0].shape)\n", + " \n", + " psnr_score = round(calculate_psnr(standard_frames, generated_frames), 6)\n", + " ssim_score = round(calculate_ssim(standard_frames, generated_frames), 6)\n", + " \n", + " standard_features = extract_features(path_standard_video, inception_model, device)\n", + " generated_features = extract_features(path_generated_video, inception_model, device)\n", + " \n", + " standard_mean, standard_cov = calculate_statistics(standard_features)\n", + " generated_mean, generated_cov = calculate_statistics(generated_features)\n", + " \n", + " fid_score = round(calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov), 6)\n", + " generated_niqe = round(calculate_niqe_score(generated_frames), 6)\n", + " \n", + " # 格式化输出,确保列对齐\n", + " row = f\"{video_name.ljust(20)}{psnr_score:<12}{ssim_score:<12}{fid_score:<12}{generated_niqe:<12}\"\n", + " print(row)\n", + " \n", + " # 释放不再需要的资源\n", + " del standard_frames, generated_frames, standard_features, generated_features\n", + " torch.cuda.empty_cache() # 释放GPU缓存\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\86188\\AppData\\Local\\Temp\\ipykernel_33716\\1210497081.py:29: DeprecationWarning: __array_wrap__ must accept context and return_scalar arguments (positionally) in the future. (Deprecated NumPy 2.0)\n", + " mse = np.mean((standard_frames[i] - generated_frames[i]) ** 2)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PSNR Score: 29.772568152345716\n", + "SSIM Score: 0.6521003624369437\n" + ] + }, + { + "ename": "TypeError", + "evalue": "a bytes-like object is required, not 'Image'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 63\u001b[0m\n\u001b[0;32m 60\u001b[0m fid_value \u001b[38;5;241m=\u001b[39m calculate_fid(standard_images, generated_images)\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fid_value\n\u001b[1;32m---> 63\u001b[0m fid_score \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_fid\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstandard_frames\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerated_frames\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 65\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFID Score: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfid_score\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 67\u001b[0m \u001b[38;5;66;03m# 计算NIQE\u001b[39;00m\n", + "Cell \u001b[1;32mIn[1], line 60\u001b[0m, in \u001b[0;36mcalculate_fid\u001b[1;34m(standard_frames, generated_frames)\u001b[0m\n\u001b[0;32m 58\u001b[0m standard_images \u001b[38;5;241m=\u001b[39m [Image\u001b[38;5;241m.\u001b[39mfromarray(frame) \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m standard_frames]\n\u001b[0;32m 59\u001b[0m generated_images \u001b[38;5;241m=\u001b[39m [Image\u001b[38;5;241m.\u001b[39mfromarray(frame) \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m generated_frames]\n\u001b[1;32m---> 60\u001b[0m fid_value \u001b[38;5;241m=\u001b[39m \u001b[43mcalculate_fid\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstandard_images\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgenerated_images\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 61\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m fid_value\n", + "Cell \u001b[1;32mIn[1], line 58\u001b[0m, in \u001b[0;36mcalculate_fid\u001b[1;34m(standard_frames, generated_frames)\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_fid\u001b[39m(standard_frames, generated_frames):\n\u001b[1;32m---> 58\u001b[0m standard_images \u001b[38;5;241m=\u001b[39m [Image\u001b[38;5;241m.\u001b[39mfromarray(frame) \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m standard_frames]\n\u001b[0;32m 59\u001b[0m generated_images \u001b[38;5;241m=\u001b[39m [Image\u001b[38;5;241m.\u001b[39mfromarray(frame) \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m generated_frames]\n\u001b[0;32m 60\u001b[0m fid_value \u001b[38;5;241m=\u001b[39m calculate_fid(standard_images, generated_images)\n", + "Cell \u001b[1;32mIn[1], line 58\u001b[0m, in \u001b[0;36m\u001b[1;34m(.0)\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcalculate_fid\u001b[39m(standard_frames, generated_frames):\n\u001b[1;32m---> 58\u001b[0m standard_images \u001b[38;5;241m=\u001b[39m [\u001b[43mImage\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfromarray\u001b[49m\u001b[43m(\u001b[49m\u001b[43mframe\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m standard_frames]\n\u001b[0;32m 59\u001b[0m generated_images \u001b[38;5;241m=\u001b[39m [Image\u001b[38;5;241m.\u001b[39mfromarray(frame) \u001b[38;5;28;01mfor\u001b[39;00m frame \u001b[38;5;129;01min\u001b[39;00m generated_frames]\n\u001b[0;32m 60\u001b[0m fid_value \u001b[38;5;241m=\u001b[39m calculate_fid(standard_images, generated_images)\n", + "File \u001b[1;32mf:\\Anaconda\\envs\\AniTalker_judge\\lib\\site-packages\\PIL\\Image.py:3342\u001b[0m, in \u001b[0;36mfromarray\u001b[1;34m(obj, mode)\u001b[0m\n\u001b[0;32m 3339\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstrides\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m requires either tobytes() or tostring()\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 3340\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(msg)\n\u001b[1;32m-> 3342\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfrombuffer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mraw\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrawmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mf:\\Anaconda\\envs\\AniTalker_judge\\lib\\site-packages\\PIL\\Image.py:3244\u001b[0m, in \u001b[0;36mfrombuffer\u001b[1;34m(mode, size, data, decoder_name, *args)\u001b[0m\n\u001b[0;32m 3241\u001b[0m im\u001b[38;5;241m.\u001b[39mreadonly \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m 3242\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m im\n\u001b[1;32m-> 3244\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfrombytes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msize\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mf:\\Anaconda\\envs\\AniTalker_judge\\lib\\site-packages\\PIL\\Image.py:3181\u001b[0m, in \u001b[0;36mfrombytes\u001b[1;34m(mode, size, data, decoder_name, *args)\u001b[0m\n\u001b[0;32m 3178\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m decoder_name \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m decoder_args \u001b[38;5;241m==\u001b[39m ():\n\u001b[0;32m 3179\u001b[0m decoder_args \u001b[38;5;241m=\u001b[39m mode\n\u001b[1;32m-> 3181\u001b[0m \u001b[43mim\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrombytes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecoder_args\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3182\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m im\n", + "File \u001b[1;32mf:\\Anaconda\\envs\\AniTalker_judge\\lib\\site-packages\\PIL\\Image.py:882\u001b[0m, in \u001b[0;36mImage.frombytes\u001b[1;34m(self, data, decoder_name, *args)\u001b[0m\n\u001b[0;32m 880\u001b[0m d \u001b[38;5;241m=\u001b[39m _getdecoder(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode, decoder_name, decoder_args)\n\u001b[0;32m 881\u001b[0m d\u001b[38;5;241m.\u001b[39msetimage(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mim)\n\u001b[1;32m--> 882\u001b[0m s \u001b[38;5;241m=\u001b[39m \u001b[43md\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 884\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m s[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 885\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnot enough image data\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "\u001b[1;31mTypeError\u001b[0m: a bytes-like object is required, not 'Image'" + ] + } + ], + "source": [ + "import imageio\n", + "import torch\n", + "from pytorch_msssim import ssim\n", + "import numpy as np\n", + "\n", + "# 视频路径\n", + "path_standard_video = 'data-20241209T141743Z-001/data/raw/videos/May.mp4'\n", + "path_generated_video = 'data-20241209T141743Z-001/data/synthetic/videos/May.mp4'\n", + "\n", + "# 加载视频\n", + "def load_video_frames(video_path):\n", + " reader = imageio.get_reader(video_path)\n", + " frames = []\n", + " for frame in reader:\n", + " frames.append(frame)\n", + " return frames\n", + "\n", + "standard_frames = load_video_frames(path_standard_video)\n", + "generated_frames = load_video_frames(path_generated_video)\n", + "\n", + "# 计算PSNR\n", + "def calculate_psnr(standard_frames, generated_frames):\n", + " psnr_values = []\n", + " for i in range(len(standard_frames)):\n", + " mse = np.mean((standard_frames[i] - generated_frames[i]) ** 2)\n", + " if mse == 0:\n", + " psnr = 100\n", + " else:\n", + " max_pixel = 255.0\n", + " psnr = 20 * np.log10(max_pixel / np.sqrt(mse))\n", + " psnr_values.append(psnr)\n", + " return np.mean(psnr_values)\n", + "\n", + "psnr_score = calculate_psnr(standard_frames, generated_frames)\n", + "\n", + "print(f\"PSNR Score: {psnr_score}\")\n", + "\n", + "# 计算SSIM\n", + "def calculate_ssim(standard_frames, generated_frames):\n", + " ssim_values = []\n", + " for i in range(len(standard_frames)):\n", + " standard_frame = torch.tensor(standard_frames[i]).permute(2, 0, 1).unsqueeze(0).float()\n", + " generated_frame = torch.tensor(generated_frames[i]).permute(2, 0, 1).unsqueeze(0).float()\n", + " ssim_val = ssim(standard_frame, generated_frame, data_range=255, size_average=True)\n", + " ssim_values.append(ssim_val.item())\n", + " return np.mean(ssim_values)\n", + "\n", + "ssim_score = calculate_ssim(standard_frames, generated_frames)\n", + "\n", + "print(f\"SSIM Score: {ssim_score}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FID score: 52.521025925726924\n" + ] + } + ], + "source": [ + "import torch\n", + "from torchvision.models import inception_v3\n", + "from torchvision import transforms\n", + "from PIL import Image\n", + "import numpy as np\n", + "from torch.utils.data import Dataset, DataLoader\n", + "import os\n", + "import cv2 \n", + "\n", + "# 检查是否有可用的 GPU\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# 视频路径\n", + "path_standard_video = 'data-20241209T141743Z-001/data/raw/videos/May.mp4'\n", + "path_generated_video = 'data-20241209T141743Z-001/data/synthetic/videos/May.mp4'\n", + "\n", + "# 加载预训练的Inception模型\n", + "inception_model = inception_v3(pretrained=True).to(device).eval()\n", + "\n", + "# 定义一个函数来提取视频帧的特征\n", + "def extract_features(video_path, model, device):\n", + " # 视频帧提取和预处理\n", + " frames = []\n", + " cap = cv2.VideoCapture(video_path)\n", + " while True:\n", + " ret, frame = cap.read()\n", + " if not ret:\n", + " break\n", + " img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))\n", + " img = transforms.ToTensor()(img).unsqueeze(0).to(device)\n", + " with torch.no_grad():\n", + " features = model(img)\n", + " frames.append(features.squeeze(0).cpu().numpy())\n", + " cap.release()\n", + " return frames\n", + "\n", + "# 从每个视频中提取帧并计算特征\n", + "standard_frames = extract_features(path_standard_video, inception_model, device)\n", + "generated_frames = extract_features(path_generated_video, inception_model, device)\n", + "\n", + "# 计算每个视频特征的均值和协方差\n", + "def calculate_statistics(features):\n", + " mean = np.mean(features, axis=0)\n", + " cov = np.cov(features, rowvar=False)\n", + " return mean, cov\n", + "\n", + "standard_mean, standard_cov = calculate_statistics(standard_frames)\n", + "generated_mean, generated_cov = calculate_statistics(generated_frames)\n", + "\n", + "# 使用FID公式计算两个视频特征分布之间的FID分数\n", + "def calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov):\n", + " # 计算均值差异\n", + " mean_diff = standard_mean - generated_mean\n", + " # 计算协方差矩阵差异\n", + " cov_diff = standard_cov + generated_cov - 2 * np.dot(standard_cov, generated_cov)\n", + " # 计算FID\n", + " fid = np.dot(mean_diff, mean_diff) + np.trace(cov_diff)\n", + " return fid\n", + "\n", + "fid_score = calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov)\n", + "print(f'FID score: {fid_score}')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 6074/6074 [08:01<00:00, 12.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated Video NIQE Score: 5.151919020330636\n" + ] + } + ], + "source": [ + "import cv2\n", + "import torch\n", + "import pyiqa\n", + "import numpy\n", + "from tqdm import tqdm\n", + "\n", + "# 设定视频路径\n", + "standard_video_path = 'data-20241209T141743Z-001/data/raw/videos/May.mp4'\n", + "generated_video_path = 'data-20241209T141743Z-001/data/synthetic/videos/May.mp4'\n", + "\n", + "# 读取视频\n", + "def read_video(video_path):\n", + " cap = cv2.VideoCapture(video_path)\n", + " frames = []\n", + " while True:\n", + " ret, frame = cap.read()\n", + " if not ret:\n", + " break\n", + " frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n", + " frames.append(frame)\n", + " cap.release()\n", + " return frames\n", + "\n", + "# 计算NIQE得分\n", + "def calculate_niqe_score(frames):\n", + " niqe_metric = pyiqa.create_metric('niqe')\n", + " scores = []\n", + " for frame in tqdm(frames):\n", + " # 将帧转换为张量\n", + " frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0\n", + " # 计算NIQE得分\n", + " score = niqe_metric(frame_tensor)\n", + " scores.append(score)\n", + " return torch.tensor(scores).mean().item()\n", + "\n", + "\n", + "# 主函数\n", + "def main():\n", + " # 读取视频帧\n", + " #standard_frames = read_video(standard_video_path)\n", + " generated_frames = read_video(generated_video_path)\n", + "\n", + " # 计算NIQE得分\n", + " #standard_niqe = calculate_niqe_score(standard_frames)\n", + " generated_niqe = calculate_niqe_score(generated_frames)\n", + "\n", + " # 输出评价得分\n", + " #print(f\"Standard Video NIQE Score: {standard_niqe}\")\n", + " print(f\"Generated Video NIQE Score: {generated_niqe}\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "AniTalker_test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.py b/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.py new file mode 100644 index 00000000..cfe36190 --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/PSNR_SSIM_FID_NIQE.py @@ -0,0 +1,199 @@ +import imageio +import torch +from pytorch_msssim import ssim +import numpy as np +from PIL import Image +import cv2 +import pyiqa +from torchvision.models import inception_v3 +from torchvision import transforms +import scipy.linalg as linalg +import warnings +import argparse +import os + +# 忽略警告 +warnings.filterwarnings('ignore') + +# 加载视频帧 +def load_video_frames(video_path): + reader = imageio.get_reader(video_path) + frames = [frame for frame in reader] + reader.close() # 关闭reader释放资源 + return frames + +def resize_video(generated_frames, standard_size): + resized_generated_frames = [cv2.resize(frame, (standard_size[1], standard_size[0])) for frame in generated_frames] + return resized_generated_frames + +# 计算PSNR +def calculate_psnr(standard_frames, generated_frames): + psnr_values = [] + for i in range(len(standard_frames)): + mse = np.mean((standard_frames[i] - generated_frames[i]) ** 2) + if mse == 0: + psnr = float('inf') # 对于完全相同的图像,PSNR为无穷大 + else: + max_pixel = 255.0 + psnr = 20 * np.log10(max_pixel / np.sqrt(mse)) + psnr_values.append(psnr) + return np.mean(psnr_values) + +# 计算SSIM +def calculate_ssim(standard_frames, generated_frames): + ssim_values = [] + for i in range(len(standard_frames)): + # 规范化帧数据到[0, 1]并转换为float32 + standard_frame = torch.tensor(standard_frames[i]).permute(2, 0, 1).unsqueeze(0).float() / 255.0 + generated_frame = torch.tensor(generated_frames[i]).permute(2, 0, 1).unsqueeze(0).float() / 255.0 + ssim_val = ssim(standard_frame, generated_frame, data_range=1.0, size_average=True) # 正确的data_range + ssim_values.append(ssim_val.item()) + return np.mean(ssim_values) + +# 定义一个函数来提取视频帧的特征,使用批量处理 +def extract_features(video_path, model, device, batch_size=16): + frames = [] + cap = cv2.VideoCapture(video_path) + batch = [] + + while True: + ret, frame = cap.read() + if not ret: + break + + img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + img = transforms.ToTensor()(img).unsqueeze(0).to(device) + batch.append(img) + + if len(batch) == batch_size: + with torch.no_grad(): + batch_tensor = torch.cat(batch, dim=0) + features = model(batch_tensor) + frames.extend(features.cpu().numpy()) + batch = [] # 重置批量 + + # 处理剩余的帧 + if batch: + with torch.no_grad(): + batch_tensor = torch.cat(batch, dim=0) + features = model(batch_tensor) + frames.extend(features.cpu().numpy()) + + cap.release() # 释放视频捕获资源 + return frames + +# 计算每个视频特征的均值和协方差 +def calculate_statistics(features): + mean = np.mean(features, axis=0) + cov = np.cov(features, rowvar=False) + return mean, cov + +# 使用FID公式计算两个视频特征分布之间的FID分数 +def calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov): + # 计算均值差 + mean_diff = standard_mean - generated_mean + + # 为了确保协方差矩阵正定,添加一个小的正则化项 + epsilon = 1e-6 + standard_cov += epsilon * np.eye(standard_cov.shape[0]) + generated_cov += epsilon * np.eye(generated_cov.shape[0]) + + # 计算协方差矩阵的平方根 + cov_sqrt = linalg.sqrtm(standard_cov @ generated_cov) + + # 如果平方根返回复数,取其实部 + if np.iscomplexobj(cov_sqrt): + cov_sqrt = cov_sqrt.real + + # 计算FID + fid = np.sum(mean_diff ** 2) + np.trace(standard_cov + generated_cov - 2 * cov_sqrt) + + # 检查是否出现NaN或负值 + if np.isnan(fid) or fid < 0: + fid = 0 # 或者返回一个合理的默认值 + return fid + +# 计算NIQE得分 +def calculate_niqe_score(frames): + niqe_metric = pyiqa.create_metric('niqe') + scores = [] + for frame in frames: + frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0 # 规范化到[0, 1] + score = niqe_metric(frame_tensor) + scores.append(score) + del niqe_metric # 删除NIQE指标以释放资源 + return torch.tensor(scores).mean().item() + +# 定义评价函数 +def evaluate_video(video_name, standard_path, generated_path, device): + path_standard_video = os.path.join(standard_path, video_name) + path_generated_video = os.path.join(generated_path, video_name) + + standard_frames = load_video_frames(path_standard_video) + generated_frames = load_video_frames(path_generated_video) + + if generated_frames[0].shape != standard_frames[0].shape: + generated_frames = resize_video(generated_frames, standard_frames[0].shape) + + psnr_score = round(calculate_psnr(standard_frames, generated_frames), 6) + ssim_score = round(calculate_ssim(standard_frames, generated_frames), 6) + + standard_features = extract_features(path_standard_video, inception_model, device) + generated_features = extract_features(path_generated_video, inception_model, device) + + standard_mean, standard_cov = calculate_statistics(standard_features) + generated_mean, generated_cov = calculate_statistics(generated_features) + + fid_score = round(calculate_fid(standard_mean, standard_cov, generated_mean, generated_cov), 6) + generated_niqe = round(calculate_niqe_score(generated_frames), 6) + + return psnr_score, ssim_score, fid_score, generated_niqe + +# 命令行参数解析 +parser = argparse.ArgumentParser(description='Evaluate videos using various metrics.') +parser.add_argument('stand_path', type=str, help='Path to the standard videos folder') +parser.add_argument('generate_path', type=str, help='Path to the generated videos folder') +parser.add_argument('--device', type=str, default='cuda:0', help='Device to use for computations (default: cuda:0)') +args = parser.parse_args() + +# 视频路径 +standard_videos_path = args.stand_path +generated_videos_path = args.generate_path +device = torch.device(args.device) + + +# 检查是否有可用的 GPU +if device.type == 'cuda' and not torch.cuda.is_available(): + raise ValueError("CUDA is not available, but device set to cuda. Please check your device settings.") + + +# 加载预训练的Inception模型 + +inception_model = inception_v3(pretrained=False) +inception_model.load_state_dict(torch.load('/app/inception_v3_google-0cc3c7bd.pth')) +inception_model.to(device).eval() + + +# 视频列表 +standard_video_names = os.listdir(standard_videos_path) +generated_video_names = os.listdir(generated_videos_path) + + +# 获取两个列表的交集 +video_names = list(set(standard_video_names) & set(generated_video_names)) + + +# 输出指标名作为表头,设置列宽 +header = "Video Name".ljust(20) + "PSNR Score".ljust(12) + "SSIM Score".ljust(12) + "FID Score".ljust(12) + "NIQE Score".ljust(12) +print(header) + +# 循环处理每个视频 +for video_name in video_names: + psnr_score, ssim_score, fid_score, niqe_score = evaluate_video(video_name, standard_videos_path, generated_videos_path, device) + + # 格式化输出,确保列对齐 + row = f"{video_name.ljust(20)}{psnr_score:<12}{ssim_score:<12}{fid_score:<12}{niqe_score:<12}" + print(row) + + # 释放不再需要的资源 + torch.cuda.empty_cache() # 释放GPU缓存 \ No newline at end of file diff --git a/AniTalker-kit/AniTalker-judge/README.md b/AniTalker-kit/AniTalker-judge/README.md new file mode 100644 index 00000000..a4d28fd2 --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/README.md @@ -0,0 +1,86 @@ +# Judge + +这一部分用于评价指标PSNR、SSIM、FID、NIQE的计算。 + +仓库中提供了代码、样例和Dockerfile文件,其中缺少了模型参数文件inception_v3_google-0cc3c7bd.pth,可以在[Google Drive](https://drive.google.com/file/d/1urWE3mUroo2rOn-A2nBXWhuKQ2I1M2zd/view?usp=drive_link)或[下载链接](https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth)下载。 + +## Quick Start + +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 拉取镜像: + +``` +docker pull gre123/anitalkerjudge:v1 +``` + +3. 参考以下docker run命令运行: + +``` +docker run --rm --gpus all \ +-v your_stand_videos_dir_path:/app/stand_videos_dir_path \ +-v your_generate_videos_dir_path:/app/generate_videos_dir_path \ +gre123/anitalkerjudge:v1 \ +/app/stand_videos_dir_path \ +/app/generate_videos_dir_path +[--device your_device](默认值为cuda:0) +``` + +其中,your_stand_videos_dir_path、your_generate_videos_dir_path为文件夹的绝对路径。 +注意:评测程序会对两个文件夹中名称相同的视频进行评测指标的计算,运行该程序需要确保对应的参照视频和生成视频名称相同且时长相同。 + +## Install + +如果想要自己手动构建镜像,请按照下列步骤进行: + +1. 将项目代码拉取到本地 + +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/AniTalker-judge +``` + +2. 下载缺少的模型参数文件inception_v3_google-0cc3c7bd.pth,放在根目录下。(可以在[Google Drive](https://drive.google.com/file/d/1urWE3mUroo2rOn-A2nBXWhuKQ2I1M2zd/view?usp=drive_link)或[下载链接](https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth)下载) +3. 在项目根目录下打开终端,运行以下命令: + +``` +docker build --rm -f "Dockerfile" -t : "." +``` + +如果ubuntu镜像无法拉取,可尝试先单独拉取对应镜像: + +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` + +3. 构建成功后,参考上面的docker run命令运行。 + +## Run + +如果想要直接在本地运行,请按照下列步骤进行: + +1. 将项目代码拉取到本地 + +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/AniTalker-judge +``` + +2. 下载缺少的模型参数文件inception_v3_google-0cc3c7bd.pth,放在根目录下。(可以在[Google Drive](https://drive.google.com/file/d/1urWE3mUroo2rOn-A2nBXWhuKQ2I1M2zd/view?usp=drive_link)或[下载链接](https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth)下载) +3. 打开Anaconda Prompt Shell,创建conda环境: + +``` +conda create -n AniTalker_judge python=3.9.21 +conda activate AniTalker_judge +``` + +3. 安装依赖: + +``` +pip install -r requirements.txt +``` + +4. 将视频放到项目文件夹下,在项目根目录下运行: + +``` +python PSNR_SSIM_FID_NIQE.py /path/to/raw/video/dir /path/to/generate/video/dir +``` diff --git a/AniTalker-kit/AniTalker-judge/demo/raw/short_May.mp4 b/AniTalker-kit/AniTalker-judge/demo/raw/short_May.mp4 new file mode 100644 index 00000000..5a963d96 Binary files /dev/null and b/AniTalker-kit/AniTalker-judge/demo/raw/short_May.mp4 differ diff --git a/AniTalker-kit/AniTalker-judge/demo/synthetic/short_May.mp4 b/AniTalker-kit/AniTalker-judge/demo/synthetic/short_May.mp4 new file mode 100644 index 00000000..6ff5db39 Binary files /dev/null and b/AniTalker-kit/AniTalker-judge/demo/synthetic/short_May.mp4 differ diff --git a/AniTalker-kit/AniTalker-judge/niqe_modelparameters.mat b/AniTalker-kit/AniTalker-judge/niqe_modelparameters.mat new file mode 100644 index 00000000..3b212e86 Binary files /dev/null and b/AniTalker-kit/AniTalker-judge/niqe_modelparameters.mat differ diff --git a/AniTalker-kit/AniTalker-judge/requirements.txt b/AniTalker-kit/AniTalker-judge/requirements.txt new file mode 100644 index 00000000..9293acfe --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/requirements.txt @@ -0,0 +1,13 @@ +# python==3.9.21 +imageio[ffmpeg]==2.36.1 +torch==2.0.1 +torchvision==0.15.2 +torchaudio==2.0.2 +pytorch_msssim==1.0.0 +pillow==11.0.0 +opencv-python-headless==4.10.0.84 +pyiqa==0.1.13 +scipy==1.13.1 +ipykernel==6.29.5 +ipywidgets==8.1.5 +numpy==1.23.0 \ No newline at end of file diff --git a/AniTalker-kit/AniTalker-judge/run_judge.txt b/AniTalker-kit/AniTalker-judge/run_judge.txt new file mode 100644 index 00000000..7f1048bd --- /dev/null +++ b/AniTalker-kit/AniTalker-judge/run_judge.txt @@ -0,0 +1,19 @@ +首先拉取镜像: +docker pull gre123/anitalkerjudge:v1 + +接着运行docker run命令: +docker run --rm --gpus all \ +-v your_stand_videos_dir_path:/app/stand_videos_dir_path \ +-v your_generate_videos_dir_path:/app/generate_videos_dir_path \ +gre123/anitalkerjudge:v1 \ +/app/stand_videos_dir_path \ +/app/generate_videos_dir_path +[--device your_device](默认值为cuda:0) + +其中,your_stand_videos_dir_path、your_generate_videos_dir_path为文件夹的绝对路径。 + +注意:评测程序会对两个文件夹中名称相同的视频进行评测指标的计算,运行该程序需要确保对应的参照视频和生成视频名称相同且时长相同。 + +例: + +docker run --rm --gpus all --memory="64g" --cpus="4" -v F:/AniTalker/AniTalker-judge/demo/raw:/app/demo/raw -v F:/AniTalker/AniTalker-judge/demo/synthetic:/app/demo/synthetic gre123/anitalkerjudge:v1 /app/demo/raw /app/demo/synthetic diff --git a/AniTalker-kit/AniTalker/.gitignore b/AniTalker-kit/AniTalker/.gitignore new file mode 100644 index 00000000..27042046 --- /dev/null +++ b/AniTalker-kit/AniTalker/.gitignore @@ -0,0 +1,16 @@ +.DS_Store +README.md.bak +__pycache__ +results +*.ckpt +ckpts +audios_hubert +scripts +*TEMP* +gfpgan +enhance_face_test.py +checkpoints +tmp +train_stage2.py +espnet +*.dat \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/Dockerfile b/AniTalker-kit/AniTalker/Dockerfile new file mode 100644 index 00000000..329c55e7 --- /dev/null +++ b/AniTalker-kit/AniTalker/Dockerfile @@ -0,0 +1,58 @@ +# 使用 nvidia/cuda:11.7.1-runtime-ubuntu22.04 作为基础镜像 +FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04 + +# 设置工作目录 +WORKDIR /app + +# 设置非交互模式 +ENV DEBIAN_FRONTEND=noninteractive + + + +# 更新APT源为官方Ubuntu源并安装系统依赖 +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + ffmpeg \ + libboost-all-dev \ + curl \ + software-properties-common \ + && rm -rf /var/lib/apt/lists/* # 清理apt缓存,以减少镜像大小 + +# 安装 Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python3.9 python3.9-dev python3.9-distutils + +# 安装指定版本的 pip 23.1 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 && \ + python3.9 -m pip install --upgrade pip==23.1 + +# 检查 pip3.9 是否存在并设置 python 和 pip 为默认版本 +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1 \ + && ln -sf /usr/local/bin/pip3 /usr/bin/pip \ + && update-alternatives --install /usr/bin/pip pip /usr/local/bin/pip3 1 + +# 复制 requirements.txt 到容器中 +COPY requirements.txt /app/requirements.txt + +COPY detection_Resnet50_Final.pth /app/gfpgan/weights/detection_Resnet50_Final.pth + +COPY parsing_parsenet.pth /app/gfpgan/weights/parsing_parsenet.pth + +COPY GFPGANv1.4.pth /usr/local/lib/python3.9/dist-packages/gfpgan/weights/GFPGANv1.4.pth + + +# 安装其他 Python 依赖 +RUN pip install --no-cache-dir -r requirements.txt + +# 复制项目文件到容器中 +COPY . /app + +# 暴露容器端口(根据需要设置) +EXPOSE 3001 + +# 设置默认执行命令(根据项目实际情况修改) +ENTRYPOINT ["python", "code/demo_final.py"] diff --git a/AniTalker-kit/AniTalker/LICENSE b/AniTalker-kit/AniTalker/LICENSE new file mode 100644 index 00000000..4c9ad980 --- /dev/null +++ b/AniTalker-kit/AniTalker/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/AniTalker-kit/AniTalker/README.md b/AniTalker-kit/AniTalker/README.md new file mode 100644 index 00000000..8594be57 --- /dev/null +++ b/AniTalker-kit/AniTalker/README.md @@ -0,0 +1,104 @@ +# AniTalker + +项目基于[AniTalker](https://github.com/X-LANCE/AniTalker) + +## Change + +项目通过身份解耦面部运动编码,实现音频、图片生成视频。 + +在原项目的基础上,进行了下列修改: + +1. 修改code/demo.py代码,改变了接口的名称,保存为code/demo_final.py代码。 +2. 在test_demo/portraits/中添加了May.png,在test_demo/audios/中添加了short_May.wav,用于测试封装后的镜像。 +3. 新增了run_output文件夹,该文件夹为运行封装后的镜像时挂载的输出文件夹,其中的视频文件是通过运行封装后的镜像生成的视频。 +4. 新增了Dockerfile,用于构建docker镜像。 +5. 新增了run_main.txt,给出docker镜像的运行示例。 +6. 修改了requirements.txt,修改了torch、torchvision、torchaudio的版本,以适配cuda11.7,增加了gfpgan包,以便在运行时使用 --face_sr参数,生成512*512的视频。 +7. 新增了parsing_parsenet.pth,将模型参数保存在本地。 + +## Quick Start + +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 从Dockerhub拉取构建好的镜像 +``` +docker pull gre123/anitalkermodel:v1 +``` +3. 拉取镜像后使用docker命令运行(如果为本地运行加 --gpus all) +``` +docker run --rm –gpus all \ +-v your_image_path:/app/image_path \ +-v your_audio_path:/app/audio_path \ +-v your_output_dir_path:/app/output_dir_path \ +gre123/anitalkermodel:v1 \ +--input_image /app/image_path \ +--input_audio_text /app/audio_path \ +--output_dir /app/output_dir_path +[--face_sr](没有该参数时生成256*256的视频,加上该参数后生成512*512的视频) +[--device your_device](默认值为cuda:0) +``` +其中,your_image_path、your_audio_path为文件的绝对路径,your_output_dir_path为文件夹的绝对路径。 + +例如: +``` +docker run --rm --gpus all --memory="64g" --cpus="4" -v F:/AniTalker/AniTalker-main/test_demos/portraits/May.png:/app/May.png -v F:/AniTalker/AniTalker-main/test_demos/audios/short_May.wav:/app/short_May.wav -v F:/AniTalker/AniTalker-main/run_output:/app/run_output gre123/anitalkermodel:v1 --input_image /app/May.png --input_audio_text /app/short_May.wav --output_dir /app/run_output +``` + +4. 生成视频可在对应文件夹下查看 + +## Install + +如果想要自己手动构建镜像,请按照下列步骤进行: + +1. 将项目代码拉取到本地 +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/AniTalker +``` +2. 下载检查点 +``` +git lfs install +git clone https://huggingface.co/taocode/anitalker_ckpts ckpts +``` +3. 下载模型权重 [GFPGANv1.4.pth](https://drive.google.com/file/d/1cVAYvvMJQoX9Jbvj08EiWDOJzRWBhL1V/view?usp=drive_link),[detection_Resnet50_Final.pth](https://drive.google.com/file/d/13P3bCDXAAFvcQ5lxkzlby11U0WFZpymF/view?usp=drive_link),将文件放至AniTalker根目录下 + +4. 在项目根目录下打开终端,运行 +``` +docker build -t . +``` +如果ubuntu镜像无法拉取,可尝试先单独拉取对应镜像。 +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` +该镜像可以尝试修改为对应的CUDA版本(同时修改Dockerfile中相关版本及对应的依赖) + +5. 运行成功后,使用docker命令运行 + +## Run + +如果不通过docker,直接运行,请按照下列步骤进行: + +1. 将项目代码拉取到本地 +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/AniTalker +``` +2. 打开Anaconda Prompt,运行 +``` +conda create -n anitalker python=3.9.0 +conda activate anitalker +``` +3. 安装必要依赖 +``` +pip install -r requirements.txt +``` +4. 下载检查点 +``` +git lfs install +git clone https://huggingface.co/taocode/anitalker_ckpts ckpts +``` +5. 下载模型权重 [GFPGANv1.4.pth](https://drive.google.com/file/d/1cVAYvvMJQoX9Jbvj08EiWDOJzRWBhL1V/view?usp=drive_link),[detection_Resnet50_Final.pth](https://drive.google.com/file/d/13P3bCDXAAFvcQ5lxkzlby11U0WFZpymF/view?usp=drive_link),将文件放至AniTalker根目录下 + +6. 运行,打开web界面交互 +``` +python code/webgui.py +``` diff --git a/AniTalker-kit/AniTalker/assets/aiface3.png b/AniTalker-kit/AniTalker/assets/aiface3.png new file mode 100644 index 00000000..6edb97dc Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/aiface3.png differ diff --git a/AniTalker-kit/AniTalker/assets/monalisa-monalisa.gif b/AniTalker-kit/AniTalker/assets/monalisa-monalisa.gif new file mode 100644 index 00000000..8195c288 Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/monalisa-monalisa.gif differ diff --git a/AniTalker-kit/AniTalker/assets/monalisa_facing_forward.gif b/AniTalker-kit/AniTalker/assets/monalisa_facing_forward.gif new file mode 100644 index 00000000..98e49305 Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/monalisa_facing_forward.gif differ diff --git a/AniTalker-kit/AniTalker/assets/monalisa_free_style.gif b/AniTalker-kit/AniTalker/assets/monalisa_free_style.gif new file mode 100644 index 00000000..6c95f9db Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/monalisa_free_style.gif differ diff --git a/AniTalker-kit/AniTalker/assets/monalisa_more_control.gif b/AniTalker-kit/AniTalker/assets/monalisa_more_control.gif new file mode 100644 index 00000000..ef91e64b Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/monalisa_more_control.gif differ diff --git a/AniTalker-kit/AniTalker/assets/monalisa_turn_head_right.gif b/AniTalker-kit/AniTalker/assets/monalisa_turn_head_right.gif new file mode 100644 index 00000000..80ad593b Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/monalisa_turn_head_right.gif differ diff --git a/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m1.jpg b/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m1.jpg new file mode 100644 index 00000000..567fdfd3 Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m1.jpg differ diff --git a/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m3.png b/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m3.png new file mode 100644 index 00000000..1ddbebf2 Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/results_run_on_macOS_m3.png differ diff --git a/AniTalker-kit/AniTalker/assets/sad.gif b/AniTalker-kit/AniTalker/assets/sad.gif new file mode 100644 index 00000000..37e7974d Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/sad.gif differ diff --git a/AniTalker-kit/AniTalker/assets/statue2.gif b/AniTalker-kit/AniTalker/assets/statue2.gif new file mode 100644 index 00000000..be930d15 Binary files /dev/null and b/AniTalker-kit/AniTalker/assets/statue2.gif differ diff --git a/AniTalker-kit/AniTalker/code/LIA_Model.py b/AniTalker-kit/AniTalker/code/LIA_Model.py new file mode 100644 index 00000000..0de914ad --- /dev/null +++ b/AniTalker-kit/AniTalker/code/LIA_Model.py @@ -0,0 +1,39 @@ +import torch +import torch.nn as nn +from networks.encoder import Encoder +from networks.styledecoder import Synthesis + +# This part is modified from: https://github.com/wyhsirius/LIA +class LIA_Model(torch.nn.Module): + def __init__(self, size = 256, style_dim = 512, motion_dim = 20, channel_multiplier=1, blur_kernel=[1, 3, 3, 1], fusion_type=''): + super().__init__() + self.enc = Encoder(size, style_dim, motion_dim, fusion_type) + self.dec = Synthesis(size, style_dim, motion_dim, blur_kernel, channel_multiplier) + + def get_start_direction_code(self, x_start, x_target, x_face, x_aug): + enc_dic = self.enc(x_start, x_target, x_face, x_aug) + + wa, alpha, feats = enc_dic['h_source'], enc_dic['h_motion'], enc_dic['feats'] + + return wa, alpha, feats + + def render(self, start, direction, feats): + return self.dec(start, direction, feats) + + def load_lightning_model(self, lia_pretrained_model_path): + selfState = self.state_dict() + + state = torch.load(lia_pretrained_model_path, map_location='cpu') + for name, param in state.items(): + origName = name; + + if name not in selfState: + name = name.replace("lia.", "") + if name not in selfState: + print("%s is not in the model."%origName) + # You can ignore those errors as some parameters are only used for training + continue + if selfState[name].size() != state[origName].size(): + print("Wrong parameter length: %s, model: %s, loaded: %s"%(origName, selfState[name].size(), state[origName].size())) + continue + selfState[name].copy_(param) diff --git a/AniTalker-kit/AniTalker/code/choices.py b/AniTalker-kit/AniTalker/code/choices.py new file mode 100644 index 00000000..740552ae --- /dev/null +++ b/AniTalker-kit/AniTalker/code/choices.py @@ -0,0 +1,179 @@ +from enum import Enum +from torch import nn + + +class TrainMode(Enum): + # manipulate mode = training the classifier + manipulate = 'manipulate' + # default trainin mode! + diffusion = 'diffusion' + # default latent training mode! + # fitting the a DDPM to a given latent + latent_diffusion = 'latentdiffusion' + + def is_manipulate(self): + return self in [ + TrainMode.manipulate, + ] + + def is_diffusion(self): + return self in [ + TrainMode.diffusion, + TrainMode.latent_diffusion, + ] + + def is_autoenc(self): + # the network possibly does autoencoding + return self in [ + TrainMode.diffusion, + ] + + def is_latent_diffusion(self): + return self in [ + TrainMode.latent_diffusion, + ] + + def use_latent_net(self): + return self.is_latent_diffusion() + + def require_dataset_infer(self): + """ + whether training in this mode requires the latent variables to be available? + """ + # this will precalculate all the latents before hand + # and the dataset will be all the predicted latents + return self in [ + TrainMode.latent_diffusion, + TrainMode.manipulate, + ] + + +class ManipulateMode(Enum): + """ + how to train the classifier to manipulate + """ + # train on whole celeba attr dataset + celebahq_all = 'celebahq_all' + # celeba with D2C's crop + d2c_fewshot = 'd2cfewshot' + d2c_fewshot_allneg = 'd2cfewshotallneg' + + def is_celeba_attr(self): + return self in [ + ManipulateMode.d2c_fewshot, + ManipulateMode.d2c_fewshot_allneg, + ManipulateMode.celebahq_all, + ] + + def is_single_class(self): + return self in [ + ManipulateMode.d2c_fewshot, + ManipulateMode.d2c_fewshot_allneg, + ] + + def is_fewshot(self): + return self in [ + ManipulateMode.d2c_fewshot, + ManipulateMode.d2c_fewshot_allneg, + ] + + def is_fewshot_allneg(self): + return self in [ + ManipulateMode.d2c_fewshot_allneg, + ] + + +class ModelType(Enum): + """ + Kinds of the backbone models + """ + + # unconditional ddpm + ddpm = 'ddpm' + # autoencoding ddpm cannot do unconditional generation + autoencoder = 'autoencoder' + + def has_autoenc(self): + return self in [ + ModelType.autoencoder, + ] + + def can_sample(self): + return self in [ModelType.ddpm] + + +class ModelName(Enum): + """ + List of all supported model classes + """ + + beatgans_ddpm = 'beatgans_ddpm' + beatgans_autoenc = 'beatgans_autoenc' + + +class ModelMeanType(Enum): + """ + Which type of output the model predicts. + """ + + eps = 'eps' # the model predicts epsilon + + +class ModelVarType(Enum): + """ + What is used as the model's output variance. + + The LEARNED_RANGE option has been added to allow the model to predict + values between FIXED_SMALL and FIXED_LARGE, making its job easier. + """ + + # posterior beta_t + fixed_small = 'fixed_small' + # beta_t + fixed_large = 'fixed_large' + + +class LossType(Enum): + mse = 'mse' # use raw MSE loss (and KL when learning variances) + l1 = 'l1' + + +class GenerativeType(Enum): + """ + How's a sample generated + """ + + ddpm = 'ddpm' + ddim = 'ddim' + + +class OptimizerType(Enum): + adam = 'adam' + adamw = 'adamw' + + +class Activation(Enum): + none = 'none' + relu = 'relu' + lrelu = 'lrelu' + silu = 'silu' + tanh = 'tanh' + + def get_act(self): + if self == Activation.none: + return nn.Identity() + elif self == Activation.relu: + return nn.ReLU() + elif self == Activation.lrelu: + return nn.LeakyReLU(negative_slope=0.2) + elif self == Activation.silu: + return nn.SiLU() + elif self == Activation.tanh: + return nn.Tanh() + else: + raise NotImplementedError() + + +class ManipulateLossType(Enum): + bce = 'bce' + mse = 'mse' \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/config.py b/AniTalker-kit/AniTalker/code/config.py new file mode 100644 index 00000000..9a2b3558 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/config.py @@ -0,0 +1,388 @@ +from model.unet import ScaleAt +from model.latentnet import * +from diffusion.resample import UniformSampler +from diffusion.diffusion import space_timesteps +from typing import Tuple + +from torch.utils.data import DataLoader + +from config_base import BaseConfig +from diffusion import * +from diffusion.base import GenerativeType, LossType, ModelMeanType, ModelVarType, get_named_beta_schedule +from model import * +from choices import * +from multiprocessing import get_context +import os +from dataset_util import * +from torch.utils.data.distributed import DistributedSampler +from dataset import LatentDataLoader + +@dataclass +class PretrainConfig(BaseConfig): + name: str + path: str + + +@dataclass +class TrainConfig(BaseConfig): + # random seed + seed: int = 0 + train_mode: TrainMode = TrainMode.diffusion + train_cond0_prob: float = 0 + train_pred_xstart_detach: bool = True + train_interpolate_prob: float = 0 + train_interpolate_img: bool = False + manipulate_mode: ManipulateMode = ManipulateMode.celebahq_all + manipulate_cls: str = None + manipulate_shots: int = None + manipulate_loss: ManipulateLossType = ManipulateLossType.bce + manipulate_znormalize: bool = False + manipulate_seed: int = 0 + accum_batches: int = 1 + autoenc_mid_attn: bool = True + batch_size: int = 16 + batch_size_eval: int = None + beatgans_gen_type: GenerativeType = GenerativeType.ddim + beatgans_loss_type: LossType = LossType.mse + beatgans_model_mean_type: ModelMeanType = ModelMeanType.eps + beatgans_model_var_type: ModelVarType = ModelVarType.fixed_large + beatgans_rescale_timesteps: bool = False + latent_infer_path: str = None + latent_znormalize: bool = False + latent_gen_type: GenerativeType = GenerativeType.ddim + latent_loss_type: LossType = LossType.mse + latent_model_mean_type: ModelMeanType = ModelMeanType.eps + latent_model_var_type: ModelVarType = ModelVarType.fixed_large + latent_rescale_timesteps: bool = False + latent_T_eval: int = 1_000 + latent_clip_sample: bool = False + latent_beta_scheduler: str = 'linear' + beta_scheduler: str = 'linear' + data_name: str = '' + data_val_name: str = None + diffusion_type: str = None + dropout: float = 0.1 + ema_decay: float = 0.9999 + eval_num_images: int = 5_000 + eval_every_samples: int = 200_000 + eval_ema_every_samples: int = 200_000 + fid_use_torch: bool = True + fp16: bool = False + grad_clip: float = 1 + img_size: int = 64 + lr: float = 0.0001 + optimizer: OptimizerType = OptimizerType.adam + weight_decay: float = 0 + model_conf: ModelConfig = None + model_name: ModelName = None + model_type: ModelType = None + net_attn: Tuple[int] = None + net_beatgans_attn_head: int = 1 + # not necessarily the same as the the number of style channels + net_beatgans_embed_channels: int = 512 + net_resblock_updown: bool = True + net_enc_use_time: bool = False + net_enc_pool: str = 'adaptivenonzero' + net_beatgans_gradient_checkpoint: bool = False + net_beatgans_resnet_two_cond: bool = False + net_beatgans_resnet_use_zero_module: bool = True + net_beatgans_resnet_scale_at: ScaleAt = ScaleAt.after_norm + net_beatgans_resnet_cond_channels: int = None + net_ch_mult: Tuple[int] = None + net_ch: int = 64 + net_enc_attn: Tuple[int] = None + net_enc_k: int = None + # number of resblocks for the encoder (half-unet) + net_enc_num_res_blocks: int = 2 + net_enc_channel_mult: Tuple[int] = None + net_enc_grad_checkpoint: bool = False + net_autoenc_stochastic: bool = False + net_latent_activation: Activation = Activation.silu + net_latent_channel_mult: Tuple[int] = (1, 2, 4) + net_latent_condition_bias: float = 0 + net_latent_dropout: float = 0 + net_latent_layers: int = None + net_latent_net_last_act: Activation = Activation.none + net_latent_net_type: LatentNetType = LatentNetType.none + net_latent_num_hid_channels: int = 1024 + net_latent_num_time_layers: int = 2 + net_latent_skip_layers: Tuple[int] = None + net_latent_time_emb_channels: int = 64 + net_latent_use_norm: bool = False + net_latent_time_last_act: bool = False + net_num_res_blocks: int = 2 + # number of resblocks for the UNET + net_num_input_res_blocks: int = None + net_enc_num_cls: int = None + num_workers: int = 4 + parallel: bool = False + postfix: str = '' + sample_size: int = 64 + sample_every_samples: int = 20_000 + save_every_samples: int = 100_000 + style_ch: int = 512 + T_eval: int = 1_000 + T_sampler: str = 'uniform' + T: int = 1_000 + total_samples: int = 10_000_000 + warmup: int = 0 + pretrain: PretrainConfig = None + continue_from: PretrainConfig = None + eval_programs: Tuple[str] = None + # if present load the checkpoint from this path instead + eval_path: str = None + base_dir: str = 'checkpoints' + use_cache_dataset: bool = False + data_cache_dir: str = os.path.expanduser('~/cache') + work_cache_dir: str = os.path.expanduser('~/mycache') + # to be overridden + name: str = '' + + def __post_init__(self): + self.batch_size_eval = self.batch_size_eval or self.batch_size + self.data_val_name = self.data_val_name or self.data_name + + def scale_up_gpus(self, num_gpus, num_nodes=1): + self.eval_ema_every_samples *= num_gpus * num_nodes + self.eval_every_samples *= num_gpus * num_nodes + self.sample_every_samples *= num_gpus * num_nodes + self.batch_size *= num_gpus * num_nodes + self.batch_size_eval *= num_gpus * num_nodes + return self + + @property + def batch_size_effective(self): + return self.batch_size * self.accum_batches + + @property + def fid_cache(self): + # we try to use the local dirs to reduce the load over network drives + # hopefully, this would reduce the disconnection problems with sshfs + return f'{self.work_cache_dir}/eval_images/{self.data_name}_size{self.img_size}_{self.eval_num_images}' + + @property + def data_path(self): + # may use the cache dir + path = data_paths[self.data_name] + if self.use_cache_dataset and path is not None: + path = use_cached_dataset_path( + path, f'{self.data_cache_dir}/{self.data_name}') + return path + + @property + def logdir(self): + return f'{self.base_dir}/{self.name}' + + @property + def generate_dir(self): + # we try to use the local dirs to reduce the load over network drives + # hopefully, this would reduce the disconnection problems with sshfs + return f'{self.work_cache_dir}/gen_images/{self.name}' + + def _make_diffusion_conf(self, T=None): + if self.diffusion_type == 'beatgans': + # can use T < self.T for evaluation + # follows the guided-diffusion repo conventions + # t's are evenly spaced + if self.beatgans_gen_type == GenerativeType.ddpm: + section_counts = [T] + elif self.beatgans_gen_type == GenerativeType.ddim: + section_counts = f'ddim{T}' + else: + raise NotImplementedError() + + return SpacedDiffusionBeatGansConfig( + gen_type=self.beatgans_gen_type, + model_type=self.model_type, + betas=get_named_beta_schedule(self.beta_scheduler, self.T), + model_mean_type=self.beatgans_model_mean_type, + model_var_type=self.beatgans_model_var_type, + loss_type=self.beatgans_loss_type, + rescale_timesteps=self.beatgans_rescale_timesteps, + use_timesteps=space_timesteps(num_timesteps=self.T, + section_counts=section_counts), + fp16=self.fp16, + ) + else: + raise NotImplementedError() + + def _make_latent_diffusion_conf(self, T=None): + # can use T < self.T for evaluation + # follows the guided-diffusion repo conventions + # t's are evenly spaced + if self.latent_gen_type == GenerativeType.ddpm: + section_counts = [T] + elif self.latent_gen_type == GenerativeType.ddim: + section_counts = f'ddim{T}' + else: + raise NotImplementedError() + + return SpacedDiffusionBeatGansConfig( + train_pred_xstart_detach=self.train_pred_xstart_detach, + gen_type=self.latent_gen_type, + # latent's model is always ddpm + model_type=ModelType.ddpm, + # latent shares the beta scheduler and full T + betas=get_named_beta_schedule(self.latent_beta_scheduler, self.T), + model_mean_type=self.latent_model_mean_type, + model_var_type=self.latent_model_var_type, + loss_type=self.latent_loss_type, + rescale_timesteps=self.latent_rescale_timesteps, + use_timesteps=space_timesteps(num_timesteps=self.T, + section_counts=section_counts), + fp16=self.fp16, + ) + + @property + def model_out_channels(self): + return 3 + + def make_T_sampler(self): + if self.T_sampler == 'uniform': + return UniformSampler(self.T) + else: + raise NotImplementedError() + + def make_diffusion_conf(self): + return self._make_diffusion_conf(self.T) + + def make_eval_diffusion_conf(self): + return self._make_diffusion_conf(T=self.T_eval) + + def make_latent_diffusion_conf(self): + return self._make_latent_diffusion_conf(T=self.T) + + def make_latent_eval_diffusion_conf(self): + # latent can have different eval T + return self._make_latent_diffusion_conf(T=self.latent_T_eval) + + def make_dataset(self, path=None, **kwargs): + return LatentDataLoader(self.window_size, + self.frame_jpgs, + self.lmd_feats_prefix, + self.audio_prefix, + self.raw_audio_prefix, + self.motion_latents_prefix, + self.pose_prefix, + self.db_name, + audio_hz=self.audio_hz) + + def make_loader(self, + dataset, + shuffle: bool, + num_worker: bool = None, + drop_last: bool = True, + batch_size: int = None, + parallel: bool = False): + if parallel and distributed.is_initialized(): + # drop last to make sure that there is no added special indexes + sampler = DistributedSampler(dataset, + shuffle=shuffle, + drop_last=True) + else: + sampler = None + return DataLoader( + dataset, + batch_size=batch_size or self.batch_size, + sampler=sampler, + # with sampler, use the sample instead of this option + shuffle=False if sampler else shuffle, + num_workers=num_worker or self.num_workers, + pin_memory=True, + drop_last=drop_last, + multiprocessing_context=get_context('fork'), + ) + + def make_model_conf(self): + if self.model_name == ModelName.beatgans_ddpm: + self.model_type = ModelType.ddpm + self.model_conf = BeatGANsUNetConfig( + attention_resolutions=self.net_attn, + channel_mult=self.net_ch_mult, + conv_resample=True, + dims=2, + dropout=self.dropout, + embed_channels=self.net_beatgans_embed_channels, + image_size=self.img_size, + in_channels=3, + model_channels=self.net_ch, + num_classes=None, + num_head_channels=-1, + num_heads_upsample=-1, + num_heads=self.net_beatgans_attn_head, + num_res_blocks=self.net_num_res_blocks, + num_input_res_blocks=self.net_num_input_res_blocks, + out_channels=self.model_out_channels, + resblock_updown=self.net_resblock_updown, + use_checkpoint=self.net_beatgans_gradient_checkpoint, + use_new_attention_order=False, + resnet_two_cond=self.net_beatgans_resnet_two_cond, + resnet_use_zero_module=self. + net_beatgans_resnet_use_zero_module, + ) + elif self.model_name in [ + ModelName.beatgans_autoenc, + ]: + cls = BeatGANsAutoencConfig + # supports both autoenc and vaeddpm + if self.model_name == ModelName.beatgans_autoenc: + self.model_type = ModelType.autoencoder + else: + raise NotImplementedError() + + if self.net_latent_net_type == LatentNetType.none: + latent_net_conf = None + elif self.net_latent_net_type == LatentNetType.skip: + latent_net_conf = MLPSkipNetConfig( + num_channels=self.style_ch, + skip_layers=self.net_latent_skip_layers, + num_hid_channels=self.net_latent_num_hid_channels, + num_layers=self.net_latent_layers, + num_time_emb_channels=self.net_latent_time_emb_channels, + activation=self.net_latent_activation, + use_norm=self.net_latent_use_norm, + condition_bias=self.net_latent_condition_bias, + dropout=self.net_latent_dropout, + last_act=self.net_latent_net_last_act, + num_time_layers=self.net_latent_num_time_layers, + time_last_act=self.net_latent_time_last_act, + ) + else: + raise NotImplementedError() + + self.model_conf = cls( + attention_resolutions=self.net_attn, + channel_mult=self.net_ch_mult, + conv_resample=True, + dims=2, + dropout=self.dropout, + embed_channels=self.net_beatgans_embed_channels, + enc_out_channels=self.style_ch, + enc_pool=self.net_enc_pool, + enc_num_res_block=self.net_enc_num_res_blocks, + enc_channel_mult=self.net_enc_channel_mult, + enc_grad_checkpoint=self.net_enc_grad_checkpoint, + enc_attn_resolutions=self.net_enc_attn, + image_size=self.img_size, + in_channels=3, + model_channels=self.net_ch, + num_classes=None, + num_head_channels=-1, + num_heads_upsample=-1, + num_heads=self.net_beatgans_attn_head, + num_res_blocks=self.net_num_res_blocks, + num_input_res_blocks=self.net_num_input_res_blocks, + out_channels=self.model_out_channels, + resblock_updown=self.net_resblock_updown, + use_checkpoint=self.net_beatgans_gradient_checkpoint, + use_new_attention_order=False, + resnet_two_cond=self.net_beatgans_resnet_two_cond, + resnet_use_zero_module=self. + net_beatgans_resnet_use_zero_module, + latent_net_conf=latent_net_conf, + resnet_cond_channels=self.net_beatgans_resnet_cond_channels, + ) + else: + raise NotImplementedError(self.model_name) + + return self.model_conf diff --git a/AniTalker-kit/AniTalker/code/config_base.py b/AniTalker-kit/AniTalker/code/config_base.py new file mode 100644 index 00000000..385f9eef --- /dev/null +++ b/AniTalker-kit/AniTalker/code/config_base.py @@ -0,0 +1,72 @@ +import json +import os +from copy import deepcopy +from dataclasses import dataclass + + +@dataclass +class BaseConfig: + def clone(self): + return deepcopy(self) + + def inherit(self, another): + """inherit common keys from a given config""" + common_keys = set(self.__dict__.keys()) & set(another.__dict__.keys()) + for k in common_keys: + setattr(self, k, getattr(another, k)) + + def propagate(self): + """push down the configuration to all members""" + for k, v in self.__dict__.items(): + if isinstance(v, BaseConfig): + v.inherit(self) + v.propagate() + + def save(self, save_path): + """save config to json file""" + dirname = os.path.dirname(save_path) + if not os.path.exists(dirname): + os.makedirs(dirname) + conf = self.as_dict_jsonable() + with open(save_path, 'w') as f: + json.dump(conf, f) + + def load(self, load_path): + """load json config""" + with open(load_path) as f: + conf = json.load(f) + self.from_dict(conf) + + def from_dict(self, dict, strict=False): + for k, v in dict.items(): + if not hasattr(self, k): + if strict: + raise ValueError(f"loading extra '{k}'") + else: + print(f"loading extra '{k}'") + continue + if isinstance(self.__dict__[k], BaseConfig): + self.__dict__[k].from_dict(v) + else: + self.__dict__[k] = v + + def as_dict_jsonable(self): + conf = {} + for k, v in self.__dict__.items(): + if isinstance(v, BaseConfig): + conf[k] = v.as_dict_jsonable() + else: + if jsonable(v): + conf[k] = v + else: + # ignore not jsonable + pass + return conf + + +def jsonable(x): + try: + json.dumps(x) + return True + except TypeError: + return False diff --git a/AniTalker-kit/AniTalker/code/data_preprocess/README.md b/AniTalker-kit/AniTalker/code/data_preprocess/README.md new file mode 100644 index 00000000..4d20da45 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/data_preprocess/README.md @@ -0,0 +1,15 @@ +Download the following models in this folder + +https://github.com/italojs/facial-landmarks-recognition/blob/master/shape_predictor_68_face_landmarks.dat + +https://raw.githubusercontent.com/tanshuai0219/EDTalk/main/data_preprocess/M003_template.npy + + +Quick Script for Downloading: + +``` +wget https://github.com/italojs/facial-landmarks-recognition/raw/master/shape_predictor_68_face_landmarks.dat + +wget https://raw.githubusercontent.com/tanshuai0219/EDTalk/main/data_preprocess/M003_template.npy + +``` \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/data_preprocess/crop_image2.py b/AniTalker-kit/AniTalker/code/data_preprocess/crop_image2.py new file mode 100644 index 00000000..8268faee --- /dev/null +++ b/AniTalker-kit/AniTalker/code/data_preprocess/crop_image2.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 24 11:36:01 2021 + +@author: Xinya +""" + +import os +import glob +import time +import numpy as np +import csv +import cv2 +import dlib + +from skimage import transform as tf + +detector = dlib.get_frontal_face_detector() +predictor = dlib.shape_predictor('code/data_preprocess/shape_predictor_68_face_landmarks.dat') + + +import imageio + + + +def save(path, frames, format): + if format == '.mp4': + imageio.mimsave(path, frames) + elif format == '.png': + if not os.path.exists(path): + + + os.makedirs(path) + for j, frame in enumerate(frames): + cv2.imwrite(path+'/'+str(j)+'.png',frame) + # imageio.imsave(os.path.join(path, str(j) + '.png'), frames[j]) + else: + print ("Unknown format %s" % format) + exit() + +def crop_image(image_path, out_path): + template = np.load('code/data_preprocess/M003_template.npy') + image = cv2.imread(image_path) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + rects = detector(gray, 1) #detect human face + if len(rects) != 1: + return 0 + for (j, rect) in enumerate(rects): + shape = predictor(gray, rect) #detect 68 points + shape = shape_to_np(shape) + + pts2 = np.float32(template[:47,:]) + # pts2 = np.float32(template[17:35,:]) + # pts1 = np.vstack((landmark[27:36,:], landmark[39,:],landmark[42,:],landmark[45,:])) + pts1 = np.float32(shape[:47,:]) #eye and nose + # pts1 = np.float32(landmark[17:35,:]) + tform = tf.SimilarityTransform() + tform.estimate( pts2, pts1) #Set the transformation matrix with the explicit parameters. + + dst = tf.warp(image, tform, output_shape=(256, 256)) + + dst = np.array(dst * 255, dtype=np.uint8) + + + cv2.imwrite(out_path,dst) + +def shape_to_np(shape, dtype="int"): + # initialize the list of (x, y)-coordinates + coords = np.zeros((shape.num_parts, 2), dtype=dtype) + + # loop over all facial landmarks and convert them + # to a 2-tuple of (x, y)-coordinates + for i in range(0, shape.num_parts): + coords[i] = (shape.part(i).x, shape.part(i).y) + + # return the list of (x, y)-coordinates + return coords + + +def crop_image_tem(video_path, out_path): + image_all = [] + videoCapture = cv2.VideoCapture(video_path) + success, frame = videoCapture.read() + n = 0 + while success : + image_all.append(frame) + n = n + 1 + success, frame = videoCapture.read() + + if len(image_all)!=0 : + template = np.load('./M003_template.npy') + image=image_all[0] + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + rects = detector(gray, 1) #detect human face + if len(rects) != 1: + return 0 + for (j, rect) in enumerate(rects): + shape = predictor(gray, rect) #detect 68 points + shape = shape_to_np(shape) + + pts2 = np.float32(template[:47,:]) + # pts2 = np.float32(template[17:35,:]) + # pts1 = np.vstack((landmark[27:36,:], landmark[39,:],landmark[42,:],landmark[45,:])) + pts1 = np.float32(shape[:47,:]) #eye and nose + # pts1 = np.float32(landmark[17:35,:]) + tform = tf.SimilarityTransform() + tform.estimate( pts2, pts1) #Set the transformation matrix with the explicit parameters. + out = [] + for i in range(len(image_all)): + image = image_all[i] + dst = tf.warp(image, tform, output_shape=(256, 256)) + + dst = np.array(dst * 255, dtype=np.uint8) + out.append(dst) + if not os.path.exists(out_path): + os.makedirs(out_path) + save(out_path,out,'.png') + +def proc_audio(src_mouth_path, dst_audio_path): + audio_command = 'ffmpeg -i \"{}\" -loglevel error -y -f wav -acodec pcm_s16le ' \ + '-ar 16000 \"{}\"'.format(src_mouth_path, dst_audio_path) + os.system(audio_command) + + + +if __name__ == "__main__": + + + image_path ='source_image.png' + save_path = 'crop_image.png' + crop_image(image_path, save_path) + diff --git a/AniTalker-kit/AniTalker/code/dataset.py b/AniTalker-kit/AniTalker/code/dataset.py new file mode 100644 index 00000000..77f869aa --- /dev/null +++ b/AniTalker-kit/AniTalker/code/dataset.py @@ -0,0 +1,218 @@ +import os +import librosa +from PIL import Image +from torchvision import transforms +import python_speech_features +import random +import os +import numpy as np +from tqdm import tqdm +import torchvision +import torchvision.transforms as transforms +from PIL import Image + +class LatentDataLoader(object): + + def __init__( + self, + window_size, + frame_jpgs, + lmd_feats_prefix, + audio_prefix, + raw_audio_prefix, + motion_latents_prefix, + pose_prefix, + db_name, + video_fps=25, + audio_hz=50, + size=256, + mfcc_mode=False, + ): + self.window_size = window_size + self.lmd_feats_prefix = lmd_feats_prefix + self.audio_prefix = audio_prefix + self.pose_prefix = pose_prefix + self.video_fps = video_fps + self.audio_hz = audio_hz + self.db_name = db_name + self.raw_audio_prefix = raw_audio_prefix + self.mfcc_mode = mfcc_mode + + + self.transform = torchvision.transforms.Compose([ + transforms.Resize((size, size)), + transforms.ToTensor(), + transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))] + ) + + self.data = [] + for db_name in [ 'VoxCeleb2', 'HDTF' ]: + db_png_path = os.path.join(frame_jpgs, db_name) + for clip_name in tqdm(os.listdir(db_png_path)): + + item_dict = dict() + item_dict['clip_name'] = clip_name + item_dict['frame_count'] = len(list(os.listdir(os.path.join(frame_jpgs, db_name, clip_name)))) + item_dict['hubert_path'] = os.path.join(audio_prefix, db_name, clip_name +".npy") + item_dict['wav_path'] = os.path.join(raw_audio_prefix, db_name, clip_name +".wav") + + item_dict['yaw_pitch_roll_path'] = os.path.join(pose_prefix, db_name, 'raw_videos_pose_yaw_pitch_roll', clip_name +".npy") + if not os.path.exists(item_dict['yaw_pitch_roll_path']): + print(f"{db_name}'s {clip_name} miss yaw_pitch_roll_path") + continue + + item_dict['yaw_pitch_roll'] = np.load(item_dict['yaw_pitch_roll_path']) + item_dict['yaw_pitch_roll'] = np.clip(item_dict['yaw_pitch_roll'], -90, 90) / 90.0 + + if not os.path.exists(item_dict['wav_path']): + print(f"{db_name}'s {clip_name} miss wav_path") + continue + + if not os.path.exists(item_dict['hubert_path']): + print(f"{db_name}'s {clip_name} miss hubert_path") + continue + + + if self.mfcc_mode: + wav, sr = librosa.load(item_dict['wav_path'], sr=16000) + input_values = python_speech_features.mfcc(signal=wav,samplerate=sr,numcep=13,winlen=0.025,winstep=0.01) + d_mfcc_feat = python_speech_features.base.delta(input_values, 1) + d_mfcc_feat2 = python_speech_features.base.delta(input_values, 2) + input_values = np.hstack((input_values, d_mfcc_feat, d_mfcc_feat2)) + item_dict['hubert_obj'] = input_values + else: + item_dict['hubert_obj'] = np.load(item_dict['hubert_path'], mmap_mode='r') + item_dict['lmd_path'] = os.path.join(lmd_feats_prefix, db_name, clip_name +".txt") + item_dict['lmd_obj_full'] = self.read_landmark_info(item_dict['lmd_path'], upper_face=False) + + motion_start_path = os.path.join(motion_latents_prefix, db_name, 'motions', clip_name +".npy") + motion_direction_path = os.path.join(motion_latents_prefix, db_name, 'directions', clip_name +".npy") + + if not os.path.exists(motion_start_path): + print(f"{db_name}'s {clip_name} miss motion_start_path") + continue + if not os.path.exists(motion_direction_path): + print(f"{db_name}'s {clip_name} miss motion_direction_path") + continue + + item_dict['motion_start_obj'] = np.load(motion_start_path) + item_dict['motion_direction_obj'] = np.load(motion_direction_path) + + if self.mfcc_mode: + min_len = min( + item_dict['lmd_obj_full'].shape[0], + item_dict['yaw_pitch_roll'].shape[0], + item_dict['motion_start_obj'].shape[0], + item_dict['motion_direction_obj'].shape[0], + int(item_dict['hubert_obj'].shape[0]/4), + item_dict['frame_count'] + ) + item_dict['frame_count'] = min_len + item_dict['hubert_obj'] = item_dict['hubert_obj'][:min_len*4,:] + else: + min_len = min( + item_dict['lmd_obj_full'].shape[0], + item_dict['yaw_pitch_roll'].shape[0], + item_dict['motion_start_obj'].shape[0], + item_dict['motion_direction_obj'].shape[0], + int(item_dict['hubert_obj'].shape[1]/2), + item_dict['frame_count'] + ) + + item_dict['frame_count'] = min_len + item_dict['hubert_obj'] = item_dict['hubert_obj'][:, :min_len*2, :] + + if min_len < self.window_size * self.video_fps + 5: + continue + + print('Db count:', len(self.data)) + + def get_single_image(self, image_path): + img_source = Image.open(image_path).convert('RGB') + img_source = self.transform(img_source) + return img_source + + def get_multiple_ranges(self, lists, multi_ranges): + # Ensure that multi_ranges is a list of tuples + if not all(isinstance(item, tuple) and len(item) == 2 for item in multi_ranges): + raise ValueError("multi_ranges must be a list of (start, end) tuples with exactly two elements each") + extracted_elements = [lists[start:end] for start, end in multi_ranges] + flat_list = [item for sublist in extracted_elements for item in sublist] + return flat_list + + + def read_landmark_info(self, lmd_path, upper_face=True): + with open(lmd_path, 'r') as file: + lmd_lines = file.readlines() + lmd_lines.sort() + + total_lmd_obj = [] + for i, line in enumerate(lmd_lines): + # Split the coordinates and filter out any empty strings + coords = [c for c in line.strip().split(' ') if c] + coords = coords[1:] # do not include the file name in the first row + lmd_obj = [] + if upper_face: + # Ensure that the coordinates are parsed as integers + for coord_pair in self.get_multiple_ranges(coords, [(0, 3), (14, 27), (36, 48)]): # 28个 + x, y = coord_pair.split('_') + lmd_obj.append((int(x)/512, int(y)/512)) + else: + for coord_pair in coords: + x, y = coord_pair.split('_') + lmd_obj.append((int(x)/512, int(y)/512)) + total_lmd_obj.append(lmd_obj) + + return np.array(total_lmd_obj, dtype=np.float32) + + def calculate_face_height(self, landmarks): + forehead_center = (landmarks[ :, 21, :] + landmarks[:, 22, :]) / 2 + chin_bottom = landmarks[:, 8, :] + distances = np.linalg.norm(forehead_center - chin_bottom, axis=1, keepdims=True) + return distances + + def __getitem__(self, index): + + data_item = self.data[index] + hubert_obj = data_item['hubert_obj'] + frame_count = data_item['frame_count'] + lmd_obj_full = data_item['lmd_obj_full'] + yaw_pitch_roll = data_item['yaw_pitch_roll'] + motion_start_obj = data_item['motion_start_obj'] + motion_direction_obj = data_item['motion_direction_obj'] + + frame_end_index = random.randint(self.window_size * self.video_fps + 1, frame_count - 1) + frame_start_index = frame_end_index - self.window_size * self.video_fps + frame_hint_index = frame_start_index - 1 + + audio_start_index = int(frame_start_index * (self.audio_hz / self.video_fps)) + audio_end_index = int(frame_end_index * (self.audio_hz / self.video_fps)) + + if self.mfcc_mode: + audio_feats = hubert_obj[audio_start_index:audio_end_index, :] + else: + audio_feats = hubert_obj[:, audio_start_index:audio_end_index, :] + + lmd_obj_full = lmd_obj_full[frame_hint_index:frame_end_index, :] + + yaw_pitch_roll = yaw_pitch_roll[frame_start_index:frame_end_index, :] + + motion_start = motion_start_obj[frame_hint_index] + motion_direction_start = motion_direction_obj[frame_hint_index] + motion_direction = motion_direction_obj[frame_start_index:frame_end_index, :] + + + + return { + 'motion_start': motion_start, + 'motion_direction': motion_direction, + 'audio_feats': audio_feats, + 'face_location': lmd_obj_full[1:, 30, 0], # '1:' means taking the first frame as the driven frame. '30' is the noise location, '0' means x coordinate + 'face_scale': self.calculate_face_height(lmd_obj_full[1:,:,:]), + 'yaw_pitch_roll': yaw_pitch_roll, + 'motion_direction_start': motion_direction_start, + } + + def __len__(self): + return len(self.data) + \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/dataset_util.py b/AniTalker-kit/AniTalker/code/dataset_util.py new file mode 100644 index 00000000..d2075ac9 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/dataset_util.py @@ -0,0 +1,13 @@ +import shutil +import os +from dist_utils import * + + +def use_cached_dataset_path(source_path, cache_path): + if get_rank() == 0: + if not os.path.exists(cache_path): + # shutil.rmtree(cache_path) + print(f'copying the data: {source_path} to {cache_path}') + shutil.copytree(source_path, cache_path) + barrier() + return cache_path \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/demo.py b/AniTalker-kit/AniTalker/code/demo.py new file mode 100644 index 00000000..b7a301b0 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/demo.py @@ -0,0 +1,301 @@ +from LIA_Model import LIA_Model +import torch +import numpy as np +import os +from PIL import Image +from tqdm import tqdm +import argparse +import numpy as np +from torchvision import transforms +from templates import * +import argparse +import shutil +from moviepy.editor import * +import librosa +import python_speech_features +import importlib.util +import time + +def check_package_installed(package_name): + package_spec = importlib.util.find_spec(package_name) + if package_spec is None: + print(f"{package_name} is not installed.") + return False + else: + print(f"{package_name} is installed.") + return True + +def frames_to_video(input_path, audio_path, output_path, fps=25): + image_files = [os.path.join(input_path, img) for img in sorted(os.listdir(input_path))] + clips = [ImageClip(m).set_duration(1/fps) for m in image_files] + video = concatenate_videoclips(clips, method="compose") + + audio = AudioFileClip(audio_path) + final_video = video.set_audio(audio) + final_video.write_videofile(output_path, fps=fps, codec='libx264', audio_codec='aac') + +def load_image(filename, size): + img = Image.open(filename).convert('RGB') + img = img.resize((size, size)) + img = np.asarray(img) + img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256 + return img / 255.0 + +def img_preprocessing(img_path, size): + img = load_image(img_path, size) # [0, 1] + img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1] + imgs_norm = (img - 0.5) * 2.0 # [-1, 1] + return imgs_norm + +def saved_image(img_tensor, img_path): + toPIL = transforms.ToPILImage() + img = toPIL(img_tensor.detach().cpu().squeeze(0)) # 使用squeeze(0)来移除批次维度 + img.save(img_path) + +def main(args): + frames_result_saved_path = os.path.join(args.result_path, 'frames') + os.makedirs(frames_result_saved_path, exist_ok=True) + test_image_name = os.path.splitext(os.path.basename(args.test_image_path))[0] + audio_name = os.path.splitext(os.path.basename(args.test_audio_path))[0] + predicted_video_256_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}.mp4') + predicted_video_512_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}_SR.mp4') + + #======Loading Stage 1 model========= + lia = LIA_Model(motion_dim=args.motion_dim, fusion_type='weighted_sum') + lia.load_lightning_model(args.stage1_checkpoint_path) + lia.to(args.device) + #============================ + + conf = ffhq256_autoenc() + conf.seed = args.seed + conf.decoder_layers = args.decoder_layers + conf.infer_type = args.infer_type + conf.motion_dim = args.motion_dim + + if args.infer_type == 'mfcc_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = True + + elif args.infer_type == 'mfcc_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = True + + elif args.infer_type == 'hubert_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + + elif args.infer_type == 'hubert_audio_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + + elif args.infer_type == 'hubert_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = False + + else: + print('Type NOT Found!') + exit(0) + + if not os.path.exists(args.test_image_path): + print(f'{args.test_image_path} does not exist!') + exit(0) + + if not os.path.exists(args.test_audio_path): + print(f'{args.test_audio_path} does not exist!') + exit(0) + + img_source = img_preprocessing(args.test_image_path, args.image_size).to(args.device) + one_shot_lia_start, one_shot_lia_direction, feats = lia.get_start_direction_code(img_source, img_source, img_source, img_source) + + + #======Loading Stage 2 model========= + model = LitModel(conf) + state = torch.load(args.stage2_checkpoint_path, map_location='cpu') + model.load_state_dict(state, strict=True) + model.ema_model.eval() + model.ema_model.to(args.device); + #================================= + + + #======Audio Input========= + if conf.infer_type.startswith('mfcc'): + # MFCC features + wav, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = python_speech_features.mfcc(signal=wav, samplerate=sr, numcep=13, winlen=0.025, winstep=0.01) + d_mfcc_feat = python_speech_features.base.delta(input_values, 1) + d_mfcc_feat2 = python_speech_features.base.delta(input_values, 2) + audio_driven_obj = np.hstack((input_values, d_mfcc_feat, d_mfcc_feat2)) + frame_start, frame_end = 0, int(audio_driven_obj.shape[0]/4) + audio_start, audio_end = int(frame_start * 4), int(frame_end * 4) # The video frame is fixed to 25 hz and the audio is fixed to 100 hz + + audio_driven = torch.Tensor(audio_driven_obj[audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + + elif conf.infer_type.startswith('hubert'): + # Hubert features + if not os.path.exists(args.test_hubert_path): + + if not check_package_installed('transformers'): + print('Please install transformers module first.') + exit(0) + hubert_model_path = 'ckpts/chinese-hubert-large' + if not os.path.exists(hubert_model_path): + print('Please download the hubert weight into the ckpts path first.') + exit(0) + print('You did not extract the audio features in advance, extracting online now, which will increase processing delay') + + start_time = time.time() + + # load hubert model + from transformers import Wav2Vec2FeatureExtractor, HubertModel + audio_model = HubertModel.from_pretrained(hubert_model_path).to(args.device) + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_path) + audio_model.feature_extractor._freeze_parameters() + audio_model.eval() + + # hubert model forward pass + audio, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values + input_values = input_values.to(args.device) + ws_feats = [] + with torch.no_grad(): + outputs = audio_model(input_values, output_hidden_states=True) + for i in range(len(outputs.hidden_states)): + ws_feats.append(outputs.hidden_states[i].detach().cpu().numpy()) + ws_feat_obj = np.array(ws_feats) + ws_feat_obj = np.squeeze(ws_feat_obj, 1) + ws_feat_obj = np.pad(ws_feat_obj, ((0, 0), (0, 1), (0, 0)), 'edge') # align the audio length with video frame + + execution_time = time.time() - start_time + print(f"Extraction Audio Feature: {execution_time:.2f} Seconds") + + audio_driven_obj = ws_feat_obj + else: + print(f'Using audio feature from path: {args.test_hubert_path}') + audio_driven_obj = np.load(args.test_hubert_path) + + frame_start, frame_end = 0, int(audio_driven_obj.shape[1]/2) + audio_start, audio_end = int(frame_start * 2), int(frame_end * 2) # The video frame is fixed to 25 hz and the audio is fixed to 50 hz + + audio_driven = torch.Tensor(audio_driven_obj[:,audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + #============================ + + # Diffusion Noise + noisyT = th.randn((1,frame_end, args.motion_dim)).to(args.device) + + #======Inputs for Attribute Control========= + if os.path.exists(args.pose_driven_path): + pose_obj = np.load(args.pose_driven_path) + + + if len(pose_obj.shape) != 2: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + if pose_obj.shape[1] != 3: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + + if pose_obj.shape[0] >= frame_end: + pose_obj = pose_obj[:frame_end,:] + else: + padding = np.tile(pose_obj[-1, :], (frame_end - pose_obj.shape[0], 1)) + pose_obj = np.vstack((pose_obj, padding)) + + pose_signal = torch.Tensor(pose_obj).unsqueeze(0).to(args.device)/ 90 # 90 is for normalization here + else: + yaw_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_yaw + pitch_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_pitch + roll_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_roll + pose_signal = torch.cat((yaw_signal, pitch_signal, roll_signal), dim=-1) + + pose_signal = torch.clamp(pose_signal, -1, 1) + + face_location_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_location + face_scae_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_scale + #=========================================== + + start_time = time.time() + + #======Diffusion Denosing Process========= + generated_directions = model.render(one_shot_lia_start, one_shot_lia_direction, audio_driven, face_location_signal, face_scae_signal, pose_signal, noisyT, args.step_T, control_flag=args.control_flag) + #========================================= + + execution_time = time.time() - start_time + print(f"Motion Diffusion Model: {execution_time:.2f} Seconds") + + generated_directions = generated_directions.detach().cpu().numpy() + + start_time = time.time() + #======Rendering images frame-by-frame========= + for pred_index in tqdm(range(generated_directions.shape[1])): + ori_img_recon = lia.render(one_shot_lia_start, torch.Tensor(generated_directions[:,pred_index,:]).to(args.device), feats) + ori_img_recon = ori_img_recon.clamp(-1, 1) + wav_pred = (ori_img_recon.detach() + 1) / 2 + saved_image(wav_pred, os.path.join(frames_result_saved_path, "%06d.png"%(pred_index))) + #============================================== + + execution_time = time.time() - start_time + print(f"Renderer Model: {execution_time:.2f} Seconds") + + frames_to_video(frames_result_saved_path, args.test_audio_path, predicted_video_256_path) + + shutil.rmtree(frames_result_saved_path) + + + # Enhancer + # Code is modified from https://github.com/OpenTalker/SadTalker/blob/cd4c0465ae0b54a6f85af57f5c65fec9fe23e7f8/src/utils/face_enhancer.py#L26 + + if args.face_sr and check_package_installed('gfpgan'): + from face_sr.face_enhancer import enhancer_list + import imageio + + # Super-resolution + imageio.mimsave(predicted_video_512_path+'.tmp.mp4', enhancer_list(predicted_video_256_path, method='gfpgan', bg_upsampler=None), fps=float(25)) + + # Merge audio and video + video_clip = VideoFileClip(predicted_video_512_path+'.tmp.mp4') + audio_clip = AudioFileClip(predicted_video_256_path) + final_clip = video_clip.set_audio(audio_clip) + final_clip.write_videofile(predicted_video_512_path, codec='libx264', audio_codec='aac') + + os.remove(predicted_video_512_path+'.tmp.mp4') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--infer_type', type=str, default='mfcc_pose_only', help='mfcc_pose_only or mfcc_full_control') + parser.add_argument('--test_image_path', type=str, default='./test_demos/portraits/monalisa.jpg', help='Path to the portrait') + parser.add_argument('--test_audio_path', type=str, default='./test_demos/audios/english_female.wav', help='Path to the driven audio') + parser.add_argument('--test_hubert_path', type=str, default='./test_demos/audios_hubert/english_female.npy', help='Path to the driven audio(hubert type). Not needed for MFCC') + parser.add_argument('--result_path', type=str, default='./results/', help='Type of inference') + parser.add_argument('--stage1_checkpoint_path', type=str, default='./ckpts/stage1.ckpt', help='Path to the checkpoint of Stage1') + parser.add_argument('--stage2_checkpoint_path', type=str, default='./ckpts/pose_only.ckpt', help='Path to the checkpoint of Stage2') + parser.add_argument('--seed', type=int, default=0, help='seed for generations') + parser.add_argument('--control_flag', action='store_true', help='Whether to use control signal or not') + parser.add_argument('--pose_yaw', type=float, default=0.25, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--pose_pitch', type=float, default=0, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--pose_roll', type=float, default=0, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--face_location', type=float, default=0.5, help='range from 0 to 1 (from left to right)') + parser.add_argument('--pose_driven_path', type=str, default='xxx', help='path to pose numpy, shape is (T, 3). You can check the following code https://github.com/liutaocode/talking_face_preprocessing to extract the yaw, pitch and roll.') + parser.add_argument('--face_scale', type=float, default=0.5, help='range from 0 to 1 (from small to large)') + parser.add_argument('--step_T', type=int, default=50, help='Step T for diffusion denoising process') + parser.add_argument('--image_size', type=int, default=256, help='Size of the image. Do not change.') + parser.add_argument('--device', type=str, default='cuda:0', help='Device for computation') + parser.add_argument('--motion_dim', type=int, default=20, help='Dimension of motion. Do not change.') + parser.add_argument('--decoder_layers', type=int, default=2, help='Layer number for the conformer.') + parser.add_argument('--face_sr', action='store_true', help='Face super-resolution (Optional). Please install GFPGAN first') + + + + args = parser.parse_args() + + # macOS Config + # Check if MPS is available + if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + args.device = torch.device("mps") + print("MPS backend is available.") + + main(args) \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/demo_final.py b/AniTalker-kit/AniTalker/code/demo_final.py new file mode 100644 index 00000000..c7b8f0d1 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/demo_final.py @@ -0,0 +1,304 @@ +from LIA_Model import LIA_Model +import torch +import numpy as np +import os +from PIL import Image +from tqdm import tqdm +import argparse +import numpy as np +from torchvision import transforms +from templates import * +import argparse +import shutil +from moviepy.editor import * +import librosa +import python_speech_features +import importlib.util +import time + +def check_package_installed(package_name): + package_spec = importlib.util.find_spec(package_name) + if package_spec is None: + print(f"{package_name} is not installed.") + return False + else: + print(f"{package_name} is installed.") + return True + +def frames_to_video(input_path, audio_path, output_path, fps=float(25)): + image_files = [os.path.join(input_path, img) for img in sorted(os.listdir(input_path))] + clips = [ImageClip(m).set_duration(1/fps) for m in image_files] + video = concatenate_videoclips(clips, method="compose") + + audio = AudioFileClip(audio_path) + final_video = video.set_audio(audio) + final_video.write_videofile(output_path, fps=float(25), codec='libx264', audio_codec='aac') + +def load_image(filename, size): + img = Image.open(filename).convert('RGB') + img = img.resize((size, size)) + img = np.asarray(img) + img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256 + return img / 255.0 + +def img_preprocessing(img_path, size): + img = load_image(img_path, size) # [0, 1] + img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1] + imgs_norm = (img - 0.5) * 2.0 # [-1, 1] + return imgs_norm + +def saved_image(img_tensor, img_path): + toPIL = transforms.ToPILImage() + img = toPIL(img_tensor.detach().cpu().squeeze(0)) # 使用squeeze(0)来移除批次维度 + img.save(img_path) + +def main(args): + frames_result_saved_path = os.path.join(args.result_path, 'frames') + os.makedirs(frames_result_saved_path, exist_ok=True) + test_image_name = os.path.splitext(os.path.basename(args.test_image_path))[0] + audio_name = os.path.splitext(os.path.basename(args.test_audio_path))[0] + predicted_video_256_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}.mp4') + predicted_video_512_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}_SR.mp4') + + #======Loading Stage 1 model========= + lia = LIA_Model(motion_dim=args.motion_dim, fusion_type='weighted_sum') + lia.load_lightning_model(args.stage1_checkpoint_path) + lia.to(args.device) + #============================ + + conf = ffhq256_autoenc() + conf.seed = args.seed + conf.decoder_layers = args.decoder_layers + conf.infer_type = args.infer_type + conf.motion_dim = args.motion_dim + + if args.infer_type == 'mfcc_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = True + + elif args.infer_type == 'mfcc_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = True + + elif args.infer_type == 'hubert_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + + elif args.infer_type == 'hubert_audio_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + + elif args.infer_type == 'hubert_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = False + + else: + print('Type NOT Found!') + exit(0) + + if not os.path.exists(args.test_image_path): + print(f'{args.test_image_path} does not exist!') + exit(0) + + if not os.path.exists(args.test_audio_path): + print(f'{args.test_audio_path} does not exist!') + exit(0) + + img_source = img_preprocessing(args.test_image_path, args.image_size).to(args.device) + one_shot_lia_start, one_shot_lia_direction, feats = lia.get_start_direction_code(img_source, img_source, img_source, img_source) + + + #======Loading Stage 2 model========= + model = LitModel(conf) + state = torch.load(args.stage2_checkpoint_path, map_location='cpu') + model.load_state_dict(state, strict=True) + model.ema_model.eval() + model.ema_model.to(args.device); + #================================= + + + #======Audio Input========= + if conf.infer_type.startswith('mfcc'): + # MFCC features + wav, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = python_speech_features.mfcc(signal=wav, samplerate=sr, numcep=13, winlen=0.025, winstep=0.01) + d_mfcc_feat = python_speech_features.base.delta(input_values, 1) + d_mfcc_feat2 = python_speech_features.base.delta(input_values, 2) + audio_driven_obj = np.hstack((input_values, d_mfcc_feat, d_mfcc_feat2)) + frame_start, frame_end = 0, int(audio_driven_obj.shape[0]/4) + audio_start, audio_end = int(frame_start * 4), int(frame_end * 4) # The video frame is fixed to 25 hz and the audio is fixed to 100 hz + + audio_driven = torch.Tensor(audio_driven_obj[audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + + elif conf.infer_type.startswith('hubert'): + # Hubert features + if not os.path.exists(args.test_hubert_path): + + if not check_package_installed('transformers'): + print('Please install transformers module first.') + exit(0) + hubert_model_path = 'ckpts/chinese-hubert-large' + if not os.path.exists(hubert_model_path): + print('Please download the hubert weight into the ckpts path first.') + exit(0) + print('You did not extract the audio features in advance, extracting online now, which will increase processing delay') + + start_time = time.time() + + # load hubert model + from transformers import Wav2Vec2FeatureExtractor, HubertModel + audio_model = HubertModel.from_pretrained(hubert_model_path).to(args.device) + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_path) + audio_model.feature_extractor._freeze_parameters() + audio_model.eval() + + # hubert model forward pass + audio, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values + input_values = input_values.to(args.device) + ws_feats = [] + with torch.no_grad(): + outputs = audio_model(input_values, output_hidden_states=True) + for i in range(len(outputs.hidden_states)): + ws_feats.append(outputs.hidden_states[i].detach().cpu().numpy()) + ws_feat_obj = np.array(ws_feats) + ws_feat_obj = np.squeeze(ws_feat_obj, 1) + ws_feat_obj = np.pad(ws_feat_obj, ((0, 0), (0, 1), (0, 0)), 'edge') # align the audio length with video frame + + execution_time = time.time() - start_time + print(f"Extraction Audio Feature: {execution_time:.2f} Seconds") + + audio_driven_obj = ws_feat_obj + else: + print(f'Using audio feature from path: {args.test_hubert_path}') + audio_driven_obj = np.load(args.test_hubert_path) + + frame_start, frame_end = 0, int(audio_driven_obj.shape[1]/2) + audio_start, audio_end = int(frame_start * 2), int(frame_end * 2) # The video frame is fixed to 25 hz and the audio is fixed to 50 hz + + audio_driven = torch.Tensor(audio_driven_obj[:,audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + #============================ + + # Diffusion Noise + noisyT = th.randn((1,frame_end, args.motion_dim)).to(args.device) + + #======Inputs for Attribute Control========= + if os.path.exists(args.pose_driven_path): + pose_obj = np.load(args.pose_driven_path) + + + if len(pose_obj.shape) != 2: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + if pose_obj.shape[1] != 3: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + + if pose_obj.shape[0] >= frame_end: + pose_obj = pose_obj[:frame_end,:] + else: + padding = np.tile(pose_obj[-1, :], (frame_end - pose_obj.shape[0], 1)) + pose_obj = np.vstack((pose_obj, padding)) + + pose_signal = torch.Tensor(pose_obj).unsqueeze(0).to(args.device)/ 90 # 90 is for normalization here + else: + yaw_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_yaw + pitch_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_pitch + roll_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_roll + pose_signal = torch.cat((yaw_signal, pitch_signal, roll_signal), dim=-1) + + pose_signal = torch.clamp(pose_signal, -1, 1) + + face_location_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_location + face_scae_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_scale + #=========================================== + + start_time = time.time() + + #======Diffusion Denosing Process========= + generated_directions = model.render(one_shot_lia_start, one_shot_lia_direction, audio_driven, face_location_signal, face_scae_signal, pose_signal, noisyT, args.step_T, control_flag=args.control_flag) + #========================================= + + execution_time = time.time() - start_time + print(f"Motion Diffusion Model: {execution_time:.2f} Seconds") + + generated_directions = generated_directions.detach().cpu().numpy() + + start_time = time.time() + #======Rendering images frame-by-frame========= + for pred_index in tqdm(range(generated_directions.shape[1])): + ori_img_recon = lia.render(one_shot_lia_start, torch.Tensor(generated_directions[:,pred_index,:]).to(args.device), feats) + ori_img_recon = ori_img_recon.clamp(-1, 1) + wav_pred = (ori_img_recon.detach() + 1) / 2 + saved_image(wav_pred, os.path.join(frames_result_saved_path, "%06d.png"%(pred_index))) + #============================================== + + execution_time = time.time() - start_time + print(f"Renderer Model: {execution_time:.2f} Seconds") + + frames_to_video(frames_result_saved_path, args.test_audio_path, predicted_video_256_path) + + shutil.rmtree(frames_result_saved_path) + + + # Enhancer + # Code is modified from https://github.com/OpenTalker/SadTalker/blob/cd4c0465ae0b54a6f85af57f5c65fec9fe23e7f8/src/utils/face_enhancer.py#L26 + + if args.face_sr and check_package_installed('gfpgan'): + from face_sr.face_enhancer import enhancer_list + import imageio + + # Super-resolution + imageio.mimsave(predicted_video_512_path+'.tmp.mp4', enhancer_list(predicted_video_256_path, method='gfpgan', bg_upsampler=None), fps=float(25)) + + # Merge audio and video + video_clip = VideoFileClip(predicted_video_512_path+'.tmp.mp4') + audio_clip = AudioFileClip(predicted_video_256_path) + final_clip = video_clip.set_audio(audio_clip) + final_clip.write_videofile(predicted_video_512_path, fps=float(25), codec='libx264', audio_codec='aac') + + os.remove(predicted_video_512_path+'.tmp.mp4') + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--infer_type', type=str, default='hubert_audio_only', help='mfcc_pose_only or mfcc_full_control') + parser.add_argument('--input_image', type=str, required=True, help='Path to the input image') + parser.add_argument('--input_audio_text', type=str, required=True, help='Path to the input audio or text') + parser.add_argument('--output_dir', type=str, required=True, help='Path to the output directory') + parser.add_argument('--test_hubert_path', type=str, default='test_demos/audios_hubert/monalisa.npy', help='Path to the driven audio(hubert type). Not needed for MFCC') + parser.add_argument('--stage1_checkpoint_path', type=str, default='ckpts/stage1.ckpt', help='Path to the checkpoint of Stage1') + parser.add_argument('--stage2_checkpoint_path', type=str, default='ckpts/stage2_audio_only_hubert.ckpt', help='Path to the checkpoint of Stage2') + parser.add_argument('--seed', type=int, default=0, help='seed for generations') + parser.add_argument('--control_flag', action='store_true', help='Whether to use control signal or not') + parser.add_argument('--pose_yaw', type=float, default=0.25, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--pose_pitch', type=float, default=0, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--pose_roll', type=float, default=0, help='range from -1 to 1 (-90 ~ 90 angles)') + parser.add_argument('--face_location', type=float, default=0.5, help='range from 0 to 1 (from left to right)') + parser.add_argument('--pose_driven_path', type=str, default='xxx', help='path to pose numpy, shape is (T, 3). You can check the following code https://github.com/liutaocode/talking_face_preprocessing to extract the yaw, pitch and roll.') + parser.add_argument('--face_scale', type=float, default=0.5, help='range from 0 to 1 (from small to large)') + parser.add_argument('--step_T', type=int, default=50, help='Step T for diffusion denoising process') + parser.add_argument('--image_size', type=int, default=256, help='Size of the image. Do not change.') + parser.add_argument('--device', type=str, default='cuda:0', help='Device for computation') + parser.add_argument('--motion_dim', type=int, default=20, help='Dimension of motion. Do not change.') + parser.add_argument('--decoder_layers', type=int, default=2, help='Layer number for the conformer.') + parser.add_argument('--face_sr', action='store_true', help='Face super-resolution (Optional). Please install GFPGAN first') + + + + args = parser.parse_args() + args.test_image_path = args.input_image + args.test_audio_path = args.input_audio_text + args.result_path = args.output_dir + + # macOS Config + # Check if MPS is available + if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + args.device = torch.device("mps") + print("MPS backend is available.") + + main(args) \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/diffusion/__init__.py b/AniTalker-kit/AniTalker/code/diffusion/__init__.py new file mode 100644 index 00000000..4e0838ce --- /dev/null +++ b/AniTalker-kit/AniTalker/code/diffusion/__init__.py @@ -0,0 +1,6 @@ +from typing import Union + +from .diffusion import SpacedDiffusionBeatGans, SpacedDiffusionBeatGansConfig + +Sampler = Union[SpacedDiffusionBeatGans] +SamplerConfig = Union[SpacedDiffusionBeatGansConfig] diff --git a/AniTalker-kit/AniTalker/code/diffusion/base.py b/AniTalker-kit/AniTalker/code/diffusion/base.py new file mode 100644 index 00000000..e20f855f --- /dev/null +++ b/AniTalker-kit/AniTalker/code/diffusion/base.py @@ -0,0 +1,1135 @@ +""" +This code started out as a PyTorch port of Ho et al's diffusion models: +https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py + +Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules. +""" + +from model.unet_autoenc import AutoencReturn +from config_base import BaseConfig +import enum +import math + +import numpy as np +import torch as th +from model import * +from model.nn import mean_flat +from typing import NamedTuple, Tuple +from choices import * +from torch.cuda.amp import autocast +import torch.nn.functional as F + +from dataclasses import dataclass + + +@dataclass +class GaussianDiffusionBeatGansConfig(BaseConfig): + gen_type: GenerativeType + betas: Tuple[float] + model_type: ModelType + model_mean_type: ModelMeanType + model_var_type: ModelVarType + loss_type: LossType + rescale_timesteps: bool + fp16: bool + train_pred_xstart_detach: bool = True + + def make_sampler(self): + return GaussianDiffusionBeatGans(self) + + +class GaussianDiffusionBeatGans: + """ + Utilities for training and sampling diffusion models. + + Ported directly from here, and then adapted over time to further experimentation. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42 + + :param betas: a 1-D numpy array of betas for each diffusion timestep, + starting at T and going to 1. + :param model_mean_type: a ModelMeanType determining what the model outputs. + :param model_var_type: a ModelVarType determining how variance is output. + :param loss_type: a LossType determining the loss function to use. + :param rescale_timesteps: if True, pass floating point timesteps into the + model so that they are always scaled like in the + original paper (0 to 1000). + """ + def __init__(self, conf: GaussianDiffusionBeatGansConfig): + self.conf = conf + self.model_mean_type = conf.model_mean_type + self.model_var_type = conf.model_var_type + self.loss_type = conf.loss_type + self.rescale_timesteps = conf.rescale_timesteps + + # Use float64 for accuracy. + betas = np.array(conf.betas, dtype=np.float64) + self.betas = betas + assert len(betas.shape) == 1, "betas must be 1-D" + assert (betas > 0).all() and (betas <= 1).all() + + self.num_timesteps = int(betas.shape[0]) + + alphas = 1.0 - betas + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1]) + self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0) + assert self.alphas_cumprod_prev.shape == (self.num_timesteps, ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod) + self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod) + self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod) + self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - + 1) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + self.posterior_variance = (betas * (1.0 - self.alphas_cumprod_prev) / + (1.0 - self.alphas_cumprod)) + # log calculation clipped because the posterior variance is 0 at the + # beginning of the diffusion chain. + self.posterior_log_variance_clipped = np.log( + np.append(self.posterior_variance[1], self.posterior_variance[1:])) + self.posterior_mean_coef1 = (betas * + np.sqrt(self.alphas_cumprod_prev) / + (1.0 - self.alphas_cumprod)) + self.posterior_mean_coef2 = ((1.0 - self.alphas_cumprod_prev) * + np.sqrt(alphas) / + (1.0 - self.alphas_cumprod)) + + def training_losses(self, + model, + motion_direction_start: th.Tensor, + motion_target: th.Tensor, + motion_start: th.Tensor, + audio_feats: th.Tensor, + face_location: th.Tensor, + face_scale: th.Tensor, + yaw_pitch_roll: th.Tensor, + t: th.Tensor, + model_kwargs=None, + noise: th.Tensor = None): + """ + Compute training losses for a single timestep. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param t: a batch of timestep indices. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param noise: if specified, the specific Gaussian noise to try to remove. + :return: a dict with the key "loss" containing a tensor of shape [N]. + Some mean or variance settings may also have other keys. + """ + if model_kwargs is None: + model_kwargs = {} + if noise is None: + noise = th.randn_like(motion_target) + + x_t = self.q_sample(motion_target, t, noise=noise) + + terms = {'x_t': x_t} + + if self.loss_type in [ + LossType.mse, + LossType.l1, + ]: + with autocast(self.conf.fp16): + # x_t is static wrt. to the diffusion process + predicted_direction, predicted_location, predicted_scale, predicted_pose = model.forward(motion_start, + motion_direction_start, + audio_feats, + face_location, + face_scale, + yaw_pitch_roll, + x_t.detach(), + self._scale_timesteps(t), + control_flag=False) + + + target_types = { + ModelMeanType.eps: noise, + } + target = target_types[self.model_mean_type] + assert predicted_direction.shape == target.shape == motion_target.shape + + if self.loss_type == LossType.mse: + if self.model_mean_type == ModelMeanType.eps: + + direction_loss = mean_flat((target - predicted_direction)**2) + # import pdb;pdb.set_trace() + location_loss = mean_flat((face_location.unsqueeze(-1) - predicted_location)**2) + scale_loss = mean_flat((face_scale - predicted_scale)**2) + pose_loss = mean_flat((yaw_pitch_roll - predicted_pose)**2) + + terms["mse"] = direction_loss + location_loss + scale_loss + pose_loss + + else: + raise NotImplementedError() + elif self.loss_type == LossType.l1: + # (n, c, h, w) => (n, ) + terms["mse"] = mean_flat((target - predicted_direction).abs()) + else: + raise NotImplementedError() + + if "vb" in terms: + # if learning the variance also use the vlb loss + terms["loss"] = terms["mse"] + terms["vb"] + else: + terms["loss"] = terms["mse"] + else: + raise NotImplementedError(self.loss_type) + + + return terms + + def sample(self, + model: Model, + shape=None, + noise=None, + cond=None, + x_start=None, + clip_denoised=True, + model_kwargs=None, + progress=False): + """ + Args: + x_start: given for the autoencoder + """ + if model_kwargs is None: + model_kwargs = {} + if self.conf.model_type.has_autoenc(): + model_kwargs['x_start'] = x_start + model_kwargs['cond'] = cond + + if self.conf.gen_type == GenerativeType.ddpm: + return self.p_sample_loop(model, + shape=shape, + noise=noise, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + progress=progress) + elif self.conf.gen_type == GenerativeType.ddim: + return self.ddim_sample_loop(model, + shape=shape, + noise=noise, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + progress=progress) + else: + raise NotImplementedError() + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * + x_start) + variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, + x_start.shape) + log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, + t, x_start.shape) + return mean, variance, log_variance + + def q_sample(self, x_start, t, noise=None): + """ + Diffuse the data for a given number of diffusion steps. + + In other words, sample from q(x_t | x_0). + + :param x_start: the initial data batch. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :param noise: if specified, the split-out normal noise. + :return: A noisy version of x_start. + """ + if noise is None: + noise = th.randn_like(x_start) + assert noise.shape == x_start.shape + return ( + _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * + x_start + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, + t, x_start.shape) * noise) + + def q_posterior_mean_variance(self, x_start, x_t, t): + """ + Compute the mean and variance of the diffusion posterior: + + q(x_{t-1} | x_t, x_0) + + """ + assert x_start.shape == x_t.shape + posterior_mean = ( + _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * + x_start + + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * + x_t) + posterior_variance = _extract_into_tensor(self.posterior_variance, t, + x_t.shape) + posterior_log_variance_clipped = _extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape) + assert (posterior_mean.shape[0] == posterior_variance.shape[0] == + posterior_log_variance_clipped.shape[0] == x_start.shape[0]) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None): + """ + Apply the model to get p(x_{t-1} | x_t), as well as a prediction of + the initial x, x_0. + + :param model: the model, which takes a signal and a batch of timesteps + as input. + :param x: the [N x C x ...] tensor at time t. + :param t: a 1-D Tensor of timesteps. + :param clip_denoised: if True, clip the denoised signal into [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. Applies before + clip_denoised. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict with the following keys: + - 'mean': the model mean output. + - 'variance': the model variance output. + - 'log_variance': the log of 'variance'. + - 'pred_xstart': the prediction for x_0. + """ + if model_kwargs is None: + model_kwargs = {} + + motion_start = model_kwargs['start'] + audio_feats = model_kwargs['audio_driven'] + face_location = model_kwargs['face_location'] + face_scale = model_kwargs['face_scale'] + yaw_pitch_roll = model_kwargs['yaw_pitch_roll'] + motion_direction_start = model_kwargs['motion_direction_start'] + control_flag = model_kwargs['control_flag'] + + B, C = x.shape[:2] + assert t.shape == (B, ) + with autocast(self.conf.fp16): + model_forward, _, _, _ = model.forward(motion_start, + motion_direction_start, + audio_feats, + face_location, + face_scale, + yaw_pitch_roll, + x, + self._scale_timesteps(t), + control_flag) + model_output = model_forward + + if self.model_var_type in [ + ModelVarType.fixed_large, ModelVarType.fixed_small + ]: + model_variance, model_log_variance = { + # for fixedlarge, we set the initial (log-)variance like so + # to get a better decoder log likelihood. + ModelVarType.fixed_large: ( + np.append(self.posterior_variance[1], self.betas[1:]), + np.log( + np.append(self.posterior_variance[1], self.betas[1:])), + ), + ModelVarType.fixed_small: ( + self.posterior_variance, + self.posterior_log_variance_clipped, + ), + }[self.model_var_type] + model_variance = _extract_into_tensor(model_variance, t, x.shape) + model_log_variance = _extract_into_tensor(model_log_variance, t, + x.shape) + + def process_xstart(x): + if denoised_fn is not None: + x = denoised_fn(x) + if clip_denoised: + return x.clamp(-1, 1) + return x + + if self.model_mean_type in [ + ModelMeanType.eps, + ]: + if self.model_mean_type == ModelMeanType.eps: + pred_xstart = process_xstart( + self._predict_xstart_from_eps(x_t=x, t=t, + eps=model_output)) + else: + raise NotImplementedError() + model_mean, _, _ = self.q_posterior_mean_variance( + x_start=pred_xstart, x_t=x, t=t) + else: + raise NotImplementedError(self.model_mean_type) + + assert (model_mean.shape == model_log_variance.shape == + pred_xstart.shape == x.shape) + return { + "mean": model_mean, + "variance": model_variance, + "log_variance": model_log_variance, + "pred_xstart": pred_xstart, + 'model_forward': model_forward, + } + + def _predict_xstart_from_eps(self, x_t, t, eps): + assert x_t.shape == eps.shape + return (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, + x_t.shape) * x_t - + _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, + x_t.shape) * eps) + + def _predict_xstart_from_xprev(self, x_t, t, xprev): + assert x_t.shape == xprev.shape + return ( # (xprev - coef2*x_t) / coef1 + _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) + * xprev - _extract_into_tensor( + self.posterior_mean_coef2 / self.posterior_mean_coef1, t, + x_t.shape) * x_t) + + def _predict_xstart_from_scaled_xstart(self, t, scaled_xstart): + return scaled_xstart * _extract_into_tensor( + self.sqrt_recip_alphas_cumprod, t, scaled_xstart.shape) + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, + x_t.shape) * x_t - + pred_xstart) / _extract_into_tensor( + self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _predict_eps_from_scaled_xstart(self, x_t, t, scaled_xstart): + """ + Args: + scaled_xstart: is supposed to be sqrt(alphacum) * x_0 + """ + # 1 / sqrt(1-alphabar) * (x_t - scaled xstart) + return (x_t - scaled_xstart) / _extract_into_tensor( + self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) + + def _scale_timesteps(self, t): + if self.rescale_timesteps: + # scale t to be maxed out at 1000 steps + return t.float() * (1000.0 / self.num_timesteps) + return t + + def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute the mean for the previous step, given a function cond_fn that + computes the gradient of a conditional log probability with respect to + x. In particular, cond_fn computes grad(log(p(y|x))), and we want to + condition on y. + + This uses the conditioning strategy from Sohl-Dickstein et al. (2015). + """ + gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs) + new_mean = (p_mean_var["mean"].float() + + p_mean_var["variance"] * gradient.float()) + return new_mean + + def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None): + """ + Compute what the p_mean_variance output would have been, should the + model's score function be conditioned by cond_fn. + + See condition_mean() for details on cond_fn. + + Unlike condition_mean(), this instead uses the conditioning strategy + from Song et al (2020). + """ + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + + eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"]) + eps = eps - (1 - alpha_bar).sqrt() * cond_fn( + x, self._scale_timesteps(t), **model_kwargs) + + out = p_mean_var.copy() + out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps) + out["mean"], _, _ = self.q_posterior_mean_variance( + x_start=out["pred_xstart"], x_t=x, t=t) + return out + + def p_sample( + self, + model: Model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + ): + """ + Sample x_{t-1} from the model at the given timestep. + + :param model: the model to sample from. + :param x: the current tensor at x_{t-1}. + :param t: the value of t, starting at 0 for the first diffusion step. + :param clip_denoised: if True, clip the x_start prediction to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :return: a dict containing the following keys: + - 'sample': a random sample from the model. + - 'pred_xstart': a prediction of x_0. + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + noise = th.randn_like(x) + nonzero_mask = ((t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + if cond_fn is not None: + out["mean"] = self.condition_mean(cond_fn, + out, + x, + t, + model_kwargs=model_kwargs) + sample = out["mean"] + nonzero_mask * th.exp( + 0.5 * out["log_variance"]) * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def p_sample_loop( + self, + model: Model, + shape=None, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param cond_fn: if not None, this is a gradient function that acts + similarly to the model. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model: Model, + shape=None, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + if noise is not None: + img = noise + else: + assert isinstance(shape, (tuple, list)) + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + # t = th.tensor([i] * shape[0], device=device) + t = th.tensor([i] * len(img), device=device) + with th.no_grad(): + out = self.p_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model: Model, + x, + t, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + if cond_fn is not None: + out = self.condition_score(cond_fn, + out, + x, + t, + model_kwargs=model_kwargs) + + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, + x.shape) + sigma = (eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * + th.sqrt(1 - alpha_bar / alpha_bar_prev)) + # Equation 12. + noise = th.randn_like(x) + mean_pred = (out["pred_xstart"] * th.sqrt(alpha_bar_prev) + + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps) + nonzero_mask = ((t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) + ) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample( + self, + model: Model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + NOTE: never used ? + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = (_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) + * x - out["pred_xstart"]) / _extract_into_tensor( + self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, + x.shape) + + # Equation 12. reversed (DDIM paper) (th.sqrt == torch.sqrt) + mean_pred = (out["pred_xstart"] * th.sqrt(alpha_bar_next) + + th.sqrt(1 - alpha_bar_next) * eps) + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample_loop( + self, + model: Model, + x, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + device=None, + ): + if device is None: + device = next(model.parameters()).device + sample_t = [] + xstart_t = [] + T = [] + indices = list(range(self.num_timesteps)) + sample = x + for i in indices: + t = th.tensor([i] * len(sample), device=device) + with th.no_grad(): + out = self.ddim_reverse_sample(model, + sample, + t=t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + eta=eta) + sample = out['sample'] + # [1, ..., T] + sample_t.append(sample) + # [0, ...., T-1] + xstart_t.append(out['pred_xstart']) + # [0, ..., T-1] ready to use + T.append(t) + + return { + # xT " + 'sample': sample, + # (1, ..., T) + 'sample_t': sample_t, + # xstart here is a bit different from sampling from T = T-1 to T = 0 + # may not be exact + 'xstart_t': xstart_t, + 'T': T, + } + + def ddim_sample_loop( + self, + model: Model, + shape=None, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model: Model, + shape=None, + noise=None, + clip_denoised=True, + denoised_fn=None, + cond_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + if noise is not None: + img = noise + else: + assert isinstance(shape, (tuple, list)) + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + + if isinstance(model_kwargs, list): + # index dependent model kwargs + # (T-1, ..., 0) + _kwargs = model_kwargs[i] + else: + _kwargs = model_kwargs + + t = th.tensor([i] * len(img), device=device) + with th.no_grad(): + out = self.ddim_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + cond_fn=cond_fn, + model_kwargs=_kwargs, + eta=eta, + ) + out['t'] = t + yield out + img = out["sample"] + + def _vb_terms_bpd(self, + model: Model, + x_start, + x_t, + t, + clip_denoised=True, + model_kwargs=None): + """ + Get a term for the variational lower-bound. + + The resulting units are bits (rather than nats, as one might expect). + This allows for comparison to other papers. + + :return: a dict with the following keys: + - 'output': a shape [N] tensor of NLLs or KLs. + - 'pred_xstart': the x_0 predictions. + """ + true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance( + x_start=x_start, x_t=x_t, t=t) + out = self.p_mean_variance(model, + x_t, + t, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs) + kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], + out["log_variance"]) + kl = mean_flat(kl) / np.log(2.0) + + decoder_nll = -discretized_gaussian_log_likelihood( + x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]) + assert decoder_nll.shape == x_start.shape + decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + + # At the first timestep return the decoder NLL, + # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) + output = th.where((t == 0), decoder_nll, kl) + return { + "output": output, + "pred_xstart": out["pred_xstart"], + 'model_forward': out['model_forward'], + } + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + + This term can't be optimized, as it only depends on the encoder. + + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = th.tensor([self.num_timesteps - 1] * batch_size, + device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl(mean1=qt_mean, + logvar1=qt_log_variance, + mean2=0.0, + logvar2=0.0) + return mean_flat(kl_prior) / np.log(2.0) + + def calc_bpd_loop(self, + model: Model, + x_start, + clip_denoised=True, + model_kwargs=None): + """ + Compute the entire variational lower-bound, measured in bits-per-dim, + as well as other related quantities. + + :param model: the model to evaluate loss on. + :param x_start: the [N x C x ...] tensor of inputs. + :param clip_denoised: if True, clip denoised samples. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + + :return: a dict containing the following keys: + - total_bpd: the total variational lower-bound, per batch element. + - prior_bpd: the prior term in the lower-bound. + - vb: an [N x T] tensor of terms in the lower-bound. + - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep. + - mse: an [N x T] tensor of epsilon MSEs for each timestep. + """ + device = x_start.device + batch_size = x_start.shape[0] + + vb = [] + xstart_mse = [] + mse = [] + for t in list(range(self.num_timesteps))[::-1]: + t_batch = th.tensor([t] * batch_size, device=device) + noise = th.randn_like(x_start) + x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise) + # Calculate VLB term at the current timestep + with th.no_grad(): + out = self._vb_terms_bpd( + model, + x_start=x_start, + x_t=x_t, + t=t_batch, + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + ) + vb.append(out["output"]) + xstart_mse.append(mean_flat((out["pred_xstart"] - x_start)**2)) + eps = self._predict_eps_from_xstart(x_t, t_batch, + out["pred_xstart"]) + mse.append(mean_flat((eps - noise)**2)) + + vb = th.stack(vb, dim=1) + xstart_mse = th.stack(xstart_mse, dim=1) + mse = th.stack(mse, dim=1) + + prior_bpd = self._prior_bpd(x_start) + total_bpd = vb.sum(dim=1) + prior_bpd + return { + "total_bpd": total_bpd, + "prior_bpd": prior_bpd, + "vb": vb, + "xstart_mse": xstart_mse, + "mse": mse, + } + + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + + if hasattr(th.backends, 'mps') and th.backends.mps.is_available(): + arr = arr.astype(np.float32) + # Convert the numpy array to a tensor and then move to the device + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps] + else: + res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) + + +def get_named_beta_schedule(schedule_name, num_diffusion_timesteps): + """ + Get a pre-defined beta schedule for the given name. + + The beta schedule library consists of beta schedules which remain similar + in the limit of num_diffusion_timesteps. + Beta schedules may be added, but should not be removed or changed once + they are committed to maintain backwards compatibility. + """ + if schedule_name == "linear": + # Linear schedule from Ho et al, extended to work for any number of + # diffusion steps. + scale = 1000 / num_diffusion_timesteps + beta_start = scale * 0.0001 + beta_end = scale * 0.02 + return np.linspace(beta_start, + beta_end, + num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "cosine": + return betas_for_alpha_bar( + num_diffusion_timesteps, + lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2)**2, + ) + elif schedule_name == "const0.01": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.01] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.015": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.015] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.008": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.008] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0065": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0065] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0055": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0055] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0045": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0045] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0035": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0035] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0025": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0025] * num_diffusion_timesteps, + dtype=np.float64) + elif schedule_name == "const0.0015": + scale = 1000 / num_diffusion_timesteps + return np.array([scale * 0.0015] * num_diffusion_timesteps, + dtype=np.float64) + else: + raise NotImplementedError(f"unknown beta schedule: {schedule_name}") + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, th.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for th.exp(). + logvar1, logvar2 = [ + x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + + ((mean1 - mean2)**2) * th.exp(-logvar2)) + + +def approx_standard_normal_cdf(x): + """ + A fast approximation of the cumulative distribution function of the + standard normal. + """ + return 0.5 * ( + 1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3)))) + + +def discretized_gaussian_log_likelihood(x, *, means, log_scales): + """ + Compute the log-likelihood of a Gaussian distribution discretizing to a + given image. + + :param x: the target images. It is assumed that this was uint8 values, + rescaled to the range [-1, 1]. + :param means: the Gaussian mean Tensor. + :param log_scales: the Gaussian log stddev Tensor. + :return: a tensor like x of log probabilities (in nats). + """ + assert x.shape == means.shape == log_scales.shape + centered_x = x - means + inv_stdv = th.exp(-log_scales) + plus_in = inv_stdv * (centered_x + 1.0 / 255.0) + cdf_plus = approx_standard_normal_cdf(plus_in) + min_in = inv_stdv * (centered_x - 1.0 / 255.0) + cdf_min = approx_standard_normal_cdf(min_in) + log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12)) + log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12)) + cdf_delta = cdf_plus - cdf_min + log_probs = th.where( + x < -0.999, + log_cdf_plus, + th.where(x > 0.999, log_one_minus_cdf_min, + th.log(cdf_delta.clamp(min=1e-12))), + ) + assert log_probs.shape == x.shape + return log_probs + + +class DummyModel(th.nn.Module): + def __init__(self, pred): + super().__init__() + self.pred = pred + + def forward(self, *args, **kwargs): + return DummyReturn(pred=self.pred) + + +class DummyReturn(NamedTuple): + pred: th.Tensor \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/diffusion/diffusion.py b/AniTalker-kit/AniTalker/code/diffusion/diffusion.py new file mode 100644 index 00000000..d960c904 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/diffusion/diffusion.py @@ -0,0 +1,156 @@ +from .base import * +from dataclasses import dataclass + + +def space_timesteps(num_timesteps, section_counts): + """ + Create a list of timesteps to use from an original diffusion process, + given the number of timesteps we want to take from equally-sized portions + of the original process. + + For example, if there's 300 timesteps and the section counts are [10,15,20] + then the first 100 timesteps are strided to be 10 timesteps, the second 100 + are strided to be 15 timesteps, and the final 100 are strided to be 20. + + If the stride is a string starting with "ddim", then the fixed striding + from the DDIM paper is used, and only one section is allowed. + + :param num_timesteps: the number of diffusion steps in the original + process to divide up. + :param section_counts: either a list of numbers, or a string containing + comma-separated numbers, indicating the step count + per section. As a special case, use "ddimN" where N + is a number of steps to use the striding from the + DDIM paper. + :return: a set of diffusion steps from the original process to use. + """ + if isinstance(section_counts, str): + if section_counts.startswith("ddim"): + desired_count = int(section_counts[len("ddim"):]) + for i in range(1, num_timesteps): + if len(range(0, num_timesteps, i)) == desired_count: + return set(range(0, num_timesteps, i)) + raise ValueError( + f"cannot create exactly {num_timesteps} steps with an integer stride" + ) + section_counts = [int(x) for x in section_counts.split(",")] + size_per = num_timesteps // len(section_counts) + extra = num_timesteps % len(section_counts) + start_idx = 0 + all_steps = [] + for i, section_count in enumerate(section_counts): + size = size_per + (1 if i < extra else 0) + if size < section_count: + raise ValueError( + f"cannot divide section of {size} steps into {section_count}") + if section_count <= 1: + frac_stride = 1 + else: + frac_stride = (size - 1) / (section_count - 1) + cur_idx = 0.0 + taken_steps = [] + for _ in range(section_count): + taken_steps.append(start_idx + round(cur_idx)) + cur_idx += frac_stride + all_steps += taken_steps + start_idx += size + return set(all_steps) + + +@dataclass +class SpacedDiffusionBeatGansConfig(GaussianDiffusionBeatGansConfig): + use_timesteps: Tuple[int] = None + + def make_sampler(self): + return SpacedDiffusionBeatGans(self) + + +class SpacedDiffusionBeatGans(GaussianDiffusionBeatGans): + """ + A diffusion process which can skip steps in a base diffusion process. + + :param use_timesteps: a collection (sequence or set) of timesteps from the + original diffusion process to retain. + :param kwargs: the kwargs to create the base diffusion process. + """ + def __init__(self, conf: SpacedDiffusionBeatGansConfig): + self.conf = conf + self.use_timesteps = set(conf.use_timesteps) + # how the new t's mapped to the old t's + self.timestep_map = [] + self.original_num_steps = len(conf.betas) + + base_diffusion = GaussianDiffusionBeatGans(conf) # pylint: disable=missing-kwoa + last_alpha_cumprod = 1.0 + new_betas = [] + for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod): + if i in self.use_timesteps: + # getting the new betas of the new timesteps + new_betas.append(1 - alpha_cumprod / last_alpha_cumprod) + last_alpha_cumprod = alpha_cumprod + self.timestep_map.append(i) + conf.betas = np.array(new_betas) + super().__init__(conf) + + def p_mean_variance(self, model: Model, *args, **kwargs): # pylint: disable=signature-differs + return super().p_mean_variance(self._wrap_model(model), *args, + **kwargs) + + def training_losses(self, model: Model, *args, **kwargs): # pylint: disable=signature-differs + return super().training_losses(self._wrap_model(model), *args, + **kwargs) + + def condition_mean(self, cond_fn, *args, **kwargs): + return super().condition_mean(self._wrap_model(cond_fn), *args, + **kwargs) + + def condition_score(self, cond_fn, *args, **kwargs): + return super().condition_score(self._wrap_model(cond_fn), *args, + **kwargs) + + def _wrap_model(self, model: Model): + if isinstance(model, _WrappedModel): + return model + return _WrappedModel(model, self.timestep_map, self.rescale_timesteps, + self.original_num_steps) + + def _scale_timesteps(self, t): + # Scaling is done by the wrapped model. + return t + + +class _WrappedModel: + """ + converting the supplied t's to the old t's scales. + """ + def __init__(self, model, timestep_map, rescale_timesteps, + original_num_steps): + self.model = model + self.timestep_map = timestep_map + self.rescale_timesteps = rescale_timesteps + self.original_num_steps = original_num_steps + + def forward(self,motion_start, motion_direction_start, audio_feats,face_location, face_scale,yaw_pitch_roll, x_t, t, control_flag=False): + """ + Args: + t: t's with differrent ranges (can be << T due to smaller eval T) need to be converted to the original t's + t_cond: the same as t but can be of different values + """ + map_tensor = th.tensor(self.timestep_map, + device=t.device, + dtype=t.dtype) + + def do(t): + new_ts = map_tensor[t] + if self.rescale_timesteps: + new_ts = new_ts.float() * (1000.0 / self.original_num_steps) + return new_ts + + return self.model(motion_start, motion_direction_start, audio_feats,face_location, face_scale,yaw_pitch_roll, x_t,do(t), control_flag=control_flag) + + def __getattr__(self, name): + # allow for calling the model's methods + if hasattr(self.model, name): + func = getattr(self.model, name) + return func + raise AttributeError(name) diff --git a/AniTalker-kit/AniTalker/code/diffusion/resample.py b/AniTalker-kit/AniTalker/code/diffusion/resample.py new file mode 100644 index 00000000..15c3c097 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/diffusion/resample.py @@ -0,0 +1,63 @@ +from abc import ABC, abstractmethod + +import numpy as np +import torch as th +import torch.distributed as dist + + +def create_named_schedule_sampler(name, diffusion): + """ + Create a ScheduleSampler from a library of pre-defined samplers. + + :param name: the name of the sampler. + :param diffusion: the diffusion object to sample for. + """ + if name == "uniform": + return UniformSampler(diffusion) + else: + raise NotImplementedError(f"unknown schedule sampler: {name}") + + +class ScheduleSampler(ABC): + """ + A distribution over timesteps in the diffusion process, intended to reduce + variance of the objective. + + By default, samplers perform unbiased importance sampling, in which the + objective's mean is unchanged. + However, subclasses may override sample() to change how the resampled + terms are reweighted, allowing for actual changes in the objective. + """ + @abstractmethod + def weights(self): + """ + Get a numpy array of weights, one per diffusion step. + + The weights needn't be normalized, but must be positive. + """ + + def sample(self, batch_size, device): + """ + Importance-sample timesteps for a batch. + + :param batch_size: the number of timesteps. + :param device: the torch device to save to. + :return: a tuple (timesteps, weights): + - timesteps: a tensor of timestep indices. + - weights: a tensor of weights to scale the resulting losses. + """ + w = self.weights() + p = w / np.sum(w) + indices_np = np.random.choice(len(p), size=(batch_size, ), p=p) + indices = th.from_numpy(indices_np).long().to(device) + weights_np = 1 / (len(p) * p[indices_np]) + weights = th.from_numpy(weights_np).float().to(device) + return indices, weights + + +class UniformSampler(ScheduleSampler): + def __init__(self, num_timesteps): + self._weights = np.ones([num_timesteps]) + + def weights(self): + return self._weights diff --git a/AniTalker-kit/AniTalker/code/dist_utils.py b/AniTalker-kit/AniTalker/code/dist_utils.py new file mode 100644 index 00000000..88bb4467 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/dist_utils.py @@ -0,0 +1,42 @@ +from typing import List +from torch import distributed + + +def barrier(): + if distributed.is_initialized(): + distributed.barrier() + else: + pass + + +def broadcast(data, src): + if distributed.is_initialized(): + distributed.broadcast(data, src) + else: + pass + + +def all_gather(data: List, src): + if distributed.is_initialized(): + distributed.all_gather(data, src) + else: + data[0] = src + + +def get_rank(): + if distributed.is_initialized(): + return distributed.get_rank() + else: + return 0 + + +def get_world_size(): + if distributed.is_initialized(): + return distributed.get_world_size() + else: + return 1 + + +def chunk_size(size, rank, world_size): + extra = rank < size % world_size + return size // world_size + extra \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/experiment.py b/AniTalker-kit/AniTalker/code/experiment.py new file mode 100644 index 00000000..b09e26d9 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/experiment.py @@ -0,0 +1,356 @@ +import copy +import os + +import numpy as np +import pytorch_lightning as pl +import torch +from pytorch_lightning import loggers as pl_loggers +from pytorch_lightning.callbacks import * +from torch.cuda import amp +from torch.optim.optimizer import Optimizer +from torch.utils.data.dataset import TensorDataset +from model.seq2seq import DiffusionPredictor + +from config import * +from dist_utils import * +from renderer import * + +# This part is modified from: https://github.com/phizaz/diffae/blob/master/experiment.py +class LitModel(pl.LightningModule): + def __init__(self, conf: TrainConfig): + super().__init__() + assert conf.train_mode != TrainMode.manipulate + if conf.seed is not None: + pl.seed_everything(conf.seed) + + self.save_hyperparameters(conf.as_dict_jsonable()) + + self.conf = conf + + self.model = DiffusionPredictor(conf) + + self.ema_model = copy.deepcopy(self.model) + self.ema_model.requires_grad_(False) + self.ema_model.eval() + + self.sampler = conf.make_diffusion_conf().make_sampler() + self.eval_sampler = conf.make_eval_diffusion_conf().make_sampler() + + # this is shared for both model and latent + self.T_sampler = conf.make_T_sampler() + + if conf.train_mode.use_latent_net(): + self.latent_sampler = conf.make_latent_diffusion_conf( + ).make_sampler() + self.eval_latent_sampler = conf.make_latent_eval_diffusion_conf( + ).make_sampler() + else: + self.latent_sampler = None + self.eval_latent_sampler = None + + # initial variables for consistent sampling + self.register_buffer( + 'x_T', + torch.randn(conf.sample_size, 3, conf.img_size, conf.img_size)) + + + def render(self, start, motion_direction_start, audio_driven, face_location, face_scale, ypr_info, noisyT, step_T, control_flag): + if step_T is None: + sampler = self.eval_sampler + else: + sampler = self.conf._make_diffusion_conf(step_T).make_sampler() + + pred_img = render_condition(self.conf, + self.ema_model, + sampler, start, motion_direction_start, audio_driven, face_location, face_scale, ypr_info, noisyT, control_flag) + return pred_img + + def forward(self, noise=None, x_start=None, ema_model: bool = False): + with amp.autocast(False): + if not self.disable_ema: + model = self.ema_model + else: + model = self.model + gen = self.eval_sampler.sample(model=model, + noise=noise, + x_start=x_start) + return gen + + def setup(self, stage=None) -> None: + """ + make datasets & seeding each worker separately + """ + ############################################## + # NEED TO SET THE SEED SEPARATELY HERE + if self.conf.seed is not None: + seed = self.conf.seed * get_world_size() + self.global_rank + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + print('local seed:', seed) + ############################################## + + self.train_data = self.conf.make_dataset() + print('train data:', len(self.train_data)) + self.val_data = self.train_data + print('val data:', len(self.val_data)) + + def _train_dataloader(self, drop_last=True): + """ + really make the dataloader + """ + # make sure to use the fraction of batch size + # the batch size is global! + conf = self.conf.clone() + conf.batch_size = self.batch_size + + dataloader = conf.make_loader(self.train_data, + shuffle=True, + drop_last=drop_last) + return dataloader + + def train_dataloader(self): + """ + return the dataloader, if diffusion mode => return image dataset + if latent mode => return the inferred latent dataset + """ + print('on train dataloader start ...') + if self.conf.train_mode.require_dataset_infer(): + if self.conds is None: + # usually we load self.conds from a file + # so we do not need to do this again! + self.conds = self.infer_whole_dataset() + # need to use float32! unless the mean & std will be off! + # (1, c) + self.conds_mean.data = self.conds.float().mean(dim=0, + keepdim=True) + self.conds_std.data = self.conds.float().std(dim=0, + keepdim=True) + print('mean:', self.conds_mean.mean(), 'std:', + self.conds_std.mean()) + + # return the dataset with pre-calculated conds + conf = self.conf.clone() + conf.batch_size = self.batch_size + data = TensorDataset(self.conds) + return conf.make_loader(data, shuffle=True) + else: + return self._train_dataloader() + + @property + def batch_size(self): + """ + local batch size for each worker + """ + ws = get_world_size() + assert self.conf.batch_size % ws == 0 + return self.conf.batch_size // ws + + @property + def num_samples(self): + """ + (global) batch size * iterations + """ + # batch size here is global! + # global_step already takes into account the accum batches + return self.global_step * self.conf.batch_size_effective + + def is_last_accum(self, batch_idx): + """ + is it the last gradient accumulation loop? + used with gradient_accum > 1 and to see if the optimizer will perform "step" in this iteration or not + """ + return (batch_idx + 1) % self.conf.accum_batches == 0 + + def training_step(self, batch, batch_idx): + """ + given an input, calculate the loss function + no optimization at this stage. + """ + with amp.autocast(False): + motion_start = batch['motion_start'] # torch.Size([B, 512]) + motion_direction = batch['motion_direction'] # torch.Size([B, 125, 20]) + audio_feats = batch['audio_feats'].float() # torch.Size([B, 25, 250, 1024]) + face_location = batch['face_location'].float() # torch.Size([B, 125]) + face_scale = batch['face_scale'].float() # torch.Size([B, 125, 1]) + yaw_pitch_roll = batch['yaw_pitch_roll'].float() # torch.Size([B, 125, 3]) + motion_direction_start = batch['motion_direction_start'].float() # torch.Size([B, 20]) + + # import pdb; pdb.set_trace() + if self.conf.train_mode == TrainMode.diffusion: + """ + main training mode!!! + """ + # with numpy seed we have the problem that the sample t's are related! + t, weight = self.T_sampler.sample(len(motion_start), motion_start.device) + losses = self.sampler.training_losses(model=self.model, + motion_direction_start=motion_direction_start, + motion_target=motion_direction, + motion_start=motion_start, + audio_feats=audio_feats, + face_location=face_location, + face_scale=face_scale, + yaw_pitch_roll=yaw_pitch_roll, + t=t) + else: + raise NotImplementedError() + + loss = losses['loss'].mean() + # divide by accum batches to make the accumulated gradient exact! + for key in losses.keys(): + losses[key] = self.all_gather(losses[key]).mean() + + if self.global_rank == 0: + self.logger.experiment.add_scalar('loss', losses['loss'], + self.num_samples) + for key in losses: + self.logger.experiment.add_scalar( + f'loss/{key}', losses[key], self.num_samples) + + return {'loss': loss} + + def on_train_batch_end(self, outputs, batch, batch_idx: int, + dataloader_idx: int) -> None: + """ + after each training step ... + """ + if self.is_last_accum(batch_idx): + + if self.conf.train_mode == TrainMode.latent_diffusion: + # it trains only the latent hence change only the latent + ema(self.model.latent_net, self.ema_model.latent_net, + self.conf.ema_decay) + else: + ema(self.model, self.ema_model, self.conf.ema_decay) + + def on_before_optimizer_step(self, optimizer: Optimizer, + optimizer_idx: int) -> None: + # fix the fp16 + clip grad norm problem with pytorch lightinng + # this is the currently correct way to do it + if self.conf.grad_clip > 0: + # from trainer.params_grads import grads_norm, iter_opt_params + params = [ + p for group in optimizer.param_groups for p in group['params'] + ] + torch.nn.utils.clip_grad_norm_(params, + max_norm=self.conf.grad_clip) + def configure_optimizers(self): + out = {} + if self.conf.optimizer == OptimizerType.adam: + optim = torch.optim.Adam(self.model.parameters(), + lr=self.conf.lr, + weight_decay=self.conf.weight_decay) + elif self.conf.optimizer == OptimizerType.adamw: + optim = torch.optim.AdamW(self.model.parameters(), + lr=self.conf.lr, + weight_decay=self.conf.weight_decay) + else: + raise NotImplementedError() + out['optimizer'] = optim + if self.conf.warmup > 0: + sched = torch.optim.lr_scheduler.LambdaLR(optim, + lr_lambda=WarmupLR( + self.conf.warmup)) + out['lr_scheduler'] = { + 'scheduler': sched, + 'interval': 'step', + } + return out + + def split_tensor(self, x): + """ + extract the tensor for a corresponding "worker" in the batch dimension + + Args: + x: (n, c) + + Returns: x: (n_local, c) + """ + n = len(x) + rank = self.global_rank + world_size = get_world_size() + # print(f'rank: {rank}/{world_size}') + per_rank = n // world_size + return x[rank * per_rank:(rank + 1) * per_rank] + +def ema(source, target, decay): + source_dict = source.state_dict() + target_dict = target.state_dict() + for key in source_dict.keys(): + target_dict[key].data.copy_(target_dict[key].data * decay + + source_dict[key].data * (1 - decay)) + + +class WarmupLR: + def __init__(self, warmup) -> None: + self.warmup = warmup + + def __call__(self, step): + return min(step, self.warmup) / self.warmup + + +def is_time(num_samples, every, step_size): + closest = (num_samples // every) * every + return num_samples - closest < step_size + + +def train(conf: TrainConfig, gpus, nodes=1, mode: str = 'train'): + print('conf:', conf.name) + # assert not (conf.fp16 and conf.grad_clip > 0 + # ), 'pytorch lightning has bug with amp + gradient clipping' + model = LitModel(conf) + + if not os.path.exists(conf.logdir): + os.makedirs(conf.logdir) + checkpoint = ModelCheckpoint(dirpath=f'{conf.logdir}', + save_last=True, + save_top_k=-1, + every_n_epochs=10) + checkpoint_path = f'{conf.logdir}/last.ckpt' + print('ckpt path:', checkpoint_path) + if os.path.exists(checkpoint_path): + resume = checkpoint_path + print('resume!') + else: + if conf.continue_from is not None: + # continue from a checkpoint + resume = conf.continue_from.pathcd + else: + resume = None + + tb_logger = pl_loggers.TensorBoardLogger(save_dir=conf.logdir, + name=None, + version='') + + # from pytorch_lightning. + + plugins = [] + if len(gpus) == 1 and nodes == 1: + accelerator = None + else: + accelerator = 'ddp' + from pytorch_lightning.plugins import DDPPlugin + + # important for working with gradient checkpoint + plugins.append(DDPPlugin(find_unused_parameters=True)) + + trainer = pl.Trainer( + max_steps=conf.total_samples // conf.batch_size_effective, + resume_from_checkpoint=resume, + gpus=gpus, + num_nodes=nodes, + accelerator=accelerator, + precision=16 if conf.fp16 else 32, + callbacks=[ + checkpoint, + LearningRateMonitor(), + ], + # clip in the model instead + # gradient_clip_val=conf.grad_clip, + replace_sampler_ddp=True, + logger=tb_logger, + accumulate_grad_batches=conf.accum_batches, + plugins=plugins, + ) + + trainer.fit(model) diff --git a/AniTalker-kit/AniTalker/code/face_sr/face_enhancer.py b/AniTalker-kit/AniTalker/code/face_sr/face_enhancer.py new file mode 100644 index 00000000..c09c4dae --- /dev/null +++ b/AniTalker-kit/AniTalker/code/face_sr/face_enhancer.py @@ -0,0 +1,123 @@ +import os +import torch + +from gfpgan import GFPGANer + +from tqdm import tqdm + +from .videoio import load_video_to_cv2 + +import cv2 + + +class GeneratorWithLen(object): + """ From https://stackoverflow.com/a/7460929 """ + + def __init__(self, gen, length): + self.gen = gen + self.length = length + + def __len__(self): + return self.length + + def __iter__(self): + return self.gen + +def enhancer_list(images, method='gfpgan', bg_upsampler='realesrgan'): + gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler) + return list(gen) + +def enhancer_generator_with_len(images, method='gfpgan', bg_upsampler='realesrgan'): + """ Provide a generator with a __len__ method so that it can passed to functions that + call len()""" + + if os.path.isfile(images): # handle video to images + # TODO: Create a generator version of load_video_to_cv2 + images = load_video_to_cv2(images) + + gen = enhancer_generator_no_len(images, method=method, bg_upsampler=bg_upsampler) + gen_with_len = GeneratorWithLen(gen, len(images)) + return gen_with_len + +def enhancer_generator_no_len(images, method='gfpgan', bg_upsampler='realesrgan'): + """ Provide a generator function so that all of the enhanced images don't need + to be stored in memory at the same time. This can save tons of RAM compared to + the enhancer function. """ + + print('face enhancer....') + if not isinstance(images, list) and os.path.isfile(images): # handle video to images + images = load_video_to_cv2(images) + + # ------------------------ set up GFPGAN restorer ------------------------ + if method == 'gfpgan': + arch = 'clean' + channel_multiplier = 2 + model_name = 'GFPGANv1.4' + url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth' + elif method == 'RestoreFormer': + arch = 'RestoreFormer' + channel_multiplier = 2 + model_name = 'RestoreFormer' + url = 'https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth' + elif method == 'codeformer': # TODO: + arch = 'CodeFormer' + channel_multiplier = 2 + model_name = 'CodeFormer' + url = 'https://github.com/sczhou/CodeFormer/releases/download/v0.1.0/codeformer.pth' + else: + raise ValueError(f'Wrong model version {method}.') + + + # ------------------------ set up background upsampler ------------------------ + if bg_upsampler == 'realesrgan': + if not torch.cuda.is_available(): # CPU + import warnings + warnings.warn('The unoptimized RealESRGAN is slow on CPU. We do not use it. ' + 'If you really want to use it, please modify the corresponding codes.') + bg_upsampler = None + else: + from basicsr.archs.rrdbnet_arch import RRDBNet + from realesrgan import RealESRGANer + model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2) + bg_upsampler = RealESRGANer( + scale=2, + model_path='https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.1/RealESRGAN_x2plus.pth', + model=model, + tile=400, + tile_pad=10, + pre_pad=0, + half=True) # need to set False in CPU mode + else: + bg_upsampler = None + + # determine model paths + model_path = os.path.join('gfpgan/weights', model_name + '.pth') + + if not os.path.isfile(model_path): + model_path = os.path.join('checkpoints', model_name + '.pth') + + if not os.path.isfile(model_path): + # download pre-trained models from url + model_path = url + + restorer = GFPGANer( + model_path=model_path, + upscale=2, + arch=arch, + channel_multiplier=channel_multiplier, + bg_upsampler=bg_upsampler) + + # ------------------------ restore ------------------------ + for idx in tqdm(range(len(images)), 'Face Enhancer:'): + + img = cv2.cvtColor(images[idx], cv2.COLOR_RGB2BGR) + + # restore faces and background if necessary + cropped_faces, restored_faces, r_img = restorer.enhance( + img, + has_aligned=False, + only_center_face=False, + paste_back=True) + + r_img = cv2.cvtColor(r_img, cv2.COLOR_BGR2RGB) + yield r_img diff --git a/AniTalker-kit/AniTalker/code/face_sr/videoio.py b/AniTalker-kit/AniTalker/code/face_sr/videoio.py new file mode 100644 index 00000000..08bfbdd7 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/face_sr/videoio.py @@ -0,0 +1,41 @@ +import shutil +import uuid + +import os + +import cv2 + +def load_video_to_cv2(input_path): + video_stream = cv2.VideoCapture(input_path) + fps = video_stream.get(cv2.CAP_PROP_FPS) + full_frames = [] + while 1: + still_reading, frame = video_stream.read() + if not still_reading: + video_stream.release() + break + full_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + return full_frames + +def save_video_with_watermark(video, audio, save_path, watermark=False): + temp_file = str(uuid.uuid4())+'.mp4' + cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -vcodec copy "%s"' % (video, audio, temp_file) + os.system(cmd) + + if watermark is False: + shutil.move(temp_file, save_path) + else: + # watermark + try: + ##### check if stable-diffusion-webui + import webui + from modules import paths + watarmark_path = paths.script_path+"/extensions/SadTalker/docs/sadtalker_logo.png" + except: + # get the root path of sadtalker. + dir_path = os.path.dirname(os.path.realpath(__file__)) + watarmark_path = dir_path+"/../../docs/sadtalker_logo.png" + + cmd = r'ffmpeg -y -hide_banner -loglevel error -i "%s" -i "%s" -filter_complex "[1]scale=100:-1[wm];[0][wm]overlay=(main_w-overlay_w)-10:10" "%s"' % (temp_file, watarmark_path, save_path) + os.system(cmd) + os.remove(temp_file) \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/model/__init__.py b/AniTalker-kit/AniTalker/code/model/__init__.py new file mode 100644 index 00000000..6a501aa1 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/__init__.py @@ -0,0 +1,6 @@ +from typing import Union +from .unet import BeatGANsUNetModel, BeatGANsUNetConfig +from .unet_autoenc import BeatGANsAutoencConfig, BeatGANsAutoencModel + +Model = Union[BeatGANsUNetModel, BeatGANsAutoencModel] +ModelConfig = Union[BeatGANsUNetConfig, BeatGANsAutoencConfig] diff --git a/AniTalker-kit/AniTalker/code/model/base.py b/AniTalker-kit/AniTalker/code/model/base.py new file mode 100644 index 00000000..efc5d42d --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/base.py @@ -0,0 +1,37 @@ +# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. +# This program is free software; you can redistribute it and/or modify +# it under the terms of the MIT License. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# MIT License for more details. + +import numpy as np +import torch + + +class BaseModule(torch.nn.Module): + def __init__(self): + super(BaseModule, self).__init__() + + @property + def nparams(self): + """ + Returns number of trainable parameters of the module. + """ + num_params = 0 + for name, param in self.named_parameters(): + if param.requires_grad: + num_params += np.prod(param.detach().cpu().numpy().shape) + return num_params + + + def relocate_input(self, x: list): + """ + Relocates provided tensors to the same device set for the module. + """ + device = next(self.parameters()).device + for i in range(len(x)): + if isinstance(x[i], torch.Tensor) and x[i].device != device: + x[i] = x[i].to(device) + return x diff --git a/AniTalker-kit/AniTalker/code/model/blocks.py b/AniTalker-kit/AniTalker/code/model/blocks.py new file mode 100644 index 00000000..9b4e11b5 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/blocks.py @@ -0,0 +1,567 @@ +import math +from abc import abstractmethod +from dataclasses import dataclass +from numbers import Number + +import torch as th +import torch.nn.functional as F +from choices import * +from config_base import BaseConfig +from torch import nn + +from .nn import (avg_pool_nd, conv_nd, linear, normalization, + timestep_embedding, torch_checkpoint, zero_module) + + +class ScaleAt(Enum): + after_norm = 'afternorm' + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + @abstractmethod + def forward(self, x, emb=None, cond=None, lateral=None): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + def forward(self, x, emb=None, cond=None, lateral=None): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb=emb, cond=cond, lateral=lateral) + else: + x = layer(x) + return x + + +@dataclass +class ResBlockConfig(BaseConfig): + channels: int + emb_channels: int + dropout: float + out_channels: int = None + # condition the resblock with time (and encoder's output) + use_condition: bool = True + # whether to use 3x3 conv for skip path when the channels aren't matched + use_conv: bool = False + # dimension of conv (always 2 = 2d) + dims: int = 2 + # gradient checkpoint + use_checkpoint: bool = False + up: bool = False + down: bool = False + # whether to condition with both time & encoder's output + two_cond: bool = False + # number of encoders' output channels + cond_emb_channels: int = None + # suggest: False + has_lateral: bool = False + lateral_channels: int = None + # whether to init the convolution with zero weights + # this is default from BeatGANs and seems to help learning + use_zero_module: bool = True + + def __post_init__(self): + self.out_channels = self.out_channels or self.channels + self.cond_emb_channels = self.cond_emb_channels or self.emb_channels + + def make_model(self): + return ResBlock(self) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + + total layers: + in_layers + - norm + - act + - conv + out_layers + - norm + - (modulation) + - act + - conv + """ + def __init__(self, conf: ResBlockConfig): + super().__init__() + self.conf = conf + + ############################# + # IN LAYERS + ############################# + assert conf.lateral_channels is None + layers = [ + normalization(conf.channels), + nn.SiLU(), + conv_nd(conf.dims, conf.channels, conf.out_channels, 3, padding=1) + ] + self.in_layers = nn.Sequential(*layers) + + self.updown = conf.up or conf.down + + if conf.up: + self.h_upd = Upsample(conf.channels, False, conf.dims) + self.x_upd = Upsample(conf.channels, False, conf.dims) + elif conf.down: + self.h_upd = Downsample(conf.channels, False, conf.dims) + self.x_upd = Downsample(conf.channels, False, conf.dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + ############################# + # OUT LAYERS CONDITIONS + ############################# + if conf.use_condition: + # condition layers for the out_layers + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear(conf.emb_channels, 2 * conf.out_channels), + ) + + if conf.two_cond: + self.cond_emb_layers = nn.Sequential( + nn.SiLU(), + linear(conf.cond_emb_channels, conf.out_channels), + ) + ############################# + # OUT LAYERS (ignored when there is no condition) + ############################# + # original version + conv = conv_nd(conf.dims, + conf.out_channels, + conf.out_channels, + 3, + padding=1) + if conf.use_zero_module: + # zere out the weights + # it seems to help training + conv = zero_module(conv) + + # construct the layers + # - norm + # - (modulation) + # - act + # - dropout + # - conv + layers = [] + layers += [ + normalization(conf.out_channels), + nn.SiLU(), + nn.Dropout(p=conf.dropout), + conv, + ] + self.out_layers = nn.Sequential(*layers) + + ############################# + # SKIP LAYERS + ############################# + if conf.out_channels == conf.channels: + # cannot be used with gatedconv, also gatedconv is alsways used as the first block + self.skip_connection = nn.Identity() + else: + if conf.use_conv: + kernel_size = 3 + padding = 1 + else: + kernel_size = 1 + padding = 0 + + self.skip_connection = conv_nd(conf.dims, + conf.channels, + conf.out_channels, + kernel_size, + padding=padding) + + def forward(self, x, emb=None, cond=None, lateral=None): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + + Args: + x: input + lateral: lateral connection from the encoder + """ + return torch_checkpoint(self._forward, (x, emb, cond, lateral), + self.conf.use_checkpoint) + + def _forward( + self, + x, + emb=None, + cond=None, + lateral=None, + ): + """ + Args: + lateral: required if "has_lateral" and non-gated, with gated, it can be supplied optionally + """ + if self.conf.has_lateral: + # lateral may be supplied even if it doesn't require + # the model will take the lateral only if "has_lateral" + assert lateral is not None + x = th.cat([x, lateral], dim=1) + + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + + if self.conf.use_condition: + # it's possible that the network may not receieve the time emb + # this happens with autoenc and setting the time_at + if emb is not None: + emb_out = self.emb_layers(emb).type(h.dtype) + else: + emb_out = None + + if self.conf.two_cond: + # it's possible that the network is two_cond + # but it doesn't get the second condition + # in which case, we ignore the second condition + # and treat as if the network has one condition + if cond is None: + cond_out = None + else: + cond_out = self.cond_emb_layers(cond).type(h.dtype) + + if cond_out is not None: + while len(cond_out.shape) < len(h.shape): + cond_out = cond_out[..., None] + else: + cond_out = None + + # this is the new refactored code + h = apply_conditions( + h=h, + emb=emb_out, + cond=cond_out, + layers=self.out_layers, + scale_bias=1, + in_channels=self.conf.out_channels, + up_down_layer=None, + ) + + return self.skip_connection(x) + h + + +def apply_conditions( + h, + emb=None, + cond=None, + layers: nn.Sequential = None, + scale_bias: float = 1, + in_channels: int = 512, + up_down_layer: nn.Module = None, +): + """ + apply conditions on the feature maps + + Args: + emb: time conditional (ready to scale + shift) + cond: encoder's conditional (read to scale + shift) + """ + two_cond = emb is not None and cond is not None + + if emb is not None: + # adjusting shapes + while len(emb.shape) < len(h.shape): + emb = emb[..., None] + + if two_cond: + # adjusting shapes + while len(cond.shape) < len(h.shape): + cond = cond[..., None] + # time first + scale_shifts = [emb, cond] + else: + # "cond" is not used with single cond mode + scale_shifts = [emb] + + # support scale, shift or shift only + for i, each in enumerate(scale_shifts): + if each is None: + # special case: the condition is not provided + a = None + b = None + else: + if each.shape[1] == in_channels * 2: + a, b = th.chunk(each, 2, dim=1) + else: + a = each + b = None + scale_shifts[i] = (a, b) + + # condition scale bias could be a list + if isinstance(scale_bias, Number): + biases = [scale_bias] * len(scale_shifts) + else: + # a list + biases = scale_bias + + # default, the scale & shift are applied after the group norm but BEFORE SiLU + pre_layers, post_layers = layers[0], layers[1:] + + # spilt the post layer to be able to scale up or down before conv + # post layers will contain only the conv + mid_layers, post_layers = post_layers[:-2], post_layers[-2:] + + h = pre_layers(h) + # scale and shift for each condition + for i, (scale, shift) in enumerate(scale_shifts): + # if scale is None, it indicates that the condition is not provided + if scale is not None: + h = h * (biases[i] + scale) + if shift is not None: + h = h + shift + h = mid_layers(h) + + # upscale or downscale if any just before the last conv + if up_down_layer is not None: + h = up_down_layer(h) + h = post_layers(h) + return h + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, + self.channels, + self.out_channels, + 3, + padding=1) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), + mode="nearest") + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + def __init__(self, channels, use_conv, dims=2, out_channels=None): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd(dims, + self.channels, + self.out_channels, + 3, + stride=stride, + padding=1) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return torch_checkpoint(self._forward, (x, ), self.use_checkpoint) + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, + dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", q * scale, + k * scale) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """ + A module which performs QKV attention and splits in a different order. + """ + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, + v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class AttentionPool2d(nn.Module): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter( + th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] diff --git a/AniTalker-kit/AniTalker/code/model/diffusion.py b/AniTalker-kit/AniTalker/code/model/diffusion.py new file mode 100644 index 00000000..9717d3a3 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/diffusion.py @@ -0,0 +1,294 @@ +# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. +# This program is free software; you can redistribute it and/or modify +# it under the terms of the MIT License. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# MIT License for more details. + +import math +import torch +from einops import rearrange + +from model.base import BaseModule + + +class Mish(BaseModule): + def forward(self, x): + return x * torch.tanh(torch.nn.functional.softplus(x)) + + +class Upsample(BaseModule): + def __init__(self, dim): + super(Upsample, self).__init__() + self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1) + + def forward(self, x): + return self.conv(x) + + +class Downsample(BaseModule): + def __init__(self, dim): + super(Downsample, self).__init__() + self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1) + + def forward(self, x): + return self.conv(x) + + +class Rezero(BaseModule): + def __init__(self, fn): + super(Rezero, self).__init__() + self.fn = fn + self.g = torch.nn.Parameter(torch.zeros(1)) + + def forward(self, x): + return self.fn(x) * self.g + + +class Block(BaseModule): + def __init__(self, dim, dim_out, groups=8): + super(Block, self).__init__() + self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3, + padding=1), torch.nn.GroupNorm( + groups, dim_out), Mish()) + + def forward(self, x, mask): + output = self.block(x * mask) + return output * mask + + +class ResnetBlock(BaseModule): + def __init__(self, dim, dim_out, time_emb_dim, groups=8): + super(ResnetBlock, self).__init__() + self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim, + dim_out)) + + self.block1 = Block(dim, dim_out, groups=groups) + self.block2 = Block(dim_out, dim_out, groups=groups) + if dim != dim_out: + self.res_conv = torch.nn.Conv2d(dim, dim_out, 1) + else: + self.res_conv = torch.nn.Identity() + + def forward(self, x, mask, time_emb): + h = self.block1(x, mask) + h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1) + h = self.block2(h, mask) + output = h + self.res_conv(x * mask) + return output + + +class LinearAttention(BaseModule): + def __init__(self, dim, heads=4, dim_head=32): + super(LinearAttention, self).__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', + heads = self.heads, qkv=3) + k = k.softmax(dim=-1) + context = torch.einsum('bhdn,bhen->bhde', k, v) + out = torch.einsum('bhde,bhdn->bhen', context, q) + out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', + heads=self.heads, h=h, w=w) + return self.to_out(out) + + +class Residual(BaseModule): + def __init__(self, fn): + super(Residual, self).__init__() + self.fn = fn + + def forward(self, x, *args, **kwargs): + output = self.fn(x, *args, **kwargs) + x + return output + + +class SinusoidalPosEmb(BaseModule): + def __init__(self, dim): + super(SinusoidalPosEmb, self).__init__() + self.dim = dim + + def forward(self, x, scale=1000): + device = x.device + half_dim = self.dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) + emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) + emb = torch.cat((emb.sin(), emb.cos()), dim=-1) + return emb + + +class GradLogPEstimator2d(BaseModule): + def __init__(self, dim, dim_mults=(1, 2, 4), groups=8, + n_spks=None, spk_emb_dim=64, n_feats=80, pe_scale=1000): + super(GradLogPEstimator2d, self).__init__() + self.dim = dim + self.dim_mults = dim_mults + self.groups = groups + self.n_spks = n_spks if not isinstance(n_spks, type(None)) else 1 + self.spk_emb_dim = spk_emb_dim + self.pe_scale = pe_scale + + if n_spks > 1: + self.spk_mlp = torch.nn.Sequential(torch.nn.Linear(spk_emb_dim, spk_emb_dim * 4), Mish(), + torch.nn.Linear(spk_emb_dim * 4, n_feats)) + self.time_pos_emb = SinusoidalPosEmb(dim) + self.mlp = torch.nn.Sequential(torch.nn.Linear(dim, dim * 4), Mish(), + torch.nn.Linear(dim * 4, dim)) + + dims = [2 + (1 if n_spks > 1 else 0), *map(lambda m: dim * m, dim_mults)] + in_out = list(zip(dims[:-1], dims[1:])) + self.downs = torch.nn.ModuleList([]) + self.ups = torch.nn.ModuleList([]) + num_resolutions = len(in_out) + + for ind, (dim_in, dim_out) in enumerate(in_out): + is_last = ind >= (num_resolutions - 1) + self.downs.append(torch.nn.ModuleList([ + ResnetBlock(dim_in, dim_out, time_emb_dim=dim), + ResnetBlock(dim_out, dim_out, time_emb_dim=dim), + Residual(Rezero(LinearAttention(dim_out))), + Downsample(dim_out) if not is_last else torch.nn.Identity()])) + + mid_dim = dims[-1] + self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim) + self.mid_attn = Residual(Rezero(LinearAttention(mid_dim))) + self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim) + + for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])): + self.ups.append(torch.nn.ModuleList([ + ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim), + ResnetBlock(dim_in, dim_in, time_emb_dim=dim), + Residual(Rezero(LinearAttention(dim_in))), + Upsample(dim_in)])) + self.final_block = Block(dim, dim) + self.final_conv = torch.nn.Conv2d(dim, 1, 1) + + def forward(self, x, mask, mu, t, spk=None): + if not isinstance(spk, type(None)): + s = self.spk_mlp(spk) + + t = self.time_pos_emb(t, scale=self.pe_scale) + t = self.mlp(t) + + if self.n_spks < 2: + x = torch.stack([mu, x], 1) + else: + s = s.unsqueeze(-1).repeat(1, 1, x.shape[-1]) + x = torch.stack([mu, x, s], 1) + mask = mask.unsqueeze(1) + + hiddens = [] + masks = [mask] + for resnet1, resnet2, attn, downsample in self.downs: + mask_down = masks[-1] + x = resnet1(x, mask_down, t) + x = resnet2(x, mask_down, t) + x = attn(x) + hiddens.append(x) + x = downsample(x * mask_down) + masks.append(mask_down[:, :, :, ::2]) + + masks = masks[:-1] + mask_mid = masks[-1] + x = self.mid_block1(x, mask_mid, t) + x = self.mid_attn(x) + x = self.mid_block2(x, mask_mid, t) + + for resnet1, resnet2, attn, upsample in self.ups: + mask_up = masks.pop() + x = torch.cat((x, hiddens.pop()), dim=1) + x = resnet1(x, mask_up, t) + x = resnet2(x, mask_up, t) + x = attn(x) + x = upsample(x * mask_up) + + x = self.final_block(x, mask) + output = self.final_conv(x * mask) + + return (output * mask).squeeze(1) + + +def get_noise(t, beta_init, beta_term, cumulative=False): + if cumulative: + noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2) + else: + noise = beta_init + (beta_term - beta_init)*t + return noise + + +class Diffusion(BaseModule): + def __init__(self, n_feats, dim, + n_spks=1, spk_emb_dim=64, + beta_min=0.05, beta_max=20, pe_scale=1000): + super(Diffusion, self).__init__() + self.n_feats = n_feats + self.dim = dim + self.n_spks = n_spks + self.spk_emb_dim = spk_emb_dim + self.beta_min = beta_min + self.beta_max = beta_max + self.pe_scale = pe_scale + + self.estimator = GradLogPEstimator2d(dim, n_spks=n_spks, + spk_emb_dim=spk_emb_dim, + pe_scale=pe_scale) + + def forward_diffusion(self, x0, mask, mu, t): + time = t.unsqueeze(-1).unsqueeze(-1) + cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True) + mean = x0*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise)) + variance = 1.0 - torch.exp(-cum_noise) + z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device, + requires_grad=False) + xt = mean + z * torch.sqrt(variance) + return xt * mask, z * mask + + @torch.no_grad() + def reverse_diffusion(self, z, mask, mu, n_timesteps, stoc=False, spk=None): + h = 1.0 / n_timesteps + xt = z * mask + for i in range(n_timesteps): + t = (1.0 - (i + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype, + device=z.device) + time = t.unsqueeze(-1).unsqueeze(-1) + noise_t = get_noise(time, self.beta_min, self.beta_max, + cumulative=False) + if stoc: # adds stochastic term + dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk) + dxt_det = dxt_det * noise_t * h + dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device, + requires_grad=False) + dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h) + dxt = dxt_det + dxt_stoc + else: + dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk)) + dxt = dxt * noise_t * h + xt = (xt - dxt) * mask + return xt + + @torch.no_grad() + def forward(self, z, mask, mu, n_timesteps, stoc=False, spk=None): + return self.reverse_diffusion(z, mask, mu, n_timesteps, stoc, spk) + + def loss_t(self, x0, mask, mu, t, spk=None): + xt, z = self.forward_diffusion(x0, mask, mu, t) + time = t.unsqueeze(-1).unsqueeze(-1) + cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True) + noise_estimation = self.estimator(xt, mask, mu, t, spk) + noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise)) + loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.n_feats) + return loss, xt + + def compute_loss(self, x0, mask, mu, spk=None, offset=1e-5): + t = torch.rand(x0.shape[0], dtype=x0.dtype, device=x0.device, + requires_grad=False) + t = torch.clamp(t, offset, 1.0 - offset) + return self.loss_t(x0, mask, mu, t, spk) diff --git a/AniTalker-kit/AniTalker/code/model/latentnet.py b/AniTalker-kit/AniTalker/code/model/latentnet.py new file mode 100644 index 00000000..6de03456 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/latentnet.py @@ -0,0 +1,193 @@ +import math +from dataclasses import dataclass +from enum import Enum +from typing import NamedTuple, Tuple + +import torch +from choices import * +from config_base import BaseConfig +from torch import nn +from torch.nn import init + +from .blocks import * +from .nn import timestep_embedding +from .unet import * + + +class LatentNetType(Enum): + none = 'none' + # injecting inputs into the hidden layers + skip = 'skip' + + +class LatentNetReturn(NamedTuple): + pred: torch.Tensor = None + + +@dataclass +class MLPSkipNetConfig(BaseConfig): + """ + default MLP for the latent DPM in the paper! + """ + num_channels: int + skip_layers: Tuple[int] + num_hid_channels: int + num_layers: int + num_time_emb_channels: int = 64 + activation: Activation = Activation.silu + use_norm: bool = True + condition_bias: float = 1 + dropout: float = 0 + last_act: Activation = Activation.none + num_time_layers: int = 2 + time_last_act: bool = False + + def make_model(self): + return MLPSkipNet(self) + + +class MLPSkipNet(nn.Module): + """ + concat x to hidden layers + + default MLP for the latent DPM in the paper! + """ + def __init__(self, conf: MLPSkipNetConfig): + super().__init__() + self.conf = conf + + layers = [] + for i in range(conf.num_time_layers): + if i == 0: + a = conf.num_time_emb_channels + b = conf.num_channels + else: + a = conf.num_channels + b = conf.num_channels + layers.append(nn.Linear(a, b)) + if i < conf.num_time_layers - 1 or conf.time_last_act: + layers.append(conf.activation.get_act()) + self.time_embed = nn.Sequential(*layers) + + self.layers = nn.ModuleList([]) + for i in range(conf.num_layers): + if i == 0: + act = conf.activation + norm = conf.use_norm + cond = True + a, b = conf.num_channels, conf.num_hid_channels + dropout = conf.dropout + elif i == conf.num_layers - 1: + act = Activation.none + norm = False + cond = False + a, b = conf.num_hid_channels, conf.num_channels + dropout = 0 + else: + act = conf.activation + norm = conf.use_norm + cond = True + a, b = conf.num_hid_channels, conf.num_hid_channels + dropout = conf.dropout + + if i in conf.skip_layers: + a += conf.num_channels + + self.layers.append( + MLPLNAct( + a, + b, + norm=norm, + activation=act, + cond_channels=conf.num_channels, + use_cond=cond, + condition_bias=conf.condition_bias, + dropout=dropout, + )) + self.last_act = conf.last_act.get_act() + + def forward(self, x, t, **kwargs): + t = timestep_embedding(t, self.conf.num_time_emb_channels) + cond = self.time_embed(t) + h = x + for i in range(len(self.layers)): + if i in self.conf.skip_layers: + # injecting input into the hidden layers + h = torch.cat([h, x], dim=1) + h = self.layers[i].forward(x=h, cond=cond) + h = self.last_act(h) + return LatentNetReturn(h) + + +class MLPLNAct(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + norm: bool, + use_cond: bool, + activation: Activation, + cond_channels: int, + condition_bias: float = 0, + dropout: float = 0, + ): + super().__init__() + self.activation = activation + self.condition_bias = condition_bias + self.use_cond = use_cond + + self.linear = nn.Linear(in_channels, out_channels) + self.act = activation.get_act() + if self.use_cond: + self.linear_emb = nn.Linear(cond_channels, out_channels) + self.cond_layers = nn.Sequential(self.act, self.linear_emb) + if norm: + self.norm = nn.LayerNorm(out_channels) + else: + self.norm = nn.Identity() + + if dropout > 0: + self.dropout = nn.Dropout(p=dropout) + else: + self.dropout = nn.Identity() + + self.init_weights() + + def init_weights(self): + for module in self.modules(): + if isinstance(module, nn.Linear): + if self.activation == Activation.relu: + init.kaiming_normal_(module.weight, + a=0, + nonlinearity='relu') + elif self.activation == Activation.lrelu: + init.kaiming_normal_(module.weight, + a=0.2, + nonlinearity='leaky_relu') + elif self.activation == Activation.silu: + init.kaiming_normal_(module.weight, + a=0, + nonlinearity='relu') + else: + # leave it as default + pass + + def forward(self, x, cond=None): + x = self.linear(x) + if self.use_cond: + # (n, c) or (n, c * 2) + cond = self.cond_layers(cond) + cond = (cond, None) + + # scale shift first + x = x * (self.condition_bias + cond[0]) + if cond[1] is not None: + x = x + cond[1] + # then norm + x = self.norm(x) + else: + # no condition + x = self.norm(x) + x = self.act(x) + x = self.dropout(x) + return x \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/model/nn.py b/AniTalker-kit/AniTalker/code/model/nn.py new file mode 100644 index 00000000..a940ac2b --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/nn.py @@ -0,0 +1,137 @@ +""" +Various utilities for neural networks. +""" + +from enum import Enum +import math +from typing import Optional + +import torch as th +import torch.nn as nn +import torch.utils.checkpoint + +import torch.nn.functional as F + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + # @th.jit.script + def forward(self, x): + return x * th.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def update_ema(target_params, source_params, rate=0.99): + """ + Update target parameters to be closer to those of source parameters using + an exponential moving average. + + :param target_params: the target parameter sequence. + :param source_params: the source parameter sequence. + :param rate: the EMA rate (closer to 1 means slower). + """ + for targ, src in zip(target_params, source_params): + targ.detach().mul_(rate).add_(src, alpha=1 - rate) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(min(32, channels), channels) + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = th.exp(-math.log(max_period) * + th.arange(start=0, end=half, dtype=th.float32) / + half).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = th.cat([th.cos(args), th.sin(args)], dim=-1) + if dim % 2: + embedding = th.cat( + [embedding, th.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +def torch_checkpoint(func, args, flag, preserve_rng_state=False): + # torch's gradient checkpoint works with automatic mixed precision, given torch >= 1.8 + if flag: + return torch.utils.checkpoint.checkpoint( + func, *args, preserve_rng_state=preserve_rng_state) + else: + return func(*args) diff --git a/AniTalker-kit/AniTalker/code/model/seq2seq.py b/AniTalker-kit/AniTalker/code/model/seq2seq.py new file mode 100644 index 00000000..bb5ae51d --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/seq2seq.py @@ -0,0 +1,141 @@ +import torch +from torch import nn +from model.base import BaseModule +from espnet.nets.pytorch_backend.conformer.encoder import Encoder as ConformerEncoder +import torch.nn.functional as F + +class LSTM(nn.Module): + def __init__(self, motion_dim, output_dim, num_layers=2, hidden_dim=128): + super().__init__() + self.lstm = nn.LSTM(input_size=motion_dim, hidden_size=hidden_dim, + num_layers=num_layers, batch_first=True) + self.fc = nn.Linear(hidden_dim, output_dim) + + def forward(self, x): + x, _ = self.lstm(x) + return self.fc(x) + +class DiffusionPredictor(BaseModule): + def __init__(self, conf): + super(DiffusionPredictor, self).__init__() + + self.infer_type = conf.infer_type + + self.initialize_layers(conf) + print(f'infer_type: {self.infer_type}') + + def create_conformer_encoder(self, attention_dim, num_blocks): + return ConformerEncoder( + idim=0, attention_dim=attention_dim, attention_heads=2, linear_units=attention_dim, + num_blocks=num_blocks, input_layer=None, dropout_rate=0.2, positional_dropout_rate=0.2, + attention_dropout_rate=0.2, normalize_before=False, concat_after=False, + positionwise_layer_type="linear", positionwise_conv_kernel_size=3, macaron_style=True, + pos_enc_layer_type="rel_pos", selfattention_layer_type="rel_selfattn", use_cnn_module=True, + cnn_module_kernel=13) + + def initialize_layers(self, conf, mfcc_dim=39, hubert_dim=1024, speech_layers=4, speech_dim=512, decoder_dim=1024, motion_start_dim=512, HAL_layers=25): + + self.conf = conf + # Speech downsampling + if self.infer_type.startswith('mfcc'): + # from 100 hz to 25 hz + self.down_sample1 = nn.Conv1d(mfcc_dim, 256, kernel_size=3, stride=2, padding=1) + self.down_sample2 = nn.Conv1d(256, speech_dim, kernel_size=3, stride=2, padding=1) + elif self.infer_type.startswith('hubert'): + # from 50 hz to 25 hz + self.down_sample1 = nn.Conv1d(hubert_dim, speech_dim, kernel_size=3, stride=2, padding=1) + + self.weights = nn.Parameter(torch.zeros(HAL_layers)) + self.speech_encoder = self.create_conformer_encoder(speech_dim, speech_layers) + else: + print('infer_type not supported') + + # Encoders & Deocoders + self.coarse_decoder = self.create_conformer_encoder(decoder_dim, conf.decoder_layers) + + # LSTM predictors for Variance Adapter + if self.infer_type != 'hubert_audio_only': + self.pose_predictor = LSTM(speech_dim, 3) + self.pose_encoder = LSTM(3, speech_dim) + + if 'full_control' in self.infer_type: + self.location_predictor = LSTM(speech_dim, 1) + self.location_encoder = LSTM(1, speech_dim) + self.face_scale_predictor = LSTM(speech_dim, 1) + self.face_scale_encoder = LSTM(1, speech_dim) + + # Linear transformations + self.init_code_proj = nn.Sequential(nn.Linear(motion_start_dim, 128)) + self.noisy_encoder = nn.Sequential(nn.Linear(conf.motion_dim, 128)) + self.t_encoder = nn.Sequential(nn.Linear(1, 128)) + self.encoder_direction_code = nn.Linear(conf.motion_dim, 128) + + self.out_proj = nn.Linear(decoder_dim, conf.motion_dim) + + + def forward(self, initial_code, direction_code, seq_input_vector, face_location, face_scale, yaw_pitch_roll, noisy_x, t_emb, control_flag=False): + + if self.infer_type.startswith('mfcc'): + x = self.mfcc_speech_downsample(seq_input_vector) + elif self.infer_type.startswith('hubert'): + norm_weights = F.softmax(self.weights, dim=-1) + weighted_feature = (norm_weights.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) * seq_input_vector).sum(dim=1) + x = self.down_sample1(weighted_feature.transpose(1,2)).transpose(1,2) + x, _ = self.speech_encoder(x, masks=None) + predicted_location, predicted_scale, predicted_pose = face_location, face_scale, yaw_pitch_roll + if self.infer_type != 'hubert_audio_only': + print(f'pose controllable. control_flag: {control_flag}') + x, predicted_location, predicted_scale, predicted_pose = self.adjust_features(x, face_location, face_scale, yaw_pitch_roll, control_flag) + concatenated_features = self.combine_features(x, initial_code, direction_code, noisy_x, t_emb) # initial_code and direction_code serve as a motion guide extracted from the reference image. This aims to tell the model what the starting motion should be. + outputs = self.decode_features(concatenated_features) + return outputs, predicted_location, predicted_scale, predicted_pose + + def mfcc_speech_downsample(self, seq_input_vector): + x = self.down_sample1(seq_input_vector.transpose(1,2)) + return self.down_sample2(x).transpose(1,2) + + def adjust_features(self, x, face_location, face_scale, yaw_pitch_roll, control_flag): + predicted_location, predicted_scale = 0, 0 + if 'full_control' in self.infer_type: + print(f'full controllable. control_flag: {control_flag}') + x_residual, predicted_location = self.adjust_location(x, face_location, control_flag) + x = x + x_residual + + x_residual, predicted_scale = self.adjust_scale(x, face_scale, control_flag) + x = x + x_residual + + x_residual, predicted_pose= self.adjust_pose(x, yaw_pitch_roll, control_flag) + x = x + x_residual + return x, predicted_location, predicted_scale, predicted_pose + + def adjust_location(self, x, face_location, control_flag): + if control_flag: + predicted_location = face_location + else: + predicted_location = self.location_predictor(x) + return self.location_encoder(predicted_location), predicted_location + + def adjust_scale(self, x, face_scale, control_flag): + if control_flag: + predicted_face_scale = face_scale + else: + predicted_face_scale = self.face_scale_predictor(x) + return self.face_scale_encoder(predicted_face_scale), predicted_face_scale + + def adjust_pose(self, x, yaw_pitch_roll, control_flag): + if control_flag: + predicted_pose = yaw_pitch_roll + else: + predicted_pose = self.pose_predictor(x) + return self.pose_encoder(predicted_pose), predicted_pose + + def combine_features(self, x, initial_code, direction_code, noisy_x, t_emb): + init_code_proj = self.init_code_proj(initial_code).unsqueeze(1).repeat(1, x.size(1), 1) + noisy_feature = self.noisy_encoder(noisy_x) + t_emb_feature = self.t_encoder(t_emb.unsqueeze(1).float()).unsqueeze(1).repeat(1, x.size(1), 1) + direction_code_feature = self.encoder_direction_code(direction_code).unsqueeze(1).repeat(1, x.size(1), 1) + return torch.cat((x, direction_code_feature, init_code_proj, noisy_feature, t_emb_feature), dim=-1) + + def decode_features(self, concatenated_features): + outputs, _ = self.coarse_decoder(concatenated_features, masks=None) + return self.out_proj(outputs) \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/code/model/unet.py b/AniTalker-kit/AniTalker/code/model/unet.py new file mode 100644 index 00000000..9cf90d54 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/unet.py @@ -0,0 +1,552 @@ +import math +from dataclasses import dataclass +from numbers import Number +from typing import NamedTuple, Tuple, Union + +import numpy as np +import torch as th +from torch import nn +import torch.nn.functional as F +from choices import * +from config_base import BaseConfig +from .blocks import * + +from .nn import (conv_nd, linear, normalization, timestep_embedding, + torch_checkpoint, zero_module) + + +@dataclass +class BeatGANsUNetConfig(BaseConfig): + image_size: int = 64 + in_channels: int = 3 + # base channels, will be multiplied + model_channels: int = 64 + # output of the unet + # suggest: 3 + # you only need 6 if you also model the variance of the noise prediction (usually we use an analytical variance hence 3) + out_channels: int = 3 + # how many repeating resblocks per resolution + # the decoding side would have "one more" resblock + # default: 2 + num_res_blocks: int = 2 + # you can also set the number of resblocks specifically for the input blocks + # default: None = above + num_input_res_blocks: int = None + # number of time embed channels and style channels + embed_channels: int = 512 + # at what resolutions you want to do self-attention of the feature maps + # attentions generally improve performance + # default: [16] + # beatgans: [32, 16, 8] + attention_resolutions: Tuple[int] = (16, ) + # number of time embed channels + time_embed_channels: int = None + # dropout applies to the resblocks (on feature maps) + dropout: float = 0.1 + channel_mult: Tuple[int] = (1, 2, 4, 8) + input_channel_mult: Tuple[int] = None + conv_resample: bool = True + # always 2 = 2d conv + dims: int = 2 + # don't use this, legacy from BeatGANs + num_classes: int = None + use_checkpoint: bool = False + # number of attention heads + num_heads: int = 1 + # or specify the number of channels per attention head + num_head_channels: int = -1 + # what's this? + num_heads_upsample: int = -1 + # use resblock for upscale/downscale blocks (expensive) + # default: True (BeatGANs) + resblock_updown: bool = True + # never tried + use_new_attention_order: bool = False + resnet_two_cond: bool = False + resnet_cond_channels: int = None + # init the decoding conv layers with zero weights, this speeds up training + # default: True (BeattGANs) + resnet_use_zero_module: bool = True + # gradient checkpoint the attention operation + attn_checkpoint: bool = False + + def make_model(self): + return BeatGANsUNetModel(self) + + +class BeatGANsUNetModel(nn.Module): + def __init__(self, conf: BeatGANsUNetConfig): + super().__init__() + self.conf = conf + + if conf.num_heads_upsample == -1: + self.num_heads_upsample = conf.num_heads + + self.dtype = th.float32 + + self.time_emb_channels = conf.time_embed_channels or conf.model_channels + self.time_embed = nn.Sequential( + linear(self.time_emb_channels, conf.embed_channels), + nn.SiLU(), + linear(conf.embed_channels, conf.embed_channels), + ) + + if conf.num_classes is not None: + self.label_emb = nn.Embedding(conf.num_classes, + conf.embed_channels) + + ch = input_ch = int(conf.channel_mult[0] * conf.model_channels) + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(conf.dims, conf.in_channels, ch, 3, padding=1)) + ]) + + kwargs = dict( + use_condition=True, + two_cond=conf.resnet_two_cond, + use_zero_module=conf.resnet_use_zero_module, + # style channels for the resnet block + cond_emb_channels=conf.resnet_cond_channels, + ) + + self._feature_size = ch + + # input_block_chans = [ch] + input_block_chans = [[] for _ in range(len(conf.channel_mult))] + input_block_chans[0].append(ch) + + # number of blocks at each resolution + self.input_num_blocks = [0 for _ in range(len(conf.channel_mult))] + self.input_num_blocks[0] = 1 + self.output_num_blocks = [0 for _ in range(len(conf.channel_mult))] + + ds = 1 + resolution = conf.image_size + for level, mult in enumerate(conf.input_channel_mult + or conf.channel_mult): + for _ in range(conf.num_input_res_blocks or conf.num_res_blocks): + layers = [ + ResBlockConfig( + ch, + conf.embed_channels, + conf.dropout, + out_channels=int(mult * conf.model_channels), + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + **kwargs, + ).make_model() + ] + ch = int(mult * conf.model_channels) + if resolution in conf.attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=conf.use_checkpoint + or conf.attn_checkpoint, + num_heads=conf.num_heads, + num_head_channels=conf.num_head_channels, + use_new_attention_order=conf. + use_new_attention_order, + )) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + # input_block_chans.append(ch) + input_block_chans[level].append(ch) + self.input_num_blocks[level] += 1 + # print(input_block_chans) + if level != len(conf.channel_mult) - 1: + resolution //= 2 + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlockConfig( + ch, + conf.embed_channels, + conf.dropout, + out_channels=out_ch, + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + down=True, + **kwargs, + ).make_model() if conf. + resblock_updown else Downsample(ch, + conf.conv_resample, + dims=conf.dims, + out_channels=out_ch))) + ch = out_ch + # input_block_chans.append(ch) + input_block_chans[level + 1].append(ch) + self.input_num_blocks[level + 1] += 1 + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlockConfig( + ch, + conf.embed_channels, + conf.dropout, + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + **kwargs, + ).make_model(), + AttentionBlock( + ch, + use_checkpoint=conf.use_checkpoint or conf.attn_checkpoint, + num_heads=conf.num_heads, + num_head_channels=conf.num_head_channels, + use_new_attention_order=conf.use_new_attention_order, + ), + ResBlockConfig( + ch, + conf.embed_channels, + conf.dropout, + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + **kwargs, + ).make_model(), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(conf.channel_mult))[::-1]: + for i in range(conf.num_res_blocks + 1): + # print(input_block_chans) + # ich = input_block_chans.pop() + try: + ich = input_block_chans[level].pop() + except IndexError: + # this happens only when num_res_block > num_enc_res_block + # we will not have enough lateral (skip) connecions for all decoder blocks + ich = 0 + # print('pop:', ich) + layers = [ + ResBlockConfig( + # only direct channels when gated + channels=ch + ich, + emb_channels=conf.embed_channels, + dropout=conf.dropout, + out_channels=int(conf.model_channels * mult), + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + # lateral channels are described here when gated + has_lateral=True if ich > 0 else False, + lateral_channels=None, + **kwargs, + ).make_model() + ] + ch = int(conf.model_channels * mult) + if resolution in conf.attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=conf.use_checkpoint + or conf.attn_checkpoint, + num_heads=self.num_heads_upsample, + num_head_channels=conf.num_head_channels, + use_new_attention_order=conf. + use_new_attention_order, + )) + if level and i == conf.num_res_blocks: + resolution *= 2 + out_ch = ch + layers.append( + ResBlockConfig( + ch, + conf.embed_channels, + conf.dropout, + out_channels=out_ch, + dims=conf.dims, + use_checkpoint=conf.use_checkpoint, + up=True, + **kwargs, + ).make_model() if ( + conf.resblock_updown + ) else Upsample(ch, + conf.conv_resample, + dims=conf.dims, + out_channels=out_ch)) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self.output_num_blocks[level] += 1 + self._feature_size += ch + + # print(input_block_chans) + # print('inputs:', self.input_num_blocks) + # print('outputs:', self.output_num_blocks) + + if conf.resnet_use_zero_module: + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module( + conv_nd(conf.dims, + input_ch, + conf.out_channels, + 3, + padding=1)), + ) + else: + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + conv_nd(conf.dims, input_ch, conf.out_channels, 3, padding=1), + ) + + def forward(self, x, t, y=None, **kwargs): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == ( + self.conf.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + + # hs = [] + hs = [[] for _ in range(len(self.conf.channel_mult))] + emb = self.time_embed(timestep_embedding(t, self.time_emb_channels)) + + if self.conf.num_classes is not None: + raise NotImplementedError() + # assert y.shape == (x.shape[0], ) + # emb = emb + self.label_emb(y) + + # new code supports input_num_blocks != output_num_blocks + h = x.type(self.dtype) + k = 0 + for i in range(len(self.input_num_blocks)): + for j in range(self.input_num_blocks[i]): + h = self.input_blocks[k](h, emb=emb) + # print(i, j, h.shape) + hs[i].append(h) + k += 1 + assert k == len(self.input_blocks) + + h = self.middle_block(h, emb=emb) + k = 0 + for i in range(len(self.output_num_blocks)): + for j in range(self.output_num_blocks[i]): + # take the lateral connection from the same layer (in reserve) + # until there is no more, use None + try: + lateral = hs[-i - 1].pop() + # print(i, j, lateral.shape) + except IndexError: + lateral = None + # print(i, j, lateral) + h = self.output_blocks[k](h, emb=emb, lateral=lateral) + k += 1 + + h = h.type(x.dtype) + pred = self.out(h) + return Return(pred=pred) + + +class Return(NamedTuple): + pred: th.Tensor + + +@dataclass +class BeatGANsEncoderConfig(BaseConfig): + image_size: int + in_channels: int + model_channels: int + out_hid_channels: int + out_channels: int + num_res_blocks: int + attention_resolutions: Tuple[int] + dropout: float = 0 + channel_mult: Tuple[int] = (1, 2, 4, 8) + use_time_condition: bool = True + conv_resample: bool = True + dims: int = 2 + use_checkpoint: bool = False + num_heads: int = 1 + num_head_channels: int = -1 + resblock_updown: bool = False + use_new_attention_order: bool = False + pool: str = 'adaptivenonzero' + + def make_model(self): + return BeatGANsEncoderModel(self) + + +class BeatGANsEncoderModel(nn.Module): + """ + The half UNet model with attention and timestep embedding. + + For usage, see UNet. + """ + def __init__(self, conf: BeatGANsEncoderConfig): + super().__init__() + self.conf = conf + self.dtype = th.float32 + + if conf.use_time_condition: + time_embed_dim = conf.model_channels * 4 + self.time_embed = nn.Sequential( + linear(conf.model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + else: + time_embed_dim = None + + ch = int(conf.channel_mult[0] * conf.model_channels) + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(conf.dims, conf.in_channels, ch, 3, padding=1)) + ]) + self._feature_size = ch + input_block_chans = [ch] + ds = 1 + resolution = conf.image_size + for level, mult in enumerate(conf.channel_mult): + for _ in range(conf.num_res_blocks): + layers = [ + ResBlockConfig( + ch, + time_embed_dim, + conf.dropout, + out_channels=int(mult * conf.model_channels), + dims=conf.dims, + use_condition=conf.use_time_condition, + use_checkpoint=conf.use_checkpoint, + ).make_model() + ] + ch = int(mult * conf.model_channels) + if resolution in conf.attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=conf.use_checkpoint, + num_heads=conf.num_heads, + num_head_channels=conf.num_head_channels, + use_new_attention_order=conf. + use_new_attention_order, + )) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(conf.channel_mult) - 1: + resolution //= 2 + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlockConfig( + ch, + time_embed_dim, + conf.dropout, + out_channels=out_ch, + dims=conf.dims, + use_condition=conf.use_time_condition, + use_checkpoint=conf.use_checkpoint, + down=True, + ).make_model() if ( + conf.resblock_updown + ) else Downsample(ch, + conf.conv_resample, + dims=conf.dims, + out_channels=out_ch))) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlockConfig( + ch, + time_embed_dim, + conf.dropout, + dims=conf.dims, + use_condition=conf.use_time_condition, + use_checkpoint=conf.use_checkpoint, + ).make_model(), + AttentionBlock( + ch, + use_checkpoint=conf.use_checkpoint, + num_heads=conf.num_heads, + num_head_channels=conf.num_head_channels, + use_new_attention_order=conf.use_new_attention_order, + ), + ResBlockConfig( + ch, + time_embed_dim, + conf.dropout, + dims=conf.dims, + use_condition=conf.use_time_condition, + use_checkpoint=conf.use_checkpoint, + ).make_model(), + ) + self._feature_size += ch + if conf.pool == "adaptivenonzero": + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.AdaptiveAvgPool2d((1, 1)), + conv_nd(conf.dims, ch, conf.out_channels, 1), + nn.Flatten(), + ) + else: + raise NotImplementedError(f"Unexpected {conf.pool} pooling") + + def forward(self, x, t=None, return_2d_feature=False): + """ + Apply the model to an input batch. + + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :return: an [N x K] Tensor of outputs. + """ + if self.conf.use_time_condition: + emb = self.time_embed(timestep_embedding(t, self.model_channels)) + else: + emb = None + + results = [] + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb=emb) + if self.conf.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = self.middle_block(h, emb=emb) + if self.conf.pool.startswith("spatial"): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = th.cat(results, axis=-1) + else: + h = h.type(x.dtype) + + h_2d = h + h = self.out(h) + + if return_2d_feature: + return h, h_2d + else: + return h + + def forward_flatten(self, x): + """ + transform the last 2d feature into a flatten vector + """ + h = self.out(x) + return h + + +class SuperResModel(BeatGANsUNetModel): + """ + A UNetModel that performs super-resolution. + + Expects an extra kwarg `low_res` to condition on a low-resolution image. + """ + def __init__(self, image_size, in_channels, *args, **kwargs): + super().__init__(image_size, in_channels * 2, *args, **kwargs) + + def forward(self, x, timesteps, low_res=None, **kwargs): + _, _, new_height, new_width = x.shape + upsampled = F.interpolate(low_res, (new_height, new_width), + mode="bilinear") + x = th.cat([x, upsampled], dim=1) + return super().forward(x, timesteps, **kwargs) diff --git a/AniTalker-kit/AniTalker/code/model/unet_autoenc.py b/AniTalker-kit/AniTalker/code/model/unet_autoenc.py new file mode 100644 index 00000000..8583fe9b --- /dev/null +++ b/AniTalker-kit/AniTalker/code/model/unet_autoenc.py @@ -0,0 +1,283 @@ +from enum import Enum + +import torch +from torch import Tensor +from torch.nn.functional import silu + +from .latentnet import * +from .unet import * +from choices import * + + +@dataclass +class BeatGANsAutoencConfig(BeatGANsUNetConfig): + # number of style channels + enc_out_channels: int = 512 + enc_attn_resolutions: Tuple[int] = None + enc_pool: str = 'depthconv' + enc_num_res_block: int = 2 + enc_channel_mult: Tuple[int] = None + enc_grad_checkpoint: bool = False + latent_net_conf: MLPSkipNetConfig = None + + def make_model(self): + return BeatGANsAutoencModel(self) + + +class BeatGANsAutoencModel(BeatGANsUNetModel): + def __init__(self, conf: BeatGANsAutoencConfig): + super().__init__(conf) + self.conf = conf + + # having only time, cond + self.time_embed = TimeStyleSeperateEmbed( + time_channels=conf.model_channels, + time_out_channels=conf.embed_channels, + ) + + self.encoder = BeatGANsEncoderConfig( + image_size=conf.image_size, + in_channels=conf.in_channels, + model_channels=conf.model_channels, + out_hid_channels=conf.enc_out_channels, + out_channels=conf.enc_out_channels, + num_res_blocks=conf.enc_num_res_block, + attention_resolutions=(conf.enc_attn_resolutions + or conf.attention_resolutions), + dropout=conf.dropout, + channel_mult=conf.enc_channel_mult or conf.channel_mult, + use_time_condition=False, + conv_resample=conf.conv_resample, + dims=conf.dims, + use_checkpoint=conf.use_checkpoint or conf.enc_grad_checkpoint, + num_heads=conf.num_heads, + num_head_channels=conf.num_head_channels, + resblock_updown=conf.resblock_updown, + use_new_attention_order=conf.use_new_attention_order, + pool=conf.enc_pool, + ).make_model() + + if conf.latent_net_conf is not None: + self.latent_net = conf.latent_net_conf.make_model() + + def reparameterize(self, mu: Tensor, logvar: Tensor) -> Tensor: + """ + Reparameterization trick to sample from N(mu, var) from + N(0,1). + :param mu: (Tensor) Mean of the latent Gaussian [B x D] + :param logvar: (Tensor) Standard deviation of the latent Gaussian [B x D] + :return: (Tensor) [B x D] + """ + assert self.conf.is_stochastic + std = torch.exp(0.5 * logvar) + eps = torch.randn_like(std) + return eps * std + mu + + def sample_z(self, n: int, device): + assert self.conf.is_stochastic + return torch.randn(n, self.conf.enc_out_channels, device=device) + + def noise_to_cond(self, noise: Tensor): + raise NotImplementedError() + assert self.conf.noise_net_conf is not None + return self.noise_net.forward(noise) + + def encode(self, x): + cond = self.encoder.forward(x) + return {'cond': cond} + + @property + def stylespace_sizes(self): + modules = list(self.input_blocks.modules()) + list( + self.middle_block.modules()) + list(self.output_blocks.modules()) + sizes = [] + for module in modules: + if isinstance(module, ResBlock): + linear = module.cond_emb_layers[-1] + sizes.append(linear.weight.shape[0]) + return sizes + + def encode_stylespace(self, x, return_vector: bool = True): + """ + encode to style space + """ + modules = list(self.input_blocks.modules()) + list( + self.middle_block.modules()) + list(self.output_blocks.modules()) + # (n, c) + cond = self.encoder.forward(x) + S = [] + for module in modules: + if isinstance(module, ResBlock): + # (n, c') + s = module.cond_emb_layers.forward(cond) + S.append(s) + + if return_vector: + # (n, sum_c) + return torch.cat(S, dim=1) + else: + return S + + def forward(self, + x, + t, + y=None, + x_start=None, + cond=None, + style=None, + noise=None, + t_cond=None, + **kwargs): + """ + Apply the model to an input batch. + + Args: + x_start: the original image to encode + cond: output of the encoder + noise: random noise (to predict the cond) + """ + + if t_cond is None: + t_cond = t + + if noise is not None: + # if the noise is given, we predict the cond from noise + cond = self.noise_to_cond(noise) + + if cond is None: + if x is not None: + assert len(x) == len(x_start), f'{len(x)} != {len(x_start)}' + + tmp = self.encode(x_start) + cond = tmp['cond'] + + if t is not None: + _t_emb = timestep_embedding(t, self.conf.model_channels) + _t_cond_emb = timestep_embedding(t_cond, self.conf.model_channels) + else: + # this happens when training only autoenc + _t_emb = None + _t_cond_emb = None + + if self.conf.resnet_two_cond: + res = self.time_embed.forward( + time_emb=_t_emb, + cond=cond, + time_cond_emb=_t_cond_emb, + ) + else: + raise NotImplementedError() + + if self.conf.resnet_two_cond: + # two cond: first = time emb, second = cond_emb + emb = res.time_emb + cond_emb = res.emb + else: + # one cond = combined of both time and cond + emb = res.emb + cond_emb = None + + # override the style if given + style = style or res.style + + assert (y is not None) == ( + self.conf.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + + if self.conf.num_classes is not None: + raise NotImplementedError() + # assert y.shape == (x.shape[0], ) + # emb = emb + self.label_emb(y) + + # where in the model to supply time conditions + enc_time_emb = emb + mid_time_emb = emb + dec_time_emb = emb + # where in the model to supply style conditions + enc_cond_emb = cond_emb + mid_cond_emb = cond_emb + dec_cond_emb = cond_emb + + # hs = [] + hs = [[] for _ in range(len(self.conf.channel_mult))] + + if x is not None: + h = x.type(self.dtype) + + # input blocks + k = 0 + for i in range(len(self.input_num_blocks)): + for j in range(self.input_num_blocks[i]): + h = self.input_blocks[k](h, + emb=enc_time_emb, + cond=enc_cond_emb) + + # print(i, j, h.shape) + hs[i].append(h) + k += 1 + assert k == len(self.input_blocks) + + # middle blocks + h = self.middle_block(h, emb=mid_time_emb, cond=mid_cond_emb) + else: + # no lateral connections + # happens when training only the autonecoder + h = None + hs = [[] for _ in range(len(self.conf.channel_mult))] + + # output blocks + k = 0 + for i in range(len(self.output_num_blocks)): + for j in range(self.output_num_blocks[i]): + # take the lateral connection from the same layer (in reserve) + # until there is no more, use None + try: + lateral = hs[-i - 1].pop() + # print(i, j, lateral.shape) + except IndexError: + lateral = None + # print(i, j, lateral) + + h = self.output_blocks[k](h, + emb=dec_time_emb, + cond=dec_cond_emb, + lateral=lateral) + k += 1 + + pred = self.out(h) + return AutoencReturn(pred=pred, cond=cond) + + +class AutoencReturn(NamedTuple): + pred: Tensor + cond: Tensor = None + + +class EmbedReturn(NamedTuple): + # style and time + emb: Tensor = None + # time only + time_emb: Tensor = None + # style only (but could depend on time) + style: Tensor = None + + +class TimeStyleSeperateEmbed(nn.Module): + # embed only style + def __init__(self, time_channels, time_out_channels): + super().__init__() + self.time_embed = nn.Sequential( + linear(time_channels, time_out_channels), + nn.SiLU(), + linear(time_out_channels, time_out_channels), + ) + self.style = nn.Identity() + + def forward(self, time_emb=None, cond=None, **kwargs): + if time_emb is None: + # happens with autoenc training mode + time_emb = None + else: + time_emb = self.time_embed(time_emb) + style = self.style(cond) + return EmbedReturn(emb=style, time_emb=time_emb, style=style) diff --git a/AniTalker-kit/AniTalker/code/networks/__init__.py b/AniTalker-kit/AniTalker/code/networks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/AniTalker-kit/AniTalker/code/networks/discriminator.py b/AniTalker-kit/AniTalker/code/networks/discriminator.py new file mode 100644 index 00000000..c98b5a85 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/networks/discriminator.py @@ -0,0 +1,259 @@ +import math +import torch +from torch.nn import functional as F +from torch import nn + + +def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5): + return F.leaky_relu(input + bias, negative_slope) * scale + + +class FusedLeakyReLU(nn.Module): + def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5): + super().__init__() + self.bias = nn.Parameter(torch.zeros(1, channel, 1, 1)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + # print("FusedLeakyReLU: ", input.abs().mean()) + out = fused_leaky_relu(input, self.bias, self.negative_slope, self.scale) + # print("FusedLeakyReLU: ", out.abs().mean()) + return out + + +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): + _, minor, in_h, in_w = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, minor, in_h, 1, in_w, 1) + out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0]) + out = out.view(-1, minor, in_h * up_y, in_w * up_x) + + out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]) + out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0), max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0), ] + + # out = out.permute(0, 3, 1, 2) + out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, ) + # out = out.permute(0, 2, 3, 1) + + return out[:, :, ::down_y, ::down_x] + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]) + + +def make_kernel(k): + k = torch.tensor(k, dtype=torch.float32) + + if k.ndim == 1: + k = k[None, :] * k[:, None] + + k /= k.sum() + + return k + + +class Blur(nn.Module): + def __init__(self, kernel, pad, upsample_factor=1): + super().__init__() + + kernel = make_kernel(kernel) + + if upsample_factor > 1: + kernel = kernel * (upsample_factor ** 2) + + self.register_buffer('kernel', kernel) + + self.pad = pad + + def forward(self, input): + return upfirdn2d(input, self.kernel, pad=self.pad) + + +class ScaledLeakyReLU(nn.Module): + def __init__(self, negative_slope=0.2): + super().__init__() + + self.negative_slope = negative_slope + + def forward(self, input): + return F.leaky_relu(input, negative_slope=self.negative_slope) + + +class EqualConv2d(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_channel, in_channel, kernel_size, kernel_size)) + self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2) + + self.stride = stride + self.padding = padding + + if bias: + self.bias = nn.Parameter(torch.zeros(out_channel)) + else: + self.bias = None + + def forward(self, input): + + return F.conv2d(input, self.weight * self.scale, bias=self.bias, stride=self.stride, + padding=self.padding, ) + + def __repr__(self): + return ( + f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},' + f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})' + ) + + +class EqualLinear(nn.Module): + def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) + + if bias: + self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) + else: + self.bias = None + + self.activation = activation + + self.scale = (1 / math.sqrt(in_dim)) * lr_mul + self.lr_mul = lr_mul + + def forward(self, input): + + if self.activation: + out = F.linear(input, self.weight * self.scale) + out = fused_leaky_relu(out, self.bias * self.lr_mul) + else: + out = F.linear(input, self.weight * self.scale, bias=self.bias * self.lr_mul) + + return out + + def __repr__(self): + return (f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})') + + +class ConvLayer(nn.Sequential): + def __init__( + self, + in_channel, + out_channel, + kernel_size, + downsample=False, + blur_kernel=[1, 3, 3, 1], + bias=True, + activate=True, + ): + layers = [] + + if downsample: + factor = 2 + p = (len(blur_kernel) - factor) + (kernel_size - 1) + pad0 = (p + 1) // 2 + pad1 = p // 2 + + layers.append(Blur(blur_kernel, pad=(pad0, pad1))) + + stride = 2 + self.padding = 0 + + else: + stride = 1 + self.padding = kernel_size // 2 + + layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=self.padding, stride=stride, + bias=bias and not activate)) + + if activate: + if bias: + layers.append(FusedLeakyReLU(out_channel)) + else: + layers.append(ScaledLeakyReLU(0.2)) + + super().__init__(*layers) + + +class ResBlock(nn.Module): + def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]): + super().__init__() + + self.conv1 = ConvLayer(in_channel, in_channel, 3) + self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True) + + self.skip = ConvLayer(in_channel, out_channel, 1, downsample=True, activate=False, bias=False) + + def forward(self, input): + out = self.conv1(input) + out = self.conv2(out) + + skip = self.skip(input) + out = (out + skip) / math.sqrt(2) + + return out + + +class Discriminator(nn.Module): + def __init__(self, size, channel_multiplier=1, blur_kernel=[1, 3, 3, 1]): + super().__init__() + + self.size = size + + channels = { + 4: 512, + 8: 512, + 16: 512, + 32: 512, + 64: 256 * channel_multiplier, + 128: 128 * channel_multiplier, + 256: 64 * channel_multiplier, + 512: 32 * channel_multiplier, + 1024: 16 * channel_multiplier, + } + + convs = [ConvLayer(3, channels[size], 1)] + log_size = int(math.log(size, 2)) + in_channel = channels[size] + + for i in range(log_size, 2, -1): + out_channel = channels[2 ** (i - 1)] + convs.append(ResBlock(in_channel, out_channel, blur_kernel)) + in_channel = out_channel + + self.convs = nn.Sequential(*convs) + + self.stddev_group = 4 + self.stddev_feat = 1 + + self.final_conv = ConvLayer(in_channel + 1, channels[4], 3) + self.final_linear = nn.Sequential( + EqualLinear(channels[4] * 4 * 4, channels[4], activation='fused_lrelu'), + EqualLinear(channels[4], 1), + ) + + def forward(self, input): + out = self.convs(input) + batch, channel, height, width = out.shape + + group = min(batch, self.stddev_group) + stddev = out.view(group, -1, self.stddev_feat, channel // self.stddev_feat, height, width) + stddev = torch.sqrt(stddev.var(0, unbiased=False) + 1e-8) + stddev = stddev.mean([2, 3, 4], keepdims=True).squeeze(2) + stddev = stddev.repeat(group, 1, height, width) + out = torch.cat([out, stddev], 1) + + out = self.final_conv(out) + + out = out.view(batch, -1) + out = self.final_linear(out) + + return out diff --git a/AniTalker-kit/AniTalker/code/networks/encoder.py b/AniTalker-kit/AniTalker/code/networks/encoder.py new file mode 100644 index 00000000..ede8bae2 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/networks/encoder.py @@ -0,0 +1,374 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + +def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5): + return F.leaky_relu(input + bias, negative_slope) * scale + +class FusedLeakyReLU(nn.Module): + def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5): + super().__init__() + self.bias = nn.Parameter(torch.zeros(1, channel, 1, 1)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + out = fused_leaky_relu(input, self.bias, self.negative_slope, self.scale) + return out + + +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): + _, minor, in_h, in_w = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, minor, in_h, 1, in_w, 1) + out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0]) + out = out.view(-1, minor, in_h * up_y, in_w * up_x) + + out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]) + out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0), + max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0), ] + + out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, ) + + return out[:, :, ::down_y, ::down_x] + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]) + + +def make_kernel(k): + k = torch.tensor(k, dtype=torch.float32) + + if k.ndim == 1: + k = k[None, :] * k[:, None] + + k /= k.sum() + + return k + + +class Blur(nn.Module): + def __init__(self, kernel, pad, upsample_factor=1): + super().__init__() + + kernel = make_kernel(kernel) + + if upsample_factor > 1: + kernel = kernel * (upsample_factor ** 2) + + self.register_buffer('kernel', kernel) + + self.pad = pad + + def forward(self, input): + return upfirdn2d(input, self.kernel, pad=self.pad) + + +class ScaledLeakyReLU(nn.Module): + def __init__(self, negative_slope=0.2): + super().__init__() + + self.negative_slope = negative_slope + + def forward(self, input): + return F.leaky_relu(input, negative_slope=self.negative_slope) + + +class EqualConv2d(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_channel, in_channel, kernel_size, kernel_size)) + self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2) + + self.stride = stride + self.padding = padding + + if bias: + self.bias = nn.Parameter(torch.zeros(out_channel)) + else: + self.bias = None + + def forward(self, input): + + return F.conv2d(input, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding) + + def __repr__(self): + return ( + f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},' + f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})' + ) + + +class EqualLinear(nn.Module): + def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) + + if bias: + self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) + else: + self.bias = None + + self.activation = activation + + self.scale = (1 / math.sqrt(in_dim)) * lr_mul + self.lr_mul = lr_mul + + def forward(self, input): + + if self.activation: + out = F.linear(input, self.weight * self.scale) + out = fused_leaky_relu(out, self.bias * self.lr_mul) + else: + out = F.linear(input, self.weight * self.scale, bias=self.bias * self.lr_mul) + + return out + + def __repr__(self): + return (f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})') + + +class ConvLayer(nn.Sequential): + def __init__( + self, + in_channel, + out_channel, + kernel_size, + downsample=False, + blur_kernel=[1, 3, 3, 1], + bias=True, + activate=True, + ): + layers = [] + + if downsample: + factor = 2 + p = (len(blur_kernel) - factor) + (kernel_size - 1) + pad0 = (p + 1) // 2 + pad1 = p // 2 + + layers.append(Blur(blur_kernel, pad=(pad0, pad1))) + + stride = 2 + self.padding = 0 + + else: + stride = 1 + self.padding = kernel_size // 2 + + layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=self.padding, stride=stride, + bias=bias and not activate)) + + if activate: + if bias: + layers.append(FusedLeakyReLU(out_channel)) + else: + layers.append(ScaledLeakyReLU(0.2)) + + super().__init__(*layers) + + +class ResBlock(nn.Module): + def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]): + super().__init__() + + self.conv1 = ConvLayer(in_channel, in_channel, 3) + self.conv2 = ConvLayer(in_channel, out_channel, 3, downsample=True) + + self.skip = ConvLayer(in_channel, out_channel, 1, downsample=True, activate=False, bias=False) + + def forward(self, input): + out = self.conv1(input) + out = self.conv2(out) + + skip = self.skip(input) + out = (out + skip) / math.sqrt(2) + + return out + +class WeightedSumLayer(nn.Module): + def __init__(self, num_tensors=8): + super(WeightedSumLayer, self).__init__() + + self.weights = nn.Parameter(torch.randn(num_tensors)) + + def forward(self, tensor_list): + + weights = torch.softmax(self.weights, dim=0) + weighted_sum = torch.zeros_like(tensor_list[0]) + for tensor, weight in zip(tensor_list, weights): + weighted_sum += tensor * weight + + return weighted_sum + +class EncoderApp(nn.Module): + def __init__(self, size, w_dim=512, fusion_type=''): + super(EncoderApp, self).__init__() + + channels = { + 4: 512, + 8: 512, + 16: 512, + 32: 512, + 64: 256, + 128: 128, + 256: 64, + 512: 32, + 1024: 16 + } + + self.w_dim = w_dim + log_size = int(math.log(size, 2)) + + self.convs = nn.ModuleList() + self.convs.append(ConvLayer(3, channels[size], 1)) + + in_channel = channels[size] + for i in range(log_size, 2, -1): + out_channel = channels[2 ** (i - 1)] + self.convs.append(ResBlock(in_channel, out_channel)) + in_channel = out_channel + + self.convs.append(EqualConv2d(in_channel, self.w_dim, 4, padding=0, bias=False)) + + self.fusion_type = fusion_type + assert self.fusion_type == 'weighted_sum' + if self.fusion_type == 'weighted_sum': + print(f'HAL layer is enabled!') + self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc1 = EqualLinear(64, 512) + self.fc2 = EqualLinear(128, 512) + self.fc3 = EqualLinear(256, 512) + self.ws = WeightedSumLayer() + + def forward(self, x): + + res = [] + h = x + pooled_h_lists = [] + for i, conv in enumerate(self.convs): + h = conv(h) + if self.fusion_type == 'weighted_sum': + pooled_h = self.adaptive_pool(h).view(x.size(0), -1) + if i == 0: + pooled_h_lists.append(self.fc1(pooled_h)) + elif i == 1: + pooled_h_lists.append(self.fc2(pooled_h)) + elif i == 2: + pooled_h_lists.append(self.fc3(pooled_h)) + else: + pooled_h_lists.append(pooled_h) + res.append(h) + + if self.fusion_type == 'weighted_sum': + last_layer = self.ws(pooled_h_lists) + else: + last_layer = res[-1].squeeze(-1).squeeze(-1) + layer_features = res[::-1][2:] + + return last_layer, layer_features + + +class DecouplingModel(nn.Module): + def __init__(self, input_dim, hidden_dim, output_dim): + super(DecouplingModel, self).__init__() + + # identity_excluded_net is called identity encoder in the paper + self.identity_net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim) + ) + + self.identity_net_density = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim) + ) + + # identity_excluded_net is called motion encoder in the paper + self.identity_excluded_net = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, output_dim) + ) + + def forward(self, x): + + id_, id_rm = self.identity_net(x), self.identity_excluded_net(x) + id_density = self.identity_net_density(id_) + return id_, id_rm, id_density + +class Encoder(nn.Module): + def __init__(self, size, dim=512, dim_motion=20, weighted_sum=False): + super(Encoder, self).__init__() + + # image encoder + self.net_app = EncoderApp(size, dim, weighted_sum) + + # decouping network + self.net_decouping = DecouplingModel(dim, dim, dim) + + # part of the motion encoder + fc = [EqualLinear(dim, dim)] + for i in range(3): + fc.append(EqualLinear(dim, dim)) + + fc.append(EqualLinear(dim, dim_motion)) + self.fc = nn.Sequential(*fc) + + def enc_app(self, x): + + h_source = self.net_app(x) + + return h_source + + def enc_motion(self, x): + + h, _ = self.net_app(x) + h_motion = self.fc(h) + + return h_motion + + def encode_image_obj(self, image_obj): + feat, _ = self.net_app(image_obj) + id_emb, idrm_emb, id_density_emb = self.net_decouping(feat) + return id_emb, idrm_emb, id_density_emb + + def forward(self, input_source, input_target, input_face, input_aug): + + + if input_target is not None: + + h_source, feats = self.net_app(input_source) + h_target, _ = self.net_app(input_target) + h_face, _ = self.net_app(input_face) + h_aug, _ = self.net_app(input_aug) + + h_source_id_emb, h_source_idrm_emb, h_source_id_density_emb = self.net_decouping(h_source) + h_target_id_emb, h_target_idrm_emb, h_target_id_density_emb = self.net_decouping(h_target) + h_face_id_emb, h_face_idrm_emb, h_face_id_density_emb = self.net_decouping(h_face) + h_aug_id_emb, h_aug_idrm_emb, h_aug_id_density_emb = self.net_decouping(h_aug) + + h_target_motion_target = self.fc(h_target_idrm_emb) + h_another_face_target = self.fc(h_face_idrm_emb) + + else: + h_source, feats = self.net_app(input_source) + + + return {'h_source':h_source, 'h_motion':h_target_motion_target, 'feats':feats, 'h_another_face_target':h_another_face_target, 'h_face':h_face, \ + 'h_source_id_emb':h_source_id_emb, 'h_source_idrm_emb':h_source_idrm_emb, 'h_source_id_density_emb':h_source_id_density_emb, \ + 'h_target_id_emb':h_target_id_emb, 'h_target_idrm_emb':h_target_idrm_emb, 'h_target_id_density_emb':h_target_id_density_emb, \ + 'h_face_id_emb':h_face_id_emb, 'h_face_idrm_emb':h_face_idrm_emb, 'h_face_id_density_emb':h_face_id_density_emb, \ + 'h_aug_id_emb':h_aug_id_emb, 'h_aug_idrm_emb':h_aug_idrm_emb ,'h_aug_id_density_emb':h_aug_id_density_emb, \ + } diff --git a/AniTalker-kit/AniTalker/code/networks/generator.py b/AniTalker-kit/AniTalker/code/networks/generator.py new file mode 100644 index 00000000..0b46d4a4 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/networks/generator.py @@ -0,0 +1,27 @@ +from torch import nn +from .encoder import Encoder +from .styledecoder import Synthesis + + +class Generator(nn.Module): + def __init__(self, size, style_dim=512, motion_dim=20, channel_multiplier=1, blur_kernel=[1, 3, 3, 1]): + super(Generator, self).__init__() + + # encoder + self.enc = Encoder(size, style_dim, motion_dim) + self.dec = Synthesis(size, style_dim, motion_dim, blur_kernel, channel_multiplier) + + def get_direction(self): + return self.dec.direction(None) + + def synthesis(self, wa, alpha, feat): + img = self.dec(wa, alpha, feat) + + return img + + def forward(self, img_source, img_drive, h_start=None): + wa, alpha, feats = self.enc(img_source, img_drive, h_start) + # import pdb;pdb.set_trace() + img_recon = self.dec(wa, alpha, feats) + + return img_recon diff --git a/AniTalker-kit/AniTalker/code/networks/styledecoder.py b/AniTalker-kit/AniTalker/code/networks/styledecoder.py new file mode 100644 index 00000000..e2ea080a --- /dev/null +++ b/AniTalker-kit/AniTalker/code/networks/styledecoder.py @@ -0,0 +1,527 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F +import numpy as np + + +def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5): + return F.leaky_relu(input + bias, negative_slope) * scale + + +class FusedLeakyReLU(nn.Module): + def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5): + super().__init__() + self.bias = nn.Parameter(torch.zeros(1, channel, 1, 1)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + out = fused_leaky_relu(input, self.bias, self.negative_slope, self.scale) + return out + + +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1): + _, minor, in_h, in_w = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, minor, in_h, 1, in_w, 1) + out = F.pad(out, [0, up_x - 1, 0, 0, 0, up_y - 1, 0, 0]) + out = out.view(-1, minor, in_h * up_y, in_w * up_x) + + out = F.pad(out, [max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]) + out = out[:, :, max(-pad_y0, 0): out.shape[2] - max(-pad_y1, 0), + max(-pad_x0, 0): out.shape[3] - max(-pad_x1, 0), ] + + out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape(-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, ) + return out[:, :, ::down_y, ::down_x] + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + return upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]) + + +class PixelNorm(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, input): + return input * torch.rsqrt(torch.mean(input ** 2, dim=1, keepdim=True) + 1e-8) + + +class MotionPixelNorm(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, input): + return input * torch.rsqrt(torch.mean(input ** 2, dim=2, keepdim=True) + 1e-8) + + +def make_kernel(k): + k = torch.tensor(k, dtype=torch.float32) + + if k.ndim == 1: + k = k[None, :] * k[:, None] + + k /= k.sum() + + return k + + +class Upsample(nn.Module): + def __init__(self, kernel, factor=2): + super().__init__() + + self.factor = factor + kernel = make_kernel(kernel) * (factor ** 2) + self.register_buffer('kernel', kernel) + + p = kernel.shape[0] - factor + + pad0 = (p + 1) // 2 + factor - 1 + pad1 = p // 2 + + self.pad = (pad0, pad1) + + def forward(self, input): + return upfirdn2d(input, self.kernel, up=self.factor, down=1, pad=self.pad) + + +class Downsample(nn.Module): + def __init__(self, kernel, factor=2): + super().__init__() + + self.factor = factor + kernel = make_kernel(kernel) + self.register_buffer('kernel', kernel) + + p = kernel.shape[0] - factor + + pad0 = (p + 1) // 2 + pad1 = p // 2 + + self.pad = (pad0, pad1) + + def forward(self, input): + return upfirdn2d(input, self.kernel, up=1, down=self.factor, pad=self.pad) + + +class Blur(nn.Module): + def __init__(self, kernel, pad, upsample_factor=1): + super().__init__() + + kernel = make_kernel(kernel) + + if upsample_factor > 1: + kernel = kernel * (upsample_factor ** 2) + + self.register_buffer('kernel', kernel) + + self.pad = pad + + def forward(self, input): + return upfirdn2d(input, self.kernel, pad=self.pad) + + +class EqualConv2d(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, stride=1, padding=0, bias=True): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_channel, in_channel, kernel_size, kernel_size)) + self.scale = 1 / math.sqrt(in_channel * kernel_size ** 2) + + self.stride = stride + self.padding = padding + + if bias: + self.bias = nn.Parameter(torch.zeros(out_channel)) + else: + self.bias = None + + def forward(self, input): + + return F.conv2d(input, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding, ) + + def __repr__(self): + return ( + f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]},' + f' {self.weight.shape[2]}, stride={self.stride}, padding={self.padding})' + ) + + +class EqualLinear(nn.Module): + def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1, activation=None): + super().__init__() + + self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul)) + + if bias: + self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init)) + else: + self.bias = None + + self.activation = activation + + self.scale = (1 / math.sqrt(in_dim)) * lr_mul + self.lr_mul = lr_mul + + def forward(self, input): + + if self.activation: + out = F.linear(input, self.weight * self.scale) + out = fused_leaky_relu(out, self.bias * self.lr_mul) + else: + out = F.linear(input, self.weight * self.scale, bias=self.bias * self.lr_mul) + + return out + + def __repr__(self): + return (f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})') + + +class ScaledLeakyReLU(nn.Module): + def __init__(self, negative_slope=0.2): + super().__init__() + + self.negative_slope = negative_slope + + def forward(self, input): + return F.leaky_relu(input, negative_slope=self.negative_slope) + + +class ModulatedConv2d(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, style_dim, demodulate=True, upsample=False, + downsample=False, blur_kernel=[1, 3, 3, 1], ): + super().__init__() + + self.eps = 1e-8 + self.kernel_size = kernel_size + self.in_channel = in_channel + self.out_channel = out_channel + self.upsample = upsample + self.downsample = downsample + + if upsample: + factor = 2 + p = (len(blur_kernel) - factor) - (kernel_size - 1) + pad0 = (p + 1) // 2 + factor - 1 + pad1 = p // 2 + 1 + + self.blur = Blur(blur_kernel, pad=(pad0, pad1), upsample_factor=factor) + + if downsample: + factor = 2 + p = (len(blur_kernel) - factor) + (kernel_size - 1) + pad0 = (p + 1) // 2 + pad1 = p // 2 + + self.blur = Blur(blur_kernel, pad=(pad0, pad1)) + + fan_in = in_channel * kernel_size ** 2 + self.scale = 1 / math.sqrt(fan_in) + self.padding = kernel_size // 2 + + self.weight = nn.Parameter(torch.randn(1, out_channel, in_channel, kernel_size, kernel_size)) + + self.modulation = EqualLinear(style_dim, in_channel, bias_init=1) + self.demodulate = demodulate + + def __repr__(self): + return ( + f'{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, ' + f'upsample={self.upsample}, downsample={self.downsample})' + ) + + def forward(self, input, style): + batch, in_channel, height, width = input.shape + + style = self.modulation(style).view(batch, 1, in_channel, 1, 1) + weight = self.scale * self.weight * style + + if self.demodulate: + demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + 1e-8) + weight = weight * demod.view(batch, self.out_channel, 1, 1, 1) + + weight = weight.view(batch * self.out_channel, in_channel, self.kernel_size, self.kernel_size) + + if self.upsample: + input = input.view(1, batch * in_channel, height, width) + weight = weight.view(batch, self.out_channel, in_channel, self.kernel_size, self.kernel_size) + weight = weight.transpose(1, 2).reshape(batch * in_channel, self.out_channel, self.kernel_size, + self.kernel_size) + out = F.conv_transpose2d(input, weight, padding=0, stride=2, groups=batch) + _, _, height, width = out.shape + out = out.view(batch, self.out_channel, height, width) + out = self.blur(out) + elif self.downsample: + input = self.blur(input) + _, _, height, width = input.shape + input = input.view(1, batch * in_channel, height, width) + out = F.conv2d(input, weight, padding=0, stride=2, groups=batch) + _, _, height, width = out.shape + out = out.view(batch, self.out_channel, height, width) + else: + input = input.view(1, batch * in_channel, height, width) + out = F.conv2d(input, weight, padding=self.padding, groups=batch) + _, _, height, width = out.shape + out = out.view(batch, self.out_channel, height, width) + + return out + + +class NoiseInjection(nn.Module): + def __init__(self): + super().__init__() + + self.weight = nn.Parameter(torch.zeros(1)) + + def forward(self, image, noise=None): + + if noise is None: + return image + else: + return image + self.weight * noise + + +class ConstantInput(nn.Module): + def __init__(self, channel, size=4): + super().__init__() + + self.input = nn.Parameter(torch.randn(1, channel, size, size)) + + def forward(self, input): + batch = input.shape[0] + out = self.input.repeat(batch, 1, 1, 1) + + return out + + +class StyledConv(nn.Module): + def __init__(self, in_channel, out_channel, kernel_size, style_dim, upsample=False, blur_kernel=[1, 3, 3, 1], + demodulate=True): + super().__init__() + + self.conv = ModulatedConv2d( + in_channel, + out_channel, + kernel_size, + style_dim, + upsample=upsample, + blur_kernel=blur_kernel, + demodulate=demodulate, + ) + + self.noise = NoiseInjection() + self.activate = FusedLeakyReLU(out_channel) + + def forward(self, input, style, noise=None): + out = self.conv(input, style) + out = self.noise(out, noise=noise) + out = self.activate(out) + + return out + + +class ConvLayer(nn.Sequential): + def __init__( + self, + in_channel, + out_channel, + kernel_size, + downsample=False, + blur_kernel=[1, 3, 3, 1], + bias=True, + activate=True, + ): + layers = [] + + if downsample: + factor = 2 + p = (len(blur_kernel) - factor) + (kernel_size - 1) + pad0 = (p + 1) // 2 + pad1 = p // 2 + + layers.append(Blur(blur_kernel, pad=(pad0, pad1))) + + stride = 2 + self.padding = 0 + + else: + stride = 1 + self.padding = kernel_size // 2 + + layers.append(EqualConv2d(in_channel, out_channel, kernel_size, padding=self.padding, stride=stride, + bias=bias and not activate)) + + if activate: + if bias: + layers.append(FusedLeakyReLU(out_channel)) + else: + layers.append(ScaledLeakyReLU(0.2)) + + super().__init__(*layers) + + +class ToRGB(nn.Module): + def __init__(self, in_channel, style_dim, upsample=True, blur_kernel=[1, 3, 3, 1]): + super().__init__() + + if upsample: + self.upsample = Upsample(blur_kernel) + + self.conv = ConvLayer(in_channel, 3, 1) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, input, skip=None): + out = self.conv(input) + out = out + self.bias + + if skip is not None: + skip = self.upsample(skip) + out = out + skip + + return out + + +class ToFlow(nn.Module): + def __init__(self, in_channel, style_dim, upsample=True, blur_kernel=[1, 3, 3, 1]): + super().__init__() + + if upsample: + self.upsample = Upsample(blur_kernel) + + self.style_dim = style_dim + self.in_channel = in_channel + self.conv = ModulatedConv2d(in_channel, 3, 1, style_dim, demodulate=False) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, input, style, feat, skip=None): # input 是来自上一层的 feature, style 是 512 的 condition, feat 是来自于 unet 的跳层 + out = self.conv(input, style) + out = out + self.bias + + # warping + xs = np.linspace(-1, 1, input.size(2)) + + xs = np.meshgrid(xs, xs) + xs = np.stack(xs, 2) + + xs = torch.tensor(xs, requires_grad=False).float().unsqueeze(0).repeat(input.size(0), 1, 1, 1).to(input.device) + # import pdb;pdb.set_trace() + if skip is not None: + skip = self.upsample(skip) + out = out + skip + + sampler = torch.tanh(out[:, 0:2, :, :]) + mask = torch.sigmoid(out[:, 2:3, :, :]) + flow = sampler.permute(0, 2, 3, 1) + xs # xs在这里相当于一个 location 的位置 + + feat_warp = F.grid_sample(feat, flow) * mask + # import pdb;pdb.set_trace() + return feat_warp, feat_warp + input * (1.0 - mask), out + + +class Direction(nn.Module): + def __init__(self, motion_dim): + super(Direction, self).__init__() + + self.weight = nn.Parameter(torch.randn(512, motion_dim)) + + def forward(self, input): + # input: (bs*t) x 512 + + weight = self.weight + 1e-8 + Q, R = torch.qr(weight) # get eignvector, orthogonal [n1, n2, n3, n4] + + if input is None: + return Q + else: + input_diag = torch.diag_embed(input) # alpha, diagonal matrix + out = torch.matmul(input_diag, Q.T) + out = torch.sum(out, dim=1) + + return out + +class Synthesis(nn.Module): + def __init__(self, size, style_dim, motion_dim, blur_kernel=[1, 3, 3, 1], channel_multiplier=1): + super(Synthesis, self).__init__() + + self.size = size + self.style_dim = style_dim + self.motion_dim = motion_dim + + self.direction = Direction(motion_dim) # Linear Motion Decomposition (LMD) from LIA + + self.channels = { + 4: 512, + 8: 512, + 16: 512, + 32: 512, + 64: 256 * channel_multiplier, + 128: 128 * channel_multiplier, + 256: 64 * channel_multiplier, + 512: 32 * channel_multiplier, + 1024: 16 * channel_multiplier, + } + + self.input = ConstantInput(self.channels[4]) + self.conv1 = StyledConv(self.channels[4], self.channels[4], 3, style_dim, blur_kernel=blur_kernel) + self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False) + + self.log_size = int(math.log(size, 2)) + self.num_layers = (self.log_size - 2) * 2 + 1 + + self.convs = nn.ModuleList() + self.upsamples = nn.ModuleList() + self.to_rgbs = nn.ModuleList() + self.to_flows = nn.ModuleList() + + in_channel = self.channels[4] + + for i in range(3, self.log_size + 1): + out_channel = self.channels[2 ** i] + + self.convs.append(StyledConv(in_channel, out_channel, 3, style_dim, upsample=True, + blur_kernel=blur_kernel)) + self.convs.append(StyledConv(out_channel, out_channel, 3, style_dim, blur_kernel=blur_kernel)) + self.to_rgbs.append(ToRGB(out_channel, style_dim)) + + self.to_flows.append(ToFlow(out_channel, style_dim)) + + in_channel = out_channel + + self.n_latent = self.log_size * 2 - 2 + + def forward(self, source_before_decoupling, target_motion, feats): + + directions = self.direction(target_motion) + latent = source_before_decoupling + directions # wa + directions + + inject_index = self.n_latent + latent = latent.unsqueeze(1).repeat(1, inject_index, 1) + + out = self.input(latent) + out = self.conv1(out, latent[:, 0]) + + i = 1 + for conv1, conv2, to_rgb, to_flow, feat in zip(self.convs[::2], self.convs[1::2], self.to_rgbs, + self.to_flows, feats): + out = conv1(out, latent[:, i]) + out = conv2(out, latent[:, i + 1]) + if out.size(2) == 8: + out_warp, out, skip_flow = to_flow(out, latent[:, i + 2], feat) + skip = to_rgb(out_warp) + else: + out_warp, out, skip_flow = to_flow(out, latent[:, i + 2], feat, skip_flow) + skip = to_rgb(out_warp, skip) + i += 2 + + img = skip + + return img + + + diff --git a/AniTalker-kit/AniTalker/code/networks/utils.py b/AniTalker-kit/AniTalker/code/networks/utils.py new file mode 100644 index 00000000..a2fbe820 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/networks/utils.py @@ -0,0 +1,53 @@ +from torch import nn +import torch.nn.functional as F +import torch + + +class AntiAliasInterpolation2d(nn.Module): + """ + Band-limited downsampling, for better preservation of the input signal. + """ + + def __init__(self, channels, scale): + super(AntiAliasInterpolation2d, self).__init__() + sigma = (1 / scale - 1) / 2 + kernel_size = 2 * round(sigma * 4) + 1 + self.ka = kernel_size // 2 + self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka + + kernel_size = [kernel_size, kernel_size] + sigma = [sigma, sigma] + # The gaussian kernel is the product of the + # gaussian function of each dimension. + kernel = 1 + meshgrids = torch.meshgrid( + [ + torch.arange(size, dtype=torch.float32) + for size in kernel_size + ] + ) + for size, std, mgrid in zip(kernel_size, sigma, meshgrids): + mean = (size - 1) / 2 + kernel *= torch.exp(-(mgrid - mean) ** 2 / (2 * std ** 2)) + + # Make sure sum of values in gaussian kernel equals 1. + kernel = kernel / torch.sum(kernel) + # Reshape to depthwise convolutional weight + kernel = kernel.view(1, 1, *kernel.size()) + kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1)) + + self.register_buffer('weight', kernel) + self.groups = channels + self.scale = scale + inv_scale = 1 / scale + self.int_inv_scale = int(inv_scale) + + def forward(self, input): + if self.scale == 1.0: + return input + + out = F.pad(input, (self.ka, self.kb, self.ka, self.kb)) + out = F.conv2d(out, weight=self.weight, groups=self.groups) + out = out[:, :, ::self.int_inv_scale, ::self.int_inv_scale] + + return out diff --git a/AniTalker-kit/AniTalker/code/renderer.py b/AniTalker-kit/AniTalker/code/renderer.py new file mode 100644 index 00000000..adfe0cd9 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/renderer.py @@ -0,0 +1,25 @@ +from config import * + +def render_condition( + conf: TrainConfig, + model, + sampler, start, motion_direction_start, audio_driven, \ + face_location, face_scale, \ + yaw_pitch_roll, noisyT, control_flag, +): + if conf.train_mode == TrainMode.diffusion: + assert conf.model_type.has_autoenc() + + return sampler.sample(model=model, + noise=noisyT, + model_kwargs={ + 'motion_direction_start': motion_direction_start, + 'yaw_pitch_roll': yaw_pitch_roll, + 'start': start, + 'audio_driven': audio_driven, + 'face_location': face_location, + 'face_scale': face_scale, + 'control_flag': control_flag + }) + else: + raise NotImplementedError() diff --git a/AniTalker-kit/AniTalker/code/webgui.py b/AniTalker-kit/AniTalker/code/webgui.py new file mode 100644 index 00000000..54414f85 --- /dev/null +++ b/AniTalker-kit/AniTalker/code/webgui.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- +''' +webui +''' + +import argparse +from datetime import datetime +from pathlib import Path +import numpy as np +import torch +from PIL import Image +import gradio as gr +import shutil +import librosa +import python_speech_features +import time +from LIA_Model import LIA_Model +import os +from tqdm import tqdm +import argparse +import numpy as np +from torchvision import transforms +from templates import * +import argparse +import shutil +from moviepy.editor import * +import librosa +import python_speech_features +import importlib.util +import time +import os +import time +import numpy as np + + +# Disable Gradio analytics to avoid network-related issues +gr.analytics_enabled = False + + +def check_package_installed(package_name): + package_spec = importlib.util.find_spec(package_name) + if package_spec is None: + print(f"{package_name} is not installed.") + return False + else: + print(f"{package_name} is installed.") + return True + +def frames_to_video(input_path, audio_path, output_path, fps=25): + image_files = [os.path.join(input_path, img) for img in sorted(os.listdir(input_path))] + clips = [ImageClip(m).set_duration(1/fps) for m in image_files] + video = concatenate_videoclips(clips, method="compose") + + audio = AudioFileClip(audio_path) + final_video = video.set_audio(audio) + final_video.write_videofile(output_path, fps=fps, codec='libx264', audio_codec='aac') + +def load_image(filename, size): + img = Image.open(filename).convert('RGB') + img = img.resize((size, size)) + img = np.asarray(img) + img = np.transpose(img, (2, 0, 1)) # 3 x 256 x 256 + return img / 255.0 + +def img_preprocessing(img_path, size): + img = load_image(img_path, size) # [0, 1] + img = torch.from_numpy(img).unsqueeze(0).float() # [0, 1] + imgs_norm = (img - 0.5) * 2.0 # [-1, 1] + return imgs_norm + +def saved_image(img_tensor, img_path): + toPIL = transforms.ToPILImage() + img = toPIL(img_tensor.detach().cpu().squeeze(0)) # 使用squeeze(0)来移除批次维度 + img.save(img_path) + +def main(args): + frames_result_saved_path = os.path.join(args.result_path, 'frames') + os.makedirs(frames_result_saved_path, exist_ok=True) + test_image_name = os.path.splitext(os.path.basename(args.test_image_path))[0] + audio_name = os.path.splitext(os.path.basename(args.test_audio_path))[0] + predicted_video_256_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}.mp4') + predicted_video_512_path = os.path.join(args.result_path, f'{test_image_name}-{audio_name}_SR.mp4') + + #======Loading Stage 1 model========= + lia = LIA_Model(motion_dim=args.motion_dim, fusion_type='weighted_sum') + lia.load_lightning_model(args.stage1_checkpoint_path) + lia.to(args.device) + #============================ + + conf = ffhq256_autoenc() + conf.seed = args.seed + conf.decoder_layers = args.decoder_layers + conf.infer_type = args.infer_type + conf.motion_dim = args.motion_dim + + if args.infer_type == 'mfcc_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = True + elif args.infer_type == 'mfcc_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = True + elif args.infer_type == 'hubert_pose_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + elif args.infer_type == 'hubert_audio_only': + conf.face_location=False + conf.face_scale=False + conf.mfcc = False + elif args.infer_type == 'hubert_full_control': + conf.face_location=True + conf.face_scale=True + conf.mfcc = False + else: + print('Type NOT Found!') + exit(0) + + if not os.path.exists(args.test_image_path): + print(f'{args.test_image_path} does not exist!') + exit(0) + + if not os.path.exists(args.test_audio_path): + print(f'{args.test_audio_path} does not exist!') + exit(0) + + img_source = img_preprocessing(args.test_image_path, args.image_size).to(args.device) + one_shot_lia_start, one_shot_lia_direction, feats = lia.get_start_direction_code(img_source, img_source, img_source, img_source) + + #======Loading Stage 2 model========= + model = LitModel(conf) + state = torch.load(args.stage2_checkpoint_path, map_location='cpu') + model.load_state_dict(state, strict=True) + model.ema_model.eval() + model.ema_model.to(args.device) + #================================= + + #======Audio Input========= + if conf.infer_type.startswith('mfcc'): + # MFCC features + wav, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = python_speech_features.mfcc(signal=wav, samplerate=sr, numcep=13, winlen=0.025, winstep=0.01) + d_mfcc_feat = python_speech_features.base.delta(input_values, 1) + d_mfcc_feat2 = python_speech_features.base.delta(input_values, 2) + audio_driven_obj = np.hstack((input_values, d_mfcc_feat, d_mfcc_feat2)) + frame_start, frame_end = 0, int(audio_driven_obj.shape[0]/4) + audio_start, audio_end = int(frame_start * 4), int(frame_end * 4) # The video frame is fixed to 25 hz and the audio is fixed to 100 hz + + audio_driven = torch.Tensor(audio_driven_obj[audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + + elif conf.infer_type.startswith('hubert'): + # Hubert features + if not os.path.exists(args.test_hubert_path): + + if not check_package_installed('transformers'): + print('Please install transformers module first.') + exit(0) + hubert_model_path = './ckpts/chinese-hubert-large' + if not os.path.exists(hubert_model_path): + print('Please download the hubert weight into the ckpts path first.') + exit(0) + print('You did not extract the audio features in advance, extracting online now, which will increase processing delay') + + start_time = time.time() + + # load hubert model + from transformers import Wav2Vec2FeatureExtractor, HubertModel + audio_model = HubertModel.from_pretrained(hubert_model_path).to(args.device) + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(hubert_model_path) + audio_model.feature_extractor._freeze_parameters() + audio_model.eval() + + # hubert model forward pass + audio, sr = librosa.load(args.test_audio_path, sr=16000) + input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values + input_values = input_values.to(args.device) + ws_feats = [] + with torch.no_grad(): + outputs = audio_model(input_values, output_hidden_states=True) + for i in range(len(outputs.hidden_states)): + ws_feats.append(outputs.hidden_states[i].detach().cpu().numpy()) + ws_feat_obj = np.array(ws_feats) + ws_feat_obj = np.squeeze(ws_feat_obj, 1) + ws_feat_obj = np.pad(ws_feat_obj, ((0, 0), (0, 1), (0, 0)), 'edge') # align the audio length with video frame + + execution_time = time.time() - start_time + print(f"Extraction Audio Feature: {execution_time:.2f} Seconds") + + audio_driven_obj = ws_feat_obj + else: + print(f'Using audio feature from path: {args.test_hubert_path}') + audio_driven_obj = np.load(args.test_hubert_path) + + frame_start, frame_end = 0, int(audio_driven_obj.shape[1]/2) + audio_start, audio_end = int(frame_start * 2), int(frame_end * 2) # The video frame is fixed to 25 hz and the audio is fixed to 50 hz + + audio_driven = torch.Tensor(audio_driven_obj[:,audio_start:audio_end,:]).unsqueeze(0).float().to(args.device) + #============================ + + # Diffusion Noise + noisyT = torch.randn((1,frame_end, args.motion_dim)).to(args.device) + + #======Inputs for Attribute Control========= + if os.path.exists(args.pose_driven_path): + pose_obj = np.load(args.pose_driven_path) + + if len(pose_obj.shape) != 2: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + if pose_obj.shape[1] != 3: + print('please check your pose information. The shape must be like (T, 3).') + exit(0) + + if pose_obj.shape[0] >= frame_end: + pose_obj = pose_obj[:frame_end,:] + else: + padding = np.tile(pose_obj[-1, :], (frame_end - pose_obj.shape[0], 1)) + pose_obj = np.vstack((pose_obj, padding)) + + pose_signal = torch.Tensor(pose_obj).unsqueeze(0).to(args.device) / 90 # 90 is for normalization here + else: + yaw_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_yaw + pitch_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_pitch + roll_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.pose_roll + pose_signal = torch.cat((yaw_signal, pitch_signal, roll_signal), dim=-1) + + pose_signal = torch.clamp(pose_signal, -1, 1) + + face_location_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_location + face_scae_signal = torch.zeros(1, frame_end, 1).to(args.device) + args.face_scale + #=========================================== + + start_time = time.time() + + #======Diffusion Denosing Process========= + generated_directions = model.render(one_shot_lia_start, one_shot_lia_direction, audio_driven, face_location_signal, face_scae_signal, pose_signal, noisyT, args.step_T, control_flag=args.control_flag) + #========================================= + + execution_time = time.time() - start_time + print(f"Motion Diffusion Model: {execution_time:.2f} Seconds") + + generated_directions = generated_directions.detach().cpu().numpy() + + start_time = time.time() + #======Rendering images frame-by-frame========= + for pred_index in tqdm(range(generated_directions.shape[1])): + ori_img_recon = lia.render(one_shot_lia_start, torch.Tensor(generated_directions[:,pred_index,:]).to(args.device), feats) + ori_img_recon = ori_img_recon.clamp(-1, 1) + wav_pred = (ori_img_recon.detach() + 1) / 2 + saved_image(wav_pred, os.path.join(frames_result_saved_path, "%06d.png"%(pred_index))) + #============================================== + + execution_time = time.time() - start_time + print(f"Renderer Model: {execution_time:.2f} Seconds") + + frames_to_video(frames_result_saved_path, args.test_audio_path, predicted_video_256_path) + + shutil.rmtree(frames_result_saved_path) + + # Enhancer + if args.face_sr and check_package_installed('gfpgan'): + from face_sr.face_enhancer import enhancer_list + import imageio + + # Super-resolution + imageio.mimsave(predicted_video_512_path+'.tmp.mp4', enhancer_list(predicted_video_256_path, method='gfpgan', bg_upsampler=None), fps=float(25)) + + # Merge audio and video + video_clip = VideoFileClip(predicted_video_512_path+'.tmp.mp4') + audio_clip = AudioFileClip(predicted_video_256_path) + final_clip = video_clip.set_audio(audio_clip) + final_clip.write_videofile(predicted_video_512_path, codec='libx264', audio_codec='aac') + + os.remove(predicted_video_512_path+'.tmp.mp4') + + if args.face_sr: + return predicted_video_256_path, predicted_video_512_path + else: + return predicted_video_256_path, predicted_video_256_path + +def generate_video(uploaded_img, uploaded_audio, infer_type, + pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, device, face_sr, seed, face_crop): + if uploaded_img is None or uploaded_audio is None: + return None, gr.Markdown("Error: Input image or audio file is empty. Please check and upload both files.") + + model_mapping = { + "mfcc_pose_only": "./ckpts/stage2_pose_only_mfcc.ckpt", + "mfcc_full_control": "./ckpts/stage2_more_controllable_mfcc.ckpt", + "hubert_audio_only": "./ckpts/stage2_audio_only_hubert.ckpt", + "hubert_pose_only": "./ckpts/stage2_pose_only_hubert.ckpt", + "hubert_full_control": "./ckpts/stage2_full_control_hubert.ckpt", + } + + if face_crop: + from data_preprocess.crop_image2 import crop_image + print("==> croping source_img") + crop_path = os.path.join(os.path.dirname(uploaded_img), 'crop_'+os.path.basename(uploaded_img)) + try: + crop_image(uploaded_img, crop_path) + if os.path.exists(crop_path): + uploaded_img = crop_path + except: + print('==> crop image failed, use original source for animate') + + stage2_checkpoint_path = model_mapping.get(infer_type, "default_checkpoint.ckpt") + try: + args = argparse.Namespace( + infer_type=infer_type, + test_image_path=uploaded_img, + test_audio_path=uploaded_audio, + test_hubert_path='', + result_path='./outputs/', + stage1_checkpoint_path='./ckpts/stage1.ckpt', + stage2_checkpoint_path=stage2_checkpoint_path, + seed=seed, + control_flag=True, + pose_yaw=pose_yaw, + pose_pitch=pose_pitch, + pose_roll=pose_roll, + face_location=face_location, + pose_driven_path='not_supported_in_this_mode', + face_scale=face_scale, + step_T=step_T, + image_size=256, + device=device, + motion_dim=20, + decoder_layers=2, + face_sr=face_sr + ) + + # Save the uploaded audio to the expected path + # shutil.copy(uploaded_audio, args.test_audio_path) + + # Run the main function + output_256_video_path, output_512_video_path = main(args) + + # Check if the output video file exists + if not os.path.exists(output_256_video_path): + return None, gr.Markdown("Error: Video generation failed. Please check your inputs and try again.") + if output_256_video_path == output_512_video_path: + return gr.Video(value=output_256_video_path), None, gr.Markdown("Video (256*256 only) generated successfully!") + return gr.Video(value=output_256_video_path), gr.Video(value=output_512_video_path), gr.Markdown("Video generated successfully!") + + except Exception as e: + return None, None, gr.Markdown(f"Error: An unexpected error occurred - {str(e)}") + +default_values = { + "pose_yaw": 0, + "pose_pitch": 0, + "pose_roll": 0, + "face_location": 0.5, + "face_scale": 0.5, + "step_T": 50, + "seed": 0, + "device": "cuda" +} + +with gr.Blocks() as demo: + gr.Markdown('# AniTalker') + gr.Markdown('![]()') + with gr.Row(): + with gr.Column(): + uploaded_img = gr.Image(type="filepath", label="Reference Image") + face_crop = gr.Checkbox(label="Face Crop (dlib)", value=False) + uploaded_audio = gr.Audio(type="filepath", label="Input Audio") + with gr.Column(): + output_video_256 = gr.Video(label="Generated Video (256)") + output_video_512 = gr.Video(label="Generated Video (512)") + output_message = gr.Markdown() + + + + generate_button = gr.Button("Generate Video") + + with gr.Accordion("Configuration", open=True): + infer_type = gr.Dropdown( + label="Inference Type", + choices=['mfcc_pose_only', 'mfcc_full_control', 'hubert_audio_only', 'hubert_pose_only'], + value='hubert_audio_only' + ) + face_sr = gr.Checkbox(label="Enable Face Super-Resolution (512*512)", value=False) + seed = gr.Number(label="Seed", value=default_values["seed"]) + pose_yaw = gr.Slider(label="pose_yaw", minimum=-1, maximum=1, value=default_values["pose_yaw"]) + pose_pitch = gr.Slider(label="pose_pitch", minimum=-1, maximum=1, value=default_values["pose_pitch"]) + pose_roll = gr.Slider(label="pose_roll", minimum=-1, maximum=1, value=default_values["pose_roll"]) + face_location = gr.Slider(label="face_location", minimum=0, maximum=1, value=default_values["face_location"]) + face_scale = gr.Slider(label="face_scale", minimum=0, maximum=1, value=default_values["face_scale"]) + step_T = gr.Slider(label="step_T", minimum=1, maximum=100, step=1, value=default_values["step_T"]) + device = gr.Radio(label="Device", choices=["cuda", "cpu"], value=default_values["device"]) + + + generate_button.click( + generate_video, + inputs=[ + uploaded_img, uploaded_audio, infer_type, + pose_yaw, pose_pitch, pose_roll, face_location, face_scale, step_T, device, face_sr, seed, + face_crop + ], + outputs=[output_video_256, output_video_512, output_message] + ) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EchoMimic') + parser.add_argument('--server_name', type=str, default='0.0.0.0', help='Server name') + parser.add_argument('--server_port', type=int, default=3001, help='Server port') + args = parser.parse_args() + + demo.launch(server_name=args.server_name, server_port=args.server_port, inbrowser=True) \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/demo.ipynb b/AniTalker-kit/AniTalker/demo.ipynb new file mode 100644 index 00000000..5604b03c --- /dev/null +++ b/AniTalker-kit/AniTalker/demo.ipynb @@ -0,0 +1,633 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Cloning code and models" + ], + "metadata": { + "id": "S7agBOp_UaAK" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EgexeiqnS62o", + "collapsed": true + }, + "outputs": [], + "source": [ + "!git clone https://github.com/X-LANCE/AniTalker.git\n", + "!pip install pytorch_lightning torchmetrics espnet moviepy python_speech_features gradio" + ] + }, + { + "cell_type": "code", + "source": [ + "%cd AniTalker/\n", + "!git clone https://huggingface.co/taocode/anitalker_ckpts\n", + "!mv anitalker_ckpts ckpts" + ], + "metadata": { + "id": "ZfNzsb6sUBmC", + "collapsed": true + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Run the demo gradio gui" + ], + "metadata": { + "id": "eCtHYoZZU2tT" + } + }, + { + "cell_type": "code", + "source": [ + "!python ./code/webgui.py" + ], + "metadata": { + "collapsed": true, + "id": "30_dXL7Ju7Ph" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Change to your own" + ], + "metadata": { + "id": "Gv2cTkSkzhyB" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Upload your_image.jpg" + ], + "metadata": { + "id": "87JLKPUk7l8Q" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "from google.colab import files\n", + "uploaded_image = files.upload()\n", + "image_filename = list(uploaded_image.keys())[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "SV5_LKjO7j9I", + "outputId": "7b275fb4-dbf7-47f7-a770-36453b35ea05" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving a.jpg to a (1).jpg\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Upload your_voise.wav" + ], + "metadata": { + "id": "CaTJCKZ19MYx" + } + }, + { + "cell_type": "code", + "source": [ + "uploaded_audio = files.upload()\n", + "audio_filename = list(uploaded_audio.keys())[0]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 73 + }, + "id": "9qBMAMDl9Hpx", + "outputId": "97123af9-24db-4e99-d6e7-8c729aa510ce" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + " \n", + " Upload widget is only available when the cell has been executed in the\n", + " current browser session. Please rerun this cell to enable.\n", + " \n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving b.wav to b (1).wav\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "folder_name = input(\"Enter the output folder name: \") # enter non-empty string and click Enter key" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NCEQB1QR9kWP", + "outputId": "659fe1d6-d9d3-4050-bca8-7c06bac27f03" + }, + "execution_count": null, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter the output folder name: your_demo\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Run your demo and wait for 3min" + ], + "metadata": { + "id": "M2wwP7Z_-fyP" + } + }, + { + "cell_type": "code", + "source": [ + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "command = f\"python ./code/demo.py --infer_type 'hubert_audio_only' --stage1_checkpoint_path 'ckpts/stage1.ckpt' --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' --test_image_path '{image_filename}' --test_audio_path '{audio_filename}' --result_path '{folder_name}'\"\n", + "os.system(command)\n", + "image_name = list(uploaded_image.keys())[0].split('.')[0]\n", + "audio_name = list(uploaded_audio.keys())[0].split('.')[0]\n", + "video_path = f'/content/AniTalker/{folder_name}/{image_name}-{audio_name}.mp4'\n", + "video = open(video_path, \"rb\").read()\n", + "video_encoded = b64encode(video).decode('ascii')\n", + "HTML(data=f'''\n", + " \n", + "''')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 621 + }, + "id": "tHer8O1Y_OOv", + "outputId": "11fa9395-6a53-442d-9b43-6e157d4b725f" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n" + ] + }, + "metadata": {}, + "execution_count": 38 + } + ] + } + ] +} \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/docs/_config.yml b/AniTalker-kit/AniTalker/docs/_config.yml new file mode 100644 index 00000000..c4192631 --- /dev/null +++ b/AniTalker-kit/AniTalker/docs/_config.yml @@ -0,0 +1 @@ +theme: jekyll-theme-cayman \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/docs/css/styles.css b/AniTalker-kit/AniTalker/docs/css/styles.css new file mode 100644 index 00000000..9fa5f46d --- /dev/null +++ b/AniTalker-kit/AniTalker/docs/css/styles.css @@ -0,0 +1,18 @@ +@import "https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"; +@import "https://cdn.jsdelivr.net/npm/@creativebulma/bulma-tooltip@1.2.0/dist/bulma-tooltip.min.css"; +@media screen and (min-width: 1024px) { + .container { + max-width: 860px; + } +} + +.authors>span { + padding: 0 0.5rem; + display: inline-block; +} + +@media only screen and (max-width: 480px) { + a.button.is-rounded.is-link.is-light:not(:last-child) { + margin-bottom: 0.75em; + } +} diff --git a/AniTalker-kit/AniTalker/docs/img/generated_result.png b/AniTalker-kit/AniTalker/docs/img/generated_result.png new file mode 100644 index 00000000..e13335b5 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/img/generated_result.png differ diff --git a/AniTalker-kit/AniTalker/docs/img/method_overview.png b/AniTalker-kit/AniTalker/docs/img/method_overview.png new file mode 100644 index 00000000..7aeeb336 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/img/method_overview.png differ diff --git a/AniTalker-kit/AniTalker/docs/index.html b/AniTalker-kit/AniTalker/docs/index.html new file mode 100644 index 00000000..a5810c89 --- /dev/null +++ b/AniTalker-kit/AniTalker/docs/index.html @@ -0,0 +1,269 @@ + + + + + + + AniTalker + + + + + + + + +
+
+

+ AniTalker: Animate Vivid and Diverse Talking Faces through Identity-Decoupled Facial Motion Encoding +

+ +

+ Codes - Paper +

+ + +
+
+ +
+
+ +
+ +
+

+ " We introduce AniTalker, a framework that transforms a single static portrait and input audio into animated talking videos with naturally flowing movements. Each column of generated results utilizes identical control signals with similar poses and expressions but incorporates some random variations, demonstrating the diversity of our generated outcomes. " +

+
+
+ +
+
+

+ Abstract +

+
+ +

+ The paper introduces AniTalker, an innovative framework designed to generate lifelike talking faces from a single portrait. Unlike existing models that primarily focus on verbal cues such as lip synchronization and fail to capture the complex dynamics of facial expressions and nonverbal cues, AniTalker employs a universal motion representation. This innovative representation effectively captures a wide range of facial dynamics, including subtle expressions and head movements. AniTalker enhances motion depiction through two self-supervised learning strategies: the first involves reconstructing target video frames from source frames within the same identity to learn subtle motion representations, and the second develops an identity encoder using metric learning while actively minimizing mutual information between the identity and motion encoders. This approach ensures that the motion representation is dynamic and devoid of identity-specific details, significantly reducing the need for labeled data. Additionally, the integration of a diffusion model with a variance adapter allows for the generation of diverse and controllable facial animations. This method not only demonstrates AniTalker’s capability to create detailed and realistic facial movements but also underscores its potential in crafting dynamic avatars for real-world applications. +

+ +
+
+
+ + +
+
+

+ Architecture +

+
+ +
+ "Our framework comprises two main components: learning a universal motion representation and then generating and manipulating this representation through a sequence model. Specifically, the first part aims to learn a robust motion representation by employing metric learning (ML), mutual information disentanglement (MID), and Hierarchical Aggregation Layer (HAL). Subsequently, this motion representation can be used for further generation and manipulation. " +
+
+ + +
+ +

+ Video Demos +

+

+ [NOTE] The videos below are generated content, and the audio does not represent the speaker's opinion. +

+
+ +
+ +
+

Audio-driven Talking Face Generation (Realism)

+

+ + + + + + + + + + + + +
+
+
+ +

Audio-driven Talking Face Generation (Statue/Cartoon)

+

+ + + + + + + + + + + + +
+
+
+ +

Video-driven Talking Face Generation (Cross/Self Reenactment)

+

+ + + + + + + + + + + + +
+
+
+ + + + + + + + + + + + + +
Diversity
Controllability
+
+
+ + + + + + + + + + + + + + + + + +
Long Video Generation (Mars Story)
Long Video Generation (Mona Lisa)
Prompt: + "Tell a story about a wonderful journey + exploring Mars to a 5-year-old kid."
Prompt: + "If you were Mona Lisa and lived in the present, + tell us about your thoughts."
+
+
+ + + + + + + + + + + + + + +
Method Comparsion (Audio-driven)
Method Comparsion (Video-driven)
+
+
+ +
+
+
+
+
+ + +
+
+
+

Ablation Studies

+ +
+
+
+ + +
+
+

+ Ethical Consideration +

+
+

+ + The potential misuse of lifelike digital human face generation, such as for creating fraudulent identities or disseminating misinformation, necessitates preemptive ethical measures. Before utilizing these models, it is crucial for organizations to integrate ethical guidelines into their policies, ensuring the application of this technology emphasizes consent, transparency, and accountability. Furthermore, it is recommended to embed visible or invisible digital watermarks in any generated content. + +

+ +
+ +

+ Removal Policy +

+
+ +

+ + Please be aware that all videos on this page are algorithmically generated from publicly available sources and are intended solely for academic demonstrations and algorithm comparisons. Any other form of usage is prohibited. If you feel uncomfortable or have any questions, please raise an issue , and we will address your request promptly. Besides, if required by the original image owner or in the case of misuse of the models, the images, models, and codes associated with this project may be removed at any time. +

+
+
+
+ + + + + diff --git a/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_1_realism.mp4 b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_1_realism.mp4 new file mode 100644 index 00000000..26b3fd30 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_1_realism.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_2_realism.mp4 b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_2_realism.mp4 new file mode 100644 index 00000000..46e29a31 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_2_realism.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_3_cartoon.mp4 b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_3_cartoon.mp4 new file mode 100644 index 00000000..091e35ec Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_3_cartoon.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_4_statue.mp4 b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_4_statue.mp4 new file mode 100644 index 00000000..0b5ce534 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/1_audio_driven_4_statue.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/2_video_driven_cross_identity.mp4 b/AniTalker-kit/AniTalker/docs/videos/2_video_driven_cross_identity.mp4 new file mode 100644 index 00000000..92c6a1ff Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/2_video_driven_cross_identity.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/2_video_driven_same_identity.mp4 b/AniTalker-kit/AniTalker/docs/videos/2_video_driven_same_identity.mp4 new file mode 100644 index 00000000..4b55e151 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/2_video_driven_same_identity.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/4_diversity_demo.mp4 b/AniTalker-kit/AniTalker/docs/videos/4_diversity_demo.mp4 new file mode 100644 index 00000000..e9628f15 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/4_diversity_demo.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/5_controllability_demo.mp4 b/AniTalker-kit/AniTalker/docs/videos/5_controllability_demo.mp4 new file mode 100644 index 00000000..c2898d4c Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/5_controllability_demo.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-1.mp4 b/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-1.mp4 new file mode 100644 index 00000000..50993ec2 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-1.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-2.mp4 b/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-2.mp4 new file mode 100644 index 00000000..4a6e4557 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/6_long_video_generation-2.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_audio_driven.mp4 b/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_audio_driven.mp4 new file mode 100644 index 00000000..47922076 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_audio_driven.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_video_driven.mp4 b/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_video_driven.mp4 new file mode 100644 index 00000000..99361fb9 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/7_method_comparsion_video_driven.mp4 differ diff --git a/AniTalker-kit/AniTalker/docs/videos/8_ablation_studies.mp4 b/AniTalker-kit/AniTalker/docs/videos/8_ablation_studies.mp4 new file mode 100644 index 00000000..ca88f631 Binary files /dev/null and b/AniTalker-kit/AniTalker/docs/videos/8_ablation_studies.mp4 differ diff --git a/AniTalker-kit/AniTalker/md_docs/config.md b/AniTalker-kit/AniTalker/md_docs/config.md new file mode 100644 index 00000000..d8117cb1 --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/config.md @@ -0,0 +1,31 @@ +# Explanation of Parameters for demo.py + +| index | Name | Type | Description | +| --- | --- | --- | --- | +| 1 | infer_type | String | Single choices: ['mfcc_pose_only', 'mfcc_full_control', 'hubert_audio_only', 'hubert_pose_only'] | +| 2 | test_image_path | String | Path to the portrait (.jpg or .png) | +| 3 | test_audio_path | String | Path to the driven audio (.wav or .mp3) | +| 4 | test_hubert_path | String | Path to the Hubert feature of the driven audio (.npy). Not needed for MFCC model | +| 5 | result_path | String | The result will be saved to this folder | +| 6 | stage1_checkpoint_path | String | The model path for the first stage. Fixed to ./ckpts/stage1.ckpt in our experiment | +| 7 | stage2_checkpoint_path | String | The model path for the second stage. This model will change with infer_type, see the table below for specific relationships | +| 8 | seed | integer | The seed of the second model to control diversity. | +| 9 | control_flag | boolean | Whether to enable control signals, does not work for audio-only models. | +| 10 | pose_yaw | float | Yaw angle for head pose. Already normalized, ranging from -1 to 1, representing -90° to 90°, only effective when control_flag is enabled and for pose designated models | +| 11 | pose_pitch | float | Pitch angle for head pose. Already normalized, ranging from -1 to 1, representing -90° to 90°, only effective when control_flag is enabled and for pose designated models | +| 12 | pose_roll | float | Roll angle for head pose. Already normalized, ranging from -1 to 1, representing -90° to 90°, only effective when control_flag is enabled and for pose designated models | +| 13 | pose_driven_path | str | [Optional] Path to pose numpy, shape is (T, 3). You can check the following code https://github.com/liutaocode/talking_face_preprocessing to extract the yaw, pitch and roll. | +| 14 | face_location | float | x-coordinate of the nose (screen coordinate). Already normalized, ranging from 0 to 1, representing from the leftmost to the rightmost of the screen, with most values centered around 0.5 (i.e., centered), effective when control_flag is enabled and only for models with the more_controllable identifier | +| 15 | face_scale | float | Size of the face (or distance from the camera). Already normalized, ranging from 0 to 1, representing the size of the face, effective when control_flag is enabled and only for models with the more_controllable identifier | +| 16 | step_T | integer | Number of diffusion denoising steps, default 50. | +| 17 | image_size | integer | Image size, currently only supports 256 in our experiment | +| 18 | motion_dim | integer | Dimension of motion, currently only supports 20 in our experiment | + +Current model mapping table: + +| index | infer_type | stage2_checkpoint_name | +| --- | --- | --- | +| 1 | mfcc_pose_only | stage2_pose_only_mfcc.ckpt | +| 2 | mfcc_full_control | stage2_more_controllable_mfcc.ckpt | +| 3 | hubert_audio_only | stage2_audio_only_hubert.ckpt | +| 4 | hubert_pose_only | stage2_pose_only_hubert.ckpt | \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/conda_environment.yml b/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/conda_environment.yml new file mode 100644 index 00000000..53d9438a --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/conda_environment.yml @@ -0,0 +1,155 @@ +name: anitalker +channels: + - pytorch + - conda-forge + - defaults +dependencies: + - brotli-python=1.1.0=py39hb198ff7_1 + - ca-certificates=2024.7.4=hf0a4a13_0 + - certifi=2024.7.4=pyhd8ed1ab_0 + - cffi=1.17.0=py39h210d88a_0 + - charset-normalizer=3.3.2=pyhd8ed1ab_0 + - filelock=3.15.4=pyhd8ed1ab_0 + - freetype=2.12.1=hadb7bae_2 + - gmp=6.3.0=h7bae524_2 + - gmpy2=2.1.5=py39h9bb7c0c_1 + - h2=4.1.0=pyhd8ed1ab_0 + - hpack=4.0.0=pyh9f0ad1d_0 + - hyperframe=6.0.1=pyhd8ed1ab_0 + - idna=3.7=pyhd8ed1ab_0 + - jinja2=3.1.4=pyhd8ed1ab_0 + - lcms2=2.16=ha0e7c42_0 + - lerc=4.0.0=h9a09cb3_0 + - libblas=3.9.0=19_osxarm64_openblas + - libcblas=3.9.0=19_osxarm64_openblas + - libcxx=18.1.8=h5a72898_2 + - libdeflate=1.20=h93a5062_0 + - libffi=3.4.2=h3422bc3_5 + - libgfortran=5.0.0=13_2_0_hd922786_3 + - libgfortran5=13.2.0=hf226fd6_3 + - libjpeg-turbo=3.0.0=hb547adb_1 + - liblapack=3.9.0=19_osxarm64_openblas + - libopenblas=0.3.24=openmp_hd76b1f2_0 + - libpng=1.6.43=h091b4b1_0 + - libsqlite=3.46.0=hfb93653_0 + - libtiff=4.6.0=h07db509_3 + - libwebp-base=1.4.0=h93a5062_0 + - libxcb=1.15=hf346824_0 + - libzlib=1.2.13=hfb2fe0b_6 + - llvm-openmp=15.0.7=h7cfbb63_0 + - markupsafe=2.1.5=py39h17cfd9d_0 + - mpc=1.3.1=h91ba8db_0 + - mpfr=4.2.1=h1cfca0a_2 + - mpmath=1.3.0=pyhd8ed1ab_0 + - ncurses=6.5=hb89a1cb_0 + - networkx=3.2=pyhd8ed1ab_0 + - openjpeg=2.5.2=h9f1df11_0 + - openssl=3.3.1=hfb2fe0b_2 + - pillow=10.3.0=py39h3352c98_0 + - pthread-stubs=0.4=h27ca646_1001 + - pycparser=2.22=pyhd8ed1ab_0 + - pysocks=1.7.1=pyha2e5f31_6 + - python=3.9.7=hc0da0df_3_cpython + - python_abi=3.9=4_cp39 + - pytorch=2.4.0=py3.9_0 + - pyyaml=6.0.2=py39hfea33bf_0 + - readline=8.2=h92ec313_1 + - requests=2.32.3=pyhd8ed1ab_0 + - setuptools=72.1.0=pyhd8ed1ab_0 + - sqlite=3.46.0=h5838104_0 + - sympy=1.13.1=pypyh2585a3b_103 + - tk=8.6.13=h5083fa2_1 + - torchaudio=2.4.0=py39_cpu + - torchvision=0.19.0=py39_cpu + - typing_extensions=4.12.2=pyha770c72_0 + - tzdata=2024a=h0c530f3_0 + - urllib3=2.2.2=pyhd8ed1ab_1 + - wheel=0.44.0=pyhd8ed1ab_0 + - xorg-libxau=1.0.11=hb547adb_0 + - xorg-libxdmcp=1.1.3=h27ca646_0 + - xz=5.2.6=h57fd34a_0 + - yaml=0.2.5=h3422bc3_2 + - zlib=1.2.13=hfb2fe0b_6 + - zstandard=0.22.0=py39h0b77d07_1 + - zstd=1.5.6=hb46c0d2_0 + - pip: + - absl-py==2.1.0 + - aiohappyeyeballs==2.3.5 + - aiohttp==3.10.2 + - aiosignal==1.3.1 + - antlr4-python3-runtime==4.9.3 + - asteroid-filterbanks==0.4.0 + - async-timeout==4.0.3 + - attrs==24.2.0 + - audioread==3.0.1 + - ci-sdr==0.0.2 + - click==8.1.7 + - configargparse==1.7 + - ctc-segmentation==1.7.4 + - cython==3.0.11 + - decorator==4.4.2 + - distance==0.1.3 + - editdistance==0.8.1 + - einops==0.8.0 + - espnet-tts-frontend==0.0.3 + - fast-bss-eval==0.1.3 + - frozenlist==1.4.1 + - fsspec==2024.6.1 + - g2p-en==2.1.0 + - grpcio==1.65.4 + - h5py==3.11.0 + - huggingface-hub==0.24.5 + - humanfriendly==10.0 + - hydra-core==1.3.2 + - imageio==2.34.2 + - imageio-ffmpeg==0.5.1 + - importlib-metadata==4.13.0 + - inflect==7.3.1 + - jaconv==0.4.0 + - jamo==0.4.1 + - joblib==1.4.2 + - kaldiio==2.18.0 + - librosa==0.9.2 + - llvmlite==0.43.0 + - markdown==3.6 + - more-itertools==10.4.0 + - moviepy==1.0.3 + - multidict==6.0.5 + - nltk==3.8.2 + - numba==0.60.0 + - numpy==2.0.1 + - omegaconf==2.3.0 + - opt-einsum==3.3.0 + - packaging==24.1 + - pip==24.0 + - platformdirs==4.2.2 + - pooch==1.8.2 + - proglog==0.1.10 + - protobuf==3.20.1 + - pydeprecate==0.3.2 + - pypinyin==0.44.0 + - python-speech-features==0.6 + - pytorch-lightning==1.6.5 + - pyworld==0.3.4 + - regex==2024.7.24 + - resampy==0.4.3 + - safetensors==0.4.4 + - scikit-learn==1.5.1 + - scipy==1.13.1 + - sentencepiece==0.1.97 + - six==1.16.0 + - soundfile==0.12.1 + - tensorboard==2.17.0 + - tensorboard-data-server==0.7.2 + - threadpoolctl==3.5.0 + - tokenizers==0.19.1 + - torch-complex==0.4.4 + - torchmetrics==0.5.0 + - tqdm==4.66.5 + - transformers==4.44.0 + - typeguard==4.3.0 + - unidecode==1.3.8 + - werkzeug==3.0.3 + - yarl==1.9.4 + - zipp==3.19.2 +prefix: /opt/anaconda3/envs/anitalker diff --git a/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/pip_requirements.txt b/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/pip_requirements.txt new file mode 100644 index 00000000..944fee4e --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/mac_os_env_list/pip_requirements.txt @@ -0,0 +1,105 @@ +absl-py==2.1.0 +aiohappyeyeballs==2.3.5 +aiohttp==3.10.2 +aiosignal==1.3.1 +antlr4-python3-runtime==4.9.3 +asteroid-filterbanks==0.4.0 +async-timeout==4.0.3 +attrs==24.2.0 +audioread==3.0.1 +Brotli @ file:///Users/runner/miniforge3/conda-bld/brotli-split_1695989934239/work +certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1720457958366/work/certifi +cffi @ file:///Users/runner/miniforge3/conda-bld/cffi_1723018415051/work +charset-normalizer @ file:///home/conda/feedstock_root/build_artifacts/charset-normalizer_1698833585322/work +ci_sdr==0.0.2 +click==8.1.7 +ConfigArgParse==1.7 +ctc_segmentation==1.7.4 +Cython==3.0.11 +decorator==4.4.2 +Distance==0.1.3 +editdistance==0.8.1 +einops==0.8.0 +-e git+https://github.com/espnet/espnet.git@b1046403ec7a20469594cb9f6ad3cbe58a7e6c81#egg=espnet +espnet-tts-frontend==0.0.3 +fast_bss_eval==0.1.3 +filelock @ file:///home/conda/feedstock_root/build_artifacts/filelock_1719088281970/work +frozenlist==1.4.1 +fsspec==2024.6.1 +g2p-en==2.1.0 +gmpy2 @ file:///Users/runner/miniforge3/conda-bld/gmpy2_1715527352339/work +grpcio==1.65.4 +h2 @ file:///home/conda/feedstock_root/build_artifacts/h2_1634280454336/work +h5py==3.11.0 +hpack==4.0.0 +huggingface-hub==0.24.5 +humanfriendly==10.0 +hydra-core==1.3.2 +hyperframe @ file:///home/conda/feedstock_root/build_artifacts/hyperframe_1619110129307/work +idna @ file:///home/conda/feedstock_root/build_artifacts/idna_1713279365350/work +imageio==2.34.2 +imageio-ffmpeg==0.5.1 +importlib-metadata==4.13.0 +inflect==7.3.1 +jaconv==0.4.0 +jamo==0.4.1 +Jinja2 @ file:///home/conda/feedstock_root/build_artifacts/jinja2_1715127149914/work +joblib==1.4.2 +kaldiio==2.18.0 +librosa==0.9.2 +llvmlite==0.43.0 +Markdown==3.6 +MarkupSafe @ file:///Users/runner/miniforge3/conda-bld/markupsafe_1706900018209/work +more-itertools==10.4.0 +moviepy==1.0.3 +mpmath @ file:///home/conda/feedstock_root/build_artifacts/mpmath_1678228039184/work +multidict==6.0.5 +networkx @ file:///home/conda/feedstock_root/build_artifacts/networkx_1697694336581/work +nltk==3.8.2 +numba==0.60.0 +numpy==2.0.1 +omegaconf==2.3.0 +opt-einsum==3.3.0 +packaging==24.1 +pillow @ file:///Users/runner/miniforge3/conda-bld/pillow_1712154530882/work +platformdirs==4.2.2 +pooch==1.8.2 +proglog==0.1.10 +protobuf==3.20.1 +pycparser @ file:///home/conda/feedstock_root/build_artifacts/pycparser_1711811537435/work +pyDeprecate==0.3.2 +pypinyin==0.44.0 +PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work +python_speech_features==0.6 +pytorch-lightning==1.6.5 +pyworld==0.3.4 +PyYAML @ file:///Users/runner/miniforge3/conda-bld/pyyaml_1723018221953/work +regex==2024.7.24 +requests @ file:///home/conda/feedstock_root/build_artifacts/requests_1717057054362/work +resampy==0.4.3 +safetensors==0.4.4 +scikit-learn==1.5.1 +scipy==1.13.1 +sentencepiece==0.1.97 +six==1.16.0 +soundfile==0.12.1 +sympy @ file:///home/conda/feedstock_root/build_artifacts/sympy_1723268786625/work +tensorboard==2.17.0 +tensorboard-data-server==0.7.2 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +torch==2.4.0 +torch-complex==0.4.4 +torchaudio==2.4.0 +torchmetrics==0.5.0 +torchvision==0.19.0 +tqdm==4.66.5 +transformers==4.44.0 +typeguard==4.3.0 +typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1717802530399/work +Unidecode==1.3.8 +urllib3 @ file:///home/conda/feedstock_root/build_artifacts/urllib3_1719391292974/work +Werkzeug==3.0.3 +yarl==1.9.4 +zipp==3.19.2 +zstandard==0.22.0 diff --git a/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_audio_only.md b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_audio_only.md new file mode 100644 index 00000000..43d7ce2c --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_audio_only.md @@ -0,0 +1,92 @@ +## More hubert cases (Audio-only Model) + +**Features of this model include:** +- The driving signals require only one image plus an audio segment. +- It offers good visual stability. +- It forces the face to orient forward. +- The expressiveness is moderate. + +## Einstein + +``` +python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/Einstein.png' \ + --test_audio_path 'test_demos/audios/english_male.mp3' \ + --test_hubert_path 'test_demos/audios_hubert/english_male.npy' \ + --result_path 'outputs/Einstein_hubert/' +``` + + +- The generated video of this sample will be saved to [outputs/Einstein_hubert/Einstein-english_male.mp4](outputs/Einstein_hubert/Einstein-english_male.mp4). + + +### Storytelling (Chinese) + +``` +python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/girl.png' \ + --test_audio_path 'test_demos/audios/lianliru.wav' \ + --test_hubert_path 'test_demos/audios_hubert/lianliru.npy' \ + --result_path 'outputs/lianliru_hubert/' +``` + +- The generated video of this sample will be saved to [outputs/lianliru_hubert/girl-lianliru.mp4](outputs/lianliru_hubert/girl-lianliru.mp4). + + +## Long Story Generation + +``` +python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/cartoon_girl.png' \ + --test_audio_path 'test_demos/audios/mars.wav' \ + --test_hubert_path 'test_demos/audios_hubert/mars.npy' \ + --result_path 'outputs/cartoon_girl_mars_story_hubert/' +``` +- The generated video of this sample will be saved to [outputs/cartoon_girl_mars_story_hubert/cartoon_girl-mars.mp4](outputs/cartoon_girl_mars_story_hubert/cartoon_girl-mars.mp4). + +## Statue + +``` +python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/statue.jpg' \ + --test_audio_path 'test_demos/audios/statue.wav' \ + --test_hubert_path 'test_demos/audios_hubert/statue.npy' \ + --result_path 'outputs/statue_hubert/' +``` +- The generated video of this sample will be saved to [outputs/statue_hubert/statue-statue.mp4](outputs/statue_hubert/statue-statue.mp4). + +## Your own case + +``` +python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path '/path/to/image_path(png or jpg)' \ + --test_audio_path '/path/to/audio_path(wav or mp3)' \ + --test_hubert_path '/path/to/hubert_feature_path(npy)' \ + --result_path '/path/to/saved_folder/' +``` + +Change the above `/path/to/xxx` to your own path or folder. + + + +## Reference + +- Image of `Einstein.png` is from [GAIA](https://gaiavatar.github.io/gaia/) +- 'lianliru.wav' is from [StoryTTS](https://github.com/X-LANCE/StoryTTS) dataset. +- Image of `girl.png` and `cartoon_girl.png` were generated by [yiyan](https://yiyan.baidu.com/). + diff --git a/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_more_control.md b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_more_control.md new file mode 100644 index 00000000..69e37e82 --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_more_control.md @@ -0,0 +1,106 @@ +## More hubert cases (More-controllable Model) + +**Features of this model include:** +- The driving signals require one image plus an audio segment. +- You can also adjust the pose_yaw, pose_pitch, and pose_roll. +- It offers moderate visual stability. +- The expressiveness is better. + +### Storytelling (Chinese) + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/sad.jpg' \ + --test_audio_path 'test_demos/audios/lianliru.wav' \ + --test_hubert_path 'test_demos/audios_hubert/lianliru.npy' \ + --result_path 'outputs/lianliru_hubert_full_control/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.5 \ + --face_sr +``` + +- The generated video of this sample will be saved to [outputs/lianliru_hubert_with_pose/girl-lianliru.mp4](../outputs/lianliru_hubert_with_pose/girl-lianliru.mp4). +- 'lianliru.wav' is from [StoryTTS](https://github.com/X-LANCE/StoryTTS) dataset. + +## Einstein + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/Einstein.png' \ + --test_audio_path 'test_demos/audios/english_male.mp3' \ + --test_hubert_path 'test_demos/audios_hubert/english_male.npy' \ + --result_path 'outputs/Einstein_hubert_full_control/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.5 \ + --face_sr +``` + + +- The generated video of this sample will be saved to [outputs/Einstein_hubert_pose/Einstein-english_male.mp4](../outputs/Einstein_hubert_pose/Einstein-english_male.mp4). +- Image of `Einstein.png` is from [GAIA](https://gaiavatar.github.io/gaia/) + + +## Long Story Generation + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/cartoon_girl.png' \ + --test_audio_path 'test_demos/audios/mars.wav' \ + --test_hubert_path 'test_demos/audios_hubert/mars.npy' \ + --result_path 'outputs/cartoon_girl_mars_story_hubert_full_control/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.5 \ + --face_sr + +``` +- The generated video of this sample will be saved to [outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4](../outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4). + +## Statue + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/statue.jpg' \ + --test_audio_path 'test_demos/audios/statue.wav' \ + --test_hubert_path 'test_demos/audios_hubert/statue.npy' \ + --result_path 'outputs/statue_hubert_full_control/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.5 \ + --face_sr + +``` +- The generated video of this sample will be saved to [outputs/statue_hubert_pose/statue-statue.mp4](../outputs/statue_hubert_pose/statue-statue.mp4). + + + diff --git a/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_pose_only.md b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_pose_only.md new file mode 100644 index 00000000..c164f6d9 --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_hubert_cases_pose_only.md @@ -0,0 +1,98 @@ +## More hubert cases (Pose-controllable Model) + +**Features of this model include:** +- The driving signals require one image plus an audio segment. +- You can also adjust the pose_yaw, pose_pitch, and pose_roll. +- It offers moderate visual stability. +- The expressiveness is better. + +### Storytelling (Chinese) + +``` +python ./code/demo.py \ + --infer_type 'hubert_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/sad.jpg' \ + --test_audio_path 'test_demos/audios/lianliru.wav' \ + --test_hubert_path 'test_demos/audios_hubert/lianliru.npy' \ + --result_path 'outputs/lianliru_hubert_with_pose/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_sr +``` + +- The generated video of this sample will be saved to [outputs/lianliru_hubert_with_pose/girl-lianliru.mp4](../outputs/lianliru_hubert_with_pose/girl-lianliru.mp4). +- 'lianliru.wav' is from [StoryTTS](https://github.com/X-LANCE/StoryTTS) dataset. + +## Einstein + +``` +python ./code/demo.py \ + --infer_type 'hubert_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/Einstein.png' \ + --test_audio_path 'test_demos/audios/english_male.mp3' \ + --test_hubert_path 'test_demos/audios_hubert/english_male.npy' \ + --result_path 'outputs/Einstein_hubert_pose/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_sr +``` + + +- The generated video of this sample will be saved to [outputs/Einstein_hubert_pose/Einstein-english_male.mp4](../outputs/Einstein_hubert_pose/Einstein-english_male.mp4). +- Image of `Einstein.png` is from [GAIA](https://gaiavatar.github.io/gaia/) + + +## Long Story Generation + +``` +python ./code/demo.py \ + --infer_type 'hubert_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/cartoon_girl.png' \ + --test_audio_path 'test_demos/audios/mars.wav' \ + --test_hubert_path 'test_demos/audios_hubert/mars.npy' \ + --result_path 'outputs/cartoon_girl_mars_story_hubert_pose/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_sr + +``` +- The generated video of this sample will be saved to [outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4](../outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4). + +## Statue + +``` +python ./code/demo.py \ + --infer_type 'hubert_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/statue.jpg' \ + --test_audio_path 'test_demos/audios/statue.wav' \ + --test_hubert_path 'test_demos/audios_hubert/statue.npy' \ + --result_path 'outputs/statue_hubert_pose/' \ + --control_flag \ + --seed 0 \ + --pose_yaw 0 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_sr + +``` +- The generated video of this sample will be saved to [outputs/statue_hubert_pose/statue-statue.mp4](../outputs/statue_hubert_pose/statue-statue.mp4). + + + diff --git a/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases.md b/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases.md new file mode 100644 index 00000000..4180e5cb --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases.md @@ -0,0 +1,88 @@ +## More MFCC cases + +### Storytelling (Chinese) + +``` +python ./code/demo.py \ + --infer_type 'mfcc_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/sad.jpg' \ + --test_audio_path 'test_demos/audios/lianliru.wav' \ + --test_hubert_path 'test_demos/audios_hubert/lianliru.npy' \ + --result_path 'outputs/lianliru_mfcc/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw -0.0427 \ + --pose_pitch -0.0536 \ + --pose_roll 0.0434 +``` + +- The generated video of this sample will be saved to [outputs/lianliru_mfcc/sad-lianliru.mp4](../outputs/lianliru_mfcc/sad-lianliru.mp4). +- 'lianliru.wav' is from [StoryTTS](https://github.com/X-LANCE/StoryTTS) dataset. + +## Einstein + +``` +python ./code/demo.py \ + --infer_type 'mfcc_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/Einstein.png' \ + --test_audio_path 'test_demos/audios/english_male.mp3' \ + --test_hubert_path 'test_demos/audios_hubert/english_male.npy' \ + --result_path 'outputs/Einstein_mfcc/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.0277 \ + --pose_pitch 0.0252 \ + --pose_roll 0.0308 +``` + + +- The generated video of this sample will be saved to [outputs/Einstein_mfcc/Einstein-english_male.mp4](../outputs/Einstein_mfcc/Einstein-english_male.mp4). +- Image of `Einstein.png` is from [GAIA](https://gaiavatar.github.io/gaia/) +- There is a bad case occurring in the middle of the generated video; this may be caused by the lack of robustness of MFCC. + + +## Long Story Generation + +``` +python ./code/demo.py \ + --infer_type 'mfcc_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/cartoon_girl.png' \ + --test_audio_path 'test_demos/audios/mars.wav' \ + --test_hubert_path 'test_demos/audios_hubert/mars.npy' \ + --result_path 'outputs/cartoon_girl_mars_story_mfcc/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.0302 \ + --pose_pitch 0.164 \ + --pose_roll 0.0415 + + +``` +- The generated video of this sample will be saved to [outputs/cartoon_girl_mars_story_mfcc/cartoon_girl-mars.mp4](../outputs/cartoon_girl_mars_story_mfcc/cartoon_girl-mars.mp4). + +## Statue + +``` +python ./code/demo.py \ + --infer_type 'mfcc_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/statue.jpg' \ + --test_audio_path 'test_demos/audios/statue.wav' \ + --test_hubert_path 'test_demos/audios_hubert/statue.npy' \ + --result_path 'outputs/statue_mfcc/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw -0.0363 \ + --pose_pitch 0.0123 \ + --pose_roll -0.0031 + +``` +- The generated video of this sample will be saved to [outputs/statue_mfcc/statue-statue.mp4](../outputs/statue_mfcc/statue-statue.mp4). + diff --git a/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases_controllable.md b/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases_controllable.md new file mode 100644 index 00000000..019fb85d --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_mfcc_cases_controllable.md @@ -0,0 +1,98 @@ +## More MFCC cases + +**Features of this model include:** +- The driving signals require one image plus an audio segment. +- You can also adjust pose_yaw, pose_pitch, pose_roll, face_location, and face_scale. +- Overall performance is not as good as Hubert. + + +### Storytelling (Chinese) + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/sad.jpg' \ + --test_audio_path 'test_demos/audios/lianliru.wav' \ + --test_hubert_path 'test_demos/audios_hubert/lianliru.npy' \ + --result_path 'outputs/lianliru_mfcc_full_control/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw -0.0427 \ + --pose_pitch -0.0536 \ + --pose_roll 0.0434 \ + --face_location 0.6 \ + --face_scale 0.4 +``` + +- The generated video of this sample will be saved to [outputs/lianliru_mfcc/sad-lianliru.mp4](../outputs/lianliru_mfcc/sad-lianliru.mp4). +- 'lianliru.wav' is from [StoryTTS](https://github.com/X-LANCE/StoryTTS) dataset. + +## Einstein + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/Einstein.png' \ + --test_audio_path 'test_demos/audios/english_male.mp3' \ + --test_hubert_path 'test_demos/audios_hubert/english_male.npy' \ + --result_path 'outputs/Einstein_mfcc_full_control/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.0277 \ + --pose_pitch 0.0252 \ + --pose_roll 0.0308 \ + --face_location 0.6 \ + --face_scale 0.4 +``` + + +- The generated video of this sample will be saved to [outputs/Einstein_mfcc/Einstein-english_male.mp4](../outputs/Einstein_mfcc/Einstein-english_male.mp4). +- Image of `Einstein.png` is from [GAIA](https://gaiavatar.github.io/gaia/) +- There is a bad case occurring in the middle of the generated video; this may be caused by the lack of robustness of MFCC. + + +## Long Story Generation + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/cartoon_girl.png' \ + --test_audio_path 'test_demos/audios/mars.wav' \ + --test_hubert_path 'test_demos/audios_hubert/mars.npy' \ + --result_path 'outputs/cartoon_girl_mars_story_full_control/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.0302 \ + --pose_pitch 0.164 \ + --pose_roll 0.0415 + + +``` +- The generated video of this sample will be saved to [outputs/cartoon_girl_mars_story_mfcc/cartoon_girl-mars.mp4](../outputs/cartoon_girl_mars_story_mfcc/cartoon_girl-mars.mp4). + +## Statue + +``` +python ./code/demo.py \ + --infer_type 'mfcc_pose_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_pose_only_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/statue.jpg' \ + --test_audio_path 'test_demos/audios/statue.wav' \ + --test_hubert_path 'test_demos/audios_hubert/statue.npy' \ + --result_path 'outputs/statue_mfcc/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw -0.0363 \ + --pose_pitch 0.0123 \ + --pose_roll -0.0031 + +``` +- The generated video of this sample will be saved to [outputs/statue_mfcc/statue-statue.mp4](../outputs/statue_mfcc/statue-statue.mp4). + diff --git a/AniTalker-kit/AniTalker/md_docs/more_mfcc_controlable_cases.md b/AniTalker-kit/AniTalker/md_docs/more_mfcc_controlable_cases.md new file mode 100644 index 00000000..0e04e94e --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/more_mfcc_controlable_cases.md @@ -0,0 +1,126 @@ +## More controllable scripts (under MFCC) + +**Features of this model include:** +- The driving signals require one image plus an audio segment. +- You can also adjust pose_yaw, pose_pitch, pose_roll, face_location, and face_scale. +- Overall performance is not as good as Hubert. + + +![monalisa_free_style](assets/monalisa_more_control.gif) + +## Neural Face for Comparsion + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/english_female.wav' \ + --result_path 'outputs/monalisa_case4/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.1 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.5 +``` + +- The generated video of this sample will be saved to [outputs/monalisa_case4/monalisa-english_female.mp4](../outputs/monalisa_case4/monalisa-english_female.mp4). + +### Adjust head location to the left + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/english_female.wav' \ + --result_path 'outputs/monalisa_case5/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.1 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.45 \ + --face_scale 0.5 +``` + +- The generated video of this sample will be saved to [outputs/monalisa_case5/monalisa-english_female.mp4](../outputs/monalisa_case5/monalisa-english_female.mp4). + + +### Adjust head location to the right + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/english_female.wav' \ + --result_path 'outputs/monalisa_case6/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.1 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.55 \ + --face_scale 0.5 +``` + +- The generated video of this sample will be saved to [outputs/monalisa_case6/monalisa-english_female.mp4](../outputs/monalisa_case6/monalisa-english_female.mp4). + + +### Adjust head to larger scale + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/english_female.wav' \ + --result_path 'outputs/monalisa_case7/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.1 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.55 +``` + +- The generated video of this sample will be saved to [outputs/monalisa_case7/monalisa-english_female.mp4](../outputs/monalisa_case7/monalisa-english_female.mp4). + + + +### Adjust head to smaller scale + +``` +python ./code/demo.py \ + --infer_type 'mfcc_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_more_controllable_mfcc.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/english_female.wav' \ + --result_path 'outputs/monalisa_case8/' \ + --control_flag True \ + --seed 0 \ + --pose_yaw 0.1 \ + --pose_pitch 0 \ + --pose_roll 0 \ + --face_location 0.5 \ + --face_scale 0.40 +``` + +- The generated video of this sample will be saved to [outputs/monalisa_case8/monalisa-english_female.mp4](../outputs/monalisa_case8/monalisa-english_female.mp4). + + +**Explanation:** + +- Regarding face location and face scale, please be aware that only minor adjustments can be made. Broad adjustments may impact other facial movements, such as the movements of the lips. This limitation is primarily because during data processing, we ensured that the face is centered and scaled to a certain proportion as much as possible, as detailed in the [facial cropping code](https://github.com/liutaocode/talking_face_preprocessing?tab=readme-ov-file#facial-part-cropping). Consequently, the network has limited capability for extensive adjustments in terms of angle. + +- If you require significant changes in these attributes, you may consider reprocessing the training data. For instance, you could allow a wider range of movement and adjust the camera distance to a larger extent. By doing so, the network will be exposed to more diverse data during training, enabling it to handle more substantial variations in face location and scale. + diff --git a/AniTalker-kit/AniTalker/md_docs/overall_pipeline.md b/AniTalker-kit/AniTalker/md_docs/overall_pipeline.md new file mode 100644 index 00000000..ea4f423c --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/overall_pipeline.md @@ -0,0 +1,98 @@ + +# Main Pipeline + +## Extract the Head Region + +### Prepare Your Image + +Using the image at [this URL](https://civitai.com/images/1022064) as an example: + + + +### Crop Out the Face Part + +Ensure that the head is centered in the image, not too large or too small, as shown in the following image: + + + +### Prepare Your Audio + +#### Generate or Prepare an Audio File +Use a TTS tool to generate or prepare your own audio file. + +We recommend using [edge-tts](https://github.com/rany2/edge-tts) with the `en-US-AriaNeural` voice, as it pairs well with our model. Here is an example script: +``` +edge-tts --voice en-US-AriaNeural --text "In the mosaic of life, each moment weaves itself into a grand tapestry, one that captures not just our highest peaks but also our lowest valleys. This intricate interplay of experiences shapes our wisdom and resilience. As we journey through life, we realize that our true wealth lies not in material possessions but in the richness of our relationships and the depth of our reflections. Life, then, is less about what we accumulate and more about what we discover within ourselves and share with others." --write-media /path/to/your/audio/path/audio_name.wav --write-subtitles /path/to/your/audio/path/audio_name.vtt +``` + +### Run the inference + + + + +[Source](https://civitai.com/images/6525430) + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/aiface3.png' \ + --test_audio_path 'test_demos/audios/speech4.wav' \ + --test_hubert_path 'test_demos/audios_hubert/speech4.npy' \ + --result_path 'outputs/pipeline_samples/' \ + --face_sr +``` + + +[1. Result (with face super-resolution)](../outputs/pipeline_samples/aiface3-speech4_SR.mp4) + +[2. Result (without face super-resolution)](../outputs/pipeline_samples/aiface3-speech4.mp4) + + +### Other samples 1 + + + + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/aiface4.png' \ + --test_audio_path 'test_demos/audios/speech4.wav' \ + --test_hubert_path 'test_demos/audios_hubert/speech4.npy' \ + --result_path 'outputs/pipeline_samples/' \ + --face_sr +``` + +[1. Result (with face super-resolution)](../outputs/pipeline_samples/aiface4-speech4_SR.mp4) + +[2. Result (without face super-resolution)](../outputs/pipeline_samples/aiface4-speech4.mp4) + + + + +### Other samples 2 + + + +[Source](https://civitai.com/images/1520359) + +``` +python ./code/demo.py \ + --infer_type 'hubert_full_control' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_full_control_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/aiface1.png' \ + --test_audio_path 'test_demos/audios/speech4.wav' \ + --test_hubert_path 'test_demos/audios_hubert/speech4.npy' \ + --result_path 'outputs/pipeline_samples/' \ + --face_sr +``` + + +[1. Result (with face super-resolution)](../outputs/pipeline_samples/aiface1-speech4_SR.mp4) + +[2. Result (without face super-resolution)](../outputs/pipeline_samples/aiface1-speech4.mp4) diff --git a/AniTalker-kit/AniTalker/md_docs/run_on_macOS.md b/AniTalker-kit/AniTalker/md_docs/run_on_macOS.md new file mode 100644 index 00000000..2ce7a6bf --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/run_on_macOS.md @@ -0,0 +1,96 @@ +# AniTalker macOS Setup Guide + +## Tested Environments + +We have successfully tested the environment on the following devices: + +- MacBook Pro M3 Max (128GB RAM, 8TB Storage), macOS Sonoma 14.6.1 +- MacBook Pro M1 Pro (16GB RAM, 2TB Storage), macOS Sonoma 14.5 + +**Note:** We currently lack access to an Intel-based Mac. If you have one available, we warmly welcome your contributions regarding the testing environment and results. + +## 1. Project Download + +Clone the repository: + +```bash +git clone https://github.com/X-LANCE/AniTalker.git +``` + +## 2. Dependencies Installation + +Follow these steps to set up your environment: + +```bash +# Create and activate a Conda environment +conda create -n anitalker python=3.9.0 -c conda-forge +conda activate anitalker + +# Install PyTorch +conda install pytorch torchvision torchaudio -c pytorch + +# Install ESPnet +git clone https://github.com/espnet/espnet.git +cd espnet +git checkout b10464 +pip install -e . + +# Install additional dependencies +conda install -c conda-forge pytorch-lightning=1.6.5 torchmetrics=0.5.0 transformers=4.19.2 moviepy numpy tokenizers scipy tqdm libffi +pip install python_speech_features + +# [Optional] Install Rust if you encounter warnings +# curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +``` + +If you encounter issues during installation, please refer to our detailed environment files: +- [Conda environment file](../md_docs/mac_os_env_list/conda_environment.yml) +- [Pip requirements file](../md_docs/mac_os_env_list/pip_requirements.txt) + +## 3. Model Download + +Please follow the instructions in the `README.md` file to download all required models, including the HuBERT model. + +## 4. Running the Demo + +Execute the following command to run the demo: + +``` + PYTORCH_ENABLE_MPS_FALLBACK=1 python ./code/demo.py \ + --infer_type 'hubert_audio_only' \ + --stage1_checkpoint_path 'ckpts/stage1.ckpt' \ + --stage2_checkpoint_path 'ckpts/stage2_audio_only_hubert.ckpt' \ + --test_image_path 'test_demos/portraits/monalisa.jpg' \ + --test_audio_path 'test_demos/audios/monalisa.wav' \ + --test_hubert_path 'test_demos/audios_hubert/monalisa.npy' \ + --result_path 'outputs/monalisa_hubert/' +``` + +### Results + +- MacBook Pro M3 Max (128GB RAM, 8TB Storage), macOS Sonoma 14.6.1: + ![Results on M3 Max](../assets/results_run_on_macOS_m3.png) + +- MacBook Pro M1 Pro (16GB RAM, 2TB Storage), macOS Sonoma 14.5: + ![Results on M1 Pro](../assets/results_run_on_macOS_m1.jpg) + +## 5. Notable Modifications + +To ensure compatibility with macOS, we've made the following adjustments: + +1. Updated dependencies in `requirements.txt` +2. Utilized MPS (Metal Performance Shaders) instead of CUDA for GPU acceleration +3. Changed data types from `float64` to `float32` for better performance +4. Added `PYTORCH_ENABLE_MPS_FALLBACK=1` environment variable to enable fallback to CPU when necessary + + +- Macbook pro M1 Pro (16GB + 2TB), Sonoma 14.5: + +![](../assets/results_run_on_macOS_m1.jpg) + +# 5. Modify log + +- dependencies: requirements.txt +- use mps insted of cuda +- change float64 to float32 +- PYTORCH_ENABLE_MPS_FALLBACK=1 diff --git a/AniTalker-kit/AniTalker/md_docs/run_on_windows.md b/AniTalker-kit/AniTalker/md_docs/run_on_windows.md new file mode 100644 index 00000000..11c8adce --- /dev/null +++ b/AniTalker-kit/AniTalker/md_docs/run_on_windows.md @@ -0,0 +1,49 @@ + +## Windows Installation + +Step 1: Clone repository +``` +git clone https://github.com/X-LANCE/AniTalker/ +``` + +Step 2: Navigate inside cloned repository + +``` +cd AniTalker +``` + +Step 3: Create virtual environment using conda +``` +conda create -n anitalker python==3.9.0 +``` + +Step 4: Activate virtual environment +``` +conda activate anitalker +``` + +Step 5: Install dependencies +``` +pip install -r requirements_windows.txt +``` + +Step 6: Download checkpoints +``` +git lfs install +``` +``` +git clone https://huggingface.co/taocode/anitalker_ckpts ckpts +``` + +Step 7: Download additional files for auto-cropping on source image +``` +wget -O code/data_preprocess/shape_predictor_68_face_landmarks.dat https://github.com/italojs/facial-landmarks-recognition/raw/master/shape_predictor_68_face_landmarks.dat + +wget -O code/data_preprocess/M003_template.npy https://raw.githubusercontent.com/tanshuai0219/EDTalk/main/data_preprocess/M003_template.npy + +``` + +Step 8: Launch WebUI +``` +python code/webgui.py +``` \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/outputs/Einstein_hubert/Einstein-english_male.mp4 b/AniTalker-kit/AniTalker/outputs/Einstein_hubert/Einstein-english_male.mp4 new file mode 100644 index 00000000..55879ff2 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/Einstein_hubert/Einstein-english_male.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male.mp4 b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male.mp4 new file mode 100644 index 00000000..ac6ef06a Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male_SR.mp4 b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male_SR.mp4 new file mode 100644 index 00000000..1cb8a159 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_full_control/Einstein-english_male_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male.mp4 b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male.mp4 new file mode 100644 index 00000000..9cdd9c53 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male_SR.mp4 b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male_SR.mp4 new file mode 100644 index 00000000..334d5d7b Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/Einstein_hubert_pose/Einstein-english_male_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/aiface1-english_male.mp4 b/AniTalker-kit/AniTalker/outputs/aiface1-english_male.mp4 new file mode 100644 index 00000000..11be1369 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/aiface1-english_male.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio.mp4 b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio.mp4 new file mode 100644 index 00000000..349bbef1 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio_SR.mp4 b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio_SR.mp4 new file mode 100644 index 00000000..3b98bb1b Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-AniTalker_intro_audio_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female.mp4 b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female.mp4 new file mode 100644 index 00000000..0034ae99 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female_SR.mp4 b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female_SR.mp4 new file mode 100644 index 00000000..029a7481 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/aiface2_hubert/aiface2-english_female_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert/cartoon_girl-mars.mp4 b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert/cartoon_girl-mars.mp4 new file mode 100644 index 00000000..7360c341 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert/cartoon_girl-mars.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars.mp4 b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars.mp4 new file mode 100644 index 00000000..3f15c2b4 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars_SR.mp4 b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars_SR.mp4 new file mode 100644 index 00000000..b4f8aa6f Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_full_control/cartoon_girl-mars_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4 b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4 new file mode 100644 index 00000000..c71b1a0e Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars_SR.mp4 b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars_SR.mp4 new file mode 100644 index 00000000..380c25ac Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/cartoon_girl_mars_story_hubert_pose/cartoon_girl-mars_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/lianliru_hubert/girl-lianliru.mp4 b/AniTalker-kit/AniTalker/outputs/lianliru_hubert/girl-lianliru.mp4 new file mode 100644 index 00000000..a5dcf586 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/lianliru_hubert/girl-lianliru.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru.mp4 b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru.mp4 new file mode 100644 index 00000000..a11a080a Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru_SR.mp4 b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru_SR.mp4 new file mode 100644 index 00000000..489e5c53 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_full_control/sad-lianliru_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru.mp4 b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru.mp4 new file mode 100644 index 00000000..80551819 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru_SR.mp4 b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru_SR.mp4 new file mode 100644 index 00000000..7d58fd15 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/lianliru_hubert_with_pose/sad-lianliru_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/monalisa-statue.mp4 b/AniTalker-kit/AniTalker/outputs/monalisa-statue.mp4 new file mode 100644 index 00000000..0dc2cdd6 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/monalisa-statue.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa.mp4 b/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa.mp4 new file mode 100644 index 00000000..e9a4959e Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa_SR.mp4 b/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa_SR.mp4 new file mode 100644 index 00000000..77747e1b Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/monalisa_hubert/monalisa-monalisa_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/monalisa_mfcc/monalisa-monalisa.mp4 b/AniTalker-kit/AniTalker/outputs/monalisa_mfcc/monalisa-monalisa.mp4 new file mode 100644 index 00000000..80c995c1 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/monalisa_mfcc/monalisa-monalisa.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4.mp4 new file mode 100644 index 00000000..c1292d9d Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4_SR.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4_SR.mp4 new file mode 100644 index 00000000..50c11d2c Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface1-speech4_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4.mp4 new file mode 100644 index 00000000..af3ba0ea Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4_SR.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4_SR.mp4 new file mode 100644 index 00000000..361d0a2e Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface3-speech4_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4.mp4 new file mode 100644 index 00000000..adf99acb Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4_SR.mp4 b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4_SR.mp4 new file mode 100644 index 00000000..6d772679 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/pipeline_samples/aiface4-speech4_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue.mp4 b/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue.mp4 new file mode 100644 index 00000000..ec41539f Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue.mp4 differ diff --git a/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue_SR.mp4 b/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue_SR.mp4 new file mode 100644 index 00000000..6a9f6ce5 Binary files /dev/null and b/AniTalker-kit/AniTalker/outputs/statue_hubert_full_control/statue-statue_SR.mp4 differ diff --git a/AniTalker-kit/AniTalker/parsing_parsenet.pth b/AniTalker-kit/AniTalker/parsing_parsenet.pth new file mode 100644 index 00000000..a786d164 Binary files /dev/null and b/AniTalker-kit/AniTalker/parsing_parsenet.pth differ diff --git a/AniTalker-kit/AniTalker/requirements.txt b/AniTalker-kit/AniTalker/requirements.txt new file mode 100644 index 00000000..990ee1a1 --- /dev/null +++ b/AniTalker-kit/AniTalker/requirements.txt @@ -0,0 +1,15 @@ +torchmetrics==0.5.0 +torch==2.0.1 +torchvision==0.15.2 +torchaudio==2.0.2 +pytorch-lightning==1.6.5 +scipy==1.5.4 +numpy==1.19.5 +tqdm==4.66.4 +espnet==202301 +moviepy==1.0.3 +python_speech_features +transformers==4.19.2 +gradio +gfpgan +dlib==19.24.5 \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/requirements_windows.txt b/AniTalker-kit/AniTalker/requirements_windows.txt new file mode 100644 index 00000000..51d3bc88 --- /dev/null +++ b/AniTalker-kit/AniTalker/requirements_windows.txt @@ -0,0 +1,19 @@ +--extra-index-url https://download.pytorch.org/whl/cu116 +pytorch-lightning==1.6.5 +torchmetrics==0.5.0 +torch==1.13.1+cu116 +torchvision==0.14.1+cu116 +torchaudio==0.13.1+cu116 +scipy==1.5.4 +numpy==1.23.5 +pillow==9.0.0 +tqdm==4.66.4 +espnet==202301 +moviepy==1.0.3 +python_speech_features +transformers==4.19.2 +facexlib +tb-nightly --index-url https://mirrors.aliyun.com/pypi/simple +gfpgan +gradio +dlib==19.24.5 \ No newline at end of file diff --git a/AniTalker-kit/AniTalker/run_main.txt b/AniTalker-kit/AniTalker/run_main.txt new file mode 100644 index 00000000..2291b22e --- /dev/null +++ b/AniTalker-kit/AniTalker/run_main.txt @@ -0,0 +1,21 @@ +首先拉取镜像: +docker pull gre123/anitalkermodel:v1 + +接着运行docker run命令: +docker run --rm –gpus all \ +-v your_image_path:/app/image_path \ +-v your_audio_path:/app/audio_path \ +-v your_output_dir_path:/app/output_dir_path \ +gre123/anitalkermodel:v1 \ +--input_image /app/image_path \ +--input_audio_text /app/audio_path \ +--output_dir /app/output_dir_path +[--face_sr](没有该参数时生成256*256的视频,加上该参数后生成512*512的视频) +[--device your_device](默认值为cuda:0) + +其中,your_image_path、your_audio_path为文件的绝对路径,your_output_dir_path为文件夹的绝对路径。 + +例: + +docker run --rm --gpus all --memory="64g" --cpus="4" -v F:/AniTalker/AniTalker-main/test_demos/portraits/May.png:/app/May.png -v F:/AniTalker/AniTalker-main/test_demos/audios/short_May.wav:/app/short_May.wav -v F:/AniTalker/AniTalker-main/run_output:/app/run_output gre123/anitalkermodel:v1 --input_image /app/May.png --input_audio_text /app/short_May.wav --output_dir /app/run_output + diff --git a/AniTalker-kit/AniTalker/run_output/May-short_May.mp4 b/AniTalker-kit/AniTalker/run_output/May-short_May.mp4 new file mode 100644 index 00000000..51fa156b Binary files /dev/null and b/AniTalker-kit/AniTalker/run_output/May-short_May.mp4 differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/AniTalker_intro_audio.wav b/AniTalker-kit/AniTalker/test_demos/audios/AniTalker_intro_audio.wav new file mode 100644 index 00000000..6e6d6b98 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/AniTalker_intro_audio.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/english_female.wav b/AniTalker-kit/AniTalker/test_demos/audios/english_female.wav new file mode 100644 index 00000000..d13c4b75 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/english_female.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/english_male.mp3 b/AniTalker-kit/AniTalker/test_demos/audios/english_male.mp3 new file mode 100644 index 00000000..db37e744 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/english_male.mp3 differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/lianliru.wav b/AniTalker-kit/AniTalker/test_demos/audios/lianliru.wav new file mode 100644 index 00000000..ae670cc9 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/lianliru.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/mars.wav b/AniTalker-kit/AniTalker/test_demos/audios/mars.wav new file mode 100644 index 00000000..50b5e565 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/mars.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/monalisa.wav b/AniTalker-kit/AniTalker/test_demos/audios/monalisa.wav new file mode 100644 index 00000000..dbd52f7f Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/monalisa.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/short_May.wav b/AniTalker-kit/AniTalker/test_demos/audios/short_May.wav new file mode 100644 index 00000000..0ec1f156 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/short_May.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/speech4.wav b/AniTalker-kit/AniTalker/test_demos/audios/speech4.wav new file mode 100644 index 00000000..ec6a7da6 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/speech4.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/audios/statue.wav b/AniTalker-kit/AniTalker/test_demos/audios/statue.wav new file mode 100644 index 00000000..129784ab Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/audios/statue.wav differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/Einstein.png b/AniTalker-kit/AniTalker/test_demos/portraits/Einstein.png new file mode 100644 index 00000000..ec661474 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/Einstein.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/May.png b/AniTalker-kit/AniTalker/test_demos/portraits/May.png new file mode 100644 index 00000000..3449aa32 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/May.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface1.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface1.png new file mode 100644 index 00000000..19c1f1c7 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface1.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface2.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface2.png new file mode 100644 index 00000000..dec4d69e Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface2.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface3.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface3.png new file mode 100644 index 00000000..73c0ca5f Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface3.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface4.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface4.png new file mode 100644 index 00000000..a4970705 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface4.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface6.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface6.png new file mode 100644 index 00000000..176a4c30 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface6.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface7.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface7.png new file mode 100644 index 00000000..9996c74d Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface7.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/aiface8.png b/AniTalker-kit/AniTalker/test_demos/portraits/aiface8.png new file mode 100644 index 00000000..39dff750 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/aiface8.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/cartoon_girl.png b/AniTalker-kit/AniTalker/test_demos/portraits/cartoon_girl.png new file mode 100644 index 00000000..c7827d75 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/cartoon_girl.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/girl.png b/AniTalker-kit/AniTalker/test_demos/portraits/girl.png new file mode 100644 index 00000000..d0b61f51 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/girl.png differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/monalisa.jpg b/AniTalker-kit/AniTalker/test_demos/portraits/monalisa.jpg new file mode 100644 index 00000000..efd4f1f9 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/monalisa.jpg differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/sad.jpg b/AniTalker-kit/AniTalker/test_demos/portraits/sad.jpg new file mode 100644 index 00000000..0fa52f7a Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/sad.jpg differ diff --git a/AniTalker-kit/AniTalker/test_demos/portraits/statue.jpg b/AniTalker-kit/AniTalker/test_demos/portraits/statue.jpg new file mode 100644 index 00000000..0be9e950 Binary files /dev/null and b/AniTalker-kit/AniTalker/test_demos/portraits/statue.jpg differ diff --git a/AniTalker-kit/README.md b/AniTalker-kit/README.md new file mode 100644 index 00000000..d05ae99a --- /dev/null +++ b/AniTalker-kit/README.md @@ -0,0 +1,124 @@ +# talkingface-kit + +本项目基于[AniTalker](https://github.com/X-LANCE/AniTalker),[SyncNet](https://github.com/joonson/syncnet_python/tree/master) + +项目包括三个部分:AniTalker、Judge和Syncnet + + +## AniTalker + +项目通过身份解耦面部运动编码,实现音频、图片生成视频。 + +在原项目的基础上,进行了下列修改: + +1. 修改code/demo.py代码,改变了接口的名称,保存为code/demo_final.py代码。 +2. 在test_demo/portraits/中添加了May.png,在test_demo/audios/中添加了short_May.wav,用于测试封装后的镜像。 +3. 新增了run_output文件夹,该文件夹为运行封装后的镜像时挂载的输出文件夹,其中的视频文件是通过运行封装后的镜像生成的视频。 +4. 新增了Dockerfile,用于构建docker镜像。 +5. 新增了run_main.txt,给出docker镜像的运行示例。 +6. 修改了requirements.txt,修改了torch、torchvision、torchaudio的版本,以适配cuda11.7,增加了gfpgan包,以便在运行时使用 --face_sr参数,生成512*512的视频。 +7. 新增了parsing_parsenet.pth,将模型参数保存在本地。 + +### Quick Start + +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 从Dockerhub拉取构建好的镜像 +``` +docker pull gre123/anitalkermodel:v1 +``` +3. 拉取镜像后使用docker命令运行(如果为本地运行加 --gpus all) +``` +docker run --rm –gpus all \ +-v your_image_path:/app/image_path \ +-v your_audio_path:/app/audio_path \ +-v your_output_dir_path:/app/output_dir_path \ +gre123/anitalkermodel:v1 \ +--input_image /app/image_path \ +--input_audio_text /app/audio_path \ +--output_dir /app/output_dir_path +[--face_sr](没有该参数时生成256*256的视频,加上该参数后生成512*512的视频) +[--device your_device](默认值为cuda:0) +``` +其中,your_image_path、your_audio_path为文件的绝对路径,your_output_dir_path为文件夹的绝对路径。 + +例如: +``` +docker run --rm --gpus all --memory="64g" --cpus="4" -v F:/AniTalker/AniTalker-main/test_demos/portraits/May.png:/app/May.png -v F:/AniTalker/AniTalker-main/test_demos/audios/short_May.wav:/app/short_May.wav -v F:/AniTalker/AniTalker-main/run_output:/app/run_output gre123/anitalkermodel:v1 --input_image /app/May.png --input_audio_text /app/short_May.wav --output_dir /app/run_output +``` + +4. 生成视频可在对应文件夹下查看 + +### 更多食用方法 + +[AniTalker](https://github.com/RubyZh/talkingface-kit/blob/main/AniTalker-kit/AniTalker/README.md) + +## Judge + +用于计算生成视频的PSNR、SSIM、FID、NIQE,用于定量评估视频生成效果。 + +在计算PSNR、SSIM、NIQE时,分别计算每一帧的指标结果,再取平均值。 + +在计算FID时,对视频的每一帧利用inception_v3模型提取特征,将一个视频所有帧的特征向量求均值和协方差,进行计算。 + +### Quick Start + +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 拉取镜像: +``` +docker pull gre123/anitalkerjudge:v1 +``` +3. 参考以下docker run命令运行: +``` +docker run --rm --gpus all \ +-v your_stand_videos_dir_path:/app/stand_videos_dir_path \ +-v your_generate_videos_dir_path:/app/generate_videos_dir_path \ +gre123/anitalkerjudge:v1 \ +/app/stand_videos_dir_path \ +/app/generate_videos_dir_path +[--device your_device](默认值为cuda:0) +``` +其中,your_stand_videos_dir_path、your_generate_videos_dir_path为文件夹的绝对路径。 +注意:评测程序会对两个文件夹中名称相同的视频进行评测指标的计算,运行该程序需要确保对应的参照视频和生成视频名称相同且时长相同。 + +### 其他食用方法 + +[Judge](https://github.com/RubyZh/talkingface-kit/blob/main/AniTalker-kit/AniTalker-judge/README.md) + +## Syncnet + +用于计算生成视频的LSE-D LSE-C,用于定量评估生成视频效果,无需ground-truth + +在原项目的基础上,进行了下列修改: + +1. 增加了demo.py、Dockerfile,修改了calculate_scores_real_videos.sh,方便构建docker镜像,并在构建完成后使用docker命令直接获取评估结果。 +2. 修改了代码中部分无法兼容numpy较高版本导致的问题(e.g. int32) +3. 修改了requirements.txt,提供了必要的、相对较高版本的依赖。 + +### Quick Start + +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 从Dockerhub拉取构建好的镜像 +``` +docker pull bellacora/syncnet-image:v4 +``` +3. 拉取镜像后使用docker命令运行(如果为本地运行加 --gpus all) +``` +docker run --rm --gpus all -v path:/app/videos --folderpath /app/videos +``` +其中path部分应当替换为宿主机上视频文件夹(注意是文件夹)的绝对路径,将会评估文件夹内所有视频,并按照文件夹内视频名称的字典顺序输出评估结果 + +### 更多食用方法 + +[Syncnet](https://github.com/RubyZh/talkingface-kit/blob/main/AniTalker-kit/syncnet_python/README.md) + +## Questions + +常见问题及解决方式可参考[Q&A](https://github.com/RubyZh/talkingface-kit/blob/main/AniTalker-kit/docs/Questions.md) + +## Contributors + + + + + +Made with [contrib.rocks](https://contrib.rocks). \ No newline at end of file diff --git a/AniTalker-kit/docs/Questions.md b/AniTalker-kit/docs/Questions.md new file mode 100644 index 00000000..1f5ed73f --- /dev/null +++ b/AniTalker-kit/docs/Questions.md @@ -0,0 +1,71 @@ +# 常见问题及解决方案 + +## AniTalker + +1. 模型权重下载失败 +如果无法下载模型权重,也能够生成512×512的视频(运行时自动下载),可以删除Dockerfile对应的COPY命令 +``` +COPY detection_Resnet50_Final.pth /app/gfpgan/weights/detection_Resnet50_Final.pth +COPY GFPGANv1.4.pth /usr/local/lib/python3.9/dist-packages/gfpgan/weights/GFPGANv1.4.pth +``` +随后构建镜像 + +2. 如果在本地不使用docker运行,可以打开web窗口进行视频生成。 + +也可以使用命令行: +``` +python demo_final.py \ +--input_image image_path \ +--input_audio_text audio_path \ +--output_dir output_dir_path +[--face_sr](没有该参数时生成256*256的视频,加上该参数后生成512*512的视频) +[--device your_device](默认值为cuda:0) +``` +(上述路径均为绝对路径) + +3. ubuntu镜像无法拉取,可尝试先单独拉取对应镜像 +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` +4. 检查点ckpts下载失败,使用huggingface镜像源进行下载[huggingface](https://hf-mirror.com/taocode/anitalker_ckpts) + +5. 安装 dlib 时,构建其所需的 wheel 失败 + +核心原因是 dlib 的构建需要 CMake,但系统中未安装 CMake,或 CMake 没有正确配置。 + +解决步骤如下: + +(1) 安装 CMake:在系统中安装 CMake,根据操作系统选择合适的安装方法。从CMake Download官方网站中下载对应版本并进行安装即可,安装时注意勾选添加到环境变量。 + +(2) 安装构建工具(Windows):由于使用的是 Windows,dlib 还需要 Microsoft C++ Build Tools。下载 Microsoft C++ 生成工具 - Visual Studio,在安装向导中选择 C++ Desktop Development 工作负载 + +(3) 安装完成后,重新打开终端 + +6. 执行 python code/webgui.py 并选择mp3格式的音频文件时出现报错 + +原因是系统中缺少 ffmpeg,gradio 库在尝试加载非 WAV 格式的音频文件时依赖于 ffmpeg + +解决方法:在官网 [ffmpeg](https://ffmpeg.org) 下载ffmpeg,安装时勾选添加到环境变量,随后重新打开终端 + +## Judge + +1. 模型权重下载失败 + +如果无法下载模型权重,也能够进行计算(运行时自动下载),可以删除Dockerfile对应的COPY命令 +``` +COPY inception_v3_google-0cc3c7bd.pth /app/inception_v3_google-0cc3c7bd.pth +``` +随后构建镜像 + +2. ubuntu镜像无法拉取,可尝试先单独拉取对应镜像 +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` + +## Syncnet + +1. ubuntu镜像无法拉取,可尝试先单独拉取对应镜像 +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` +2. 本地运行时需要ffmpeg, 在官网 [ffmpeg](https://ffmpeg.org) 下载ffmpeg,安装时勾选添加到环境变量,随后重新打开终端 diff --git a/AniTalker-kit/syncnet_python/.gitignore b/AniTalker-kit/syncnet_python/.gitignore new file mode 100644 index 00000000..350ada00 --- /dev/null +++ b/AniTalker-kit/syncnet_python/.gitignore @@ -0,0 +1,45 @@ +# Compiled source # +################### +*.com +*.class +*.dll +*.exe +*.o +*.so +*.pyc + +# Packages # +############ +# it's better to unpack these files and commit the raw source +# git has its own built in compression methods +*.7z +*.dmg +*.gz +*.iso +*.jar +*.rar +*.tar +*.zip + +# Logs and databases # +###################### +*.log +*.sql +*.sqlite + +# OS generated files # +###################### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Specific to this demo # +######################### +data/ +protos/ +utils/ +*.pth diff --git a/AniTalker-kit/syncnet_python/Dockerfile b/AniTalker-kit/syncnet_python/Dockerfile new file mode 100644 index 00000000..9bf94773 --- /dev/null +++ b/AniTalker-kit/syncnet_python/Dockerfile @@ -0,0 +1,35 @@ +# 使用 NVIDIA 官方 CUDA 12.3 镜像作为基础镜像 +FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04 + +# 设置工作目录 +WORKDIR /app + +# 设置环境变量以避免交互式安装提示 +ENV DEBIAN_FRONTEND=noninteractive + +# 更新包索引并安装依赖项(包括 wget、git、Python 及其他工具) +RUN apt-get update && apt-get install -y \ + wget \ + git \ + python3 \ + python3-pip \ + python3-dev \ + build-essential \ + cmake \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# 复制项目文件到容器 +COPY . /app + +# 安装 Python 库依赖 +RUN pip3 install -r requirements.txt + +# 设置工作目录 +WORKDIR /app + +# 开放端口 8080,用于 Web 服务 +EXPOSE 8080 + +# 运行你的应用程序 +ENTRYPOINT ["python3", "demo.py"] \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/LICENSE.md b/AniTalker-kit/syncnet_python/LICENSE.md new file mode 100644 index 00000000..de4a5458 --- /dev/null +++ b/AniTalker-kit/syncnet_python/LICENSE.md @@ -0,0 +1,19 @@ +Copyright (c) 2016-present Joon Son Chung. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/AniTalker-kit/syncnet_python/README.md b/AniTalker-kit/syncnet_python/README.md new file mode 100644 index 00000000..1dd61685 --- /dev/null +++ b/AniTalker-kit/syncnet_python/README.md @@ -0,0 +1,71 @@ +# SyncNet + +本项目基于[SyncNet](https://github.com/joonson/syncnet_python/tree/master) + +## Change +本项目在原项目的基础上,进行了下列修改: + +1. 增加了demo.py、Dockerfile,修改了calculate_scores_real_videos.sh,方便构建docker镜像,并在构建完成后使用docker命令直接获取评估结果。 +2. 修改了代码中部分无法兼容numpy较高版本导致的问题(e.g. int32) +3. 修改了requirements.txt,提供了必要的、相对较高版本的依赖。 + +## Quick Start +1. 安装docker,宿主机CUDA版本为11.7及以上 +2. 从Dockerhub拉取构建好的镜像 +``` +docker pull bellacora/syncnet-image:v4 +``` +3. 拉取镜像后使用docker命令运行(如果为本地运行加 --gpus all) +``` +docker run --rm --gpus all -v path:/app/videos --folderpath /app/videos +``` +其中path部分应当替换为宿主机上视频文件夹(注意是文件夹)的绝对路径,将会评估文件夹内所有视频,并按照文件夹内视频名称的字典顺序输出评估结果 + +4. 输出结果的最后X行(文件夹内有X个视频)为计算的LSE-D LSE-C值,按照文件夹内视频名称的字典顺序输出 + +## Dockerfile + +如果想要自己手动构建镜像,请按照下列步骤进行: +1. 将项目代码拉取到本地 +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/syncnet_python +``` +2. 在项目根目录下打开终端,运行 +``` +docker build -t . +``` +如果ubuntu镜像无法拉取,可尝试先单独拉取对应镜像。 +``` +docker pull nvidia/cuda:11.7.1-runtime-ubuntu22.04 +``` +该镜像可以尝试修改为对应的CUDA版本(同时修改Dockerfile中相关版本及对应的依赖) + +如果requirements.txt内的依赖在电脑上无法兼容,可尝试调整torch和torchvision的版本,以便与CUDA版本兼容;但scenedetect的版本不能低于0.6.0,其他依赖版本依照原项目中给出的,应当满足torch>=1.4.0,torchvision>=0.5.0,numpy>=1.18.1,scipy>=1.2.1 + +3. 运行成功后,使用docker命令运行,参考前述命令标准 + +## Local + +如果不通过docker,直接运行,请按照下列步骤进行: +1. 将项目代码拉取到本地 +``` +git clone https://github.com/RubyZh/talkingface-kit.git +cd talkingface-kit/AniTalker-kit/syncnet_python +``` +2. 安装ffmpeg + +3. 打开Anaconda Prompt,运行 +``` +conda create -n syncnet python=3.9.0 +conda activate syncnet +``` +4. 安装必要依赖 +``` +pip install -r requirements.txt +``` +5. 将视频放至文件夹下,运行代码 +``` +sh calculate_scores_real_videos.sh /path/to/video/data/root +``` +在calculate_scores_real_videos.sh文件内,ls 命令默认按字典顺序列出文件,因此,$yourfilenames 中的文件会按照字典顺序排列,输出结果按照字典顺序排列 \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/SyncNetInstance.py b/AniTalker-kit/syncnet_python/SyncNetInstance.py new file mode 100644 index 00000000..497d44fc --- /dev/null +++ b/AniTalker-kit/syncnet_python/SyncNetInstance.py @@ -0,0 +1,208 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +# Video 25 FPS, Audio 16000HZ + +import torch +import numpy +import time, pdb, argparse, subprocess, os, math, glob +import cv2 +import python_speech_features + +from scipy import signal +from scipy.io import wavfile +from SyncNetModel import * +from shutil import rmtree + + +# ==================== Get OFFSET ==================== + +def calc_pdist(feat1, feat2, vshift=10): + + win_size = vshift*2+1 + + feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) + + dists = [] + + for i in range(0,len(feat1)): + + dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) + + return dists + +# ==================== MAIN DEF ==================== + +class SyncNetInstance(torch.nn.Module): + + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + super(SyncNetInstance, self).__init__(); + + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + + def evaluate(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Convert files + # ========== ========== + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + + command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + output = subprocess.call(command, shell=True, stdout=None) + + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + output = subprocess.call(command, shell=True, stdout=None) + + # ========== ========== + # Load video + # ========== ========== + + images = [] + + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) + flist.sort() + + for fname in flist: + images.append(cv2.imread(fname)) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Load audio + # ========== ========== + + sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) + mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) + mfcc = numpy.stack([numpy.array(i) for i in mfcc]) + + cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) + cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + + # ========== ========== + # Check audio and video input length + # ========== ========== + + if (float(len(audio))/16000) != (float(len(images))/25) : + print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + + min_length = min(len(images),math.floor(len(audio)/640)) + + # ========== ========== + # Generate video and audio feats + # ========== ========== + + lastframe = min_length-5 + im_feat = [] + cc_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lip(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + cc_in = torch.cat(cc_batch,0) + cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_feat.append(cc_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + cc_feat = torch.cat(cc_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) + mdist = torch.mean(torch.stack(dists,1),1) + + minval, minidx = torch.min(mdist,0) + + offset = opt.vshift-minidx + conf = torch.median(mdist) - minval + + fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) + # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) + fconf = torch.median(mdist).numpy() - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) + print('Framewise conf: ') + print(fconfm) + print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + + dists_npy = numpy.array([ dist.numpy() for dist in dists ]) + return offset.numpy(), conf.numpy(), dists_npy + + def extract_feature(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Load video + # ========== ========== + cap = cv2.VideoCapture(videofile) + + frame_num = 1; + images = [] + while frame_num: + frame_num += 1 + ret, image = cap.read() + if ret == 0: + break + + images.append(image) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Generate video feats + # ========== ========== + + lastframe = len(images)-4 + im_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + return im_feat + + + def loadParameters(self, path): + loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + + self_state = self.__S__.state_dict(); + + for name, param in loaded_state.items(): + + self_state[name].copy_(param); diff --git a/AniTalker-kit/syncnet_python/SyncNetInstance_calc_scores.py b/AniTalker-kit/syncnet_python/SyncNetInstance_calc_scores.py new file mode 100644 index 00000000..64906e25 --- /dev/null +++ b/AniTalker-kit/syncnet_python/SyncNetInstance_calc_scores.py @@ -0,0 +1,210 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +# Video 25 FPS, Audio 16000HZ + +import torch +import numpy +import time, pdb, argparse, subprocess, os, math, glob +import cv2 +import python_speech_features + +from scipy import signal +from scipy.io import wavfile +from SyncNetModel import * +from shutil import rmtree + + +# ==================== Get OFFSET ==================== + +def calc_pdist(feat1, feat2, vshift=10): + + win_size = vshift*2+1 + + feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) + + dists = [] + + for i in range(0,len(feat1)): + + dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) + + return dists + +# ==================== MAIN DEF ==================== + +class SyncNetInstance(torch.nn.Module): + + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + super(SyncNetInstance, self).__init__(); + + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + + def evaluate(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Convert files + # ========== ========== + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + + command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + output = subprocess.call(command, shell=True, stdout=None) + + command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + output = subprocess.call(command, shell=True, stdout=None) + + # ========== ========== + # Load video + # ========== ========== + + images = [] + + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) + flist.sort() + + for fname in flist: + img_input = cv2.imread(fname) + img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE + images.append(img_input) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Load audio + # ========== ========== + + sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) + mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) + mfcc = numpy.stack([numpy.array(i) for i in mfcc]) + + cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) + cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + + # ========== ========== + # Check audio and video input length + # ========== ========== + + #if (float(len(audio))/16000) != (float(len(images))/25) : + # print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + + min_length = min(len(images),math.floor(len(audio)/640)) + + # ========== ========== + # Generate video and audio feats + # ========== ========== + + lastframe = min_length-5 + im_feat = [] + cc_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lip(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + cc_in = torch.cat(cc_batch,0) + cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_feat.append(cc_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + cc_feat = torch.cat(cc_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + #print('Compute time %.3f sec.' % (time.time()-tS)) + + dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) + mdist = torch.mean(torch.stack(dists,1),1) + + minval, minidx = torch.min(mdist,0) + + offset = opt.vshift-minidx + conf = torch.median(mdist) - minval + + fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) + # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) + fconf = torch.median(mdist).numpy() - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) + #print('Framewise conf: ') + #print(fconfm) + #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + + dists_npy = numpy.array([ dist.numpy() for dist in dists ]) + return offset.numpy(), conf.numpy(), minval.numpy() + + def extract_feature(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Load video + # ========== ========== + cap = cv2.VideoCapture(videofile) + + frame_num = 1; + images = [] + while frame_num: + frame_num += 1 + ret, image = cap.read() + if ret == 0: + break + + images.append(image) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Generate video feats + # ========== ========== + + lastframe = len(images)-4 + im_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + return im_feat + + + def loadParameters(self, path): + loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + + self_state = self.__S__.state_dict(); + + for name, param in loaded_state.items(): + + self_state[name].copy_(param); diff --git a/AniTalker-kit/syncnet_python/SyncNetModel.py b/AniTalker-kit/syncnet_python/SyncNetModel.py new file mode 100644 index 00000000..c21ce25c --- /dev/null +++ b/AniTalker-kit/syncnet_python/SyncNetModel.py @@ -0,0 +1,117 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import torch +import torch.nn as nn + +def save(model, filename): + with open(filename, "wb") as f: + torch.save(model, f); + print("%s saved."%filename); + +def load(filename): + net = torch.load(filename) + return net; + +class S(nn.Module): + def __init__(self, num_layers_in_fc_layers = 1024): + super(S, self).__init__(); + + self.__nFeatures__ = 24; + self.__nChs__ = 32; + self.__midChs__ = 32; + + self.netcnnaud = nn.Sequential( + nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), + nn.BatchNorm2d(64), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)), + + nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)), + nn.BatchNorm2d(192), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), + + nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(384), + nn.ReLU(inplace=True), + + nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + + nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), + + nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)), + nn.BatchNorm2d(512), + nn.ReLU(), + ); + + self.netfcaud = nn.Sequential( + nn.Linear(512, 512), + nn.BatchNorm1d(512), + nn.ReLU(), + nn.Linear(512, num_layers_in_fc_layers), + ); + + self.netfclip = nn.Sequential( + nn.Linear(512, 512), + nn.BatchNorm1d(512), + nn.ReLU(), + nn.Linear(512, num_layers_in_fc_layers), + ); + + self.netcnnlip = nn.Sequential( + nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0), + nn.BatchNorm3d(96), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), + + nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), + + nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0), + nn.BatchNorm3d(512), + nn.ReLU(inplace=True), + ); + + def forward_aud(self, x): + + mid = self.netcnnaud(x); # N x ch x 24 x M + mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + out = self.netfcaud(mid); + + return out; + + def forward_lip(self, x): + + mid = self.netcnnlip(x); + mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + out = self.netfclip(mid); + + return out; + + def forward_lipfeat(self, x): + + mid = self.netcnnlip(x); + out = mid.view((mid.size()[0], -1)); # N x (ch x 24) + + return out; \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/calculate_scores_LRS.py b/AniTalker-kit/syncnet_python/calculate_scores_LRS.py new file mode 100644 index 00000000..eda02b8f --- /dev/null +++ b/AniTalker-kit/syncnet_python/calculate_scores_LRS.py @@ -0,0 +1,53 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess +import glob +import os +from tqdm import tqdm + +from SyncNetInstance_calc_scores import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_root', type=str, required=True, help=''); +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); +parser.add_argument('--reference', type=str, default="demo", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); +path = os.path.join(opt.data_root, "*.mp4") + +all_videos = glob.glob(path) + +prog_bar = tqdm(range(len(all_videos))) +avg_confidence = 0. +avg_min_distance = 0. + + +for videofile_idx in prog_bar: + videofile = all_videos[videofile_idx] + offset, confidence, min_distance = s.evaluate(opt, videofile=videofile) + avg_confidence += confidence + avg_min_distance += min_distance + prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3))) + prog_bar.refresh() + +print ('Average Confidence: {}'.format(avg_confidence/len(all_videos))) +print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos))) + + + diff --git a/AniTalker-kit/syncnet_python/calculate_scores_real_videos.py b/AniTalker-kit/syncnet_python/calculate_scores_real_videos.py new file mode 100644 index 00000000..09622584 --- /dev/null +++ b/AniTalker-kit/syncnet_python/calculate_scores_real_videos.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess, pickle, os, gzip, glob + +from SyncNetInstance_calc_scores import * + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + +# ==================== LOAD MODEL AND FILE LIST ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); + +flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) +flist.sort() + +# ==================== GET OFFSETS ==================== + +dists = [] +for idx, fname in enumerate(flist): + offset, conf, dist = s.evaluate(opt,videofile=fname) + print (str(dist)+" "+str(conf)) + +# ==================== PRINT RESULTS TO FILE ==================== + +#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: +# pickle.dump(dists, fil) diff --git a/AniTalker-kit/syncnet_python/calculate_scores_real_videos.sh b/AniTalker-kit/syncnet_python/calculate_scores_real_videos.sh new file mode 100644 index 00000000..5a921496 --- /dev/null +++ b/AniTalker-kit/syncnet_python/calculate_scores_real_videos.sh @@ -0,0 +1,7 @@ +yourfilenames=`ls $1` + +for eachfile in $yourfilenames +do + python3 run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir + python3 calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir +done diff --git a/AniTalker-kit/syncnet_python/demo.py b/AniTalker-kit/syncnet_python/demo.py new file mode 100644 index 00000000..d3594d4b --- /dev/null +++ b/AniTalker-kit/syncnet_python/demo.py @@ -0,0 +1,10 @@ +import argparse +import os + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='EchoMimic') + parser.add_argument('--folderpath', type=str, required=True, help="Path to videos") + args = parser.parse_args() + + command = f"sh calculate_scores_real_videos.sh '{args.folderpath}'" + os.system(command) \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/demo_feature.py b/AniTalker-kit/syncnet_python/demo_feature.py new file mode 100644 index 00000000..e3bd290e --- /dev/null +++ b/AniTalker-kit/syncnet_python/demo_feature.py @@ -0,0 +1,32 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess + +from SyncNetInstance import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); +parser.add_argument('--tmp_dir', type=str, default="data", help=''); +parser.add_argument('--save_as', type=str, default="data/features.pt", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +feats = s.extract_feature(opt, videofile=opt.videofile) + +torch.save(feats, opt.save_as) diff --git a/AniTalker-kit/syncnet_python/demo_syncnet.py b/AniTalker-kit/syncnet_python/demo_syncnet.py new file mode 100644 index 00000000..01c25a6f --- /dev/null +++ b/AniTalker-kit/syncnet_python/demo_syncnet.py @@ -0,0 +1,30 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess + +from SyncNetInstance import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); +parser.add_argument('--reference', type=str, default="demo", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +s.evaluate(opt, videofile=opt.videofile) diff --git a/AniTalker-kit/syncnet_python/detectors/README.md b/AniTalker-kit/syncnet_python/detectors/README.md new file mode 100644 index 00000000..f5a8d4fe --- /dev/null +++ b/AniTalker-kit/syncnet_python/detectors/README.md @@ -0,0 +1,3 @@ +# Face detector + +This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. diff --git a/AniTalker-kit/syncnet_python/detectors/__init__.py b/AniTalker-kit/syncnet_python/detectors/__init__.py new file mode 100644 index 00000000..059d49bf --- /dev/null +++ b/AniTalker-kit/syncnet_python/detectors/__init__.py @@ -0,0 +1 @@ +from .s3fd import S3FD \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/detectors/s3fd/__init__.py b/AniTalker-kit/syncnet_python/detectors/s3fd/__init__.py new file mode 100644 index 00000000..d7f35e05 --- /dev/null +++ b/AniTalker-kit/syncnet_python/detectors/s3fd/__init__.py @@ -0,0 +1,61 @@ +import time +import numpy as np +import cv2 +import torch +from torchvision import transforms +from .nets import S3FDNet +from .box_utils import nms_ + +PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth' +img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') + + +class S3FD(): + + def __init__(self, device='cuda'): + + tstamp = time.time() + self.device = device + + print('[S3FD] loading with', self.device) + self.net = S3FDNet(device=self.device).to(self.device) + state_dict = torch.load(PATH_WEIGHT, map_location=self.device) + self.net.load_state_dict(state_dict) + self.net.eval() + print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + + def detect_faces(self, image, conf_th=0.8, scales=[1]): + + w, h = image.shape[1], image.shape[0] + + bboxes = np.empty(shape=(0, 5)) + + with torch.no_grad(): + for s in scales: + scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) + + scaled_img = np.swapaxes(scaled_img, 1, 2) + scaled_img = np.swapaxes(scaled_img, 1, 0) + scaled_img = scaled_img[[2, 1, 0], :, :] + scaled_img = scaled_img.astype('float32') + scaled_img -= img_mean + scaled_img = scaled_img[[2, 1, 0], :, :] + x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device) + y = self.net(x) + + detections = y.data + scale = torch.Tensor([w, h, w, h]) + + for i in range(detections.size(1)): + j = 0 + while detections[0, i, j, 0] > conf_th: + score = detections[0, i, j, 0] + pt = (detections[0, i, j, 1:] * scale).cpu().numpy() + bbox = (pt[0], pt[1], pt[2], pt[3], score) + bboxes = np.vstack((bboxes, bbox)) + j += 1 + + keep = nms_(bboxes, 0.1) + bboxes = bboxes[keep] + + return bboxes diff --git a/AniTalker-kit/syncnet_python/detectors/s3fd/box_utils.py b/AniTalker-kit/syncnet_python/detectors/s3fd/box_utils.py new file mode 100644 index 00000000..701a8e5d --- /dev/null +++ b/AniTalker-kit/syncnet_python/detectors/s3fd/box_utils.py @@ -0,0 +1,217 @@ +import numpy as np +from itertools import product as product +import torch +from torch.autograd import Function + + +def nms_(dets, thresh): + """ + Courtesy of Ross Girshick + [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] + """ + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(int(i)) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return np.array(keep).astype(np.int32) + + +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat(( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count + + +class Detect(object): + + def __init__(self, num_classes=2, + top_k=750, nms_thresh=0.3, conf_thresh=0.05, + variance=[0.1, 0.2], nms_top_k=5000): + + self.num_classes = num_classes + self.top_k = top_k + self.nms_thresh = nms_thresh + self.conf_thresh = conf_thresh + self.variance = variance + self.nms_top_k = nms_top_k + + def forward(self, loc_data, conf_data, prior_data): + + num = loc_data.size(0) + num_priors = prior_data.size(0) + + conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) + batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) + batch_priors = batch_priors.contiguous().view(-1, 4) + + decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) + decoded_boxes = decoded_boxes.view(num, num_priors, 4) + + output = torch.zeros(num, self.num_classes, self.top_k, 5) + + for i in range(num): + boxes = decoded_boxes[i].clone() + conf_scores = conf_preds[i].clone() + + for cl in range(1, self.num_classes): + c_mask = conf_scores[cl].gt(self.conf_thresh) + scores = conf_scores[cl][c_mask] + + if scores.dim() == 0: + continue + l_mask = c_mask.unsqueeze(1).expand_as(boxes) + boxes_ = boxes[l_mask].view(-1, 4) + ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) + count = count if count < self.top_k else self.top_k + + output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) + + return output + + +class PriorBox(object): + + def __init__(self, input_size, feature_maps, + variance=[0.1, 0.2], + min_sizes=[16, 32, 64, 128, 256, 512], + steps=[4, 8, 16, 32, 64, 128], + clip=False): + + super(PriorBox, self).__init__() + + self.imh = input_size[0] + self.imw = input_size[1] + self.feature_maps = feature_maps + + self.variance = variance + self.min_sizes = min_sizes + self.steps = steps + self.clip = clip + + def forward(self): + mean = [] + for k, fmap in enumerate(self.feature_maps): + feath = fmap[0] + featw = fmap[1] + for i, j in product(range(feath), range(featw)): + f_kw = self.imw / self.steps[k] + f_kh = self.imh / self.steps[k] + + cx = (j + 0.5) / f_kw + cy = (i + 0.5) / f_kh + + s_kw = self.min_sizes[k] / self.imw + s_kh = self.min_sizes[k] / self.imh + + mean += [cx, cy, s_kw, s_kh] + + output = torch.FloatTensor(mean).view(-1, 4) + + if self.clip: + output.clamp_(max=1, min=0) + + return output diff --git a/AniTalker-kit/syncnet_python/detectors/s3fd/nets.py b/AniTalker-kit/syncnet_python/detectors/s3fd/nets.py new file mode 100644 index 00000000..85b5c82c --- /dev/null +++ b/AniTalker-kit/syncnet_python/detectors/s3fd/nets.py @@ -0,0 +1,174 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from .box_utils import Detect, PriorBox + + +class L2Norm(nn.Module): + + def __init__(self, n_channels, scale): + super(L2Norm, self).__init__() + self.n_channels = n_channels + self.gamma = scale or None + self.eps = 1e-10 + self.weight = nn.Parameter(torch.Tensor(self.n_channels)) + self.reset_parameters() + + def reset_parameters(self): + init.constant_(self.weight, self.gamma) + + def forward(self, x): + norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps + x = torch.div(x, norm) + out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x + return out + + +class S3FDNet(nn.Module): + + def __init__(self, device='cuda'): + super(S3FDNet, self).__init__() + self.device = device + + self.vgg = nn.ModuleList([ + nn.Conv2d(3, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(64, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(128, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2, ceil_mode=True), + + nn.Conv2d(256, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 1, 1), + nn.ReLU(inplace=True), + ]) + + self.L2Norm3_3 = L2Norm(256, 10) + self.L2Norm4_3 = L2Norm(512, 8) + self.L2Norm5_3 = L2Norm(512, 5) + + self.extras = nn.ModuleList([ + nn.Conv2d(1024, 256, 1, 1), + nn.Conv2d(256, 512, 3, 2, padding=1), + nn.Conv2d(512, 128, 1, 1), + nn.Conv2d(128, 256, 3, 2, padding=1), + ]) + + self.loc = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(1024, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(256, 4, 3, 1, padding=1), + ]) + + self.conf = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(1024, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(256, 2, 3, 1, padding=1), + ]) + + self.softmax = nn.Softmax(dim=-1) + self.detect = Detect() + + def forward(self, x): + size = x.size()[2:] + sources = list() + loc = list() + conf = list() + + for k in range(16): + x = self.vgg[k](x) + s = self.L2Norm3_3(x) + sources.append(s) + + for k in range(16, 23): + x = self.vgg[k](x) + s = self.L2Norm4_3(x) + sources.append(s) + + for k in range(23, 30): + x = self.vgg[k](x) + s = self.L2Norm5_3(x) + sources.append(s) + + for k in range(30, len(self.vgg)): + x = self.vgg[k](x) + sources.append(x) + + # apply extra layers and cache source layer outputs + for k, v in enumerate(self.extras): + x = F.relu(v(x), inplace=True) + if k % 2 == 1: + sources.append(x) + + # apply multibox head to source layers + loc_x = self.loc[0](sources[0]) + conf_x = self.conf[0](sources[0]) + + max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) + conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) + + loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) + conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) + + for i in range(1, len(sources)): + x = sources[i] + conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) + loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) + + features_maps = [] + for i in range(len(loc)): + feat = [] + feat += [loc[i].size(1), loc[i].size(2)] + features_maps += [feat] + + loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) + conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) + + with torch.no_grad(): + self.priorbox = PriorBox(size, features_maps) + self.priors = self.priorbox.forward() + + output = self.detect.forward( + loc.view(loc.size(0), -1, 4), + self.softmax(conf.view(conf.size(0), -1, 2)), + self.priors.type(type(x.data)).to(self.device) + ) + + return output diff --git a/AniTalker-kit/syncnet_python/download_model.sh b/AniTalker-kit/syncnet_python/download_model.sh new file mode 100644 index 00000000..3e3a9dc2 --- /dev/null +++ b/AniTalker-kit/syncnet_python/download_model.sh @@ -0,0 +1,9 @@ +# SyncNet model + +mkdir data +wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model +wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi + +# For the pre-processing pipeline +mkdir detectors/s3fd/weights +wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth \ No newline at end of file diff --git a/AniTalker-kit/syncnet_python/img/ex1.jpg b/AniTalker-kit/syncnet_python/img/ex1.jpg new file mode 100644 index 00000000..b20b57e1 Binary files /dev/null and b/AniTalker-kit/syncnet_python/img/ex1.jpg differ diff --git a/AniTalker-kit/syncnet_python/img/ex2.jpg b/AniTalker-kit/syncnet_python/img/ex2.jpg new file mode 100644 index 00000000..851402cc Binary files /dev/null and b/AniTalker-kit/syncnet_python/img/ex2.jpg differ diff --git a/AniTalker-kit/syncnet_python/requirements.txt b/AniTalker-kit/syncnet_python/requirements.txt new file mode 100644 index 00000000..a33b4aef --- /dev/null +++ b/AniTalker-kit/syncnet_python/requirements.txt @@ -0,0 +1,7 @@ +torch==2.0.1 +torchvision==0.15.2 +numpy==1.22.4 +scipy==1.13.1 +scenedetect==0.6.0 +opencv-contrib-python +python_speech_features diff --git a/AniTalker-kit/syncnet_python/run_pipeline.py b/AniTalker-kit/syncnet_python/run_pipeline.py new file mode 100644 index 00000000..f5fc22e0 --- /dev/null +++ b/AniTalker-kit/syncnet_python/run_pipeline.py @@ -0,0 +1,322 @@ +#!/usr/bin/python + +import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 +import numpy as np +from shutil import rmtree + +import scenedetect +from scenedetect.video_manager import VideoManager +from scenedetect.scene_manager import SceneManager +from scenedetect.frame_timecode import FrameTimecode +from scenedetect.stats_manager import StatsManager +from scenedetect.detectors import ContentDetector + +from scipy.interpolate import interp1d +from scipy.io import wavfile +from scipy import signal + +from detectors import S3FD + +# ========== ========== ========== ========== +# # PARSE ARGS +# ========== ========== ========== ========== + +parser = argparse.ArgumentParser(description = "FaceTracker"); +parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry'); +parser.add_argument('--videofile', type=str, default='', help='Input video file'); +parser.add_argument('--reference', type=str, default='', help='Video reference'); +parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); +parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); +parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration'); +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); +parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); +parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) +setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) + +# ========== ========== ========== ========== +# # IOU FUNCTION +# ========== ========== ========== ========== + +def bb_intersection_over_union(boxA, boxB): + + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + + interArea = max(0, xB - xA) * max(0, yB - yA) + + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + + iou = interArea / float(boxAArea + boxBArea - interArea) + + return iou + +# ========== ========== ========== ========== +# # FACE TRACKING +# ========== ========== ========== ========== + +def track_shot(opt,scenefaces): + + iouThres = 0.5 # Minimum IOU between consecutive face detections + tracks = [] + + while True: + track = [] + for framefaces in scenefaces: + for face in framefaces: + if track == []: + track.append(face) + framefaces.remove(face) + elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det: + iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) + if iou > iouThres: + track.append(face) + framefaces.remove(face) + continue + else: + break + + if track == []: + break + elif len(track) > opt.min_track: + + framenum = np.array([ f['frame'] for f in track ]) + bboxes = np.array([np.array(f['bbox']) for f in track]) + + frame_i = np.arange(framenum[0],framenum[-1]+1) + + bboxes_i = [] + for ij in range(0,4): + interpfn = interp1d(framenum, bboxes[:,ij]) + bboxes_i.append(interpfn(frame_i)) + bboxes_i = np.stack(bboxes_i, axis=1) + + if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size: + tracks.append({'frame':frame_i,'bbox':bboxes_i}) + + return tracks + +# ========== ========== ========== ========== +# # VIDEO CROP AND SAVE +# ========== ========== ========== ========== + +def crop_video(opt,track,cropfile): + + flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) + flist.sort() + + fourcc = cv2.VideoWriter_fourcc(*'XVID') + vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224)) + + dets = {'x':[], 'y':[], 's':[]} + + for det in track['bbox']: + + dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + + # Smooth detections + dets['s'] = signal.medfilt(dets['s'],kernel_size=13) + dets['x'] = signal.medfilt(dets['x'],kernel_size=13) + dets['y'] = signal.medfilt(dets['y'],kernel_size=13) + + for fidx, frame in enumerate(track['frame']): + + cs = opt.crop_scale + + bs = dets['s'][fidx] # Detection box size + bsi = int(bs*(1+2*cs)) # Pad videos by this amount + + image = cv2.imread(flist[frame]) + + frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) + my = dets['y'][fidx]+bsi # BBox center Y + mx = dets['x'][fidx]+bsi # BBox center X + + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + + vOut.write(cv2.resize(face,(224,224))) + + audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') + audiostart = (track['frame'][0])/opt.frame_rate + audioend = (track['frame'][-1]+1)/opt.frame_rate + + vOut.release() + + # ========== CROP AUDIO FILE ========== + + command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) + output = subprocess.call(command, shell=True, stdout=None) + + if output != 0: + pdb.set_trace() + + sample_rate, audio = wavfile.read(audiotmp) + + # ========== COMBINE AUDIO AND VIDEO FILES ========== + + command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) + output = subprocess.call(command, shell=True, stdout=None) + + if output != 0: + pdb.set_trace() + + print('Written %s'%cropfile) + + os.remove(cropfile+'t.avi') + + print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))) + + return {'track':track, 'proc_track':dets} + +# ========== ========== ========== ========== +# # FACE DETECTION +# ========== ========== ========== ========== + +def inference_video(opt): + + DET = S3FD(device='cuda') + + flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) + flist.sort() + + dets = [] + + for fidx, fname in enumerate(flist): + + start_time = time.time() + + image = cv2.imread(fname) + + image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale]) + + dets.append([]); + for bbox in bboxes: + dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) + + elapsed_time = time.time() - start_time + + print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) + + savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') + + with open(savepath, 'wb') as fil: + pickle.dump(dets, fil) + + return dets + +# ========== ========== ========== ========== +# # SCENE DETECTION +# ========== ========== ========== ========== + +def scene_detect(opt): + + video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')]) + stats_manager = StatsManager() + scene_manager = SceneManager(stats_manager) + # Add ContentDetector algorithm (constructor takes detector options like threshold). + scene_manager.add_detector(ContentDetector()) + base_timecode = video_manager.get_base_timecode() + + video_manager.set_downscale_factor() + + video_manager.start() + + scene_manager.detect_scenes(frame_source=video_manager) + + scene_list = scene_manager.get_scene_list(base_timecode) + + savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl') + + if scene_list == []: + scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())] + + with open(savepath, 'wb') as fil: + pickle.dump(scene_list, fil) + + print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) + + return scene_list + + +# ========== ========== ========== ========== +# # EXECUTE DEMO +# ========== ========== ========== ========== + +# ========== DELETE EXISTING DIRECTORIES ========== + +if os.path.exists(os.path.join(opt.work_dir,opt.reference)): + rmtree(os.path.join(opt.work_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): + rmtree(os.path.join(opt.crop_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): + rmtree(os.path.join(opt.avi_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): + rmtree(os.path.join(opt.frames_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + +# ========== MAKE NEW DIRECTORIES ========== + +os.makedirs(os.path.join(opt.work_dir,opt.reference)) +os.makedirs(os.path.join(opt.crop_dir,opt.reference)) +os.makedirs(os.path.join(opt.avi_dir,opt.reference)) +os.makedirs(os.path.join(opt.frames_dir,opt.reference)) +os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + +# ========== CONVERT VIDEO AND EXTRACT FRAMES ========== + +command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) +output = subprocess.call(command, shell=True, stdout=None) + +command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) +output = subprocess.call(command, shell=True, stdout=None) + +command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) +output = subprocess.call(command, shell=True, stdout=None) + +# ========== FACE DETECTION ========== + +faces = inference_video(opt) + +# ========== SCENE DETECTION ========== + +scene = scene_detect(opt) + +# ========== FACE TRACKING ========== + +alltracks = [] +vidtracks = [] + +for shot in scene: + + if shot[1].frame_num - shot[0].frame_num >= opt.min_track : + alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) + +# ========== FACE TRACK CROP ========== + +for ii, track in enumerate(alltracks): + vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) + +# ========== SAVE RESULTS ========== + +savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') + +with open(savepath, 'wb') as fil: + pickle.dump(vidtracks, fil) + +rmtree(os.path.join(opt.tmp_dir,opt.reference)) diff --git a/AniTalker-kit/syncnet_python/run_syncnet.py b/AniTalker-kit/syncnet_python/run_syncnet.py new file mode 100644 index 00000000..45099fd6 --- /dev/null +++ b/AniTalker-kit/syncnet_python/run_syncnet.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess, pickle, os, gzip, glob + +from SyncNetInstance import * + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + +# ==================== LOAD MODEL AND FILE LIST ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) +flist.sort() + +# ==================== GET OFFSETS ==================== + +dists = [] +for idx, fname in enumerate(flist): + offset, conf, dist = s.evaluate(opt,videofile=fname) + dists.append(dist) + +# ==================== PRINT RESULTS TO FILE ==================== + +with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: + pickle.dump(dists, fil) diff --git a/AniTalker-kit/syncnet_python/run_visualise.py b/AniTalker-kit/syncnet_python/run_visualise.py new file mode 100644 index 00000000..85d89253 --- /dev/null +++ b/AniTalker-kit/syncnet_python/run_visualise.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import torch +import numpy +import time, pdb, argparse, subprocess, pickle, os, glob +import cv2 + +from scipy import signal + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) +setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) + +# ==================== LOAD FILES ==================== + +with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil: + tracks = pickle.load(fil, encoding='latin1') + +with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil: + dists = pickle.load(fil, encoding='latin1') + +flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) +flist.sort() + +# ==================== SMOOTH FACES ==================== + +faces = [[] for i in range(len(flist))] + +for tidx, track in enumerate(tracks): + + mean_dists = numpy.mean(numpy.stack(dists[tidx],1),1) + minidx = numpy.argmin(mean_dists,0) + minval = mean_dists[minidx] + + fdist = numpy.stack([dist[minidx] for dist in dists[tidx]]) + fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=10) + + fconf = numpy.median(mean_dists) - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + for fidx, frame in enumerate(track['track']['frame'].tolist()) : + faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) + +# ==================== ADD DETECTIONS TO VIDEO ==================== + +first_image = cv2.imread(flist[0]) + +fw = first_image.shape[1] +fh = first_image.shape[0] + +fourcc = cv2.VideoWriter_fourcc(*'XVID') +vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh)) + +for fidx, fname in enumerate(flist): + + image = cv2.imread(fname) + + for face in faces[fidx]: + + clr = max(min(face['conf']*25,255),0) + + cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3) + cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2) + + vOut.write(image) + + print('Frame %d'%fidx) + +vOut.release() + +# ========== COMBINE AUDIO AND VIDEO FILES ========== + +command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 +output = subprocess.call(command, shell=True, stdout=None) + +