Academic-Hammer · xsh0413 · Dec 23, 2024
diff --git a/Hallo2/README.md b/Hallo2/README.md
@@ -0,0 +1,76 @@
+# Hallo2项目配置文档
+
+### 小组成员
+
+黄松毅 吴京桥 熊康慈
+
+### 镜像文件下载和导入
+
+百度云盘：https://pan.baidu.com/s/1D7w-AarTui4qsTPO_wiNfg
+提取码：eigy
+
+该云盘中有镜像压缩文件：hallo2.tar，先从云盘中下载完整docker镜像文件hallo2.tar，然后将其导入到服务器中，使用其加载对应镜像，命令为：`docker load -i hallo2.tar`，然后可以查看现有镜像：`docker images`，应该会出现一个名为hallo2，版本为v5的镜像。
+接着需要基于hallo2:v5镜像构建容器，命令为：
+```bash
+docker run -it --rm \
+--gpus all \
+-v /path/to/your/input_image:/app/input.jpg \
+-v /path/to/your/input_audio_text:/app/input.wav \
+-v /path/to/your/output_dir:/app/output \
+hallo2:v5
+```
+
+其中，我们需要指定--gpus为all，否则哪怕容器装有cuda driver和cuda都没办法调用到主机上的gpu。其中我们建议输入的图片格式为jpg格式，音频格式为wav格式，并将图片和音频导入到容器中时候名字都为input。
+
+### 运行项目生成视频
+
+在输入上述指令之后，我们就成功进入hallo2容器中。
+
+该容器的工作目录为/app，因此我们需要先进入/app目录：`cd /app`，随后我们输入生成视频的指令：
+```bash
+python scripts/inference_long.py \
+--config configs/inference/long.yaml \
+--source_image ./input.jpg \
+--driving_audio ./input.wav \
+--pose_weight 1.0 \
+--face_weight 1.0 \
+--lip_weight 1.0 \
+--face_expand_ratio 1.0 \
+&& python scripts/video_sr.py \
+--input_path output_long/debug/input/merge_video.mp4 \
+--output_path output/ 
+--bg_upsampler realesrgan --face_upsample -w 1 -s 4
+```
+
+即可以在容器中的/app/output下找到生成的merge_video.mp4。
+
+注：运行7s的视频，需要运行20min左右。
+
+### 对视频进行评估
+
+之后我们在app目录下找到一个evaluation文件夹，进入该文件夹：`cd evaluation`，之后可以找到该目录下的evaluation.py文件，该文件用于评估指标：
+
+![alt text](evaluation_help.png)
+
+其中，original_video_path需要指出原视频路径，generated_video_path需要指出生成视频路径，output_dir需要指出输出数据的保存路径，另外几个参数为是否要计算该评估指标，值为1表示需要计算。
+
+之后，我们使用之前就保存在该容器中的两个视频分别作为原始视频和生成视频来测试评估代码，输入指令如下：
+```bash
+Python evaluation.py \
+--original_video_path ./examples/merge_video.mp4 \
+--generated_video_path ../output_long/debug/1/merge_video.mp4 \
+--output_dir ./output --psnr 1 --fid 1 --lse 1
+```
+
+该指令计算psnr、fid和lse（LSE-C和LSE-D），并且输出在目前文件夹(/app/evaluation)的/output文件夹下的evaluation.txt。该txt文件结尾输出数据结构为：
+
+原始视频路径：
+PSNR:...
+FID:...
+...(其他指标)
+
+注：如果不在容器环境下运行evaluation.py，需要将pytorch_fid的fid_score.py中的get_activations()函数中的dataloader中的num_workers设置为0
+
+----------
+
+注：**如有任何问题**，请联系2223915400@qq.com或者vx:hsy190613
diff --git a/Hallo2/evaluation/FID/FID.py b/Hallo2/evaluation/FID/FID.py
@@ -0,0 +1,43 @@
+import torch
+from pytorch_fid import fid_score
+# import logging
+
+# # 配置日志记录
+# logging.basicConfig(
+#     filename='fid_score.log',  # 日志文件名
+#     filemode='a',             # 追加模式
+#     level=logging.INFO,       # 日志记录级别
+#     format='%(asctime)s - %(levelname)s - %(message)s'
+# )
+
+def calculate_fid(real_images_folder='./original_frames', generated_images_folder='./generated_frames'):
+    # 设置真实数据和生成数据文件夹路径
+    # real_images_folder = 'raw_results'
+    # generated_images_folder = 'final_results'
+
+    # 设置参数
+    new_batch_size = 16
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    dims = 2048  # 使用 Inception 模型的默认特征维度
+
+    # try:
+    # 计算 FID 值
+    fid_value = fid_score.calculate_fid_given_paths(
+        [real_images_folder, generated_images_folder],
+        batch_size=new_batch_size,
+        device=device,
+        dims=dims
+    )
+    # logging.info(f'FID value: {fid_value}')
+    # print(f'FID value: {fid_value}')
+
+    print(__file__)
+
+    return fid_value
+    # except Exception as e:
+    #     logging.error(f'Error occurred while calculating FID: {str(e)}')
+
+
+
+if __name__ == '__main__':
+    calculate_fid()
diff --git a/Hallo2/evaluation/FID/__pycache__/FID.cpython-310.pyc b/Hallo2/evaluation/FID/__pycache__/FID.cpython-310.pyc
diff --git a/Hallo2/evaluation/LSE/SyncNetInstance_calc_scores.py b/Hallo2/evaluation/LSE/SyncNetInstance_calc_scores.py
@@ -0,0 +1,245 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from LSE.SyncNetModel import *
+from shutil import rmtree
+from moviepy.editor import VideoFileClip
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__()
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda()
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval()
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        # ========== ==========
+        # Save jpg & wav 
+        # ========== ==========
+
+        # 一直有问题
+        # command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        # output = subprocess.call(command, shell=True, stdout=None)
+
+        # command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        # output = subprocess.call(command, shell=True, stdout=None)
+
+        # 提取音频并保存为 wav 格式（使用 moviepy）
+        video = VideoFileClip(videofile)
+        audio = video.audio
+        audio_path = os.path.join(opt.tmp_dir,opt.reference, "audio.wav")
+        audio.write_audiofile(audio_path, codec='pcm_s16le')  # 保存为 .wav 格式
+
+        # 使用 OpenCV 提取每一帧并保存为 jpg
+        frame_dir = os.path.join(opt.tmp_dir,opt.reference)
+        # if not os.path.exists(frame_dir):
+        #     os.makedirs(frame_dir)
+
+        # 打开视频文件
+        cap = cv2.VideoCapture(videofile)
+
+        # 读取视频帧并保存
+        frame_count = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break  # 如果没有更多帧，跳出循环
+
+            # 保存每帧图像
+            frame_path = os.path.join(frame_dir, f"{frame_count:06d}.jpg")
+            cv2.imwrite(frame_path, frame)
+            # print(f"Saved frame {frame_count:06d}.jpg")
+            frame_count += 1
+
+        # 释放资源
+        cap.release()
+
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        #if (float(len(audio))/16000) != (float(len(images))/25) :
+        #    print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda())
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+
+        #print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        #print('Framewise conf: ')
+        #print(fconfm)
+        #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), minval.numpy()
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval()
+
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda())
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage)
+
+        self_state = self.__S__.state_dict()
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param)