Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
76 changes: 76 additions & 0 deletions Hallo2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Hallo2项目配置文档

### 小组成员

黄松毅 吴京桥 熊康慈

### 镜像文件下载和导入

百度云盘:https://pan.baidu.com/s/1D7w-AarTui4qsTPO_wiNfg
提取码:eigy

该云盘中有镜像压缩文件:hallo2.tar,先从云盘中下载完整docker镜像文件hallo2.tar,然后将其导入到服务器中,使用其加载对应镜像,命令为:`docker load -i hallo2.tar`,然后可以查看现有镜像:`docker images`,应该会出现一个名为hallo2,版本为v5的镜像。
接着需要基于hallo2:v5镜像构建容器,命令为:
```bash
docker run -it --rm \
--gpus all \
-v /path/to/your/input_image:/app/input.jpg \
-v /path/to/your/input_audio_text:/app/input.wav \
-v /path/to/your/output_dir:/app/output \
hallo2:v5
```

其中,我们需要指定--gpus为all,否则哪怕容器装有cuda driver和cuda都没办法调用到主机上的gpu。其中我们建议输入的图片格式为jpg格式,音频格式为wav格式,并将图片和音频导入到容器中时候名字都为input。

### 运行项目生成视频

在输入上述指令之后,我们就成功进入hallo2容器中。

该容器的工作目录为/app,因此我们需要先进入/app目录:`cd /app`,随后我们输入生成视频的指令:
```bash
python scripts/inference_long.py \
--config configs/inference/long.yaml \
--source_image ./input.jpg \
--driving_audio ./input.wav \
--pose_weight 1.0 \
--face_weight 1.0 \
--lip_weight 1.0 \
--face_expand_ratio 1.0 \
&& python scripts/video_sr.py \
--input_path output_long/debug/input/merge_video.mp4 \
--output_path output/
--bg_upsampler realesrgan --face_upsample -w 1 -s 4
```

即可以在容器中的/app/output下找到生成的merge_video.mp4。

注:运行7s的视频,需要运行20min左右。

### 对视频进行评估

之后我们在app目录下找到一个evaluation文件夹,进入该文件夹:`cd evaluation`,之后可以找到该目录下的evaluation.py文件,该文件用于评估指标:

![alt text](evaluation_help.png)

其中,original_video_path需要指出原视频路径,generated_video_path需要指出生成视频路径,output_dir需要指出输出数据的保存路径,另外几个参数为是否要计算该评估指标,值为1表示需要计算。

之后,我们使用之前就保存在该容器中的两个视频分别作为原始视频和生成视频来测试评估代码,输入指令如下:
```bash
Python evaluation.py \
--original_video_path ./examples/merge_video.mp4 \
--generated_video_path ../output_long/debug/1/merge_video.mp4 \
--output_dir ./output --psnr 1 --fid 1 --lse 1
```

该指令计算psnr、fid和lse(LSE-C和LSE-D),并且输出在目前文件夹(/app/evaluation)的/output文件夹下的evaluation.txt。该txt文件结尾输出数据结构为:

原始视频路径:
PSNR:...
FID:...
...(其他指标)

注:如果不在容器环境下运行evaluation.py,需要将pytorch_fid的fid_score.py中的get_activations()函数中的dataloader中的num_workers设置为0

----------

注:**如有任何问题**,请联系2223915400@qq.com或者vx:hsy190613
43 changes: 43 additions & 0 deletions Hallo2/evaluation/FID/FID.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import torch
from pytorch_fid import fid_score
# import logging

# # 配置日志记录
# logging.basicConfig(
# filename='fid_score.log', # 日志文件名
# filemode='a', # 追加模式
# level=logging.INFO, # 日志记录级别
# format='%(asctime)s - %(levelname)s - %(message)s'
# )

def calculate_fid(real_images_folder='./original_frames', generated_images_folder='./generated_frames'):
# 设置真实数据和生成数据文件夹路径
# real_images_folder = 'raw_results'
# generated_images_folder = 'final_results'

# 设置参数
new_batch_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dims = 2048 # 使用 Inception 模型的默认特征维度

# try:
# 计算 FID 值
fid_value = fid_score.calculate_fid_given_paths(
[real_images_folder, generated_images_folder],
batch_size=new_batch_size,
device=device,
dims=dims
)
# logging.info(f'FID value: {fid_value}')
# print(f'FID value: {fid_value}')

print(__file__)

return fid_value
# except Exception as e:
# logging.error(f'Error occurred while calculating FID: {str(e)}')



if __name__ == '__main__':
calculate_fid()
Binary file not shown.
245 changes: 245 additions & 0 deletions Hallo2/evaluation/LSE/SyncNetInstance_calc_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
#!/usr/bin/python
#-*- coding: utf-8 -*-
# Video 25 FPS, Audio 16000HZ

import torch
import numpy
import time, pdb, argparse, subprocess, os, math, glob
import cv2
import python_speech_features

from scipy import signal
from scipy.io import wavfile
from LSE.SyncNetModel import *
from shutil import rmtree
from moviepy.editor import VideoFileClip

# ==================== Get OFFSET ====================

def calc_pdist(feat1, feat2, vshift=10):

win_size = vshift*2+1

feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))

dists = []

for i in range(0,len(feat1)):

dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))

return dists

# ==================== MAIN DEF ====================

class SyncNetInstance(torch.nn.Module):

def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
super(SyncNetInstance, self).__init__()

self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda()

def evaluate(self, opt, videofile):

self.__S__.eval()

# ========== ==========
# Convert files
# ========== ==========

if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
rmtree(os.path.join(opt.tmp_dir,opt.reference))

os.makedirs(os.path.join(opt.tmp_dir,opt.reference))

# ========== ==========
# Save jpg & wav
# ========== ==========

# 一直有问题
# command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
# output = subprocess.call(command, shell=True, stdout=None)

# command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
# output = subprocess.call(command, shell=True, stdout=None)

# 提取音频并保存为 wav 格式(使用 moviepy)
video = VideoFileClip(videofile)
audio = video.audio
audio_path = os.path.join(opt.tmp_dir,opt.reference, "audio.wav")
audio.write_audiofile(audio_path, codec='pcm_s16le') # 保存为 .wav 格式

# 使用 OpenCV 提取每一帧并保存为 jpg
frame_dir = os.path.join(opt.tmp_dir,opt.reference)
# if not os.path.exists(frame_dir):
# os.makedirs(frame_dir)

# 打开视频文件
cap = cv2.VideoCapture(videofile)

# 读取视频帧并保存
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break # 如果没有更多帧,跳出循环

# 保存每帧图像
frame_path = os.path.join(frame_dir, f"{frame_count:06d}.jpg")
cv2.imwrite(frame_path, frame)
# print(f"Saved frame {frame_count:06d}.jpg")
frame_count += 1

# 释放资源
cap.release()

# ========== ==========
# Load video
# ========== ==========

images = []

flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
flist.sort()

for fname in flist:
img_input = cv2.imread(fname)
img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
images.append(img_input)

im = numpy.stack(images,axis=3)
im = numpy.expand_dims(im,axis=0)
im = numpy.transpose(im,(0,3,4,1,2))

imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())

# ========== ==========
# Load audio
# ========== ==========

sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
mfcc = numpy.stack([numpy.array(i) for i in mfcc])

cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())

# ========== ==========
# Check audio and video input length
# ========== ==========

#if (float(len(audio))/16000) != (float(len(images))/25) :
# print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))

min_length = min(len(images),math.floor(len(audio)/640))

# ========== ==========
# Generate video and audio feats
# ========== ==========

lastframe = min_length-5
im_feat = []
cc_feat = []

tS = time.time()
for i in range(0,lastframe,opt.batch_size):

im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
im_in = torch.cat(im_batch,0)
im_out = self.__S__.forward_lip(im_in.cuda())
im_feat.append(im_out.data.cpu())

cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
cc_in = torch.cat(cc_batch,0)
cc_out = self.__S__.forward_aud(cc_in.cuda())
cc_feat.append(cc_out.data.cpu())

im_feat = torch.cat(im_feat,0)
cc_feat = torch.cat(cc_feat,0)

# ========== ==========
# Compute offset
# ========== ==========

#print('Compute time %.3f sec.' % (time.time()-tS))

dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
mdist = torch.mean(torch.stack(dists,1),1)

minval, minidx = torch.min(mdist,0)

offset = opt.vshift-minidx
conf = torch.median(mdist) - minval

fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
# fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
fconf = torch.median(mdist).numpy() - fdist
fconfm = signal.medfilt(fconf,kernel_size=9)

numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
#print('Framewise conf: ')
#print(fconfm)
#print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))

dists_npy = numpy.array([ dist.numpy() for dist in dists ])
return offset.numpy(), conf.numpy(), minval.numpy()

def extract_feature(self, opt, videofile):

self.__S__.eval()

# ========== ==========
# Load video
# ========== ==========
cap = cv2.VideoCapture(videofile)

frame_num = 1
images = []
while frame_num:
frame_num += 1
ret, image = cap.read()
if ret == 0:
break

images.append(image)

im = numpy.stack(images,axis=3)
im = numpy.expand_dims(im,axis=0)
im = numpy.transpose(im,(0,3,4,1,2))

imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())

# ========== ==========
# Generate video feats
# ========== ==========

lastframe = len(images)-4
im_feat = []

tS = time.time()
for i in range(0,lastframe,opt.batch_size):

im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
im_in = torch.cat(im_batch,0)
im_out = self.__S__.forward_lipfeat(im_in.cuda())
im_feat.append(im_out.data.cpu())

im_feat = torch.cat(im_feat,0)

# ========== ==========
# Compute offset
# ========== ==========

print('Compute time %.3f sec.' % (time.time()-tS))

return im_feat


def loadParameters(self, path):
loaded_state = torch.load(path, map_location=lambda storage, loc: storage)

self_state = self.__S__.state_dict()

for name, param in loaded_state.items():

self_state[name].copy_(param)
Loading