diff --git a/syncnet_python-master/Audio2Head/Audio2Head/README.md b/syncnet_python-master/Audio2Head/Audio2Head/README.md new file mode 100755 index 00000000..05a48609 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/README.md @@ -0,0 +1,49 @@ +# Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion (IJCAI 2021) + +#### [Paper](https://www.ijcai.org/proceedings/2021/0152.pdf) | [Demo](https://www.youtube.com/watch?v=xvcBJ29l8rA) + +#### Requirements + +- Python 3.6 , Pytorch >= 1.6 and ffmpeg + +- Other requirements are listed in the 'requirements.txt' + + + +#### Pretrained Checkpoint + +Please download the pretrained checkpoint from [google-drive](https://drive.google.com/file/d/1tvI43ZIrnx9Ti2TpFiEO4dK5DOwcECD7/view?usp=sharing) and put it within the folder (`/checkpoints`). + + + +#### Generate Demo Results + +``` +python inference.py --audio_path xxx.wav --img_path xxx.jpg +``` + +Note that the input images must keep the same height and width and the face should be appropriately cropped as in `/demo/img`. + + + +#### License and Citation + +``` +@InProceedings{wang2021audio2head, +author = Suzhen Wang, Lincheng Li, Yu Ding, Changjie Fan, Xin Yu +title = {Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion}, +booktitle = {the 30th International Joint Conference on Artificial Intelligence (IJCAI-21)}, +year = {2021}, +} +``` + + + +#### Acknowledgement + +This codebase is based on [First Order Motion Model](https://github.com/AliaksandrSiarohin/first-order-model), thanks for their contribution. + + + + + diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml new file mode 100755 index 00000000..7d5a97fe --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml @@ -0,0 +1,8 @@ +block_expansion: 32 +estimate_jacobian: true +max_features: 512 +num_blocks: 5 +num_kp: 10 +num_w: 2 +seq: true +seq_len: 64 \ No newline at end of file diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml new file mode 100755 index 00000000..b8399646 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml @@ -0,0 +1,83 @@ +dataset_params: + root_dir: /root/ + frame_shape: [256, 256, 3] + id_sampling: True + pairs_list: data/vox256.csv + augmentation_params: + flip_param: + horizontal_flip: True + time_flip: True + jitter_param: + brightness: 0.1 + contrast: 0.1 + saturation: 0.1 + hue: 0.1 + + +model_params: + common_params: + num_kp: 10 + num_channels: 3 + estimate_jacobian: True + kp_detector_params: + temperature: 0.1 + block_expansion: 32 + max_features: 1024 + scale_factor: 0.25 + num_blocks: 5 + generator_params: + block_expansion: 64 + max_features: 512 + num_down_blocks: 2 + num_bottleneck_blocks: 6 + estimate_occlusion_map: True + dense_motion_params: + block_expansion: 64 + max_features: 1024 + num_blocks: 5 + scale_factor: 0.25 + discriminator_params: + scales: [1] + block_expansion: 32 + max_features: 512 + num_blocks: 4 + sn: True + +train_params: + num_epochs: 100 + num_repeats: 50 + epoch_milestones: [5, 20, 30] + lr_generator: 2.0e-4 + lr_discriminator: 2.0e-4 + lr_kp_detector: 2.0e-4 + batch_size: 36 + scales: [1, 0.5, 0.25, 0.125] + checkpoint_freq: 10 + transform_params: + sigma_affine: 0.05 + sigma_tps: 0.005 + points_tps: 5 + loss_weights: + generator_gan: 0 + discriminator_gan: 1 + feature_matching: [10, 10, 10, 10] + perceptual: [10, 10, 10, 10, 10] + equivariance_value: 10 + equivariance_jacobian: 10 + +reconstruction_params: + num_videos: 1000 + format: '.mp4' + +animate_params: + num_pairs: 50 + format: '.mp4' + normalization_params: + adapt_movement_scale: False + use_relative_movement: True + use_relative_jacobian: True + +visualizer_params: + kp_size: 5 + draw_border: True + colormap: 'gist_rainbow' diff --git a/syncnet_python-master/Audio2Head/Audio2Head/inference.py b/syncnet_python-master/Audio2Head/Audio2Head/inference.py new file mode 100755 index 00000000..771e7b21 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/inference.py @@ -0,0 +1,273 @@ +import argparse +import subprocess +import python_speech_features +from scipy.io import wavfile +from scipy.interpolate import interp1d +import numpy as np +import pyworld +import torch +from modules.audio2pose import get_pose_from_audio +from skimage import io, img_as_float32 +import cv2 +from modules.generator import OcclusionAwareGenerator +from modules.keypoint_detector import KPDetector +from modules.audio2kp import AudioModel3D +import yaml, os, imageio + +def draw_annotation_box(image, rotation_vector, translation_vector, color=(255, 255, 255), line_width=2): + """Draw a 3D box as annotation of pose""" + + camera_matrix = np.array( + [[233.333, 0, 128], + [0, 233.333, 128], + [0, 0, 1]], dtype="double") + + dist_coeefs = np.zeros((4, 1)) + + point_3d = [] + rear_size = 75 + rear_depth = 0 + point_3d.append((-rear_size, -rear_size, rear_depth)) + point_3d.append((-rear_size, rear_size, rear_depth)) + point_3d.append((rear_size, rear_size, rear_depth)) + point_3d.append((rear_size, -rear_size, rear_depth)) + point_3d.append((-rear_size, -rear_size, rear_depth)) + + front_size = 100 + front_depth = 100 + point_3d.append((-front_size, -front_size, front_depth)) + point_3d.append((-front_size, front_size, front_depth)) + point_3d.append((front_size, front_size, front_depth)) + point_3d.append((front_size, -front_size, front_depth)) + point_3d.append((-front_size, -front_size, front_depth)) + point_3d = np.array(point_3d, dtype=float).reshape(-1, 3) # 如果需要使用 NumPy 的 float64 类型,可以改成 np.float64 + + # Map to 2d image points + (point_2d, _) = cv2.projectPoints(point_3d, + rotation_vector, + translation_vector, + camera_matrix, + dist_coeefs) + point_2d = np.int32(point_2d.reshape(-1, 2)) + + # Draw all the lines + cv2.polylines(image, [point_2d], True, color, line_width, cv2.LINE_AA) + cv2.line(image, tuple(point_2d[1]), tuple( + point_2d[6]), color, line_width, cv2.LINE_AA) + cv2.line(image, tuple(point_2d[2]), tuple( + point_2d[7]), color, line_width, cv2.LINE_AA) + cv2.line(image, tuple(point_2d[3]), tuple( + point_2d[8]), color, line_width, cv2.LINE_AA) + +def inter_pitch(y, y_flag): + frame_num = y.shape[0] + i = 0 + last = -1 + while(i < frame_num): + if y_flag[i] == 0: + while True: + if y_flag[i] == 0: + if i == frame_num-1: + if last != -1: + y[last+1:] = y[last] + i += 1 + break + i += 1 + else: + break + if i >= frame_num: + break + elif last == -1: + y[:i] = y[i] + else: + inter_num = i - last + 1 + fy = np.array([y[last], y[i]]) + fx = np.linspace(0, 1, num=2) + f = interp1d(fx, fy) + fx_new = np.linspace(0, 1, inter_num) + fy_new = f(fx_new) + y[last+1:i] = fy_new[1:-1] + last = i + i += 1 + else: + last = i + i += 1 + return y + +def get_audio_feature_from_audio(audio_path, norm=True): + sample_rate, audio = wavfile.read(audio_path) + if len(audio.shape) == 2: + if np.min(audio[:, 0]) <= 0: + audio = audio[:, 1] + else: + audio = audio[:, 0] + if norm: + audio = audio - np.mean(audio) + audio = audio / np.max(np.abs(audio)) + a = python_speech_features.mfcc(audio, sample_rate) + b = python_speech_features.logfbank(audio, sample_rate) + c, _ = pyworld.harvest(audio, sample_rate, frame_period=10) + c_flag = (c == 0.0) ^ 1 + c = inter_pitch(c, c_flag) + c = np.expand_dims(c, axis=1) + c_flag = np.expand_dims(c_flag, axis=1) + frame_num = np.min([a.shape[0], b.shape[0], c.shape[0]]) + + cat = np.concatenate([a[:frame_num], b[:frame_num], c[:frame_num], c_flag[:frame_num]], axis=1) + return cat + +def audio2head(audio_path, img_path, model_path, save_path): + temp_audio = "./results/temp.wav" + # 使用 ffmpeg 将输入音频转换为指定格式 + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (audio_path, temp_audio)) + output = subprocess.call(command, shell=True, stdout=None) + + # 读取转换后的音频特征 + audio_feature = get_audio_feature_from_audio(temp_audio) + frames = len(audio_feature) // 4 + + # 读取并处理图片 + img = io.imread(img_path)[:, :, :3] # 读取图片,保留前三个通道(RGB) + img = cv2.resize(img, (256, 256)) # 调整图片大小 + + img = np.array(img_as_float32(img)) + img = img.transpose((2, 0, 1)) + img = torch.from_numpy(img).unsqueeze(0).cuda() + + # 获取参考姿态 + ref_pose_rot, ref_pose_trans = get_pose_from_audio(img, audio_feature, model_path) + torch.cuda.empty_cache() + + # 加载配置文件 + config_file = r"./config/vox-256.yaml" + with open(config_file) as f: + config = yaml.load(f, Loader=yaml.FullLoader) + kp_detector = KPDetector(**config['model_params']['kp_detector_params'], + **config['model_params']['common_params']) + generator = OcclusionAwareGenerator(**config['model_params']['generator_params'], + **config['model_params']['common_params']) + kp_detector = kp_detector.cuda() + generator = generator.cuda() + + # 加载参数 + opt = argparse.Namespace(**yaml.load(open("./config/parameters.yaml"), Loader=yaml.FullLoader)) + audio2kp = AudioModel3D(opt).cuda() + + # 加载预训练模型 + checkpoint = torch.load(model_path) + kp_detector.load_state_dict(checkpoint["kp_detector"]) + generator.load_state_dict(checkpoint["generator"]) + audio2kp.load_state_dict(checkpoint["audio2kp"]) + + generator.eval() + kp_detector.eval() + audio2kp.eval() + + # 准备音频和姿态数据 + audio_f = [] + poses = [] + pad = np.zeros((4, 41), dtype=np.float32) + for i in range(0, frames, opt.seq_len // 2): + temp_audio = [] + temp_pos = [] + for j in range(opt.seq_len): + if i + j < frames: + temp_audio.append(audio_feature[(i+j)*4:(i+j)*4+4]) + trans = ref_pose_trans[i + j] + rot = ref_pose_rot[i + j] + else: + temp_audio.append(pad) + trans = ref_pose_trans[-1] + rot = ref_pose_rot[-1] + + pose = np.zeros([256, 256]) + draw_annotation_box(pose, np.array(rot), np.array(trans)) + temp_pos.append(pose) + audio_f.append(temp_audio) + poses.append(temp_pos) + + audio_f = torch.from_numpy(np.array(audio_f, dtype=np.float32)).unsqueeze(0) + poses = torch.from_numpy(np.array(poses, dtype=np.float32)).unsqueeze(0) + + bs = audio_f.shape[1] + predictions_gen = [] + total_frames = 0 + + for bs_idx in range(bs): + t = {} + + t["audio"] = audio_f[:, bs_idx].cuda() + t["pose"] = poses[:, bs_idx].cuda() + t["id_img"] = img + kp_gen_source = kp_detector(img) + + gen_kp = audio2kp(t) + if bs_idx == 0: + startid = 0 + end_id = opt.seq_len // 4 * 3 + else: + startid = opt.seq_len // 4 + end_id = opt.seq_len // 4 * 3 + + for frame_bs_idx in range(startid, end_id): + tt = {} + tt["value"] = gen_kp["value"][:, frame_bs_idx] + if opt.estimate_jacobian: + tt["jacobian"] = gen_kp["jacobian"][:, frame_bs_idx] + out_gen = generator(img, kp_source=kp_gen_source, kp_driving=tt) + out_gen["kp_source"] = kp_gen_source + out_gen["kp_driving"] = tt + del out_gen['sparse_deformed'] + del out_gen['occlusion_map'] + del out_gen['deformed'] + predictions_gen.append( + (np.transpose(out_gen['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0] * 255).astype(np.uint8)) + + total_frames += 1 + if total_frames >= frames: + break + if total_frames >= frames: + break + + log_dir = save_path + temp_dir = os.path.join(log_dir, "temp") + if not os.path.exists(temp_dir): + os.makedirs(temp_dir) + image_name = os.path.basename(img_path)[:-4] + "_" + os.path.basename(audio_path)[:-4] + ".mp4" + + video_path = os.path.join(temp_dir, image_name) + + # 生成视频文件 + imageio.mimsave(video_path, predictions_gen, format='FFMPEG', fps=25.0) + + # 将音频合并到视频中 + save_video = os.path.join(log_dir, image_name) + cmd = r'ffmpeg -y -i "%s" -i "%s" -vcodec copy "%s"' % (video_path, audio_path, save_video) + os.system(cmd) + os.remove(video_path) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="将音频和图片转换为视频") + + # 修改音频和图片路径,固定在input_wav和input_img文件夹下 + parser.add_argument("--audio_filename", default="intro.wav", help="音频文件名,位于 ./input_wav/ 目录下") + parser.add_argument("--img_filename", default="paint.jpg", help="图片文件名,位于 ./input_img/ 目录下") + # 修改 save_path 的默认值为上两级目录的 out_video 文件夹 + parser.add_argument("--save_path", default=os.path.join("..", "..", "out_video"), help="保存路径") + parser.add_argument("--model_path", default=r"/app/Audio2Head/Audio2Head/checkpoints/audio2head.pth.tar", help="预训练模型路径") + + parse = parser.parse_args() + + # 构建完整的音频和图片路径 + audio_path = os.path.join("/app/Audio2Head/Audio2Head/input_wav", parse.audio_filename) + img_path = os.path.join("/app/Audio2Head/Audio2Head/input_img", parse.img_filename) + + + # 检查文件是否存在 + if not os.path.isfile(audio_path): + raise FileNotFoundError(f"音频文件未找到: {audio_path}") + if not os.path.isfile(img_path): + raise FileNotFoundError(f"图片文件未找到: {img_path}") + + os.makedirs(parse.save_path, exist_ok=True) + audio2head(audio_path, img_path, parse.model_path, parse.save_path) diff --git a/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png b/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png new file mode 100755 index 00000000..a2cb63ee Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav b/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav new file mode 100755 index 00000000..0d635d30 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aec27cf98ccf6a010ca1ed9fcadfaf1ba8f30734e94213ea610ad403d14962ad +size 38478798 diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc new file mode 100755 index 00000000..432f67d2 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc new file mode 100755 index 00000000..fa213b83 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc new file mode 100755 index 00000000..b075d905 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc new file mode 100755 index 00000000..3a51ef4b Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc new file mode 100755 index 00000000..3490cef9 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc new file mode 100755 index 00000000..5e6d2832 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc new file mode 100755 index 00000000..ee3a692b Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py new file mode 100755 index 00000000..c7a61c17 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py @@ -0,0 +1,82 @@ +from torch import nn +import torch +import torch.nn.functional as F +from modules.util import AntiAliasInterpolation2d +from modules.util import Hourglass3D + +from modules.util import gaussian2kp +from sync_batchnorm import SynchronizedBatchNorm2d as BatchNorm2d + + +class AudioModel3D(nn.Module): + def __init__(self,opt): + super(AudioModel3D,self).__init__() + self.opt = opt + self.seq_len = opt.seq_len + self.pad = 0 + + self.down_id = AntiAliasInterpolation2d(3,0.25) + self.down_pose = AntiAliasInterpolation2d(opt.seq_len,0.25) + + self.embedding = nn.Sequential(nn.ConvTranspose2d(1, 8, (29, 14), stride=(1, 1), padding=(0, 11)), + BatchNorm2d(8), + nn.ReLU(inplace=True), + nn.Conv2d(8, 2, (13, 13), stride=(1, 1), padding=(6, 6))) + + num_channels = 6 + self.predictor = Hourglass3D(opt.block_expansion, in_features=num_channels, + max_features=opt.max_features, num_blocks=opt.num_blocks) + + self.kp = nn.Conv3d(in_channels=self.predictor.out_filters, out_channels=opt.num_kp, kernel_size=(7, 7, 7), + padding=(3,0,0)) + if opt.estimate_jacobian: + self.num_jacobian_maps = opt.num_kp + self.jacobian = nn.Conv2d(in_channels=self.predictor.out_filters, + out_channels=4 * self.num_jacobian_maps, kernel_size=(7, 7), padding=(0,0)) + self.jacobian.weight.data.zero_() + self.jacobian.bias.data.copy_(torch.tensor([1, 0, 0, 1] * self.num_jacobian_maps, dtype=torch.float)) + else: + self.jacobian = None + + self.temperature = 0.1 + + + def forward(self, x): + bs,_,_,c_dim = x["audio"].shape + + audio_embedding = self.embedding(x["audio"].reshape(-1,1,4,c_dim)) + audio_embedding = F.interpolate(audio_embedding,scale_factor=2).reshape(bs,self.opt.seq_len,2,64,64).permute(0,2,1,3,4) + + id_feature = self.down_id(x["id_img"]) + pose_feature = self.down_pose(x["pose"]) + + embeddings = torch.cat([audio_embedding,id_feature.unsqueeze(2).repeat(1,1,self.opt.seq_len,1,1),pose_feature.unsqueeze(1)],dim=1) + + feature_map = self.predictor(embeddings) + feature_shape = feature_map.shape + prediction = self.kp(feature_map).permute(0,2,1,3,4) + prediction = prediction.reshape(-1,prediction.shape[2],prediction.shape[3],prediction.shape[4]) + final_shape = prediction.shape + heatmap = prediction.view(final_shape[0], final_shape[1], -1) + heatmap = F.softmax(heatmap / self.temperature, dim=2) + heatmap = heatmap.view(*final_shape) + + out = gaussian2kp(heatmap) + out["value"] = out["value"].reshape(-1,self.opt.seq_len,self.opt.num_kp,2) + if self.jacobian is not None: + jacobian_map = self.jacobian(feature_map.permute(0,2,1,3,4).reshape(-1, feature_shape[1],feature_shape[3],feature_shape[4])) + + jacobian_map = jacobian_map.reshape(final_shape[0], self.num_jacobian_maps, 4, final_shape[2], + final_shape[3]) + out["jacobian_map"] = jacobian_map + heatmap = heatmap.unsqueeze(2) + + jacobian = heatmap * jacobian_map + jacobian = jacobian.view(final_shape[0], final_shape[1], 4, -1) + jacobian = jacobian.sum(dim=-1) + jacobian = jacobian.view(jacobian.shape[0], jacobian.shape[1], 2, 2) + out['jacobian'] = jacobian.reshape(-1,self.seq_len,self.opt.num_kp,2,2) + + out["pred_fature"] = prediction + return out + diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py new file mode 100755 index 00000000..1d09a17a --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py @@ -0,0 +1,61 @@ +import torch +import torch.nn as nn +from modules.util import MyResNet34 +import numpy as np + +class audio2poseLSTM(nn.Module): + def __init__(self): + super(audio2poseLSTM,self).__init__() + + self.em_audio = MyResNet34(256, 1) + self.em_img = MyResNet34(256, 3) + + self.lstm = nn.LSTM(512,256,num_layers=2,bias=True,batch_first=True) + self.output = nn.Linear(256,6) + + def forward(self,x): + img_em = self.em_img(x['img']) + result = [self.output(img_em).unsqueeze(1)] + bs,seqlen,_,_ = x["audio"].shape + zero_state = torch.zeros((2,bs,256),requires_grad=True).to(img_em.device) + cur_state = (zero_state,zero_state) + audio = x["audio"].reshape(-1, 1, 4, 41) + audio_em = self.em_audio(audio).reshape(bs, seqlen, 256) + for i in range(seqlen): + + img_em,cur_state = self.lstm(torch.cat((audio_em[:,i:i+1],img_em.unsqueeze(1)),dim=2),cur_state) + img_em = img_em.reshape(-1, 256) + result.append(self.output(img_em).unsqueeze(1)) + res = torch.cat(result,dim=1) + return res + +def get_pose_from_audio(img,audio,model_path): + num_frame = len(audio) // 4 + minv = np.array([-0.639, -0.501, -0.47, -102.6, -32.5, 184.6], dtype=np.float32) + maxv = np.array([0.411, 0.547, 0.433, 159.1, 116.5, 376.5], dtype=np.float32) + + + generator = audio2poseLSTM().cuda() + + ckpt_para = torch.load(model_path) + + generator.load_state_dict(ckpt_para["audio2pose"]) + generator.eval() + + audio_seq = [] + for i in range(num_frame): + audio_seq.append(audio[i*4:i*4+4]) + + audio = torch.from_numpy(np.array(audio_seq,dtype=np.float32)).unsqueeze(0).cuda() + + x = {} + x["img"] = img + x["audio"] = audio + poses = generator(x) + + print(poses.shape) + poses = poses.cpu().data.numpy()[0] + + poses = (poses+1)/2*(maxv-minv)+minv + rot,trans = poses[:,:3].copy(),poses[:,3:].copy() + return rot,trans \ No newline at end of file diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py new file mode 100755 index 00000000..2e32b4c8 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py @@ -0,0 +1,113 @@ +from torch import nn +import torch.nn.functional as F +import torch +from modules.util import Hourglass, AntiAliasInterpolation2d, make_coordinate_grid, kp2gaussian + +class DenseMotionNetwork(nn.Module): + """ + Module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving + """ + + def __init__(self, block_expansion, num_blocks, max_features, num_kp, num_channels, estimate_occlusion_map=False, + scale_factor=1, kp_variance=0.01): + super(DenseMotionNetwork, self).__init__() + self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp + 1) * (num_channels + 1), + max_features=max_features, num_blocks=num_blocks) + + self.mask = nn.Conv2d(self.hourglass.out_filters, num_kp + 1, kernel_size=(7, 7), padding=(3, 3)) + + if estimate_occlusion_map: + self.occlusion = nn.Conv2d(self.hourglass.out_filters, 1, kernel_size=(7, 7), padding=(3, 3)) + else: + self.occlusion = None + + self.num_kp = num_kp + self.scale_factor = scale_factor + self.kp_variance = kp_variance + + if self.scale_factor != 1: + self.down = AntiAliasInterpolation2d(num_channels, self.scale_factor) + + def create_heatmap_representations(self, source_image, kp_driving, kp_source): + """ + Eq 6. in the paper H_k(z) + """ + spatial_size = source_image.shape[2:] + gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=self.kp_variance) + gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=self.kp_variance) + heatmap = gaussian_driving - gaussian_source + + #adding background feature + zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1]).type(heatmap.type()) + heatmap = torch.cat([zeros, heatmap], dim=1) + heatmap = heatmap.unsqueeze(2) + return heatmap + + def create_sparse_motions(self, source_image, kp_driving, kp_source): + """ + Eq 4. in the paper T_{s<-d}(z) + """ + bs, _, h, w = source_image.shape + identity_grid = make_coordinate_grid((h, w), type=kp_source['value'].type()) + identity_grid = identity_grid.view(1, 1, h, w, 2) + coordinate_grid = identity_grid - kp_driving['value'].view(bs, self.num_kp, 1, 1, 2) + if 'jacobian' in kp_driving: + jacobian = torch.matmul(kp_source['jacobian'], torch.inverse(kp_driving['jacobian'])) + jacobian = jacobian.unsqueeze(-3).unsqueeze(-3) + jacobian = jacobian.repeat(1, 1, h, w, 1, 1) + coordinate_grid = torch.matmul(jacobian, coordinate_grid.unsqueeze(-1)) + coordinate_grid = coordinate_grid.squeeze(-1) + + driving_to_source = coordinate_grid + kp_source['value'].view(bs, self.num_kp, 1, 1, 2) + + #adding background feature + identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1) + sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1) + return sparse_motions + + def create_deformed_source_image(self, source_image, sparse_motions): + """ + Eq 7. in the paper \hat{T}_{s<-d}(z) + """ + bs, _, h, w = source_image.shape + source_repeat = source_image.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp + 1, 1, 1, 1, 1) + source_repeat = source_repeat.view(bs * (self.num_kp + 1), -1, h, w) + sparse_motions = sparse_motions.view((bs * (self.num_kp + 1), h, w, -1)) + sparse_deformed = F.grid_sample(source_repeat, sparse_motions) + # sparse_deformed = F.grid_sample(source_repeat, sparse_motions,align_corners = False) + sparse_deformed = sparse_deformed.view((bs, self.num_kp + 1, -1, h, w)) + return sparse_deformed + + def forward(self, source_image, kp_driving, kp_source): + if self.scale_factor != 1: + source_image = self.down(source_image) + + bs, _, h, w = source_image.shape + + out_dict = dict() + heatmap_representation = self.create_heatmap_representations(source_image, kp_driving, kp_source)#bs*(numkp+1)*1*h*w + sparse_motion = self.create_sparse_motions(source_image, kp_driving, kp_source)#bs*(numkp+1)*h*w*2 + deformed_source = self.create_deformed_source_image(source_image, sparse_motion) + out_dict['sparse_deformed'] = deformed_source + + input = torch.cat([heatmap_representation, deformed_source], dim=2)#bs*num+1*4*w*h + input = input.view(bs, -1, h, w) + + prediction = self.hourglass(input) + + mask = self.mask(prediction) + mask = F.softmax(mask, dim=1) + out_dict['mask'] = mask + mask = mask.unsqueeze(2)#bs*numkp+1*1*h*w + sparse_motion = sparse_motion.permute(0, 1, 4, 2, 3) + deformation = (sparse_motion * mask).sum(dim=1)# bs,2,64,64 + deformation = deformation.permute(0, 2, 3, 1)#bs*h*w*2 + + out_dict['deformation'] = deformation + + # Sec. 3.2 in the paper + if self.occlusion: + occlusion_map = torch.sigmoid(self.occlusion(prediction)) + out_dict['occlusion_map'] = occlusion_map + + return out_dict diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py new file mode 100755 index 00000000..6999b39d --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py @@ -0,0 +1,99 @@ +import torch +from torch import nn +import torch.nn.functional as F +from modules.util import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d +from modules.dense_motion import DenseMotionNetwork + + +class OcclusionAwareGenerator(nn.Module): + """ + Generator that given source image and and keypoints try to transform image according to movement trajectories + induced by keypoints. Generator follows Johnson architecture. + """ + + def __init__(self, num_channels, num_kp, block_expansion, max_features, num_down_blocks, + num_bottleneck_blocks, estimate_occlusion_map=False, dense_motion_params=None, estimate_jacobian=False): + super(OcclusionAwareGenerator, self).__init__() + + if dense_motion_params is not None: + self.dense_motion_network = DenseMotionNetwork(num_kp=num_kp, num_channels=num_channels, + estimate_occlusion_map=estimate_occlusion_map, + **dense_motion_params) + else: + self.dense_motion_network = None + + self.first = SameBlock2d(num_channels, block_expansion, kernel_size=(7, 7), padding=(3, 3)) + + down_blocks = [] + for i in range(num_down_blocks): + in_features = min(max_features, block_expansion * (2 ** i)) + out_features = min(max_features, block_expansion * (2 ** (i + 1))) + down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1))) + self.down_blocks = nn.ModuleList(down_blocks) + + up_blocks = [] + for i in range(num_down_blocks): + in_features = min(max_features, block_expansion * (2 ** (num_down_blocks - i))) + out_features = min(max_features, block_expansion * (2 ** (num_down_blocks - i - 1))) + up_blocks.append(UpBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1))) + self.up_blocks = nn.ModuleList(up_blocks) + + self.bottleneck = torch.nn.Sequential() + in_features = min(max_features, block_expansion * (2 ** num_down_blocks)) + for i in range(num_bottleneck_blocks): + self.bottleneck.add_module('r' + str(i), ResBlock2d(in_features, kernel_size=(3, 3), padding=(1, 1))) + + self.final = nn.Conv2d(block_expansion, num_channels, kernel_size=(7, 7), padding=(3, 3)) + self.estimate_occlusion_map = estimate_occlusion_map + self.num_channels = num_channels + + def deform_input(self, inp, deformation): + _, h_old, w_old, _ = deformation.shape + _, _, h, w = inp.shape + if h_old != h or w_old != w: + deformation = deformation.permute(0, 3, 1, 2) + deformation = F.interpolate(deformation, size=(h, w), mode='bilinear') + deformation = deformation.permute(0, 2, 3, 1) + return F.grid_sample(inp, deformation) + # return F.grid_sample(inp, deformation,align_corners = False) + + def forward(self, source_image, kp_driving, kp_source): + # Encoding (downsampling) part + out = self.first(source_image) + for i in range(len(self.down_blocks)): + out = self.down_blocks[i](out) + + # Transforming feature representation according to deformation and occlusion + output_dict = {} + if self.dense_motion_network is not None: + dense_motion = self.dense_motion_network(source_image=source_image, kp_driving=kp_driving, + kp_source=kp_source) + output_dict['mask'] = dense_motion['mask'] + output_dict['sparse_deformed'] = dense_motion['sparse_deformed'] + output_dict['deformation'] = dense_motion['deformation'] + + if 'occlusion_map' in dense_motion: + occlusion_map = dense_motion['occlusion_map'] + output_dict['occlusion_map'] = occlusion_map + else: + occlusion_map = None + deformation = dense_motion['deformation'] + out = self.deform_input(out, deformation) + + if occlusion_map is not None: + if out.shape[2] != occlusion_map.shape[2] or out.shape[3] != occlusion_map.shape[3]: + occlusion_map = F.interpolate(occlusion_map, size=out.shape[2:], mode='bilinear') + out = out * occlusion_map + + output_dict["deformed"] = self.deform_input(source_image, deformation) + + # Decoding part + out = self.bottleneck(out) + for i in range(len(self.up_blocks)): + out = self.up_blocks[i](out) + out = self.final(out) + out = F.sigmoid(out) + + output_dict["prediction"] = out + + return output_dict diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py new file mode 100755 index 00000000..15ab08a3 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py @@ -0,0 +1,77 @@ +from torch import nn +import torch +import torch.nn.functional as F +from modules.util import Hourglass, make_coordinate_grid, AntiAliasInterpolation2d + + +class KPDetector(nn.Module): + """ + Detecting a keypoints. Return keypoint position and jacobian near each keypoint. + """ + + def __init__(self, block_expansion, num_kp, num_channels, max_features, + num_blocks, temperature, estimate_jacobian=False, scale_factor=1, + single_jacobian_map=False, pad=0): + super(KPDetector, self).__init__() + + self.predictor = Hourglass(block_expansion, in_features=num_channels, + max_features=max_features, num_blocks=num_blocks) + + self.kp = nn.Conv2d(in_channels=self.predictor.out_filters, out_channels=num_kp, kernel_size=(7, 7), + padding=pad) + + if estimate_jacobian: + self.num_jacobian_maps = 1 if single_jacobian_map else num_kp + self.jacobian = nn.Conv2d(in_channels=self.predictor.out_filters, + out_channels=4 * self.num_jacobian_maps, kernel_size=(7, 7), padding=pad) + self.jacobian.weight.data.zero_() + self.jacobian.bias.data.copy_(torch.tensor([1, 0, 0, 1] * self.num_jacobian_maps, dtype=torch.float)) + else: + self.jacobian = None + + self.temperature = temperature + self.scale_factor = scale_factor + if self.scale_factor != 1: + self.down = AntiAliasInterpolation2d(num_channels, self.scale_factor) + + def gaussian2kp(self, heatmap): + """ + Extract the mean and from a heatmap + """ + shape = heatmap.shape + heatmap = heatmap.unsqueeze(-1) + grid = make_coordinate_grid(shape[2:], heatmap.type()).unsqueeze_(0).unsqueeze_(0) + value = (heatmap * grid).sum(dim=(2, 3)) + kp = {'value': value} + + return kp + + def forward(self, x): + if self.scale_factor != 1: + x = self.down(x) + + feature_map = self.predictor(x) + prediction = self.kp(feature_map) + + final_shape = prediction.shape + heatmap = prediction.view(final_shape[0], final_shape[1], -1) + heatmap = F.softmax(heatmap / self.temperature, dim=2) + heatmap = heatmap.view(*final_shape) + + out = self.gaussian2kp(heatmap) + + if self.jacobian is not None: + jacobian_map = self.jacobian(feature_map) + + jacobian_map = jacobian_map.reshape(final_shape[0], self.num_jacobian_maps, 4, final_shape[2], + final_shape[3]) + out["jacobian_map"] = jacobian_map + heatmap = heatmap.unsqueeze(2) + + jacobian = heatmap * jacobian_map + jacobian = jacobian.view(final_shape[0], final_shape[1], 4, -1) + jacobian = jacobian.sum(dim=-1) + jacobian = jacobian.view(jacobian.shape[0], jacobian.shape[1], 2, 2) + out['jacobian'] = jacobian + out["pred_fature"] = prediction + return out diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py new file mode 100755 index 00000000..06e23cc6 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py @@ -0,0 +1,203 @@ +import torch +import torch.nn as nn + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=dilation, groups=groups, bias=False, dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError('BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + # Both self.conv1 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1, + base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) * groups + # Both self.conv2 and self.downsample layers downsample the input when stride != 1 + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + +class ResNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000, zero_init_residual=False, + groups=1, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None,input_channel = 3): + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self._norm_layer = norm_layer + + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + # each element in the tuple indicates if we should replace + # the 2x2 stride with a dilated convolution instead + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError("replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d(input_channel, self.inplanes, kernel_size=7, stride=2, padding=3, + bias=False) + self.bn1 = norm_layer(self.inplanes) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2, + dilate=replace_stride_with_dilation[0]) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2, + dilate=replace_stride_with_dilation[1]) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2, + dilate=replace_stride_with_dilation[2]) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 + if zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + nn.init.constant_(m.bn3.weight, 0) + elif isinstance(m, BasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + norm_layer(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self.groups, + self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + + return x + +def _resnet(arch, block, layers, pretrained, progress, **kwargs): + model = ResNet(block, layers, **kwargs) + return model + +def resnet34(pretrained=False, progress=True, **kwargs): + r"""ResNet-34 model from + `"Deep Residual Learning for Image Recognition" `_ + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, + **kwargs) \ No newline at end of file diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py new file mode 100755 index 00000000..79e143fa --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py @@ -0,0 +1,407 @@ +from torch import nn + +import torch.nn.functional as F +import torch + +from sync_batchnorm import SynchronizedBatchNorm2d as BatchNorm2d +from sync_batchnorm import SynchronizedBatchNorm3d as BatchNorm3d +from modules.resnet import resnet34 + +def gaussian2kp(heatmap): + """ + Extract the mean and from a heatmap + """ + shape = heatmap.shape + heatmap = heatmap.unsqueeze(-1) + grid = make_coordinate_grid(shape[2:], heatmap.type()).unsqueeze_(0).unsqueeze_(0) + value = (heatmap * grid).sum(dim=(2, 3)) + kp = {'value': value} + + return kp + +def kp2gaussian(kp, spatial_size, kp_variance): + """ + Transform a keypoint into gaussian like representation + """ + mean = kp['value'] #bs*numkp*2 + + coordinate_grid = make_coordinate_grid(spatial_size, mean.type()) #h*w*2 + number_of_leading_dimensions = len(mean.shape) - 1 + shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape #1*1*h*w*2 + coordinate_grid = coordinate_grid.view(*shape) + repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1) + coordinate_grid = coordinate_grid.repeat(*repeats) #bs*numkp*h*w*2 + + # Preprocess kp shape + shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 2) + mean = mean.view(*shape) + + mean_sub = (coordinate_grid - mean) + + out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance) + + return out + + +def make_coordinate_grid(spatial_size, type): + """ + Create a meshgrid [-1,1] x [-1,1] of given spatial_size. + """ + h, w = spatial_size + x = torch.arange(w).type(type) + y = torch.arange(h).type(type) + + x = (2 * (x / (w - 1)) - 1) + y = (2 * (y / (h - 1)) - 1) + + yy = y.view(-1, 1).repeat(1, w) + xx = x.view(1, -1).repeat(h, 1) + + meshed = torch.cat([xx.unsqueeze_(2), yy.unsqueeze_(2)], 2) + + return meshed + + +class ResBlock2d(nn.Module): + """ + Res block, preserve spatial resolution. + """ + + def __init__(self, in_features, kernel_size, padding): + super(ResBlock2d, self).__init__() + self.conv1 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, + padding=padding) + self.conv2 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, + padding=padding) + self.norm1 = BatchNorm2d(in_features, affine=True) + self.norm2 = BatchNorm2d(in_features, affine=True) + + def forward(self, x): + out = self.norm1(x) + out = F.relu(out,inplace=True) + out = self.conv1(out) + out = self.norm2(out) + out = F.relu(out,inplace=True) + out = self.conv2(out) + out += x + return out + +class ResBlock3d(nn.Module): + """ + Res block, preserve spatial resolution. + """ + + def __init__(self, in_features, kernel_size, padding): + super(ResBlock3d, self).__init__() + self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, + padding=padding) + self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, + padding=padding) + self.norm1 = BatchNorm3d(in_features, affine=True) + self.norm2 = BatchNorm3d(in_features, affine=True) + + def forward(self, x): + out = self.norm1(x) + out = F.relu(out,inplace=True) + out = self.conv1(out) + out = self.norm2(out) + out = F.relu(out,inplace=True) + out = self.conv2(out) + out += x + return out + + +class UpBlock2d(nn.Module): + """ + Upsampling block for use in decoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(UpBlock2d, self).__init__() + + self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + self.norm = BatchNorm2d(out_features, affine=True) + + def forward(self, x): + out = F.interpolate(x, scale_factor=2) + del x + out = self.conv(out) + out = self.norm(out) + out = F.relu(out,inplace=True) + return out + +class UpBlock3d(nn.Module): + """ + Upsampling block for use in decoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(UpBlock3d, self).__init__() + + self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + self.norm = BatchNorm3d(out_features, affine=True) + self.res = ResBlock3d(out_features,kernel_size,padding) + self.norm2 = BatchNorm3d(out_features,affine=True) + + def forward(self, x): + out = F.interpolate(x, scale_factor=2) + out = self.conv(out) + out = self.norm(out) + out = F.relu(out,inplace=True) + out = self.res(out) + out = self.norm2(out) + out = F.relu(out,inplace=True) + return out + +class DownBlock2d(nn.Module): + """ + Downsampling block for use in encoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(DownBlock2d, self).__init__() + self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + self.norm = BatchNorm2d(out_features, affine=True) + self.pool = nn.AvgPool2d(kernel_size=(2, 2)) + + def forward(self, x): + out = self.conv(x) + del x + out = self.norm(out) + out = F.relu(out,inplace=True) + out = self.pool(out) + return out + +class DownBlock3d(nn.Module): + """ + Downsampling block for use in encoder. + """ + + def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1): + super(DownBlock3d, self).__init__() + + self.res = ResBlock3d(in_features=in_features,kernel_size=kernel_size,padding=padding) + self.norm_res = BatchNorm3d(in_features,affine=True) + self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, + padding=padding, groups=groups) + + self.norm = BatchNorm3d(out_features, affine=True) + self.pool = nn.AvgPool3d(kernel_size=(2, 2, 2)) + + def forward(self, x): + out = self.res(x) + out = self.norm_res(out) + out = F.relu(out,inplace=True) + out = self.conv(out) + out = self.norm(out) + out = F.relu(out,inplace=True) + out = self.pool(out) + return out + +class SameBlock2d(nn.Module): + """ + Simple block, preserve spatial resolution. + """ + + def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1): + super(SameBlock2d, self).__init__() + self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, + kernel_size=kernel_size, padding=padding, groups=groups) + self.norm = BatchNorm2d(out_features, affine=True) + + def forward(self, x): + out = self.conv(x) + out = self.norm(out) + out = F.relu(out,inplace=True) + return out + + +class Encoder(nn.Module): + """ + Hourglass Encoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Encoder, self).__init__() + + down_blocks = [] + for i in range(num_blocks): + down_blocks.append(DownBlock2d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), + min(max_features, block_expansion * (2 ** (i + 1))), + kernel_size=3, padding=1)) + self.down_blocks = nn.ModuleList(down_blocks) + + def forward(self, x): + outs = [x] + for down_block in self.down_blocks: + outs.append(down_block(outs[-1])) + return outs + + +class Encoder3D(nn.Module): + """ + Hourglass Encoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Encoder3D, self).__init__() + + down_blocks = [] + for i in range(num_blocks): + down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), + min(max_features, block_expansion * (2 ** (i + 1))), + kernel_size=3, padding=1)) + self.down_blocks = nn.ModuleList(down_blocks) + + def forward(self, x): + outs = [x] + for down_block in self.down_blocks: + outs.append(down_block(outs[-1])) + return outs + +class Decoder(nn.Module): + """ + Hourglass Decoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Decoder, self).__init__() + + up_blocks = [] + + for i in range(num_blocks)[::-1]: + in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1))) + out_filters = min(max_features, block_expansion * (2 ** i)) + up_blocks.append(UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1)) + + self.up_blocks = nn.ModuleList(up_blocks) + self.out_filters = block_expansion + in_features + + def forward(self, x): + out = x.pop() + for up_block in self.up_blocks: + out = up_block(out) + skip = x.pop() + out = torch.cat([out, skip], dim=1) + return out + + +class Decoder3D(nn.Module): + """ + Hourglass Decoder + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Decoder3D, self).__init__() + + up_blocks = [] + res_blocks = [] + + for i in range(num_blocks)[::-1]: + in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1))) + out_filters = min(max_features, block_expansion * (2 ** i)) + up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1)) + if i>0: + res_blocks.append(nn.Sequential(ResBlock3d(out_filters,kernel_size=3,padding=1),BatchNorm3d(out_filters), nn.ReLU(inplace=True))) + else: + res_blocks.append(nn.Sequential(ResBlock3d(in_features,kernel_size=3,padding=1),BatchNorm3d(in_features), nn.ReLU(inplace=True))) + self.res_blocks = nn.ModuleList(res_blocks) + self.up_blocks = nn.ModuleList(up_blocks) + self.out_filters = block_expansion + in_features + + def forward(self, x): + out = x.pop() + for up_block,res_bl in zip(self.up_blocks,self.res_blocks): + out = up_block(out) + skip = x.pop() + out = torch.cat([out, res_bl(skip)], dim=1) + return out + +class Hourglass(nn.Module): + """ + Hourglass architecture. + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Hourglass, self).__init__() + self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features) + self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features) + self.out_filters = self.decoder.out_filters + + def forward(self, x): + return self.decoder(self.encoder(x)) + +class Hourglass3D(nn.Module): + """ + Hourglass architecture. + """ + + def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256): + super(Hourglass3D, self).__init__() + self.encoder = Encoder3D(block_expansion, in_features, num_blocks, max_features) + self.decoder = Decoder3D(block_expansion, in_features, num_blocks, max_features) + self.out_filters = self.decoder.out_filters + + def forward(self, x): + return self.decoder(self.encoder(x)) + + +class AntiAliasInterpolation2d(nn.Module): + """ + Band-limited downsampling, for better preservation of the input signal. + """ + def __init__(self, channels, scale): + super(AntiAliasInterpolation2d, self).__init__() + sigma = (1 / scale - 1) / 2 + kernel_size = 2 * round(sigma * 4) + 1 + self.ka = kernel_size // 2 + self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka + + + kernel_size = [kernel_size, kernel_size] + sigma = [sigma, sigma] + # The gaussian kernel is the product of the + # gaussian function of each dimension. + kernel = 1 + meshgrids = torch.meshgrid( + [ + torch.arange(size, dtype=torch.float32) + for size in kernel_size + ] + ) + for size, std, mgrid in zip(kernel_size, sigma, meshgrids): + mean = (size - 1) / 2 + kernel *= torch.exp(-(mgrid - mean) ** 2 / (2 * std ** 2)) + + # Make sure sum of values in gaussian kernel equals 1. + kernel = kernel / torch.sum(kernel) + # Reshape to depthwise convolutional weight + kernel = kernel.view(1, 1, *kernel.size()) + kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1)) + + self.register_buffer('weight', kernel) + self.groups = channels + self.scale = scale + + def forward(self, input): + if self.scale == 1.0: + return input + + out = F.pad(input, (self.ka, self.kb, self.ka, self.kb)) + out = F.conv2d(out, weight=self.weight, groups=self.groups) + out = F.interpolate(out, scale_factor=(self.scale, self.scale)) + + return out + + + +class MyResNet34(nn.Module): + def __init__(self,embedding_dim,input_channel = 3): + super(MyResNet34, self).__init__() + self.resnet = resnet34(norm_layer = BatchNorm2d,num_classes=embedding_dim,input_channel = input_channel) + def forward(self, x): + return self.resnet(x) + diff --git a/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt b/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt new file mode 100755 index 00000000..09b03681 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt @@ -0,0 +1,9 @@ +scikit-image +python_speech_features +pyworld +pyyaml +pytorch-lightning +imageio +scipy +pyworld +opencv-python \ No newline at end of file diff --git a/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh b/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh new file mode 100755 index 00000000..89df6ba2 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# 定义文件夹路径 +IMG_DIR="/app/Audio2Head/Audio2Head/input_img" +WAV_DIR="/app/Audio2Head/Audio2Head/input_wav" +OUT_DIR="/app/out_video" + +# 确保工作目录正确 +cd /app/Audio2Head/Audio2Head || exit +# 创建输出目录(如果不存在) +mkdir -p "$OUT_DIR" + +# 支持的图片格式 +IMG_EXTENSIONS=("png" "jpg" "jpeg") + +# 遍历所有音频文件 +for audio_path in "$WAV_DIR"/*.wav; do + # 检查是否存在.wav文件 + if [ ! -e "$audio_path" ]; then + echo "No .wav files found in $WAV_DIR" + exit 1 + fi + + # 提取音频文件的基名(不含扩展名) + audio_filename=$(basename "$audio_path") + base_name="${audio_filename%.*}" + + # 初始化图片文件变量 + img_file="" + + # 查找匹配的图片文件 + for ext in "${IMG_EXTENSIONS[@]}"; do + potential_img="$IMG_DIR/$base_name.$ext" + if [ -f "$potential_img" ]; then + img_file="$potential_img" + img_extension="$ext" + break + fi + done + + # 如果未找到匹配的图片文件,输出警告并跳过 + if [ -z "$img_file" ]; then + echo "Warning: No matching image file found for $audio_filename in $IMG_DIR" + continue + fi + + # 提取图片文件名 + img_filename=$(basename "$img_file") + + # 输出当前处理的文件 + echo "Processing: $audio_filename and $img_filename" + + # 运行Python脚本 + # 修改以下部分 + python /app/Audio2Head/Audio2Head/inference.py --audio_filename "$audio_filename" --img_filename "$img_filename" + + + # 检查Python脚本是否成功执行 + if [ $? -ne 0 ]; then + echo "Error: Processing failed for $audio_filename and $img_filename" + continue + fi + + echo "Successfully processed $audio_filename and $img_filename" +done + +echo "All matching files have been processed." diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py new file mode 100755 index 00000000..48871cdc --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +# File : __init__.py +# Author : Jiayuan Mao +# Email : maojiayuan@gmail.com +# Date : 27/01/2018 +# +# This file is part of Synchronized-BatchNorm-PyTorch. +# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch +# Distributed under MIT License. + +from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d +from .replicate import DataParallelWithCallback, patch_replication_callback diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc new file mode 100755 index 00000000..3ea13f83 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc new file mode 100755 index 00000000..712f18e5 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc new file mode 100755 index 00000000..c68cdc82 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc new file mode 100755 index 00000000..78d6ecc5 Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc differ diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py new file mode 100755 index 00000000..b4cc2ccd --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py @@ -0,0 +1,315 @@ +# -*- coding: utf-8 -*- +# File : batchnorm.py +# Author : Jiayuan Mao +# Email : maojiayuan@gmail.com +# Date : 27/01/2018 +# +# This file is part of Synchronized-BatchNorm-PyTorch. +# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch +# Distributed under MIT License. + +import collections + +import torch +import torch.nn.functional as F + +from torch.nn.modules.batchnorm import _BatchNorm +from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast + +from .comm import SyncMaster + +__all__ = ['SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d'] + + +def _sum_ft(tensor): + """sum over the first and last dimention""" + return tensor.sum(dim=0).sum(dim=-1) + + +def _unsqueeze_ft(tensor): + """add new dementions at the front and the tail""" + return tensor.unsqueeze(0).unsqueeze(-1) + + +_ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size']) +_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std']) + + +class _SynchronizedBatchNorm(_BatchNorm): + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True): + super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine) + + self._sync_master = SyncMaster(self._data_parallel_master) + + self._is_parallel = False + self._parallel_id = None + self._slave_pipe = None + + def forward(self, input): + # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation. + if not (self._is_parallel and self.training): + return F.batch_norm( + input, self.running_mean, self.running_var, self.weight, self.bias, + self.training, self.momentum, self.eps) + + # Resize the input to (B, C, -1). + input_shape = input.size() + input = input.view(input.size(0), self.num_features, -1) + + # Compute the sum and square-sum. + sum_size = input.size(0) * input.size(2) + input_sum = _sum_ft(input) + input_ssum = _sum_ft(input ** 2) + + # Reduce-and-broadcast the statistics. + if self._parallel_id == 0: + mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size)) + else: + mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size)) + + # Compute the output. + if self.affine: + # MJY:: Fuse the multiplication for speed. + output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias) + else: + output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std) + + # Reshape it. + return output.view(input_shape) + + def __data_parallel_replicate__(self, ctx, copy_id): + self._is_parallel = True + self._parallel_id = copy_id + + # parallel_id == 0 means master device. + if self._parallel_id == 0: + ctx.sync_master = self._sync_master + else: + self._slave_pipe = ctx.sync_master.register_slave(copy_id) + + def _data_parallel_master(self, intermediates): + """Reduce the sum and square-sum, compute the statistics, and broadcast it.""" + + # Always using same "device order" makes the ReduceAdd operation faster. + # Thanks to:: Tete Xiao (http://tetexiao.com/) + intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device()) + + to_reduce = [i[1][:2] for i in intermediates] + to_reduce = [j for i in to_reduce for j in i] # flatten + target_gpus = [i[1].sum.get_device() for i in intermediates] + + sum_size = sum([i[1].sum_size for i in intermediates]) + sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce) + mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size) + + broadcasted = Broadcast.apply(target_gpus, mean, inv_std) + + outputs = [] + for i, rec in enumerate(intermediates): + outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2]))) + + return outputs + + def _compute_mean_std(self, sum_, ssum, size): + """Compute the mean and standard-deviation with sum and square-sum. This method + also maintains the moving average on the master device.""" + assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.' + mean = sum_ / size + sumvar = ssum - sum_ * mean + unbias_var = sumvar / (size - 1) + bias_var = sumvar / size + + self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data + self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data + + return mean, bias_var.clamp(self.eps) ** -0.5 + + +class SynchronizedBatchNorm1d(_SynchronizedBatchNorm): + r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a + mini-batch. + + .. math:: + + y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta + + This module differs from the built-in PyTorch BatchNorm1d as the mean and + standard-deviation are reduced across all devices during training. + + For example, when one uses `nn.DataParallel` to wrap the network during + training, PyTorch's implementation normalize the tensor on each device using + the statistics only on that device, which accelerated the computation and + is also easy to implement, but the statistics might be inaccurate. + Instead, in this synchronized version, the statistics will be computed + over all training samples distributed on multiple devices. + + Note that, for one-GPU or CPU-only case, this module behaves exactly same + as the built-in PyTorch implementation. + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and gamma and beta are learnable parameter vectors + of size C (where C is the input size). + + During training, this layer keeps a running estimate of its computed mean + and variance. The running sum is kept with a default momentum of 0.1. + + During evaluation, this running mean/variance is used for normalization. + + Because the BatchNorm is done over the `C` dimension, computing statistics + on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm + + Args: + num_features: num_features from an expected input of size + `batch_size x num_features [x width]` + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Default: 0.1 + affine: a boolean value that when set to ``True``, gives the layer learnable + affine parameters. Default: ``True`` + + Shape: + - Input: :math:`(N, C)` or :math:`(N, C, L)` + - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input) + + Examples: + >>> # With Learnable Parameters + >>> m = SynchronizedBatchNorm1d(100) + >>> # Without Learnable Parameters + >>> m = SynchronizedBatchNorm1d(100, affine=False) + >>> input = torch.autograd.Variable(torch.randn(20, 100)) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 2 and input.dim() != 3: + raise ValueError('expected 2D or 3D input (got {}D input)' + .format(input.dim())) + super(SynchronizedBatchNorm1d, self)._check_input_dim(input) + + +class SynchronizedBatchNorm2d(_SynchronizedBatchNorm): + r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch + of 3d inputs + + .. math:: + + y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta + + This module differs from the built-in PyTorch BatchNorm2d as the mean and + standard-deviation are reduced across all devices during training. + + For example, when one uses `nn.DataParallel` to wrap the network during + training, PyTorch's implementation normalize the tensor on each device using + the statistics only on that device, which accelerated the computation and + is also easy to implement, but the statistics might be inaccurate. + Instead, in this synchronized version, the statistics will be computed + over all training samples distributed on multiple devices. + + Note that, for one-GPU or CPU-only case, this module behaves exactly same + as the built-in PyTorch implementation. + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and gamma and beta are learnable parameter vectors + of size C (where C is the input size). + + During training, this layer keeps a running estimate of its computed mean + and variance. The running sum is kept with a default momentum of 0.1. + + During evaluation, this running mean/variance is used for normalization. + + Because the BatchNorm is done over the `C` dimension, computing statistics + on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm + + Args: + num_features: num_features from an expected input of + size batch_size x num_features x height x width + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Default: 0.1 + affine: a boolean value that when set to ``True``, gives the layer learnable + affine parameters. Default: ``True`` + + Shape: + - Input: :math:`(N, C, H, W)` + - Output: :math:`(N, C, H, W)` (same shape as input) + + Examples: + >>> # With Learnable Parameters + >>> m = SynchronizedBatchNorm2d(100) + >>> # Without Learnable Parameters + >>> m = SynchronizedBatchNorm2d(100, affine=False) + >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45)) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 4: + raise ValueError('expected 4D input (got {}D input)' + .format(input.dim())) + super(SynchronizedBatchNorm2d, self)._check_input_dim(input) + + +class SynchronizedBatchNorm3d(_SynchronizedBatchNorm): + r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch + of 4d inputs + + .. math:: + + y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta + + This module differs from the built-in PyTorch BatchNorm3d as the mean and + standard-deviation are reduced across all devices during training. + + For example, when one uses `nn.DataParallel` to wrap the network during + training, PyTorch's implementation normalize the tensor on each device using + the statistics only on that device, which accelerated the computation and + is also easy to implement, but the statistics might be inaccurate. + Instead, in this synchronized version, the statistics will be computed + over all training samples distributed on multiple devices. + + Note that, for one-GPU or CPU-only case, this module behaves exactly same + as the built-in PyTorch implementation. + + The mean and standard-deviation are calculated per-dimension over + the mini-batches and gamma and beta are learnable parameter vectors + of size C (where C is the input size). + + During training, this layer keeps a running estimate of its computed mean + and variance. The running sum is kept with a default momentum of 0.1. + + During evaluation, this running mean/variance is used for normalization. + + Because the BatchNorm is done over the `C` dimension, computing statistics + on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm + or Spatio-temporal BatchNorm + + Args: + num_features: num_features from an expected input of + size batch_size x num_features x depth x height x width + eps: a value added to the denominator for numerical stability. + Default: 1e-5 + momentum: the value used for the running_mean and running_var + computation. Default: 0.1 + affine: a boolean value that when set to ``True``, gives the layer learnable + affine parameters. Default: ``True`` + + Shape: + - Input: :math:`(N, C, D, H, W)` + - Output: :math:`(N, C, D, H, W)` (same shape as input) + + Examples: + >>> # With Learnable Parameters + >>> m = SynchronizedBatchNorm3d(100) + >>> # Without Learnable Parameters + >>> m = SynchronizedBatchNorm3d(100, affine=False) + >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10)) + >>> output = m(input) + """ + + def _check_input_dim(self, input): + if input.dim() != 5: + raise ValueError('expected 5D input (got {}D input)' + .format(input.dim())) + super(SynchronizedBatchNorm3d, self)._check_input_dim(input) diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py new file mode 100755 index 00000000..b66ec4ae --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +# File : comm.py +# Author : Jiayuan Mao +# Email : maojiayuan@gmail.com +# Date : 27/01/2018 +# +# This file is part of Synchronized-BatchNorm-PyTorch. +# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch +# Distributed under MIT License. + +import queue +import collections +import threading + +__all__ = ['FutureResult', 'SlavePipe', 'SyncMaster'] + + +class FutureResult(object): + """A thread-safe future implementation. Used only as one-to-one pipe.""" + + def __init__(self): + self._result = None + self._lock = threading.Lock() + self._cond = threading.Condition(self._lock) + + def put(self, result): + with self._lock: + assert self._result is None, 'Previous result has\'t been fetched.' + self._result = result + self._cond.notify() + + def get(self): + with self._lock: + if self._result is None: + self._cond.wait() + + res = self._result + self._result = None + return res + + +_MasterRegistry = collections.namedtuple('MasterRegistry', ['result']) +_SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result']) + + +class SlavePipe(_SlavePipeBase): + """Pipe for master-slave communication.""" + + def run_slave(self, msg): + self.queue.put((self.identifier, msg)) + ret = self.result.get() + self.queue.put(True) + return ret + + +class SyncMaster(object): + """An abstract `SyncMaster` object. + + - During the replication, as the data parallel will trigger an callback of each module, all slave devices should + call `register(id)` and obtain an `SlavePipe` to communicate with the master. + - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected, + and passed to a registered callback. + - After receiving the messages, the master device should gather the information and determine to message passed + back to each slave devices. + """ + + def __init__(self, master_callback): + """ + + Args: + master_callback: a callback to be invoked after having collected messages from slave devices. + """ + self._master_callback = master_callback + self._queue = queue.Queue() + self._registry = collections.OrderedDict() + self._activated = False + + def __getstate__(self): + return {'master_callback': self._master_callback} + + def __setstate__(self, state): + self.__init__(state['master_callback']) + + def register_slave(self, identifier): + """ + Register an slave device. + + Args: + identifier: an identifier, usually is the device id. + + Returns: a `SlavePipe` object which can be used to communicate with the master device. + + """ + if self._activated: + assert self._queue.empty(), 'Queue is not clean before next initialization.' + self._activated = False + self._registry.clear() + future = FutureResult() + self._registry[identifier] = _MasterRegistry(future) + return SlavePipe(identifier, self._queue, future) + + def run_master(self, master_msg): + """ + Main entry for the master device in each forward pass. + The messages were first collected from each devices (including the master device), and then + an callback will be invoked to compute the message to be sent back to each devices + (including the master device). + + Args: + master_msg: the message that the master want to send to itself. This will be placed as the first + message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example. + + Returns: the message to be sent back to the master device. + + """ + self._activated = True + + intermediates = [(0, master_msg)] + for i in range(self.nr_slaves): + intermediates.append(self._queue.get()) + + results = self._master_callback(intermediates) + assert results[0][0] == 0, 'The first result should belongs to the master.' + + for i, res in results: + if i == 0: + continue + self._registry[i].result.put(res) + + for i in range(self.nr_slaves): + assert self._queue.get() is True + + return results[0][1] + + @property + def nr_slaves(self): + return len(self._registry) diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py new file mode 100755 index 00000000..9b97380d --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# File : replicate.py +# Author : Jiayuan Mao +# Email : maojiayuan@gmail.com +# Date : 27/01/2018 +# +# This file is part of Synchronized-BatchNorm-PyTorch. +# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch +# Distributed under MIT License. + +import functools + +from torch.nn.parallel.data_parallel import DataParallel + +__all__ = [ + 'CallbackContext', + 'execute_replication_callbacks', + 'DataParallelWithCallback', + 'patch_replication_callback' +] + + +class CallbackContext(object): + pass + + +def execute_replication_callbacks(modules): + """ + Execute an replication callback `__data_parallel_replicate__` on each module created by original replication. + + The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` + + Note that, as all modules are isomorphism, we assign each sub-module with a context + (shared among multiple copies of this module on different devices). + Through this context, different copies can share some information. + + We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback + of any slave copies. + """ + master_copy = modules[0] + nr_modules = len(list(master_copy.modules())) + ctxs = [CallbackContext() for _ in range(nr_modules)] + + for i, module in enumerate(modules): + for j, m in enumerate(module.modules()): + if hasattr(m, '__data_parallel_replicate__'): + m.__data_parallel_replicate__(ctxs[j], i) + + +class DataParallelWithCallback(DataParallel): + """ + Data Parallel with a replication callback. + + An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by + original `replicate` function. + The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)` + + Examples: + > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) + > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) + # sync_bn.__data_parallel_replicate__ will be invoked. + """ + + def replicate(self, module, device_ids): + modules = super(DataParallelWithCallback, self).replicate(module, device_ids) + execute_replication_callbacks(modules) + return modules + + +def patch_replication_callback(data_parallel): + """ + Monkey-patch an existing `DataParallel` object. Add the replication callback. + Useful when you have customized `DataParallel` implementation. + + Examples: + > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) + > sync_bn = DataParallel(sync_bn, device_ids=[0, 1]) + > patch_replication_callback(sync_bn) + # this is equivalent to + > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False) + > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1]) + """ + + assert isinstance(data_parallel, DataParallel) + + old_replicate = data_parallel.replicate + + @functools.wraps(old_replicate) + def new_replicate(module, device_ids): + modules = old_replicate(module, device_ids) + execute_replication_callbacks(modules) + return modules + + data_parallel.replicate = new_replicate diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py new file mode 100755 index 00000000..9716d035 --- /dev/null +++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# File : unittest.py +# Author : Jiayuan Mao +# Email : maojiayuan@gmail.com +# Date : 27/01/2018 +# +# This file is part of Synchronized-BatchNorm-PyTorch. +# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch +# Distributed under MIT License. + +import unittest + +import numpy as np +from torch.autograd import Variable + + +def as_numpy(v): + if isinstance(v, Variable): + v = v.data + return v.cpu().numpy() + + +class TorchTestCase(unittest.TestCase): + def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3): + npa, npb = as_numpy(a), as_numpy(b) + self.assertTrue( + np.allclose(npa, npb, atol=atol), + 'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max()) + ) diff --git a/syncnet_python-master/Dockerfile b/syncnet_python-master/Dockerfile new file mode 100755 index 00000000..6141f535 --- /dev/null +++ b/syncnet_python-master/Dockerfile @@ -0,0 +1,67 @@ +# 使用 CUDA 11.3.1 基础镜像 +FROM nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu20.04 + +# 设置非交互模式,避免交互式安装 +ENV DEBIAN_FRONTEND=noninteractive + +# 更新系统并添加支持 Python 3.9 的 PPA 源 +RUN apt-get update && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update + +# 安装 Python 3.9、开发头文件、构建工具和基础依赖 +RUN apt-get install -y \ + python3.9 \ + python3.9-dev \ + python3.9-distutils \ + build-essential \ + ffmpeg \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# 手动安装最新版本的 pip,确保正确安装到 Python 3.9 环境 +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 + +# 使用清华源配置 pip +RUN python3.9 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/ && \ + python3.9 -m pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn + +# 创建 python 的符号链接指向 python3.9 +RUN ln -s /usr/bin/python3.9 /usr/bin/python + +# 确认 pip 已正确安装 +RUN python3.9 -m pip --version + +# 设置工作目录 +WORKDIR /app + +# 复制只包含依赖的文件,利用缓存 +COPY requirements.txt /app/ + +# 升级 pip 并安装项目依赖(不包含 torch, torchvision, torchaudio) +RUN python3.9 -m pip install --upgrade pip && \ + python3.9 -m pip install --progress-bar=on -r requirements.txt --no-cache-dir && \ + python3.9 -m pip install cupy-cuda113 imageio[ffmpeg] + +# 升级 pip 并安装 PyTorch 及其依赖(针对 CUDA 11.3),显示下载进度 +RUN python3.9 -m pip install --upgrade pip && \ + python3.9 -m pip install --progress-bar=on torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://mirrors.aliyun.com/pytorch-wheels/cu113/ + +# 复制剩余的项目文件 +COPY . /app + +# 打印 Python 和 pip 版本以验证安装 +RUN python3.9 --version +RUN python3.9 -m pip --version + +# 创建必要的文件夹 +RUN mkdir -p /app/ref_video \ + && mkdir -p /app/out_video \ + && mkdir -p /app/Audio2Head/Audio2Head/input_wav \ + && mkdir -p /app/Audio2Head/Audio2Head/input_img + +# 确保脚本有执行权限 +RUN chmod +x /app/Audio2Head/Audio2Head/run_inference.sh + +# 定义启动命令 +CMD ["bash", "-c", "/app/Audio2Head/Audio2Head/run_inference.sh && python /app/batch_psnr.py"] diff --git a/syncnet_python-master/FID.py b/syncnet_python-master/FID.py new file mode 100755 index 00000000..7a5a7f9a --- /dev/null +++ b/syncnet_python-master/FID.py @@ -0,0 +1,147 @@ +import cv2 +import numpy as np +import torch +from sklearn.metrics.pairwise import cosine_similarity +from scipy.linalg import sqrtm +from torchvision import models, transforms +from PIL import Image +import sys +import os +from datetime import datetime + +# 设置设备为GPU(如果可用) +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# 加载预训练的InceptionV3模型,并去掉最后的分类层 +inception_model = models.inception_v3(pretrained=True).to(device) +inception_model.fc = torch.nn.Identity() # 去掉最后的分类层 +inception_model.eval() + +# 定义图像预处理函数 +preprocess = transforms.Compose([ + transforms.Resize(299), # InceptionV3的输入大小 + transforms.CenterCrop(299), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), +]) + +# 计算图像特征 +def get_features_batch(images, batch_size=32): + features = [] + num_images = len(images) + for i in range(0, num_images, batch_size): + batch_images = images[i:i+batch_size] + batch_pil = [Image.fromarray(img) for img in batch_images] + batch_tensor = torch.stack([preprocess(img) for img in batch_pil]).to(device) + with torch.no_grad(): + batch_features = inception_model(batch_tensor) + features.append(batch_features.cpu().numpy()) + return np.vstack(features) + +# 计算FID分数 +def calculate_fid(features1, features2): + mu1, sigma1 = features1.mean(axis=0), np.cov(features1, rowvar=False) + mu2, sigma2 = features2.mean(axis=0), np.cov(features2, rowvar=False) + + ssdiff = np.sum((mu1 - mu2)**2.0) + covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False) + + if np.iscomplexobj(covmean): + covmean = covmean.real + + fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean) + return fid + +# 日志记录函数 +def log_fid(log_file_path, ref_video_path, out_video_path, frame_count, fid_score): + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + log_entry = f"{timestamp},\"{ref_video_path}\",\"{out_video_path}\",{frame_count},{fid_score}\n" + with open(log_file_path, "a") as log_file: + # 如果文件是空的,写入表头 + if os.path.getsize(log_file_path) == 0: + log_file.write("Timestamp,Reference_Video_Path,Output_Video_Path,Frame_Count,FID_Score\n") + log_file.write(log_entry) + +if __name__ == "__main__": + if len(sys.argv) < 3 or len(sys.argv) > 4: + print("Usage: python script_name.py []") + sys.exit(1) + + ref_video_path = sys.argv[1] + out_video_path = sys.argv[2] + + # 可选:获取日志文件路径 + if len(sys.argv) == 4: + log_file_path = sys.argv[3] + else: + log_file_path = "fid_log.txt" + + # 检查视频文件是否存在 + if not os.path.isfile(ref_video_path): + print(f"Error: Reference video file '{ref_video_path}' does not exist.") + sys.exit(1) + if not os.path.isfile(out_video_path): + print(f"Error: Output video file '{out_video_path}' does not exist.") + sys.exit(1) + + # 打开参考视频和输出视频文件 + ref_cap = cv2.VideoCapture(ref_video_path) + out_cap = cv2.VideoCapture(out_video_path) + + if not ref_cap.isOpened(): + print(f"Error: Could not open reference video '{ref_video_path}'.") + sys.exit(1) + if not out_cap.isOpened(): + print(f"Error: Could not open output video '{out_video_path}'.") + ref_cap.release() + sys.exit(1) + + # 获取视频属性 + ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + ref_frames = [] + out_frames = [] + + print("Reading frames from videos...") + + while True: + ret_ref, ref_frame = ref_cap.read() + ret_out, out_frame = out_cap.read() + + if not ret_ref or not ret_out: + break + # 转换为RGB,因为Inception模型使用RGB图像 + ref_frames.append(cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB)) + out_frames.append(cv2.cvtColor(out_frame, cv2.COLOR_BGR2RGB)) + + # 处理输出视频比参考视频少一帧的情况 + if len(out_frames) < len(ref_frames): + ref_frames = ref_frames[:len(out_frames)] + elif len(ref_frames) < len(out_frames): + out_frames = out_frames[:len(ref_frames)] + + frame_count = len(ref_frames) + print(f"Total frames to process: {frame_count}") + + # 释放视频捕获对象 + ref_cap.release() + out_cap.release() + + if frame_count == 0: + print("No frames to process.") + sys.exit(1) + + print("Extracting features from reference video frames...") + ref_features = get_features_batch(ref_frames) + + print("Extracting features from output video frames...") + out_features = get_features_batch(out_frames) + + print("Calculating FID score...") + fid_score = calculate_fid(ref_features, out_features) + print(f"FID score: {fid_score}") + + # 记录到日志文件 + log_fid(log_file_path, ref_video_path, out_video_path, frame_count, fid_score) + print(f"FID score logged to '{log_file_path}'.") diff --git a/syncnet_python-master/NIQE.py b/syncnet_python-master/NIQE.py new file mode 100755 index 00000000..5200b645 --- /dev/null +++ b/syncnet_python-master/NIQE.py @@ -0,0 +1,283 @@ +import cv2 +import numpy as np +import cupy as cp +import scipy.io +from os.path import dirname, join +from PIL import Image +import scipy.special +import math +import os +from skimage.transform import resize +import sys +import cupyx.scipy.ndimage as cupyx_ndimage # 导入cupyx的ndimage模块 + +gamma_range = np.arange(0.2, 10, 0.001) +a = scipy.special.gamma(2.0 / gamma_range) +a *= a +b = scipy.special.gamma(1.0 / gamma_range) +c = scipy.special.gamma(3.0 / gamma_range) +prec_gammas = a / (b * c) + +def aggd_features(imdata): + # 将imdata转移到GPU + imdata = cp.asarray(imdata) + # 展平imdata + imdata = imdata.flatten() + imdata2 = imdata * imdata + left_data = imdata2[imdata < 0] + right_data = imdata2[imdata >= 0] + + left_mean_sqrt = cp.sqrt(cp.average(left_data)) if left_data.size > 0 else 0 + right_mean_sqrt = cp.sqrt(cp.average(right_data)) if right_data.size > 0 else 0 + + gamma_hat = left_mean_sqrt / right_mean_sqrt if right_mean_sqrt != 0 else cp.inf + + imdata2_mean = cp.mean(imdata2) + r_hat = (cp.mean(cp.abs(imdata)) ** 2) / cp.mean(imdata2) if imdata2_mean != 0 else cp.inf + rhat_norm = r_hat * (((gamma_hat ** 3) + 1) * (gamma_hat + 1)) / ((gamma_hat ** 2) + 1) ** 2 + + pos = cp.argmin((cp.asarray(prec_gammas) - rhat_norm) ** 2).get() + alpha = gamma_range[pos] + + gam1 = scipy.special.gamma(1.0 / alpha) + gam2 = scipy.special.gamma(2.0 / alpha) + gam3 = scipy.special.gamma(3.0 / alpha) + + aggdratio = math.sqrt(gam1) / math.sqrt(gam3) + bl = aggdratio * left_mean_sqrt.get() + br = aggdratio * right_mean_sqrt.get() + + N = (br - bl) * (gam2 / gam1) + return (alpha, N, bl, br, left_mean_sqrt.get(), right_mean_sqrt.get()) + +def ggd_features(imdata): + nr_gam = 1 / prec_gammas + sigma_sq = cp.var(imdata) + E = cp.mean(cp.abs(imdata)) + rho = sigma_sq / E ** 2 + pos = cp.argmin(cp.abs(nr_gam - rho)).get() + return gamma_range[pos], sigma_sq.get() + +def paired_product(new_im): + # 将数据转移到CPU进行滚动操作 + new_im_cpu = new_im.get() + shift1 = np.roll(new_im_cpu.copy(), 1, axis=1) + shift2 = np.roll(new_im_cpu.copy(), 1, axis=0) + shift3 = np.roll(np.roll(new_im_cpu.copy(), 1, axis=0), 1, axis=1) + shift4 = np.roll(np.roll(new_im_cpu.copy(), 1, axis=0), -1, axis=1) + + H_img = shift1 * new_im_cpu + V_img = shift2 * new_im_cpu + D1_img = shift3 * new_im_cpu + D2_img = shift4 * new_im_cpu + + return cp.asarray(H_img), cp.asarray(V_img), cp.asarray(D1_img), cp.asarray(D2_img) + +def gen_gauss_window(lw, sigma): + sd = np.float32(sigma) + lw = int(lw) + weights = [0.0] * (2 * lw + 1) + weights[lw] = 1.0 + sum_weights = 1.0 + sd *= sd + for ii in range(1, lw + 1): + tmp = math.exp(-0.5 * float(ii * ii) / sd) + weights[lw + ii] = tmp + weights[lw - ii] = tmp + sum_weights += 2.0 * tmp + weights = [w / sum_weights for w in weights] + return cp.asarray(weights, dtype=cp.float32) + +def compute_image_mscn_transform(image, C=1, avg_window=None, extend_mode='constant'): + if avg_window is None: + avg_window = gen_gauss_window(3, 7.0 / 6.0) + assert len(image.shape) == 2 + h, w = image.shape + image = cp.asarray(image, dtype=cp.float32) + + # 使用cupyx.scipy.ndimage.correlate进行多维相关操作 + mu_image = cupyx_ndimage.correlate(image, avg_window[:, None], mode=extend_mode) + mu_image = cupyx_ndimage.correlate(mu_image, avg_window[None, :], mode=extend_mode) + + var_image = cupyx_ndimage.correlate(image ** 2, avg_window[:, None], mode=extend_mode) + var_image = cupyx_ndimage.correlate(var_image, avg_window[None, :], mode=extend_mode) + var_image = cp.sqrt(cp.abs(var_image - mu_image ** 2)) + return (image - mu_image) / (var_image + C), var_image, mu_image + +def _niqe_extract_subband_feats(mscncoefs): + alpha_m, N, bl, br, lsq, rsq = aggd_features(mscncoefs.copy()) + pps1, pps2, pps3, pps4 = paired_product(mscncoefs) + alpha1, N1, bl1, br1, lsq1, rsq1 = aggd_features(pps1) + alpha2, N2, bl2, br2, lsq2, rsq2 = aggd_features(pps2) + alpha3, N3, bl3, br3, lsq3, rsq3 = aggd_features(pps3) + alpha4, N4, bl4, br4, lsq4, rsq4 = aggd_features(pps4) + return cp.asnumpy(cp.array([alpha_m, (bl + br) / 2.0, + alpha1, N1, bl1, br1, # (V) + alpha2, N2, bl2, br2, # (H) + alpha3, N3, bl3, bl3, # (D1) + alpha4, N4, bl4, bl4, # (D2) + ])) + +def get_patches_train_features(img, patch_size, stride=8): + return _get_patches_generic(img, patch_size, 1, stride) + +def get_patches_test_features(img, patch_size, stride=8): + return _get_patches_generic(img, patch_size, 0, stride) + +def extract_on_patches(img, patch_size): + h, w = img.shape + patch_size = int(patch_size) + patches = [] + for j in range(0, h - patch_size + 1, patch_size): + for i in range(0, w - patch_size + 1, patch_size): + patch = img[j:j + patch_size, i:i + patch_size] + patches.append(patch) + + patches = cp.asarray(patches) + + patch_features = [] + for p in patches: + patch_features.append(_niqe_extract_subband_feats(p)) + patch_features = cp.asarray(patch_features) + return cp.asnumpy(patch_features) + +def _get_patches_generic(img, patch_size, is_train, stride): + h, w = img.shape + if h < patch_size or w < patch_size: + print("Input image is too small") + exit(0) + + # 确保补丁能够均匀划分 + hoffset = h % patch_size + woffset = w % patch_size + + if hoffset > 0: + img = img[:-hoffset, :] + if woffset > 0: + img = img[:, :-woffset] + + img = img.astype(cp.float32) + # 使用skimage在CPU上调整图像大小 + img_cpu = cp.asnumpy(img) + img2_cpu = resize(img_cpu, (int(img_cpu.shape[0] * 0.5), int(img_cpu.shape[1] * 0.5)), mode='constant', anti_aliasing=True) + img2 = cp.asarray(img2_cpu, dtype=cp.float32) + + mscn1, var, mu = compute_image_mscn_transform(img) + mscn1 = mscn1.astype(cp.float32) + + mscn2, _, _ = compute_image_mscn_transform(img2) + mscn2 = mscn2.astype(cp.float32) + + feats_lvl1 = extract_on_patches(mscn1, patch_size) + feats_lvl2 = extract_on_patches(mscn2, patch_size / 2) + + feats = np.hstack((feats_lvl1, feats_lvl2)) # feats_lvl3)) + return feats + +def niqe(inputImgData): + patch_size = 96 + module_path = dirname(__file__) + + # 加载预训练的NIQE参数 + params = scipy.io.loadmat(join(module_path, 'data', 'niqe_image_params.mat')) + pop_mu = cp.asarray(np.ravel(params["pop_mu"])) + pop_cov = cp.asarray(params["pop_cov"]) + + M, N = inputImgData.shape + + assert M > (patch_size * 2 + 1), "niqe called with small frame size, requires > 192x192 resolution video using current training parameters" + assert N > (patch_size * 2 + 1), "niqe called with small frame size, requires > 192x192 resolution video using current training parameters" + + # 将图像转移到GPU + inputImgData = cp.asarray(inputImgData, dtype=cp.float32) + + feats = cp.asarray(get_patches_test_features(inputImgData, patch_size)) + sample_mu = cp.mean(feats, axis=0) + sample_cov = cp.cov(feats.T) + + X = sample_mu - pop_mu + covmat = (pop_cov + sample_cov) / 2.0 + pinvmat = cp.linalg.pinv(covmat) + niqe_score = cp.sqrt(cp.dot(cp.dot(X, pinvmat), X)).get() + + return niqe_score + +def evaluate_video_with_niqe(video_path, log_file_path="niqe_log.txt"): + cap = cv2.VideoCapture(video_path) + + if not cap.isOpened(): + print("Error: Could not open video.") + return + + niqe_values = [] + NIQE_sum = 0 + frame_count = 0 + + # 设置设备为GPU(如果可用) + device = cp.cuda.Device() + device.use() + + # 打开日志文件以追加模式 + try: + log_file = open(log_file_path, "a") + except Exception as e: + print(f"Error: Could not open log file {log_file_path}: {e}") + cap.release() + return + + # 写入日志文件的头部(如果文件是空的) + if os.path.getsize(log_file_path) == 0: + log_file.write("Video_Path,Frame_Count,Average_NIQE\n") + + print("Processing frames and calculating NIQE...") + + while True: + ret, frame = cap.read() + if not ret: + break + + # 将帧从BGR转换为灰度图 + gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + + # 计算NIQE + try: + niqe_value = niqe(gray_frame) + except Exception as e: + print(f"Error during NIQE calculation on frame {frame_count + 1}: {e}") + continue # 跳过该帧并继续处理 + + NIQE_sum += niqe_value + frame_count += 1 + + # 可选:打印每帧的NIQE分数 + # print(f"Frame {frame_count}: NIQE score = {niqe_value}") + + cap.release() + + if frame_count > 0: + NIQE_mean = NIQE_sum / frame_count + print(f"Average NIQE over {frame_count} frames: {NIQE_mean}") + + # 记录到日志文件 + log_entry = f"\"{video_path}\",{frame_count},{NIQE_mean}\n" + log_file.write(log_entry) + else: + print("No frames were processed.") + + log_file.close() + +if __name__ == "__main__": + if len(sys.argv) < 2 or len(sys.argv) > 3: + print("Usage: python script_name.py []") + sys.exit(1) + video_path = sys.argv[1] + + # 可选:获取日志文件路径 + if len(sys.argv) == 3: + log_file_path = sys.argv[2] + else: + log_file_path = "niqe_log.txt" + + evaluate_video_with_niqe(video_path, log_file_path) + print(f"NIQE scores logged to '{log_file_path}'.") + diff --git a/syncnet_python-master/PSNR.py b/syncnet_python-master/PSNR.py new file mode 100755 index 00000000..3525069c --- /dev/null +++ b/syncnet_python-master/PSNR.py @@ -0,0 +1,118 @@ +import cv2 +import numpy as np +import cupy as cp +import sys +import os + +def compute_psnr_gpu(ref_gray, out_gray): + """ + 在GPU上计算PSNR。 + """ + # 将图像数据转换为float32类型并传输到GPU + ref_gpu = cp.asarray(ref_gray, dtype=cp.float32) + out_gpu = cp.asarray(out_gray, dtype=cp.float32) + + # 计算均方误差 (MSE) + mse = cp.mean((ref_gpu - out_gpu) ** 2) + + if mse == 0: + return float('inf') + + PIXEL_MAX = 255.0 + psnr_value = 20 * cp.log10(PIXEL_MAX / cp.sqrt(mse)) + + # 将结果从GPU传回CPU + return psnr_value.get() + +def evaluate_video_with_psnr(reference_video_path, output_video_path, log_file_path="psnr_log.txt"): + # 打开参考视频和输出视频文件 + ref_cap = cv2.VideoCapture(reference_video_path) + out_cap = cv2.VideoCapture(output_video_path) + + if not ref_cap.isOpened() or not out_cap.isOpened(): + print(f"Error: Could not open one of the video files.") + return + + # 获取视频属性 + ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # 初始化变量 + frame_count = 0 + psnr_scores = [] + + # 打开日志文件以追加模式 + try: + log_file = open(log_file_path, "a") + except Exception as e: + print(f"Error: Could not open log file {log_file_path}: {e}") + ref_cap.release() + out_cap.release() + return + + # 写入日志文件的头部(如果文件是空的) + if os.path.getsize(log_file_path) == 0: + log_file.write("Reference_Video_Path,Output_Video_Path,Frame_Count,Average_PSNR\n") + + while True: + ret_ref, ref_frame = ref_cap.read() + ret_out, out_frame = out_cap.read() + + if not ret_ref: + # 参考视频已经读完,但输出视频可能还有剩余帧(不处理这些帧) + break + + if not ret_out: + break + + # 调整输出帧大小以匹配参考帧 + out_frame_resized = cv2.resize(out_frame, (ref_frame_width, ref_frame_height)) + + # 将BGR图像转换为灰度图像 + ref_gray = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2GRAY) + out_gray = cv2.cvtColor(out_frame_resized, cv2.COLOR_BGR2GRAY) + + # 使用GPU计算PSNR + score = compute_psnr_gpu(ref_gray, out_gray) + + # 保存PSNR分数 + psnr_scores.append(score) + + frame_count += 1 + + # 可选:打印每帧的PSNR分数 + # print(f"Frame {frame_count}: PSNR score = {score}") + + # 计算并打印平均PSNR分数 + if psnr_scores: + average_score = np.mean(psnr_scores) + output_message = f"Average PSNR score over {frame_count} frames: {average_score}" + print(output_message) + + # 写入日志文件 + log_entry = f"\"{reference_video_path}\",\"{output_video_path}\",Average PSNR score over{frame_count},frames:{average_score}\n" + log_file.write(log_entry) + else: + print("No frames were processed.") + + # 关闭日志文件 + log_file.close() + + # 释放视频捕获对象 + ref_cap.release() + out_cap.release() + +if __name__ == "__main__": + if len(sys.argv) < 3 or len(sys.argv) > 4: + print("Usage: python script_name.py []") + sys.exit(1) + reference_video_path = sys.argv[1] + output_video_path = sys.argv[2] + + # 可选:获取日志文件路径 + if len(sys.argv) == 4: + log_file_path = sys.argv[3] + else: + log_file_path = "psnr_log.txt" + + evaluate_video_with_psnr(reference_video_path, output_video_path, log_file_path) diff --git a/syncnet_python-master/README.md b/syncnet_python-master/README.md new file mode 100755 index 00000000..4d607ba8 --- /dev/null +++ b/syncnet_python-master/README.md @@ -0,0 +1,82 @@ +# test-audio2head + +## 使用docker进行部署 +### 首先下载checkpoints + +首先请从以下网址下载[google-drive](https://drive.google.com/file/d/1tvI43ZIrnx9Ti2TpFiEO4dK5DOwcECD7/view?usp=sharing),当上以连接无法访问或者载入后无法运行时,请下载这个备用[checkpoint2](https://drive.google.com/drive/folders/1k-6im7e4EkPjQSXCO7jWEQwYSHCsCyJb?usp=sharing)。 + +下载完成后,请将下载好的checkpoints文件夹放到syncnet_python-master/Audio2Head/Audio2Head目录下 + +完成以上步骤后,请用以下命令构建docker容器 +``` bash +docker build -t syn . +``` + +构建完成后,你可以开始运行容器,但是请注意,请确保你准备好了用于的评估的原视频,用于模型生成视频的评估视频的第一帧图片和音频。同时还需要注意,你的评估视频,用于模型生成视频的第一帧图片和音频的文件名应该相同。(比如,你的一个评估视频是eric.mp4,那么你应该截取这个视频的第一帧图片用于模型生成,并命名为eric.png,同截取这个视频的音频,命名为eric.wav)然后将你的所有评估视频放一个文件夹(例如ref_video),所有图片放一个文件夹(例如input_img),所有音频放一个文件夹(例如input_wav)然后通过以下命令进行挂载和运行程序 +```bash +docker run --rm --gpus all \ +-v /path/to/your/input_img/:/app/Audio2Head/Audio2Head/input_img \ +-v /path/to/your/input_wav/:/app/Audio2Head/Audio2Head/input_wav \ +-v /path/to/your/ref_video/:/app/ref_video \ +syn +``` + +然后通过批处理程序,会自动生成调用模型生成视频然后进行评估 +## 使用conda环境部署 +如果你的系统没有Anconda环境,可以按以下操作 +### 1.首先安装conda环境 +打开终端,下载 Miniconda 安装包: +``` bash + +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + +``` +运行安装脚本(注意最后添加环境变量时选择yes,全部输yes) +``` +bash Miniconda3-latest-Linux-x86_64.sh +``` +完成安装后,执行: +``` +source ~/.bashrc +``` + +### 2.创建conda环境并进入 + +``` +conda create -n syn python=3.10 && conda activate syn +``` + +### 3.进入到项目目录 +首先安装依赖 + +``` +# 确保你的电脑安装了cuda12以上的版本 +pip install -r requirements.txt +pip install imageio[ffmpeg] +pip install cupy-cuda12x +apt-get install ffmpeg +``` +然后将你的所有评估视频的第一帧图像放到 +``` +Audio2Head\Audio2Head\input_img +``` +将所有评估视频的音频放到 +``` +Audio2Head\Audio2Head\input_wav +``` +注意,input_wav里要求wav格式,同时input_img和input_wav对应的视频名字应该相同 + +然后将你的评估视频保存到 +``` +ref_video +``` + +然后运行run_inference.sh批处理文件 +``` +./run_inference.sh +``` +等上一个批处理脚本运行完后,运行 +``` +python batch_psnr.py +``` +开始评估 diff --git a/syncnet_python-master/SSIM.py b/syncnet_python-master/SSIM.py new file mode 100755 index 00000000..70474d6f --- /dev/null +++ b/syncnet_python-master/SSIM.py @@ -0,0 +1,66 @@ +import cv2 +import numpy as np +from skimage.metrics import structural_similarity as ssim +import sys + +def calculate_video_ssim(ref_video_path, out_video_path): + # 打开视频文件 + ref_cap = cv2.VideoCapture(ref_video_path) + out_cap = cv2.VideoCapture(out_video_path) + + # 获取视频帧的宽度和高度(假设两者相同) + ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + + # 初始化SSIM列表 + ssim_scores = [] + + while True: + # 读取参考视频和输出视频的帧 + ret_ref, ref_frame = ref_cap.read() + ret_out, out_frame = out_cap.read() + + # 如果输出视频已经结束,则停止循环(即使参考视频还有帧) + if not ret_out: + break + + # 如果参考视频已经结束(理论上不应该发生,因为参考视频应该更长),则也停止循环 + if not ret_ref: + print("Warning: Reference video ended before output video, which is unexpected.") + break + + # 转换帧的颜色空间(如果需要,这里假设已经是RGB或灰度图,OpenCV默认读取为BGR) + # ref_frame = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB) # 如果需要RGB + # out_frame = cv2.cvtColor(out_frame, cv2.COLOR_BGR2RGB) # 如果需要RGB + out_frame_resized = cv2.resize(out_frame, (ref_frame_width, ref_frame_height)) + # 由于OpenCV读取的是BGR,而skimage.metrics.structural_similarity期望的是RGB或灰度图, + # 如果你的SSIM计算库需要Rcce7aedf909d531d03df263977188917B,请取消上面两行的注释,并注释掉下面的转换(这里我们假设使用灰度图) + ref_gray = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2GRAY) + out_gray = cv2.cvtColor(out_frame_resized, cv2.COLOR_BGR2GRAY) + + # 计算SSIM + score, _ = ssim(ref_gray, out_gray, full=True) + ssim_scores.append(score) + + # 处理输出视频比参考视频少一帧的情况(实际上在这个循环中不需要额外处理,因为循环会在输出视频结束时停止) + + # 计算平均SSIM + average_ssim = np.mean(ssim_scores) + + # 释放视频捕获对象 + ref_cap.release() + out_cap.release() + + return average_ssim, ssim_scores + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python script_name.py ") + sys.exit(1) + ref_video_path = sys.argv[1] + out_video_path = sys.argv[2] + average_ssim, ssim_scores = calculate_video_ssim(ref_video_path, out_video_path) + print(f"Average SSIM: {average_ssim}") + # 如果需要查看每一帧的SSIM,可以打印ssim_scores列表 + # for i, score in enumerate(ssim_scores): + # print(f"Frame {i+1} SSIM: {score}") \ No newline at end of file diff --git a/syncnet_python-master/SyncNetInstance.py b/syncnet_python-master/SyncNetInstance.py new file mode 100755 index 00000000..497d44fc --- /dev/null +++ b/syncnet_python-master/SyncNetInstance.py @@ -0,0 +1,208 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +# Video 25 FPS, Audio 16000HZ + +import torch +import numpy +import time, pdb, argparse, subprocess, os, math, glob +import cv2 +import python_speech_features + +from scipy import signal +from scipy.io import wavfile +from SyncNetModel import * +from shutil import rmtree + + +# ==================== Get OFFSET ==================== + +def calc_pdist(feat1, feat2, vshift=10): + + win_size = vshift*2+1 + + feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) + + dists = [] + + for i in range(0,len(feat1)): + + dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) + + return dists + +# ==================== MAIN DEF ==================== + +class SyncNetInstance(torch.nn.Module): + + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + super(SyncNetInstance, self).__init__(); + + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + + def evaluate(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Convert files + # ========== ========== + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + + command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + output = subprocess.call(command, shell=True, stdout=None) + + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + output = subprocess.call(command, shell=True, stdout=None) + + # ========== ========== + # Load video + # ========== ========== + + images = [] + + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) + flist.sort() + + for fname in flist: + images.append(cv2.imread(fname)) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Load audio + # ========== ========== + + sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) + mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) + mfcc = numpy.stack([numpy.array(i) for i in mfcc]) + + cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) + cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + + # ========== ========== + # Check audio and video input length + # ========== ========== + + if (float(len(audio))/16000) != (float(len(images))/25) : + print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + + min_length = min(len(images),math.floor(len(audio)/640)) + + # ========== ========== + # Generate video and audio feats + # ========== ========== + + lastframe = min_length-5 + im_feat = [] + cc_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lip(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + cc_in = torch.cat(cc_batch,0) + cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_feat.append(cc_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + cc_feat = torch.cat(cc_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) + mdist = torch.mean(torch.stack(dists,1),1) + + minval, minidx = torch.min(mdist,0) + + offset = opt.vshift-minidx + conf = torch.median(mdist) - minval + + fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) + # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) + fconf = torch.median(mdist).numpy() - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) + print('Framewise conf: ') + print(fconfm) + print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + + dists_npy = numpy.array([ dist.numpy() for dist in dists ]) + return offset.numpy(), conf.numpy(), dists_npy + + def extract_feature(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Load video + # ========== ========== + cap = cv2.VideoCapture(videofile) + + frame_num = 1; + images = [] + while frame_num: + frame_num += 1 + ret, image = cap.read() + if ret == 0: + break + + images.append(image) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Generate video feats + # ========== ========== + + lastframe = len(images)-4 + im_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + return im_feat + + + def loadParameters(self, path): + loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + + self_state = self.__S__.state_dict(); + + for name, param in loaded_state.items(): + + self_state[name].copy_(param); diff --git a/syncnet_python-master/SyncNetInstance_calc_scores.py b/syncnet_python-master/SyncNetInstance_calc_scores.py new file mode 100755 index 00000000..64906e25 --- /dev/null +++ b/syncnet_python-master/SyncNetInstance_calc_scores.py @@ -0,0 +1,210 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- +# Video 25 FPS, Audio 16000HZ + +import torch +import numpy +import time, pdb, argparse, subprocess, os, math, glob +import cv2 +import python_speech_features + +from scipy import signal +from scipy.io import wavfile +from SyncNetModel import * +from shutil import rmtree + + +# ==================== Get OFFSET ==================== + +def calc_pdist(feat1, feat2, vshift=10): + + win_size = vshift*2+1 + + feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) + + dists = [] + + for i in range(0,len(feat1)): + + dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:])) + + return dists + +# ==================== MAIN DEF ==================== + +class SyncNetInstance(torch.nn.Module): + + def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024): + super(SyncNetInstance, self).__init__(); + + self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda(); + + def evaluate(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Convert files + # ========== ========== + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + + command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + output = subprocess.call(command, shell=True, stdout=None) + + command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + output = subprocess.call(command, shell=True, stdout=None) + + # ========== ========== + # Load video + # ========== ========== + + images = [] + + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) + flist.sort() + + for fname in flist: + img_input = cv2.imread(fname) + img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE + images.append(img_input) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Load audio + # ========== ========== + + sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav')) + mfcc = zip(*python_speech_features.mfcc(audio,sample_rate)) + mfcc = numpy.stack([numpy.array(i) for i in mfcc]) + + cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0) + cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float()) + + # ========== ========== + # Check audio and video input length + # ========== ========== + + #if (float(len(audio))/16000) != (float(len(images))/25) : + # print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) + + min_length = min(len(images),math.floor(len(audio)/640)) + + # ========== ========== + # Generate video and audio feats + # ========== ========== + + lastframe = min_length-5 + im_feat = [] + cc_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lip(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + cc_in = torch.cat(cc_batch,0) + cc_out = self.__S__.forward_aud(cc_in.cuda()) + cc_feat.append(cc_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + cc_feat = torch.cat(cc_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + #print('Compute time %.3f sec.' % (time.time()-tS)) + + dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) + mdist = torch.mean(torch.stack(dists,1),1) + + minval, minidx = torch.min(mdist,0) + + offset = opt.vshift-minidx + conf = torch.median(mdist) - minval + + fdist = numpy.stack([dist[minidx].numpy() for dist in dists]) + # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) + fconf = torch.median(mdist).numpy() - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) + #print('Framewise conf: ') + #print(fconfm) + #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf)) + + dists_npy = numpy.array([ dist.numpy() for dist in dists ]) + return offset.numpy(), conf.numpy(), minval.numpy() + + def extract_feature(self, opt, videofile): + + self.__S__.eval(); + + # ========== ========== + # Load video + # ========== ========== + cap = cv2.VideoCapture(videofile) + + frame_num = 1; + images = [] + while frame_num: + frame_num += 1 + ret, image = cap.read() + if ret == 0: + break + + images.append(image) + + im = numpy.stack(images,axis=3) + im = numpy.expand_dims(im,axis=0) + im = numpy.transpose(im,(0,3,4,1,2)) + + imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) + + # ========== ========== + # Generate video feats + # ========== ========== + + lastframe = len(images)-4 + im_feat = [] + + tS = time.time() + for i in range(0,lastframe,opt.batch_size): + + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] + im_in = torch.cat(im_batch,0) + im_out = self.__S__.forward_lipfeat(im_in.cuda()); + im_feat.append(im_out.data.cpu()) + + im_feat = torch.cat(im_feat,0) + + # ========== ========== + # Compute offset + # ========== ========== + + print('Compute time %.3f sec.' % (time.time()-tS)) + + return im_feat + + + def loadParameters(self, path): + loaded_state = torch.load(path, map_location=lambda storage, loc: storage); + + self_state = self.__S__.state_dict(); + + for name, param in loaded_state.items(): + + self_state[name].copy_(param); diff --git a/syncnet_python-master/SyncNetModel.py b/syncnet_python-master/SyncNetModel.py new file mode 100755 index 00000000..12e87a9f --- /dev/null +++ b/syncnet_python-master/SyncNetModel.py @@ -0,0 +1,117 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*-conc + +import torch +import torch.nn as nn + +def save(model, filename): + with open(filename, "wb") as f: + torch.save(model, f); + print("%s saved."%filename); + +def load(filename): + net = torch.load(filename) + return net; + +class S(nn.Module): + def __init__(self, num_layers_in_fc_layers = 1024): + super(S, self).__init__(); + + self.__nFeatures__ = 24; + self.__nChs__ = 32; + self.__midChs__ = 32; + + self.netcnnaud = nn.Sequential( + nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)), + nn.BatchNorm2d(64), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)), + + nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)), + nn.BatchNorm2d(192), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)), + + nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(384), + nn.ReLU(inplace=True), + + nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + + nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)), + nn.BatchNorm2d(256), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)), + + nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)), + nn.BatchNorm2d(512), + nn.ReLU(), + ); + + self.netfcaud = nn.Sequential( + nn.Linear(512, 512), + nn.BatchNorm1d(512), + nn.ReLU(), + nn.Linear(512, num_layers_in_fc_layers), + ); + + self.netfclip = nn.Sequential( + nn.Linear(512, 512), + nn.BatchNorm1d(512), + nn.ReLU(), + nn.Linear(512, num_layers_in_fc_layers), + ); + + self.netcnnlip = nn.Sequential( + nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0), + nn.BatchNorm3d(96), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), + + nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + + nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)), + nn.BatchNorm3d(256), + nn.ReLU(inplace=True), + nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)), + + nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0), + nn.BatchNorm3d(512), + nn.ReLU(inplace=True), + ); + + def forward_aud(self, x): + + mid = self.netcnnaud(x); # N x ch x 24 x M + mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + out = self.netfcaud(mid); + + return out; + + def forward_lip(self, x): + + mid = self.netcnnlip(x); + mid = mid.view((mid.size()[0], -1)); # N x (ch x 24) + out = self.netfclip(mid); + + return out; + + def forward_lipfeat(self, x): + + mid = self.netcnnlip(x); + out = mid.view((mid.size()[0], -1)); # N x (ch x 24) + + return out; \ No newline at end of file diff --git a/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc new file mode 100755 index 00000000..dfe081fd Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc differ diff --git a/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc new file mode 100755 index 00000000..d670ff95 Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc differ diff --git a/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc new file mode 100755 index 00000000..1c8a3037 Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc differ diff --git a/syncnet_python-master/all_scores.txt b/syncnet_python-master/all_scores.txt new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/batch_process.log b/syncnet_python-master/batch_process.log new file mode 100755 index 00000000..139597f9 --- /dev/null +++ b/syncnet_python-master/batch_process.log @@ -0,0 +1,2 @@ + + diff --git a/syncnet_python-master/batch_psnr.log b/syncnet_python-master/batch_psnr.log new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/batch_psnr.py b/syncnet_python-master/batch_psnr.py new file mode 100755 index 00000000..181c3f38 --- /dev/null +++ b/syncnet_python-master/batch_psnr.py @@ -0,0 +1,145 @@ +import os +import subprocess +import logging +from multiprocessing import Pool, cpu_count + +def get_ref_video_files(directory): + return [f for f in os.listdir(directory) if f.lower().endswith('.mp4')] + +def setup_logging(log_file='batch_process.log'): + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + handlers=[ + logging.FileHandler(log_file, mode='w', encoding='utf-8'), + logging.StreamHandler() + ] + ) + +def run_command(command, logger): + try: + result = subprocess.run( + command, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + logger.info(f"命令成功执行: {' '.join(command)}") + logger.info(f"输出:\n{result.stdout}") + if result.stderr: + logger.warning(f"错误输出:\n{result.stderr}") + except subprocess.CalledProcessError as e: + logger.error(f"命令执行失败: {' '.join(command)}") + logger.error(f"错误输出:\n{e.stderr}") + +def process_video_pair(args): + ref_path, out_path, logger = args + ref_filename = os.path.basename(ref_path) + out_filename = os.path.basename(out_path) + + logger.info(f"正在处理: {ref_filename} 与 {out_filename}") + + # 1. 运行 PSNR.py + psnr_command = ['python', 'PSNR.py', ref_path, out_path] + run_command(psnr_command, logger) + + # 2. 运行 NIQE.py + niqe_command = ['python', 'NIQE.py', out_path] + run_command(niqe_command, logger) + + # 3. 运行 SSIM.py + ssim_command = ['python', 'SSIM.py', ref_path, out_path] + run_command(ssim_command, logger) + + # 4. 运行 FID.py + fid_command = ['python', 'FID.py', ref_path, out_path] + run_command(fid_command, logger) + + # 5. 运行 run_pipeline.py + run_pipeline_command = [ + 'python', 'run_pipeline.py', + '--videofile', out_path, + '--reference', 'wav2lip', + '--data_dir', 'tmp_dir' + ] + run_command(run_pipeline_command, logger) + + # 6. 运行 calculate_scores_real_videos.py 并将输出追加到 all_scores.txt + calculate_scores_command = [ + 'python', 'calculate_scores_real_videos.py', + '--videofile', out_path, + '--reference', 'wav2lip', + '--data_dir', 'tmp_dir' + ] + try: + result = subprocess.run( + calculate_scores_command, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + logger.info(f"命令成功执行: {' '.join(calculate_scores_command)}") + logger.info(f"输出:\n{result.stdout}") + if result.stderr: + logger.warning(f"错误输出:\n{result.stderr}") + + # 追加输出到 all_scores.txt + all_scores_path = os.path.join(os.getcwd(), 'all_scores.txt') + with open(all_scores_path, 'a') as f: + f.write(result.stdout) + except subprocess.CalledProcessError as e: + logger.error(f"命令执行失败: {' '.join(calculate_scores_command)}") + logger.error(f"错误输出:\n{e.stderr}") + +def main(ref_dir, out_dir): + setup_logging() + logger = logging.getLogger() + + if not os.path.isdir(ref_dir): + logger.error(f"参考视频目录不存在: {ref_dir}") + return + if not os.path.isdir(out_dir): + logger.error(f"输出视频目录不存在: {out_dir}") + return + + ref_files = get_ref_video_files(ref_dir) + + if not ref_files: + logger.error("参考视频目录下没有找到任何.mp4文件。") + return + + logger.info(f"找到 {len(ref_files)} 个参考文件。开始处理...") + + # 清空 all_scores.txt + all_scores_path = os.path.join(os.getcwd(), 'all_scores.txt') + with open(all_scores_path, 'w') as f: + f.write('') # 清空文件 + + # 准备任务列表 + tasks = [] + for ref_filename in sorted(ref_files): + ref_basename = os.path.splitext(ref_filename)[0] + out_filename = f"{ref_basename}_{ref_basename}.mp4" + ref_path = os.path.join(ref_dir, ref_filename) + out_path = os.path.join(out_dir, out_filename) + + if not os.path.isfile(out_path): + logger.warning(f"输出视频文件不存在: {out_filename}. 跳过.") + continue + + tasks.append((ref_path, out_path, logger)) + + # 使用进程池并行处理 + with Pool(processes=cpu_count()) as pool: + pool.map(process_video_pair, tasks) + + logger.info("所有文件处理完成。") + +if __name__ == "__main__": + script_dir = os.path.dirname(os.path.abspath(__file__)) + ref_video_dir = os.path.join(script_dir, 'ref_video') + out_video_dir = os.path.join(script_dir, 'out_video') + + main(ref_video_dir, out_video_dir) diff --git a/syncnet_python-master/calculate_scores_LRS.py b/syncnet_python-master/calculate_scores_LRS.py new file mode 100755 index 00000000..eda02b8f --- /dev/null +++ b/syncnet_python-master/calculate_scores_LRS.py @@ -0,0 +1,53 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess +import glob +import os +from tqdm import tqdm + +from SyncNetInstance_calc_scores import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_root', type=str, required=True, help=''); +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); +parser.add_argument('--reference', type=str, default="demo", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); +path = os.path.join(opt.data_root, "*.mp4") + +all_videos = glob.glob(path) + +prog_bar = tqdm(range(len(all_videos))) +avg_confidence = 0. +avg_min_distance = 0. + + +for videofile_idx in prog_bar: + videofile = all_videos[videofile_idx] + offset, confidence, min_distance = s.evaluate(opt, videofile=videofile) + avg_confidence += confidence + avg_min_distance += min_distance + prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3))) + prog_bar.refresh() + +print ('Average Confidence: {}'.format(avg_confidence/len(all_videos))) +print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos))) + + + diff --git a/syncnet_python-master/calculate_scores_real_videos.py b/syncnet_python-master/calculate_scores_real_videos.py new file mode 100755 index 00000000..391526e0 --- /dev/null +++ b/syncnet_python-master/calculate_scores_real_videos.py @@ -0,0 +1,44 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess, pickle, os, gzip, glob + +from SyncNetInstance_calc_scores import * +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--initial_model', type=str, default="/app/data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_dir', type=str, default='/app/data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + +# ==================== LOAD MODEL AND FILE LIST ==================== +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +#print("Model %s loaded."%opt.initial_model); + +flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) +flist.sort() + +# ==================== GET OFFSETS ==================== + +dists = [] +for idx, fname in enumerate(flist): + offset, conf, dist = s.evaluate(opt,videofile=fname) + print (str(dist)+" "+str(conf)) + dists.append((dist, conf)) + +# ==================== PRINT RESULTS TO FILE ==================== + +with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: + pickle.dump(dists, fil) diff --git a/syncnet_python-master/calculate_scores_real_videos.sh b/syncnet_python-master/calculate_scores_real_videos.sh new file mode 100755 index 00000000..4a45cd56 --- /dev/null +++ b/syncnet_python-master/calculate_scores_real_videos.sh @@ -0,0 +1,8 @@ +rm all_scores.txt +yourfilenames=`ls $1` + +for eachfile in $yourfilenames +do + python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir + python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt +done diff --git a/syncnet_python-master/data/example.avi b/syncnet_python-master/data/example.avi new file mode 100755 index 00000000..68a47538 Binary files /dev/null and b/syncnet_python-master/data/example.avi differ diff --git a/syncnet_python-master/data/niqe_image_params.mat b/syncnet_python-master/data/niqe_image_params.mat new file mode 100755 index 00000000..53df0998 Binary files /dev/null and b/syncnet_python-master/data/niqe_image_params.mat differ diff --git a/syncnet_python-master/data/syncnet_v2.model b/syncnet_python-master/data/syncnet_v2.model new file mode 100755 index 00000000..230757f4 --- /dev/null +++ b/syncnet_python-master/data/syncnet_v2.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:961e8696f888fce4f3f3a6c3d5b3267cf5b343100b238e79b2659bff2c605442 +size 54573114 diff --git a/syncnet_python-master/data/work/pywork/faces.pckl b/syncnet_python-master/data/work/pywork/faces.pckl new file mode 100755 index 00000000..92c3c883 --- /dev/null +++ b/syncnet_python-master/data/work/pywork/faces.pckl @@ -0,0 +1 @@ +]. \ No newline at end of file diff --git a/syncnet_python-master/demo_feature.py b/syncnet_python-master/demo_feature.py new file mode 100755 index 00000000..e3bd290e --- /dev/null +++ b/syncnet_python-master/demo_feature.py @@ -0,0 +1,32 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess + +from SyncNetInstance import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); +parser.add_argument('--tmp_dir', type=str, default="data", help=''); +parser.add_argument('--save_as', type=str, default="data/features.pt", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +feats = s.extract_feature(opt, videofile=opt.videofile) + +torch.save(feats, opt.save_as) diff --git a/syncnet_python-master/demo_syncnet.py b/syncnet_python-master/demo_syncnet.py new file mode 100755 index 00000000..01c25a6f --- /dev/null +++ b/syncnet_python-master/demo_syncnet.py @@ -0,0 +1,30 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess + +from SyncNetInstance import * + +# ==================== LOAD PARAMS ==================== + + +parser = argparse.ArgumentParser(description = "SyncNet"); + +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--videofile', type=str, default="data/example.avi", help=''); +parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help=''); +parser.add_argument('--reference', type=str, default="demo", help=''); + +opt = parser.parse_args(); + + +# ==================== RUN EVALUATION ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +s.evaluate(opt, videofile=opt.videofile) diff --git a/syncnet_python-master/detectors/README.md b/syncnet_python-master/detectors/README.md new file mode 100755 index 00000000..f5a8d4fe --- /dev/null +++ b/syncnet_python-master/detectors/README.md @@ -0,0 +1,3 @@ +# Face detector + +This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. diff --git a/syncnet_python-master/detectors/__init__.py b/syncnet_python-master/detectors/__init__.py new file mode 100755 index 00000000..059d49bf --- /dev/null +++ b/syncnet_python-master/detectors/__init__.py @@ -0,0 +1 @@ +from .s3fd import S3FD \ No newline at end of file diff --git a/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc b/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc new file mode 100755 index 00000000..df49682d Binary files /dev/null and b/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc differ diff --git a/syncnet_python-master/detectors/s3fd/__init__.py b/syncnet_python-master/detectors/s3fd/__init__.py new file mode 100755 index 00000000..d7f35e05 --- /dev/null +++ b/syncnet_python-master/detectors/s3fd/__init__.py @@ -0,0 +1,61 @@ +import time +import numpy as np +import cv2 +import torch +from torchvision import transforms +from .nets import S3FDNet +from .box_utils import nms_ + +PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth' +img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') + + +class S3FD(): + + def __init__(self, device='cuda'): + + tstamp = time.time() + self.device = device + + print('[S3FD] loading with', self.device) + self.net = S3FDNet(device=self.device).to(self.device) + state_dict = torch.load(PATH_WEIGHT, map_location=self.device) + self.net.load_state_dict(state_dict) + self.net.eval() + print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + + def detect_faces(self, image, conf_th=0.8, scales=[1]): + + w, h = image.shape[1], image.shape[0] + + bboxes = np.empty(shape=(0, 5)) + + with torch.no_grad(): + for s in scales: + scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) + + scaled_img = np.swapaxes(scaled_img, 1, 2) + scaled_img = np.swapaxes(scaled_img, 1, 0) + scaled_img = scaled_img[[2, 1, 0], :, :] + scaled_img = scaled_img.astype('float32') + scaled_img -= img_mean + scaled_img = scaled_img[[2, 1, 0], :, :] + x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device) + y = self.net(x) + + detections = y.data + scale = torch.Tensor([w, h, w, h]) + + for i in range(detections.size(1)): + j = 0 + while detections[0, i, j, 0] > conf_th: + score = detections[0, i, j, 0] + pt = (detections[0, i, j, 1:] * scale).cpu().numpy() + bbox = (pt[0], pt[1], pt[2], pt[3], score) + bboxes = np.vstack((bboxes, bbox)) + j += 1 + + keep = nms_(bboxes, 0.1) + bboxes = bboxes[keep] + + return bboxes diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc new file mode 100755 index 00000000..4e8df576 Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc differ diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc new file mode 100755 index 00000000..1938a8eb Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc differ diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc new file mode 100755 index 00000000..f83ad53f Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc differ diff --git a/syncnet_python-master/detectors/s3fd/box_utils.py b/syncnet_python-master/detectors/s3fd/box_utils.py new file mode 100755 index 00000000..0779bcd5 --- /dev/null +++ b/syncnet_python-master/detectors/s3fd/box_utils.py @@ -0,0 +1,217 @@ +import numpy as np +from itertools import product as product +import torch +from torch.autograd import Function + + +def nms_(dets, thresh): + """ + Courtesy of Ross Girshick + [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] + """ + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(int(i)) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return np.array(keep).astype(np.int) + + +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat(( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count + + +class Detect(object): + + def __init__(self, num_classes=2, + top_k=750, nms_thresh=0.3, conf_thresh=0.05, + variance=[0.1, 0.2], nms_top_k=5000): + + self.num_classes = num_classes + self.top_k = top_k + self.nms_thresh = nms_thresh + self.conf_thresh = conf_thresh + self.variance = variance + self.nms_top_k = nms_top_k + + def forward(self, loc_data, conf_data, prior_data): + + num = loc_data.size(0) + num_priors = prior_data.size(0) + + conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) + batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) + batch_priors = batch_priors.contiguous().view(-1, 4) + + decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) + decoded_boxes = decoded_boxes.view(num, num_priors, 4) + + output = torch.zeros(num, self.num_classes, self.top_k, 5) + + for i in range(num): + boxes = decoded_boxes[i].clone() + conf_scores = conf_preds[i].clone() + + for cl in range(1, self.num_classes): + c_mask = conf_scores[cl].gt(self.conf_thresh) + scores = conf_scores[cl][c_mask] + + if scores.dim() == 0: + continue + l_mask = c_mask.unsqueeze(1).expand_as(boxes) + boxes_ = boxes[l_mask].view(-1, 4) + ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) + count = count if count < self.top_k else self.top_k + + output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) + + return output + + +class PriorBox(object): + + def __init__(self, input_size, feature_maps, + variance=[0.1, 0.2], + min_sizes=[16, 32, 64, 128, 256, 512], + steps=[4, 8, 16, 32, 64, 128], + clip=False): + + super(PriorBox, self).__init__() + + self.imh = input_size[0] + self.imw = input_size[1] + self.feature_maps = feature_maps + + self.variance = variance + self.min_sizes = min_sizes + self.steps = steps + self.clip = clip + + def forward(self): + mean = [] + for k, fmap in enumerate(self.feature_maps): + feath = fmap[0] + featw = fmap[1] + for i, j in product(range(feath), range(featw)): + f_kw = self.imw / self.steps[k] + f_kh = self.imh / self.steps[k] + + cx = (j + 0.5) / f_kw + cy = (i + 0.5) / f_kh + + s_kw = self.min_sizes[k] / self.imw + s_kh = self.min_sizes[k] / self.imh + + mean += [cx, cy, s_kw, s_kh] + + output = torch.FloatTensor(mean).view(-1, 4) + + if self.clip: + output.clamp_(max=1, min=0) + + return output diff --git a/syncnet_python-master/detectors/s3fd/nets.py b/syncnet_python-master/detectors/s3fd/nets.py new file mode 100755 index 00000000..85b5c82c --- /dev/null +++ b/syncnet_python-master/detectors/s3fd/nets.py @@ -0,0 +1,174 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from .box_utils import Detect, PriorBox + + +class L2Norm(nn.Module): + + def __init__(self, n_channels, scale): + super(L2Norm, self).__init__() + self.n_channels = n_channels + self.gamma = scale or None + self.eps = 1e-10 + self.weight = nn.Parameter(torch.Tensor(self.n_channels)) + self.reset_parameters() + + def reset_parameters(self): + init.constant_(self.weight, self.gamma) + + def forward(self, x): + norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps + x = torch.div(x, norm) + out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x + return out + + +class S3FDNet(nn.Module): + + def __init__(self, device='cuda'): + super(S3FDNet, self).__init__() + self.device = device + + self.vgg = nn.ModuleList([ + nn.Conv2d(3, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(64, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(128, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2, ceil_mode=True), + + nn.Conv2d(256, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 1, 1), + nn.ReLU(inplace=True), + ]) + + self.L2Norm3_3 = L2Norm(256, 10) + self.L2Norm4_3 = L2Norm(512, 8) + self.L2Norm5_3 = L2Norm(512, 5) + + self.extras = nn.ModuleList([ + nn.Conv2d(1024, 256, 1, 1), + nn.Conv2d(256, 512, 3, 2, padding=1), + nn.Conv2d(512, 128, 1, 1), + nn.Conv2d(128, 256, 3, 2, padding=1), + ]) + + self.loc = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(1024, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(256, 4, 3, 1, padding=1), + ]) + + self.conf = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(1024, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(256, 2, 3, 1, padding=1), + ]) + + self.softmax = nn.Softmax(dim=-1) + self.detect = Detect() + + def forward(self, x): + size = x.size()[2:] + sources = list() + loc = list() + conf = list() + + for k in range(16): + x = self.vgg[k](x) + s = self.L2Norm3_3(x) + sources.append(s) + + for k in range(16, 23): + x = self.vgg[k](x) + s = self.L2Norm4_3(x) + sources.append(s) + + for k in range(23, 30): + x = self.vgg[k](x) + s = self.L2Norm5_3(x) + sources.append(s) + + for k in range(30, len(self.vgg)): + x = self.vgg[k](x) + sources.append(x) + + # apply extra layers and cache source layer outputs + for k, v in enumerate(self.extras): + x = F.relu(v(x), inplace=True) + if k % 2 == 1: + sources.append(x) + + # apply multibox head to source layers + loc_x = self.loc[0](sources[0]) + conf_x = self.conf[0](sources[0]) + + max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) + conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) + + loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) + conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) + + for i in range(1, len(sources)): + x = sources[i] + conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) + loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) + + features_maps = [] + for i in range(len(loc)): + feat = [] + feat += [loc[i].size(1), loc[i].size(2)] + features_maps += [feat] + + loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) + conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) + + with torch.no_grad(): + self.priorbox = PriorBox(size, features_maps) + self.priors = self.priorbox.forward() + + output = self.detect.forward( + loc.view(loc.size(0), -1, 4), + self.softmax(conf.view(conf.size(0), -1, 2)), + self.priors.type(type(x.data)).to(self.device) + ) + + return output diff --git a/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth b/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth new file mode 100755 index 00000000..7a577e1e --- /dev/null +++ b/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54a87c2b7543b64729c9a25eafd188da15fd3f6e02f0ecec76ae1b30d86c491 +size 89844381 diff --git a/syncnet_python-master/download_model.sh b/syncnet_python-master/download_model.sh new file mode 100755 index 00000000..3e3a9dc2 --- /dev/null +++ b/syncnet_python-master/download_model.sh @@ -0,0 +1,9 @@ +# SyncNet model + +mkdir data +wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model +wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi + +# For the pre-processing pipeline +mkdir detectors/s3fd/weights +wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth \ No newline at end of file diff --git a/syncnet_python-master/fid_log.txt b/syncnet_python-master/fid_log.txt new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/img/ex1.jpg b/syncnet_python-master/img/ex1.jpg new file mode 100755 index 00000000..b20b57e1 Binary files /dev/null and b/syncnet_python-master/img/ex1.jpg differ diff --git a/syncnet_python-master/img/ex2.jpg b/syncnet_python-master/img/ex2.jpg new file mode 100755 index 00000000..851402cc Binary files /dev/null and b/syncnet_python-master/img/ex2.jpg differ diff --git a/syncnet_python-master/niqe_log.txt b/syncnet_python-master/niqe_log.txt new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/out_video/Obama_Obama.mp4 b/syncnet_python-master/out_video/Obama_Obama.mp4 new file mode 100755 index 00000000..7febe228 --- /dev/null +++ b/syncnet_python-master/out_video/Obama_Obama.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d +size 7483658 diff --git a/syncnet_python-master/psnr_log.txt b/syncnet_python-master/psnr_log.txt new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/ref_video/Obama.mp4 b/syncnet_python-master/ref_video/Obama.mp4 new file mode 100755 index 00000000..f14145dc --- /dev/null +++ b/syncnet_python-master/ref_video/Obama.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a01306b103de1ecdda339efa6f65aa34b90c5cc6d0f03d28beaddb6afba3dab +size 25431660 diff --git a/syncnet_python-master/requirements.txt b/syncnet_python-master/requirements.txt new file mode 100755 index 00000000..e2818374 --- /dev/null +++ b/syncnet_python-master/requirements.txt @@ -0,0 +1,13 @@ +numpy==1.23.0 +scenedetect==0.6.0 +scikit-image==0.24.0 +scikit-learn==1.6.0 +opencv-contrib-python +python_speech_features +pyworld +pyyaml +pytorch-lightning==1.9.0 # 确保与 torch 1.11.0 兼容 +imageio +opencv-python +scipy + diff --git a/syncnet_python-master/run_pipeline.py b/syncnet_python-master/run_pipeline.py new file mode 100755 index 00000000..f2589fb9 --- /dev/null +++ b/syncnet_python-master/run_pipeline.py @@ -0,0 +1,322 @@ +#!/usr/bin/python + +import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 +import numpy as np +from shutil import rmtree + +import scenedetect +from scenedetect.video_manager import VideoManager +from scenedetect.scene_manager import SceneManager +from scenedetect.frame_timecode import FrameTimecode +from scenedetect.stats_manager import StatsManager +from scenedetect.detectors import ContentDetector + +from scipy.interpolate import interp1d +from scipy.io import wavfile +from scipy import signal + +from detectors import S3FD + +# ========== ========== ========== ========== +# # PARSE ARGS +# ========== ========== ========== ========== + +parser = argparse.ArgumentParser(description = "FaceTracker"); +parser.add_argument('--data_dir', type=str, default='/app/data/work', help='Output direcotry'); +parser.add_argument('--videofile', type=str, default='', help='Input video file'); +parser.add_argument('--reference', type=str, default='', help='Video reference'); +parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); +parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); +parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration'); +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); +parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); +parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) +setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) + +# ========== ========== ========== ========== +# # IOU FUNCTION +# ========== ========== ========== ========== + +def bb_intersection_over_union(boxA, boxB): + + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[2], boxB[2]) + yB = min(boxA[3], boxB[3]) + + interArea = max(0, xB - xA) * max(0, yB - yA) + + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) + boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) + + iou = interArea / float(boxAArea + boxBArea - interArea) + + return iou + +# ========== ========== ========== ========== +# # FACE TRACKING +# ========== ========== ========== ========== + +def track_shot(opt,scenefaces): + + iouThres = 0.5 # Minimum IOU between consecutive face detections + tracks = [] + + while True: + track = [] + for framefaces in scenefaces: + for face in framefaces: + if track == []: + track.append(face) + framefaces.remove(face) + elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det: + iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox']) + if iou > iouThres: + track.append(face) + framefaces.remove(face) + continue + else: + break + + if track == []: + break + elif len(track) > opt.min_track: + + framenum = np.array([ f['frame'] for f in track ]) + bboxes = np.array([np.array(f['bbox']) for f in track]) + + frame_i = np.arange(framenum[0],framenum[-1]+1) + + bboxes_i = [] + for ij in range(0,4): + interpfn = interp1d(framenum, bboxes[:,ij]) + bboxes_i.append(interpfn(frame_i)) + bboxes_i = np.stack(bboxes_i, axis=1) + + if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size: + tracks.append({'frame':frame_i,'bbox':bboxes_i}) + + return tracks + +# ========== ========== ========== ========== +# # VIDEO CROP AND SAVE +# ========== ========== ========== ========== + +def crop_video(opt,track,cropfile): + + flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) + flist.sort() + + fourcc = cv2.VideoWriter_fourcc(*'XVID') + vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224)) + + dets = {'x':[], 'y':[], 's':[]} + + for det in track['bbox']: + + dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x + dets['x'].append((det[0]+det[2])/2) # crop center y + + # Smooth detections + dets['s'] = signal.medfilt(dets['s'],kernel_size=13) + dets['x'] = signal.medfilt(dets['x'],kernel_size=13) + dets['y'] = signal.medfilt(dets['y'],kernel_size=13) + + for fidx, frame in enumerate(track['frame']): + + cs = opt.crop_scale + + bs = dets['s'][fidx] # Detection box size + bsi = int(bs*(1+2*cs)) # Pad videos by this amount + + image = cv2.imread(flist[frame]) + + frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) + my = dets['y'][fidx]+bsi # BBox center Y + mx = dets['x'][fidx]+bsi # BBox center X + + face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] + + vOut.write(cv2.resize(face,(224,224))) + + audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') + audiostart = (track['frame'][0])/opt.frame_rate + audioend = (track['frame'][-1]+1)/opt.frame_rate + + vOut.release() + + # ========== CROP AUDIO FILE ========== + + command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) + output = subprocess.call(command, shell=True, stdout=None) + + if output != 0: + pdb.set_trace() + + sample_rate, audio = wavfile.read(audiotmp) + + # ========== COMBINE AUDIO AND VIDEO FILES ========== + + command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) + output = subprocess.call(command, shell=True, stdout=None) + + if output != 0: + pdb.set_trace() + + print('Written %s'%cropfile) + + os.remove(cropfile+'t.avi') + + print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s']))) + + return {'track':track, 'proc_track':dets} + +# ========== ========== ========== ========== +# # FACE DETECTION +# ========== ========== ========== ========== + +def inference_video(opt): + + DET = S3FD(device='cuda') + + flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) + flist.sort() + + dets = [] + + for fidx, fname in enumerate(flist): + + start_time = time.time() + + image = cv2.imread(fname) + + image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale]) + + dets.append([]); + for bbox in bboxes: + dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]}) + + elapsed_time = time.time() - start_time + + print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) + + savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') + + with open(savepath, 'wb') as fil: + pickle.dump(dets, fil) + + return dets + +# ========== ========== ========== ========== +# # SCENE DETECTION +# ========== ========== ========== ========== + +def scene_detect(opt): + + video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')]) + stats_manager = StatsManager() + scene_manager = SceneManager(stats_manager) + # Add ContentDetector algorithm (constructor takes detector options like threshold). + scene_manager.add_detector(ContentDetector()) + base_timecode = video_manager.get_base_timecode() + + video_manager.set_downscale_factor() + + video_manager.start() + + scene_manager.detect_scenes(frame_source=video_manager) + + scene_list = scene_manager.get_scene_list(base_timecode) + + savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl') + + if scene_list == []: + scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())] + + with open(savepath, 'wb') as fil: + pickle.dump(scene_list, fil) + + # print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) + + return scene_list + + +# ========== ========== ========== ========== +# # EXECUTE DEMO +# ========== ========== ========== ========== + +# ========== DELETE EXISTING DIRECTORIES ========== + +if os.path.exists(os.path.join(opt.work_dir,opt.reference)): + rmtree(os.path.join(opt.work_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): + rmtree(os.path.join(opt.crop_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): + rmtree(os.path.join(opt.avi_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): + rmtree(os.path.join(opt.frames_dir,opt.reference)) + +if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) + +# ========== MAKE NEW DIRECTORIES ========== + +os.makedirs(os.path.join(opt.work_dir,opt.reference)) +os.makedirs(os.path.join(opt.crop_dir,opt.reference)) +os.makedirs(os.path.join(opt.avi_dir,opt.reference)) +os.makedirs(os.path.join(opt.frames_dir,opt.reference)) +os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + +# ========== CONVERT VIDEO AND EXTRACT FRAMES ========== + +command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) +output = subprocess.call(command, shell=True, stdout=None) + +command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) +output = subprocess.call(command, shell=True, stdout=None) + +command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) +output = subprocess.call(command, shell=True, stdout=None) + +# ========== FACE DETECTION ========== + +faces = inference_video(opt) + +# ========== SCENE DETECTION ========== + +scene = scene_detect(opt) + +# ========== FACE TRACKING ========== + +alltracks = [] +vidtracks = [] + +for shot in scene: + + if shot[1].frame_num - shot[0].frame_num >= opt.min_track : + alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) + +# ========== FACE TRACK CROP ========== + +for ii, track in enumerate(alltracks): + vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) + +# ========== SAVE RESULTS ========== + +savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') + +with open(savepath, 'wb') as fil: + pickle.dump(vidtracks, fil) + +rmtree(os.path.join(opt.tmp_dir,opt.reference)) diff --git a/syncnet_python-master/run_syncnet.py b/syncnet_python-master/run_syncnet.py new file mode 100755 index 00000000..45099fd6 --- /dev/null +++ b/syncnet_python-master/run_syncnet.py @@ -0,0 +1,45 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import time, pdb, argparse, subprocess, pickle, os, gzip, glob + +from SyncNetInstance import * + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); +parser.add_argument('--batch_size', type=int, default='20', help=''); +parser.add_argument('--vshift', type=int, default='15', help=''); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + +# ==================== LOAD MODEL AND FILE LIST ==================== + +s = SyncNetInstance(); + +s.loadParameters(opt.initial_model); +print("Model %s loaded."%opt.initial_model); + +flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) +flist.sort() + +# ==================== GET OFFSETS ==================== + +dists = [] +for idx, fname in enumerate(flist): + offset, conf, dist = s.evaluate(opt,videofile=fname) + dists.append(dist) + +# ==================== PRINT RESULTS TO FILE ==================== + +with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: + pickle.dump(dists, fil) diff --git a/syncnet_python-master/run_visualise.py b/syncnet_python-master/run_visualise.py new file mode 100755 index 00000000..85d89253 --- /dev/null +++ b/syncnet_python-master/run_visualise.py @@ -0,0 +1,88 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import torch +import numpy +import time, pdb, argparse, subprocess, pickle, os, glob +import cv2 + +from scipy import signal + +# ==================== PARSE ARGUMENT ==================== + +parser = argparse.ArgumentParser(description = "SyncNet"); +parser.add_argument('--data_dir', type=str, default='data/work', help=''); +parser.add_argument('--videofile', type=str, default='', help=''); +parser.add_argument('--reference', type=str, default='', help=''); +parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); +opt = parser.parse_args(); + +setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) +setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) +setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) +setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) +setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) + +# ==================== LOAD FILES ==================== + +with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil: + tracks = pickle.load(fil, encoding='latin1') + +with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil: + dists = pickle.load(fil, encoding='latin1') + +flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) +flist.sort() + +# ==================== SMOOTH FACES ==================== + +faces = [[] for i in range(len(flist))] + +for tidx, track in enumerate(tracks): + + mean_dists = numpy.mean(numpy.stack(dists[tidx],1),1) + minidx = numpy.argmin(mean_dists,0) + minval = mean_dists[minidx] + + fdist = numpy.stack([dist[minidx] for dist in dists[tidx]]) + fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=10) + + fconf = numpy.median(mean_dists) - fdist + fconfm = signal.medfilt(fconf,kernel_size=9) + + for fidx, frame in enumerate(track['track']['frame'].tolist()) : + faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]}) + +# ==================== ADD DETECTIONS TO VIDEO ==================== + +first_image = cv2.imread(flist[0]) + +fw = first_image.shape[1] +fh = first_image.shape[0] + +fourcc = cv2.VideoWriter_fourcc(*'XVID') +vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh)) + +for fidx, fname in enumerate(flist): + + image = cv2.imread(fname) + + for face in faces[fidx]: + + clr = max(min(face['conf']*25,255),0) + + cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3) + cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2) + + vOut.write(image) + + print('Frame %d'%fidx) + +vOut.release() + +# ========== COMBINE AUDIO AND VIDEO FILES ========== + +command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 +output = subprocess.call(command, shell=True, stdout=None) + + diff --git a/syncnet_python-master/ssim_log.txt b/syncnet_python-master/ssim_log.txt new file mode 100755 index 00000000..e69de29b diff --git a/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav b/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav new file mode 100755 index 00000000..2d00aca4 --- /dev/null +++ b/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd7a5f093fc3f55a9bafdc161ada7923e1756288937eaada330a9ebc5ce0a828 +size 18270746 diff --git a/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi b/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi new file mode 100755 index 00000000..21dea003 Binary files /dev/null and b/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi differ diff --git a/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi b/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi new file mode 100755 index 00000000..660f527c Binary files /dev/null and b/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi differ diff --git a/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4 b/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4 new file mode 100755 index 00000000..7febe228 --- /dev/null +++ b/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d +size 7483658 diff --git a/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4 b/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4 new file mode 100755 index 00000000..7febe228 --- /dev/null +++ b/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d +size 7483658 diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl new file mode 100755 index 00000000..6eb466d5 Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl differ diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl new file mode 100755 index 00000000..0a1e5f4b Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl differ diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl new file mode 100755 index 00000000..bedaf3d7 Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl differ diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl new file mode 100755 index 00000000..45374675 Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl differ