diff --git a/syncnet_python-master/Audio2Head/Audio2Head/README.md b/syncnet_python-master/Audio2Head/Audio2Head/README.md
new file mode 100755
index 00000000..05a48609
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/README.md
@@ -0,0 +1,49 @@
+# Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion (IJCAI 2021)
+
+#### [Paper](https://www.ijcai.org/proceedings/2021/0152.pdf) | [Demo](https://www.youtube.com/watch?v=xvcBJ29l8rA)
+
+#### Requirements
+
+- Python 3.6 , Pytorch >= 1.6 and ffmpeg
+
+- Other requirements are listed in the 'requirements.txt'
+
+  
+
+#### Pretrained Checkpoint
+
+Please download the pretrained checkpoint from [google-drive](https://drive.google.com/file/d/1tvI43ZIrnx9Ti2TpFiEO4dK5DOwcECD7/view?usp=sharing) and put it within the folder (`/checkpoints`).
+
+
+
+#### Generate Demo Results
+
+```
+python inference.py --audio_path xxx.wav --img_path xxx.jpg
+```
+
+Note that the input images must keep the same height and width and the face should be appropriately cropped as in `/demo/img`.
+
+
+
+#### License and Citation
+
+```
+@InProceedings{wang2021audio2head,
+author = Suzhen Wang, Lincheng Li, Yu Ding, Changjie Fan, Xin Yu
+title = {Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion},
+booktitle = {the 30th International Joint Conference on Artificial Intelligence (IJCAI-21)},
+year = {2021},
+}
+```
+
+
+
+#### Acknowledgement
+
+This codebase is based on [First Order Motion Model](https://github.com/AliaksandrSiarohin/first-order-model), thanks for their contribution.
+
+
+
+
+
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml
new file mode 100755
index 00000000..7d5a97fe
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml
@@ -0,0 +1,8 @@
+block_expansion: 32
+estimate_jacobian: true
+max_features: 512
+num_blocks: 5
+num_kp: 10
+num_w: 2
+seq: true
+seq_len: 64
\ No newline at end of file
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml
new file mode 100755
index 00000000..b8399646
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml
@@ -0,0 +1,83 @@
+dataset_params:
+  root_dir: /root/
+  frame_shape: [256, 256, 3]
+  id_sampling: True
+  pairs_list: data/vox256.csv
+  augmentation_params:
+    flip_param:
+      horizontal_flip: True
+      time_flip: True
+    jitter_param:
+      brightness: 0.1
+      contrast: 0.1
+      saturation: 0.1
+      hue: 0.1
+
+
+model_params:
+  common_params:
+    num_kp: 10
+    num_channels: 3
+    estimate_jacobian: True
+  kp_detector_params:
+     temperature: 0.1
+     block_expansion: 32
+     max_features: 1024
+     scale_factor: 0.25
+     num_blocks: 5
+  generator_params:
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    num_bottleneck_blocks: 6
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 64
+      max_features: 1024
+      num_blocks: 5
+      scale_factor: 0.25
+  discriminator_params:
+    scales: [1]
+    block_expansion: 32
+    max_features: 512
+    num_blocks: 4
+    sn: True
+
+train_params:
+  num_epochs: 100
+  num_repeats: 50
+  epoch_milestones: [5, 20, 30]
+  lr_generator: 2.0e-4
+  lr_discriminator: 2.0e-4
+  lr_kp_detector: 2.0e-4
+  batch_size: 36
+  scales: [1, 0.5, 0.25, 0.125]
+  checkpoint_freq: 10
+  transform_params:
+    sigma_affine: 0.05
+    sigma_tps: 0.005
+    points_tps: 5
+  loss_weights:
+    generator_gan: 0
+    discriminator_gan: 1
+    feature_matching: [10, 10, 10, 10]
+    perceptual: [10, 10, 10, 10, 10]
+    equivariance_value: 10
+    equivariance_jacobian: 10
+
+reconstruction_params:
+  num_videos: 1000
+  format: '.mp4'
+
+animate_params:
+  num_pairs: 50
+  format: '.mp4'
+  normalization_params:
+    adapt_movement_scale: False
+    use_relative_movement: True
+    use_relative_jacobian: True
+
+visualizer_params:
+  kp_size: 5
+  draw_border: True
+  colormap: 'gist_rainbow'
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/inference.py b/syncnet_python-master/Audio2Head/Audio2Head/inference.py
new file mode 100755
index 00000000..771e7b21
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/inference.py
@@ -0,0 +1,273 @@
+import argparse
+import subprocess
+import python_speech_features
+from scipy.io import wavfile
+from scipy.interpolate import interp1d
+import numpy as np
+import pyworld
+import torch
+from modules.audio2pose import get_pose_from_audio
+from skimage import io, img_as_float32
+import cv2
+from modules.generator import OcclusionAwareGenerator
+from modules.keypoint_detector import KPDetector
+from modules.audio2kp import AudioModel3D
+import yaml, os, imageio
+
+def draw_annotation_box(image, rotation_vector, translation_vector, color=(255, 255, 255), line_width=2):
+    """Draw a 3D box as annotation of pose"""
+
+    camera_matrix = np.array(
+        [[233.333, 0, 128],
+         [0, 233.333, 128],
+         [0, 0, 1]], dtype="double")
+
+    dist_coeefs = np.zeros((4, 1))
+
+    point_3d = []
+    rear_size = 75
+    rear_depth = 0
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+
+    front_size = 100
+    front_depth = 100
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d.append((-front_size, front_size, front_depth))
+    point_3d.append((front_size, front_size, front_depth))
+    point_3d.append((front_size, -front_size, front_depth))
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d = np.array(point_3d, dtype=float).reshape(-1, 3)  # 如果需要使用 NumPy 的 float64 类型，可以改成 np.float64
+
+    # Map to 2d image points
+    (point_2d, _) = cv2.projectPoints(point_3d,
+                                      rotation_vector,
+                                      translation_vector,
+                                      camera_matrix,
+                                      dist_coeefs)
+    point_2d = np.int32(point_2d.reshape(-1, 2))
+
+    # Draw all the lines
+    cv2.polylines(image, [point_2d], True, color, line_width, cv2.LINE_AA)
+    cv2.line(image, tuple(point_2d[1]), tuple(
+        point_2d[6]), color, line_width, cv2.LINE_AA)
+    cv2.line(image, tuple(point_2d[2]), tuple(
+        point_2d[7]), color, line_width, cv2.LINE_AA)
+    cv2.line(image, tuple(point_2d[3]), tuple(
+        point_2d[8]), color, line_width, cv2.LINE_AA)
+
+def inter_pitch(y, y_flag):
+    frame_num = y.shape[0]
+    i = 0
+    last = -1
+    while(i < frame_num):
+        if y_flag[i] == 0:
+            while True:
+                if y_flag[i] == 0:
+                    if i == frame_num-1:
+                        if last != -1:
+                            y[last+1:] = y[last]
+                        i += 1
+                        break
+                    i += 1
+                else:
+                    break
+            if i >= frame_num:
+                break
+            elif last == -1:
+                y[:i] = y[i]
+            else:
+                inter_num = i - last + 1
+                fy = np.array([y[last], y[i]])
+                fx = np.linspace(0, 1, num=2)
+                f = interp1d(fx, fy)
+                fx_new = np.linspace(0, 1, inter_num)
+                fy_new = f(fx_new)
+                y[last+1:i] = fy_new[1:-1]
+                last = i
+                i += 1
+        else:
+            last = i
+            i += 1
+    return y
+
+def get_audio_feature_from_audio(audio_path, norm=True):
+    sample_rate, audio = wavfile.read(audio_path)
+    if len(audio.shape) == 2:
+        if np.min(audio[:, 0]) <= 0:
+            audio = audio[:, 1]
+        else:
+            audio = audio[:, 0]
+    if norm:
+        audio = audio - np.mean(audio)
+        audio = audio / np.max(np.abs(audio))
+        a = python_speech_features.mfcc(audio, sample_rate)
+        b = python_speech_features.logfbank(audio, sample_rate)
+        c, _ = pyworld.harvest(audio, sample_rate, frame_period=10)
+        c_flag = (c == 0.0) ^ 1
+        c = inter_pitch(c, c_flag)
+        c = np.expand_dims(c, axis=1)
+        c_flag = np.expand_dims(c_flag, axis=1)
+        frame_num = np.min([a.shape[0], b.shape[0], c.shape[0]])
+
+        cat = np.concatenate([a[:frame_num], b[:frame_num], c[:frame_num], c_flag[:frame_num]], axis=1)
+        return cat
+
+def audio2head(audio_path, img_path, model_path, save_path):
+    temp_audio = "./results/temp.wav"
+    # 使用 ffmpeg 将输入音频转换为指定格式
+    command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (audio_path, temp_audio))
+    output = subprocess.call(command, shell=True, stdout=None)
+
+    # 读取转换后的音频特征
+    audio_feature = get_audio_feature_from_audio(temp_audio)
+    frames = len(audio_feature) // 4
+
+    # 读取并处理图片
+    img = io.imread(img_path)[:, :, :3]  # 读取图片，保留前三个通道（RGB）
+    img = cv2.resize(img, (256, 256))   # 调整图片大小
+
+    img = np.array(img_as_float32(img))
+    img = img.transpose((2, 0, 1))
+    img = torch.from_numpy(img).unsqueeze(0).cuda()
+
+    # 获取参考姿态
+    ref_pose_rot, ref_pose_trans = get_pose_from_audio(img, audio_feature, model_path)
+    torch.cuda.empty_cache()
+
+    # 加载配置文件
+    config_file = r"./config/vox-256.yaml"
+    with open(config_file) as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    kp_detector = KPDetector(**config['model_params']['kp_detector_params'],
+                             **config['model_params']['common_params'])
+    generator = OcclusionAwareGenerator(**config['model_params']['generator_params'],
+                                        **config['model_params']['common_params'])
+    kp_detector = kp_detector.cuda()
+    generator = generator.cuda()
+
+    # 加载参数
+    opt = argparse.Namespace(**yaml.load(open("./config/parameters.yaml"), Loader=yaml.FullLoader))
+    audio2kp = AudioModel3D(opt).cuda()
+
+    # 加载预训练模型
+    checkpoint = torch.load(model_path)
+    kp_detector.load_state_dict(checkpoint["kp_detector"])
+    generator.load_state_dict(checkpoint["generator"])
+    audio2kp.load_state_dict(checkpoint["audio2kp"])
+
+    generator.eval()
+    kp_detector.eval()
+    audio2kp.eval()
+
+    # 准备音频和姿态数据
+    audio_f = []
+    poses = []
+    pad = np.zeros((4, 41), dtype=np.float32)
+    for i in range(0, frames, opt.seq_len // 2):
+        temp_audio = []
+        temp_pos = []
+        for j in range(opt.seq_len):
+            if i + j < frames:
+                temp_audio.append(audio_feature[(i+j)*4:(i+j)*4+4])
+                trans = ref_pose_trans[i + j]
+                rot = ref_pose_rot[i + j]
+            else:
+                temp_audio.append(pad)
+                trans = ref_pose_trans[-1]
+                rot = ref_pose_rot[-1]
+
+            pose = np.zeros([256, 256])
+            draw_annotation_box(pose, np.array(rot), np.array(trans))
+            temp_pos.append(pose)
+        audio_f.append(temp_audio)
+        poses.append(temp_pos)
+
+    audio_f = torch.from_numpy(np.array(audio_f, dtype=np.float32)).unsqueeze(0)
+    poses = torch.from_numpy(np.array(poses, dtype=np.float32)).unsqueeze(0)
+
+    bs = audio_f.shape[1]
+    predictions_gen = []
+    total_frames = 0
+
+    for bs_idx in range(bs):
+        t = {}
+
+        t["audio"] = audio_f[:, bs_idx].cuda()
+        t["pose"] = poses[:, bs_idx].cuda()
+        t["id_img"] = img
+        kp_gen_source = kp_detector(img)
+
+        gen_kp = audio2kp(t)
+        if bs_idx == 0:
+            startid = 0
+            end_id = opt.seq_len // 4 * 3
+        else:
+            startid = opt.seq_len // 4
+            end_id = opt.seq_len // 4 * 3
+
+        for frame_bs_idx in range(startid, end_id):
+            tt = {}
+            tt["value"] = gen_kp["value"][:, frame_bs_idx]
+            if opt.estimate_jacobian:
+                tt["jacobian"] = gen_kp["jacobian"][:, frame_bs_idx]
+            out_gen = generator(img, kp_source=kp_gen_source, kp_driving=tt)
+            out_gen["kp_source"] = kp_gen_source
+            out_gen["kp_driving"] = tt
+            del out_gen['sparse_deformed']
+            del out_gen['occlusion_map']
+            del out_gen['deformed']
+            predictions_gen.append(
+                (np.transpose(out_gen['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0] * 255).astype(np.uint8))
+
+            total_frames += 1
+            if total_frames >= frames:
+                break
+        if total_frames >= frames:
+            break
+
+    log_dir = save_path
+    temp_dir = os.path.join(log_dir, "temp")
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+    image_name = os.path.basename(img_path)[:-4] + "_" + os.path.basename(audio_path)[:-4] + ".mp4"
+
+    video_path = os.path.join(temp_dir, image_name)
+
+    # 生成视频文件
+    imageio.mimsave(video_path, predictions_gen, format='FFMPEG', fps=25.0)
+
+    # 将音频合并到视频中
+    save_video = os.path.join(log_dir, image_name)
+    cmd = r'ffmpeg -y -i "%s" -i "%s" -vcodec copy "%s"' % (video_path, audio_path, save_video)
+    os.system(cmd)
+    os.remove(video_path)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="将音频和图片转换为视频")
+
+    # 修改音频和图片路径，固定在input_wav和input_img文件夹下
+    parser.add_argument("--audio_filename", default="intro.wav", help="音频文件名，位于 ./input_wav/ 目录下")
+    parser.add_argument("--img_filename", default="paint.jpg", help="图片文件名，位于 ./input_img/ 目录下")
+    # 修改 save_path 的默认值为上两级目录的 out_video 文件夹
+    parser.add_argument("--save_path", default=os.path.join("..", "..", "out_video"), help="保存路径")
+    parser.add_argument("--model_path", default=r"/app/Audio2Head/Audio2Head/checkpoints/audio2head.pth.tar", help="预训练模型路径")
+
+    parse = parser.parse_args()
+
+    # 构建完整的音频和图片路径
+    audio_path = os.path.join("/app/Audio2Head/Audio2Head/input_wav", parse.audio_filename)
+    img_path = os.path.join("/app/Audio2Head/Audio2Head/input_img", parse.img_filename)
+
+
+    # 检查文件是否存在
+    if not os.path.isfile(audio_path):
+        raise FileNotFoundError(f"音频文件未找到: {audio_path}")
+    if not os.path.isfile(img_path):
+        raise FileNotFoundError(f"图片文件未找到: {img_path}")
+
+    os.makedirs(parse.save_path, exist_ok=True)
+    audio2head(audio_path, img_path, parse.model_path, parse.save_path)
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png b/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png
new file mode 100755
index 00000000..a2cb63ee
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/input_img/Obama1.png differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav b/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav
new file mode 100755
index 00000000..0d635d30
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/input_wav/Obama1.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aec27cf98ccf6a010ca1ed9fcadfaf1ba8f30734e94213ea610ad403d14962ad
+size 38478798
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc
new file mode 100755
index 00000000..432f67d2
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2kp.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc
new file mode 100755
index 00000000..fa213b83
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/audio2pose.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc
new file mode 100755
index 00000000..b075d905
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/dense_motion.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc
new file mode 100755
index 00000000..3a51ef4b
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/generator.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc
new file mode 100755
index 00000000..3490cef9
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/keypoint_detector.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc
new file mode 100755
index 00000000..5e6d2832
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/resnet.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc
new file mode 100755
index 00000000..ee3a692b
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/modules/__pycache__/util.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py
new file mode 100755
index 00000000..c7a61c17
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2kp.py
@@ -0,0 +1,82 @@
+from torch import nn
+import torch
+import torch.nn.functional as F
+from modules.util import AntiAliasInterpolation2d
+from modules.util import Hourglass3D
+
+from modules.util import gaussian2kp
+from sync_batchnorm import SynchronizedBatchNorm2d as BatchNorm2d
+
+
+class AudioModel3D(nn.Module):
+    def __init__(self,opt):
+        super(AudioModel3D,self).__init__()
+        self.opt = opt
+        self.seq_len = opt.seq_len
+        self.pad = 0
+
+        self.down_id = AntiAliasInterpolation2d(3,0.25)
+        self.down_pose = AntiAliasInterpolation2d(opt.seq_len,0.25)
+
+        self.embedding = nn.Sequential(nn.ConvTranspose2d(1, 8, (29, 14), stride=(1, 1), padding=(0, 11)),
+                                       BatchNorm2d(8),
+                                       nn.ReLU(inplace=True),
+                                       nn.Conv2d(8, 2, (13, 13), stride=(1, 1), padding=(6, 6)))
+
+        num_channels = 6
+        self.predictor = Hourglass3D(opt.block_expansion, in_features=num_channels,
+                                       max_features=opt.max_features, num_blocks=opt.num_blocks)
+
+        self.kp = nn.Conv3d(in_channels=self.predictor.out_filters, out_channels=opt.num_kp, kernel_size=(7, 7, 7),
+                            padding=(3,0,0))
+        if opt.estimate_jacobian:
+            self.num_jacobian_maps = opt.num_kp
+            self.jacobian = nn.Conv2d(in_channels=self.predictor.out_filters,
+                                      out_channels=4 * self.num_jacobian_maps, kernel_size=(7, 7), padding=(0,0))
+            self.jacobian.weight.data.zero_()
+            self.jacobian.bias.data.copy_(torch.tensor([1, 0, 0, 1] * self.num_jacobian_maps, dtype=torch.float))
+        else:
+            self.jacobian = None
+
+        self.temperature = 0.1
+
+
+    def forward(self, x):
+        bs,_,_,c_dim = x["audio"].shape
+
+        audio_embedding = self.embedding(x["audio"].reshape(-1,1,4,c_dim))
+        audio_embedding = F.interpolate(audio_embedding,scale_factor=2).reshape(bs,self.opt.seq_len,2,64,64).permute(0,2,1,3,4)
+
+        id_feature = self.down_id(x["id_img"])
+        pose_feature = self.down_pose(x["pose"])
+
+        embeddings = torch.cat([audio_embedding,id_feature.unsqueeze(2).repeat(1,1,self.opt.seq_len,1,1),pose_feature.unsqueeze(1)],dim=1)
+
+        feature_map = self.predictor(embeddings)
+        feature_shape = feature_map.shape
+        prediction = self.kp(feature_map).permute(0,2,1,3,4)
+        prediction = prediction.reshape(-1,prediction.shape[2],prediction.shape[3],prediction.shape[4])
+        final_shape = prediction.shape
+        heatmap = prediction.view(final_shape[0], final_shape[1], -1)
+        heatmap = F.softmax(heatmap / self.temperature, dim=2)
+        heatmap = heatmap.view(*final_shape)
+
+        out = gaussian2kp(heatmap)
+        out["value"] = out["value"].reshape(-1,self.opt.seq_len,self.opt.num_kp,2)
+        if self.jacobian is not None:
+            jacobian_map = self.jacobian(feature_map.permute(0,2,1,3,4).reshape(-1, feature_shape[1],feature_shape[3],feature_shape[4]))
+
+            jacobian_map = jacobian_map.reshape(final_shape[0], self.num_jacobian_maps, 4, final_shape[2],
+                                                final_shape[3])
+            out["jacobian_map"] = jacobian_map
+            heatmap = heatmap.unsqueeze(2)
+
+            jacobian = heatmap * jacobian_map
+            jacobian = jacobian.view(final_shape[0], final_shape[1], 4, -1)
+            jacobian = jacobian.sum(dim=-1)
+            jacobian = jacobian.view(jacobian.shape[0], jacobian.shape[1], 2, 2)
+            out['jacobian'] = jacobian.reshape(-1,self.seq_len,self.opt.num_kp,2,2)
+
+        out["pred_fature"] = prediction
+        return out
+
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py
new file mode 100755
index 00000000..1d09a17a
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/audio2pose.py
@@ -0,0 +1,61 @@
+import torch
+import torch.nn as nn
+from modules.util import MyResNet34
+import numpy as np
+
+class audio2poseLSTM(nn.Module):
+    def __init__(self):
+        super(audio2poseLSTM,self).__init__()
+
+        self.em_audio = MyResNet34(256, 1)
+        self.em_img = MyResNet34(256, 3)
+
+        self.lstm = nn.LSTM(512,256,num_layers=2,bias=True,batch_first=True)
+        self.output = nn.Linear(256,6)
+
+    def forward(self,x):
+        img_em = self.em_img(x['img'])
+        result = [self.output(img_em).unsqueeze(1)]
+        bs,seqlen,_,_ = x["audio"].shape
+        zero_state = torch.zeros((2,bs,256),requires_grad=True).to(img_em.device)
+        cur_state = (zero_state,zero_state)
+        audio = x["audio"].reshape(-1, 1, 4, 41)
+        audio_em = self.em_audio(audio).reshape(bs, seqlen, 256)
+        for i in range(seqlen):
+
+            img_em,cur_state = self.lstm(torch.cat((audio_em[:,i:i+1],img_em.unsqueeze(1)),dim=2),cur_state)
+            img_em = img_em.reshape(-1, 256)
+            result.append(self.output(img_em).unsqueeze(1))
+        res = torch.cat(result,dim=1)
+        return res
+
+def get_pose_from_audio(img,audio,model_path):
+    num_frame = len(audio) // 4
+    minv = np.array([-0.639, -0.501, -0.47, -102.6, -32.5, 184.6], dtype=np.float32)
+    maxv = np.array([0.411, 0.547, 0.433, 159.1, 116.5, 376.5], dtype=np.float32)
+
+
+    generator = audio2poseLSTM().cuda()
+
+    ckpt_para = torch.load(model_path)
+
+    generator.load_state_dict(ckpt_para["audio2pose"])
+    generator.eval()
+
+    audio_seq = []
+    for i in range(num_frame):
+        audio_seq.append(audio[i*4:i*4+4])
+
+    audio = torch.from_numpy(np.array(audio_seq,dtype=np.float32)).unsqueeze(0).cuda()
+
+    x = {}
+    x["img"] = img
+    x["audio"] = audio
+    poses = generator(x)
+
+    print(poses.shape)
+    poses = poses.cpu().data.numpy()[0]
+
+    poses = (poses+1)/2*(maxv-minv)+minv
+    rot,trans =  poses[:,:3].copy(),poses[:,3:].copy()
+    return rot,trans
\ No newline at end of file
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py
new file mode 100755
index 00000000..2e32b4c8
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/dense_motion.py
@@ -0,0 +1,113 @@
+from torch import nn
+import torch.nn.functional as F
+import torch
+from modules.util import Hourglass, AntiAliasInterpolation2d, make_coordinate_grid, kp2gaussian
+
+class DenseMotionNetwork(nn.Module):
+    """
+    Module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+    """
+
+    def __init__(self, block_expansion, num_blocks, max_features, num_kp, num_channels, estimate_occlusion_map=False,
+                 scale_factor=1, kp_variance=0.01):
+        super(DenseMotionNetwork, self).__init__()
+        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp + 1) * (num_channels + 1),
+                                   max_features=max_features, num_blocks=num_blocks)
+
+        self.mask = nn.Conv2d(self.hourglass.out_filters, num_kp + 1, kernel_size=(7, 7), padding=(3, 3))
+
+        if estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(self.hourglass.out_filters, 1, kernel_size=(7, 7), padding=(3, 3))
+        else:
+            self.occlusion = None
+
+        self.num_kp = num_kp
+        self.scale_factor = scale_factor
+        self.kp_variance = kp_variance
+
+        if self.scale_factor != 1:
+            self.down = AntiAliasInterpolation2d(num_channels, self.scale_factor)
+
+    def create_heatmap_representations(self, source_image, kp_driving, kp_source):
+        """
+        Eq 6. in the paper H_k(z)
+        """
+        spatial_size = source_image.shape[2:]
+        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=self.kp_variance)
+        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=self.kp_variance)
+        heatmap = gaussian_driving - gaussian_source
+
+        #adding background feature
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1]).type(heatmap.type())
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)
+        return heatmap
+
+    def create_sparse_motions(self, source_image, kp_driving, kp_source):
+        """
+        Eq 4. in the paper T_{s<-d}(z)
+        """
+        bs, _, h, w = source_image.shape
+        identity_grid = make_coordinate_grid((h, w), type=kp_source['value'].type())
+        identity_grid = identity_grid.view(1, 1, h, w, 2)
+        coordinate_grid = identity_grid - kp_driving['value'].view(bs, self.num_kp, 1, 1, 2)
+        if 'jacobian' in kp_driving:
+            jacobian = torch.matmul(kp_source['jacobian'], torch.inverse(kp_driving['jacobian']))
+            jacobian = jacobian.unsqueeze(-3).unsqueeze(-3)
+            jacobian = jacobian.repeat(1, 1, h, w, 1, 1)
+            coordinate_grid = torch.matmul(jacobian, coordinate_grid.unsqueeze(-1))
+            coordinate_grid = coordinate_grid.squeeze(-1)
+
+        driving_to_source = coordinate_grid + kp_source['value'].view(bs, self.num_kp, 1, 1, 2)
+
+        #adding background feature
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)
+        return sparse_motions
+
+    def create_deformed_source_image(self, source_image, sparse_motions):
+        """
+        Eq 7. in the paper \hat{T}_{s<-d}(z)
+        """
+        bs, _, h, w = source_image.shape
+        source_repeat = source_image.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp + 1, 1, 1, 1, 1)
+        source_repeat = source_repeat.view(bs * (self.num_kp + 1), -1, h, w)
+        sparse_motions = sparse_motions.view((bs * (self.num_kp + 1), h, w, -1))
+        sparse_deformed = F.grid_sample(source_repeat, sparse_motions)
+        # sparse_deformed = F.grid_sample(source_repeat, sparse_motions,align_corners = False)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp + 1, -1, h, w))
+        return sparse_deformed
+
+    def forward(self, source_image, kp_driving, kp_source):
+        if self.scale_factor != 1:
+            source_image = self.down(source_image)
+
+        bs, _, h, w = source_image.shape
+
+        out_dict = dict()
+        heatmap_representation = self.create_heatmap_representations(source_image, kp_driving, kp_source)#bs*(numkp+1)*1*h*w
+        sparse_motion = self.create_sparse_motions(source_image, kp_driving, kp_source)#bs*(numkp+1)*h*w*2
+        deformed_source = self.create_deformed_source_image(source_image, sparse_motion)
+        out_dict['sparse_deformed'] = deformed_source
+
+        input = torch.cat([heatmap_representation, deformed_source], dim=2)#bs*num+1*4*w*h
+        input = input.view(bs, -1, h, w)
+
+        prediction = self.hourglass(input)
+
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)#bs*numkp+1*1*h*w
+        sparse_motion = sparse_motion.permute(0, 1, 4, 2, 3)
+        deformation = (sparse_motion * mask).sum(dim=1)# bs,2,64,64
+        deformation = deformation.permute(0, 2, 3, 1)#bs*h*w*2
+
+        out_dict['deformation'] = deformation
+
+        # Sec. 3.2 in the paper
+        if self.occlusion:
+            occlusion_map = torch.sigmoid(self.occlusion(prediction))
+            out_dict['occlusion_map'] = occlusion_map
+
+        return out_dict
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py
new file mode 100755
index 00000000..6999b39d
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/generator.py
@@ -0,0 +1,99 @@
+import torch
+from torch import nn
+import torch.nn.functional as F
+from modules.util import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d
+from modules.dense_motion import DenseMotionNetwork
+
+
+class OcclusionAwareGenerator(nn.Module):
+    """
+    Generator that given source image and and keypoints try to transform image according to movement trajectories
+    induced by keypoints. Generator follows Johnson architecture.
+    """
+
+    def __init__(self, num_channels, num_kp, block_expansion, max_features, num_down_blocks,
+                 num_bottleneck_blocks, estimate_occlusion_map=False, dense_motion_params=None, estimate_jacobian=False):
+        super(OcclusionAwareGenerator, self).__init__()
+
+        if dense_motion_params is not None:
+            self.dense_motion_network = DenseMotionNetwork(num_kp=num_kp, num_channels=num_channels,
+                                                           estimate_occlusion_map=estimate_occlusion_map,
+                                                           **dense_motion_params)
+        else:
+            self.dense_motion_network = None
+
+        self.first = SameBlock2d(num_channels, block_expansion, kernel_size=(7, 7), padding=(3, 3))
+
+        down_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2 ** i))
+            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
+            down_blocks.append(DownBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+        up_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2 ** (num_down_blocks - i)))
+            out_features = min(max_features, block_expansion * (2 ** (num_down_blocks - i - 1)))
+            up_blocks.append(UpBlock2d(in_features, out_features, kernel_size=(3, 3), padding=(1, 1)))
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+        self.bottleneck = torch.nn.Sequential()
+        in_features = min(max_features, block_expansion * (2 ** num_down_blocks))
+        for i in range(num_bottleneck_blocks):
+            self.bottleneck.add_module('r' + str(i), ResBlock2d(in_features, kernel_size=(3, 3), padding=(1, 1)))
+
+        self.final = nn.Conv2d(block_expansion, num_channels, kernel_size=(7, 7), padding=(3, 3))
+        self.estimate_occlusion_map = estimate_occlusion_map
+        self.num_channels = num_channels
+
+    def deform_input(self, inp, deformation):
+        _, h_old, w_old, _ = deformation.shape
+        _, _, h, w = inp.shape
+        if h_old != h or w_old != w:
+            deformation = deformation.permute(0, 3, 1, 2)
+            deformation = F.interpolate(deformation, size=(h, w), mode='bilinear')
+            deformation = deformation.permute(0, 2, 3, 1)
+        return F.grid_sample(inp, deformation)
+        # return F.grid_sample(inp, deformation,align_corners = False)
+
+    def forward(self, source_image, kp_driving, kp_source):
+        # Encoding (downsampling) part
+        out = self.first(source_image)
+        for i in range(len(self.down_blocks)):
+            out = self.down_blocks[i](out)
+
+        # Transforming feature representation according to deformation and occlusion
+        output_dict = {}
+        if self.dense_motion_network is not None:
+            dense_motion = self.dense_motion_network(source_image=source_image, kp_driving=kp_driving,
+                                                     kp_source=kp_source)
+            output_dict['mask'] = dense_motion['mask']
+            output_dict['sparse_deformed'] = dense_motion['sparse_deformed']
+            output_dict['deformation'] = dense_motion['deformation']
+
+            if 'occlusion_map' in dense_motion:
+                occlusion_map = dense_motion['occlusion_map']
+                output_dict['occlusion_map'] = occlusion_map
+            else:
+                occlusion_map = None
+            deformation = dense_motion['deformation']
+            out = self.deform_input(out, deformation)
+
+            if occlusion_map is not None:
+                if out.shape[2] != occlusion_map.shape[2] or out.shape[3] != occlusion_map.shape[3]:
+                    occlusion_map = F.interpolate(occlusion_map, size=out.shape[2:], mode='bilinear')
+                out = out * occlusion_map
+
+            output_dict["deformed"] = self.deform_input(source_image, deformation)
+
+        # Decoding part
+        out = self.bottleneck(out)
+        for i in range(len(self.up_blocks)):
+            out = self.up_blocks[i](out)
+        out = self.final(out)
+        out = F.sigmoid(out)
+
+        output_dict["prediction"] = out
+
+        return output_dict
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py
new file mode 100755
index 00000000..15ab08a3
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/keypoint_detector.py
@@ -0,0 +1,77 @@
+from torch import nn
+import torch
+import torch.nn.functional as F
+from modules.util import Hourglass, make_coordinate_grid, AntiAliasInterpolation2d
+
+
+class KPDetector(nn.Module):
+    """
+    Detecting a keypoints. Return keypoint position and jacobian near each keypoint.
+    """
+
+    def __init__(self, block_expansion, num_kp, num_channels, max_features,
+                 num_blocks, temperature, estimate_jacobian=False, scale_factor=1,
+                 single_jacobian_map=False, pad=0):
+        super(KPDetector, self).__init__()
+
+        self.predictor = Hourglass(block_expansion, in_features=num_channels,
+                                   max_features=max_features, num_blocks=num_blocks)
+
+        self.kp = nn.Conv2d(in_channels=self.predictor.out_filters, out_channels=num_kp, kernel_size=(7, 7),
+                            padding=pad)
+
+        if estimate_jacobian:
+            self.num_jacobian_maps = 1 if single_jacobian_map else num_kp
+            self.jacobian = nn.Conv2d(in_channels=self.predictor.out_filters,
+                                      out_channels=4 * self.num_jacobian_maps, kernel_size=(7, 7), padding=pad)
+            self.jacobian.weight.data.zero_()
+            self.jacobian.bias.data.copy_(torch.tensor([1, 0, 0, 1] * self.num_jacobian_maps, dtype=torch.float))
+        else:
+            self.jacobian = None
+
+        self.temperature = temperature
+        self.scale_factor = scale_factor
+        if self.scale_factor != 1:
+            self.down = AntiAliasInterpolation2d(num_channels, self.scale_factor)
+
+    def gaussian2kp(self, heatmap):
+        """
+        Extract the mean and from a heatmap
+        """
+        shape = heatmap.shape
+        heatmap = heatmap.unsqueeze(-1)
+        grid = make_coordinate_grid(shape[2:], heatmap.type()).unsqueeze_(0).unsqueeze_(0)
+        value = (heatmap * grid).sum(dim=(2, 3))
+        kp = {'value': value}
+
+        return kp
+
+    def forward(self, x):
+        if self.scale_factor != 1:
+            x = self.down(x)
+
+        feature_map = self.predictor(x)
+        prediction = self.kp(feature_map)
+
+        final_shape = prediction.shape
+        heatmap = prediction.view(final_shape[0], final_shape[1], -1)
+        heatmap = F.softmax(heatmap / self.temperature, dim=2)
+        heatmap = heatmap.view(*final_shape)
+
+        out = self.gaussian2kp(heatmap)
+
+        if self.jacobian is not None:
+            jacobian_map = self.jacobian(feature_map)
+
+            jacobian_map = jacobian_map.reshape(final_shape[0], self.num_jacobian_maps, 4, final_shape[2],
+                                                final_shape[3])
+            out["jacobian_map"] = jacobian_map
+            heatmap = heatmap.unsqueeze(2)
+
+            jacobian = heatmap * jacobian_map
+            jacobian = jacobian.view(final_shape[0], final_shape[1], 4, -1)
+            jacobian = jacobian.sum(dim=-1)
+            jacobian = jacobian.view(jacobian.shape[0], jacobian.shape[1], 2, 2)
+            out['jacobian'] = jacobian
+        out["pred_fature"] = prediction
+        return out
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py
new file mode 100755
index 00000000..06e23cc6
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/resnet.py
@@ -0,0 +1,203 @@
+import torch
+import torch.nn as nn
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+class ResNet(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None,input_channel = 3):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(input_channel, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    return model
+
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
\ No newline at end of file
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py b/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py
new file mode 100755
index 00000000..79e143fa
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/modules/util.py
@@ -0,0 +1,407 @@
+from torch import nn
+
+import torch.nn.functional as F
+import torch
+
+from sync_batchnorm import SynchronizedBatchNorm2d as BatchNorm2d
+from sync_batchnorm import SynchronizedBatchNorm3d as BatchNorm3d
+from modules.resnet import resnet34
+
+def gaussian2kp(heatmap):
+    """
+    Extract the mean and from a heatmap
+    """
+    shape = heatmap.shape
+    heatmap = heatmap.unsqueeze(-1)
+    grid = make_coordinate_grid(shape[2:], heatmap.type()).unsqueeze_(0).unsqueeze_(0)
+    value = (heatmap * grid).sum(dim=(2, 3))
+    kp = {'value': value}
+
+    return kp
+
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp['value'] #bs*numkp*2
+
+    coordinate_grid = make_coordinate_grid(spatial_size, mean.type()) #h*w*2
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape #1*1*h*w*2
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)  #bs*numkp*h*w*2
+
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 2)
+    mean = mean.view(*shape)
+
+    mean_sub = (coordinate_grid - mean)
+
+    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
+
+    return out
+
+
+def make_coordinate_grid(spatial_size, type):
+    """
+    Create a meshgrid [-1,1] x [-1,1] of given spatial_size.
+    """
+    h, w = spatial_size
+    x = torch.arange(w).type(type)
+    y = torch.arange(h).type(type)
+
+    x = (2 * (x / (w - 1)) - 1)
+    y = (2 * (y / (h - 1)) - 1)
+
+    yy = y.view(-1, 1).repeat(1, w)
+    xx = x.view(1, -1).repeat(h, 1)
+
+    meshed = torch.cat([xx.unsqueeze_(2), yy.unsqueeze_(2)], 2)
+
+    return meshed
+
+
+class ResBlock2d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock2d, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.conv2 = nn.Conv2d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.norm1 = BatchNorm2d(in_features, affine=True)
+        self.norm2 = BatchNorm2d(in_features, affine=True)
+
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out,inplace=True)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out,inplace=True)
+        out = self.conv2(out)
+        out += x
+        return out
+
+class ResBlock3d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock3d, self).__init__()
+        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size,
+                               padding=padding)
+        self.norm1 = BatchNorm3d(in_features, affine=True)
+        self.norm2 = BatchNorm3d(in_features, affine=True)
+
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out,inplace=True)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out,inplace=True)
+        out = self.conv2(out)
+        out += x
+        return out
+
+
+class UpBlock2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock2d, self).__init__()
+
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = BatchNorm2d(out_features, affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=2)
+        del x
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out,inplace=True)
+        return out
+
+class UpBlock3d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock3d, self).__init__()
+
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = BatchNorm3d(out_features, affine=True)
+        self.res = ResBlock3d(out_features,kernel_size,padding)
+        self.norm2 = BatchNorm3d(out_features,affine=True)
+
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=2)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out,inplace=True)
+        out = self.res(out)
+        out = self.norm2(out)
+        out = F.relu(out,inplace=True)
+        return out
+
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+
+    def forward(self, x):
+        out = self.conv(x)
+        del x
+        out = self.norm(out)
+        out = F.relu(out,inplace=True)
+        out = self.pool(out)
+        return out
+
+class DownBlock3d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock3d, self).__init__()
+
+        self.res = ResBlock3d(in_features=in_features,kernel_size=kernel_size,padding=padding)
+        self.norm_res = BatchNorm3d(in_features,affine=True)
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+
+        self.norm = BatchNorm3d(out_features, affine=True)
+        self.pool = nn.AvgPool3d(kernel_size=(2, 2, 2))
+
+    def forward(self, x):
+        out = self.res(x)
+        out = self.norm_res(out)
+        out = F.relu(out,inplace=True)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out,inplace=True)
+        out = self.pool(out)
+        return out
+
+class SameBlock2d(nn.Module):
+    """
+    Simple block, preserve spatial resolution.
+    """
+
+    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1):
+        super(SameBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features,
+                              kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = BatchNorm2d(out_features, affine=True)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out,inplace=True)
+        return out
+
+
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock2d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)),
+                                           min(max_features, block_expansion * (2 ** (i + 1))),
+                                           kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+
+class Encoder3D(nn.Module):
+    """
+    Hourglass Encoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder3D, self).__init__()
+
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)),
+                                           min(max_features, block_expansion * (2 ** (i + 1))),
+                                           kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder, self).__init__()
+
+        up_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1))
+
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        return out
+
+
+class Decoder3D(nn.Module):
+    """
+    Hourglass Decoder
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder3D, self).__init__()
+
+        up_blocks = []
+        res_blocks = []
+
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
+            if i>0:
+                res_blocks.append(nn.Sequential(ResBlock3d(out_filters,kernel_size=3,padding=1),BatchNorm3d(out_filters), nn.ReLU(inplace=True)))
+            else:
+                res_blocks.append(nn.Sequential(ResBlock3d(in_features,kernel_size=3,padding=1),BatchNorm3d(in_features), nn.ReLU(inplace=True)))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+
+    def forward(self, x):
+        out = x.pop()
+        for up_block,res_bl in zip(self.up_blocks,self.res_blocks):
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, res_bl(skip)], dim=1)
+        return out
+
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+class Hourglass3D(nn.Module):
+    """
+    Hourglass architecture.
+    """
+
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass3D, self).__init__()
+        self.encoder = Encoder3D(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder3D(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+
+
+class AntiAliasInterpolation2d(nn.Module):
+    """
+    Band-limited downsampling, for better preservation of the input signal.
+    """
+    def __init__(self, channels, scale):
+        super(AntiAliasInterpolation2d, self).__init__()
+        sigma = (1 / scale - 1) / 2
+        kernel_size = 2 * round(sigma * 4) + 1
+        self.ka = kernel_size // 2
+        self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka
+
+
+        kernel_size = [kernel_size, kernel_size]
+        sigma = [sigma, sigma]
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid(
+            [
+                torch.arange(size, dtype=torch.float32)
+                for size in kernel_size
+                ]
+        )
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= torch.exp(-(mgrid - mean) ** 2 / (2 * std ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer('weight', kernel)
+        self.groups = channels
+        self.scale = scale
+
+    def forward(self, input):
+        if self.scale == 1.0:
+            return input
+
+        out = F.pad(input, (self.ka, self.kb, self.ka, self.kb))
+        out = F.conv2d(out, weight=self.weight, groups=self.groups)
+        out = F.interpolate(out, scale_factor=(self.scale, self.scale))
+
+        return out
+
+
+
+class MyResNet34(nn.Module):
+    def __init__(self,embedding_dim,input_channel = 3):
+        super(MyResNet34, self).__init__()
+        self.resnet = resnet34(norm_layer = BatchNorm2d,num_classes=embedding_dim,input_channel = input_channel)
+    def forward(self, x):
+        return self.resnet(x)
+
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt b/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt
new file mode 100755
index 00000000..09b03681
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/requirements.txt
@@ -0,0 +1,9 @@
+scikit-image
+python_speech_features
+pyworld
+pyyaml
+pytorch-lightning
+imageio
+scipy
+pyworld
+opencv-python
\ No newline at end of file
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh b/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh
new file mode 100755
index 00000000..89df6ba2
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/run_inference.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# 定义文件夹路径
+IMG_DIR="/app/Audio2Head/Audio2Head/input_img"
+WAV_DIR="/app/Audio2Head/Audio2Head/input_wav"
+OUT_DIR="/app/out_video"
+
+# 确保工作目录正确
+cd /app/Audio2Head/Audio2Head || exit
+# 创建输出目录（如果不存在）
+mkdir -p "$OUT_DIR"
+
+# 支持的图片格式
+IMG_EXTENSIONS=("png" "jpg" "jpeg")
+
+# 遍历所有音频文件
+for audio_path in "$WAV_DIR"/*.wav; do
+    # 检查是否存在.wav文件
+    if [ ! -e "$audio_path" ]; then
+        echo "No .wav files found in $WAV_DIR"
+        exit 1
+    fi
+
+    # 提取音频文件的基名（不含扩展名）
+    audio_filename=$(basename "$audio_path")
+    base_name="${audio_filename%.*}"
+
+    # 初始化图片文件变量
+    img_file=""
+    
+    # 查找匹配的图片文件
+    for ext in "${IMG_EXTENSIONS[@]}"; do
+        potential_img="$IMG_DIR/$base_name.$ext"
+        if [ -f "$potential_img" ]; then
+            img_file="$potential_img"
+            img_extension="$ext"
+            break
+        fi
+    done
+
+    # 如果未找到匹配的图片文件，输出警告并跳过
+    if [ -z "$img_file" ]; then
+        echo "Warning: No matching image file found for $audio_filename in $IMG_DIR"
+        continue
+    fi
+
+    # 提取图片文件名
+    img_filename=$(basename "$img_file")
+
+    # 输出当前处理的文件
+    echo "Processing: $audio_filename and $img_filename"
+
+    # 运行Python脚本
+    # 修改以下部分
+    python /app/Audio2Head/Audio2Head/inference.py --audio_filename "$audio_filename" --img_filename "$img_filename"
+
+
+    # 检查Python脚本是否成功执行
+    if [ $? -ne 0 ]; then
+        echo "Error: Processing failed for $audio_filename and $img_filename"
+        continue
+    fi
+
+    echo "Successfully processed $audio_filename and $img_filename"
+done
+
+echo "All matching files have been processed."
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py
new file mode 100755
index 00000000..48871cdc
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+# File   : __init__.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+from .batchnorm import SynchronizedBatchNorm1d, SynchronizedBatchNorm2d, SynchronizedBatchNorm3d
+from .replicate import DataParallelWithCallback, patch_replication_callback
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc
new file mode 100755
index 00000000..3ea13f83
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/__init__.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc
new file mode 100755
index 00000000..712f18e5
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/batchnorm.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc
new file mode 100755
index 00000000..c68cdc82
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/comm.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc
new file mode 100755
index 00000000..78d6ecc5
Binary files /dev/null and b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/__pycache__/replicate.cpython-310.pyc differ
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py
new file mode 100755
index 00000000..b4cc2ccd
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/batchnorm.py
@@ -0,0 +1,315 @@
+# -*- coding: utf-8 -*-
+# File   : batchnorm.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import collections
+
+import torch
+import torch.nn.functional as F
+
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.parallel._functions import ReduceAddCoalesced, Broadcast
+
+from .comm import SyncMaster
+
+__all__ = ['SynchronizedBatchNorm1d', 'SynchronizedBatchNorm2d', 'SynchronizedBatchNorm3d']
+
+
+def _sum_ft(tensor):
+    """sum over the first and last dimention"""
+    return tensor.sum(dim=0).sum(dim=-1)
+
+
+def _unsqueeze_ft(tensor):
+    """add new dementions at the front and the tail"""
+    return tensor.unsqueeze(0).unsqueeze(-1)
+
+
+_ChildMessage = collections.namedtuple('_ChildMessage', ['sum', 'ssum', 'sum_size'])
+_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
+
+
+class _SynchronizedBatchNorm(_BatchNorm):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
+        super(_SynchronizedBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
+
+        self._sync_master = SyncMaster(self._data_parallel_master)
+
+        self._is_parallel = False
+        self._parallel_id = None
+        self._slave_pipe = None
+
+    def forward(self, input):
+        # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
+        if not (self._is_parallel and self.training):
+            return F.batch_norm(
+                input, self.running_mean, self.running_var, self.weight, self.bias,
+                self.training, self.momentum, self.eps)
+
+        # Resize the input to (B, C, -1).
+        input_shape = input.size()
+        input = input.view(input.size(0), self.num_features, -1)
+
+        # Compute the sum and square-sum.
+        sum_size = input.size(0) * input.size(2)
+        input_sum = _sum_ft(input)
+        input_ssum = _sum_ft(input ** 2)
+
+        # Reduce-and-broadcast the statistics.
+        if self._parallel_id == 0:
+            mean, inv_std = self._sync_master.run_master(_ChildMessage(input_sum, input_ssum, sum_size))
+        else:
+            mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(input_sum, input_ssum, sum_size))
+
+        # Compute the output.
+        if self.affine:
+            # MJY:: Fuse the multiplication for speed.
+            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std * self.weight) + _unsqueeze_ft(self.bias)
+        else:
+            output = (input - _unsqueeze_ft(mean)) * _unsqueeze_ft(inv_std)
+
+        # Reshape it.
+        return output.view(input_shape)
+
+    def __data_parallel_replicate__(self, ctx, copy_id):
+        self._is_parallel = True
+        self._parallel_id = copy_id
+
+        # parallel_id == 0 means master device.
+        if self._parallel_id == 0:
+            ctx.sync_master = self._sync_master
+        else:
+            self._slave_pipe = ctx.sync_master.register_slave(copy_id)
+
+    def _data_parallel_master(self, intermediates):
+        """Reduce the sum and square-sum, compute the statistics, and broadcast it."""
+
+        # Always using same "device order" makes the ReduceAdd operation faster.
+        # Thanks to:: Tete Xiao (http://tetexiao.com/)
+        intermediates = sorted(intermediates, key=lambda i: i[1].sum.get_device())
+
+        to_reduce = [i[1][:2] for i in intermediates]
+        to_reduce = [j for i in to_reduce for j in i]  # flatten
+        target_gpus = [i[1].sum.get_device() for i in intermediates]
+
+        sum_size = sum([i[1].sum_size for i in intermediates])
+        sum_, ssum = ReduceAddCoalesced.apply(target_gpus[0], 2, *to_reduce)
+        mean, inv_std = self._compute_mean_std(sum_, ssum, sum_size)
+
+        broadcasted = Broadcast.apply(target_gpus, mean, inv_std)
+
+        outputs = []
+        for i, rec in enumerate(intermediates):
+            outputs.append((rec[0], _MasterMessage(*broadcasted[i*2:i*2+2])))
+
+        return outputs
+
+    def _compute_mean_std(self, sum_, ssum, size):
+        """Compute the mean and standard-deviation with sum and square-sum. This method
+        also maintains the moving average on the master device."""
+        assert size > 1, 'BatchNorm computes unbiased standard-deviation, which requires size > 1.'
+        mean = sum_ / size
+        sumvar = ssum - sum_ * mean
+        unbias_var = sumvar / (size - 1)
+        bias_var = sumvar / size
+
+        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
+        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
+
+        return mean, bias_var.clamp(self.eps) ** -0.5
+
+
+class SynchronizedBatchNorm1d(_SynchronizedBatchNorm):
+    r"""Applies Synchronized Batch Normalization over a 2d or 3d input that is seen as a
+    mini-batch.
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm1d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+    
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, L)` slices, it's common terminology to call this Temporal BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of size
+            `batch_size x num_features [x width]`
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C)` or :math:`(N, C, L)`
+        - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm1d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm1d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 2 and input.dim() != 3:
+            raise ValueError('expected 2D or 3D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm1d, self)._check_input_dim(input)
+
+
+class SynchronizedBatchNorm2d(_SynchronizedBatchNorm):
+    r"""Applies Batch Normalization over a 4d input that is seen as a mini-batch
+    of 3d inputs
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm2d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+    
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, H, W)` slices, it's common terminology to call this Spatial BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of
+            size batch_size x num_features x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, H, W)`
+        - Output: :math:`(N, C, H, W)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm2d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm2d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 4:
+            raise ValueError('expected 4D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm2d, self)._check_input_dim(input)
+
+
+class SynchronizedBatchNorm3d(_SynchronizedBatchNorm):
+    r"""Applies Batch Normalization over a 5d input that is seen as a mini-batch
+    of 4d inputs
+
+    .. math::
+
+        y = \frac{x - mean[x]}{ \sqrt{Var[x] + \epsilon}} * gamma + beta
+
+    This module differs from the built-in PyTorch BatchNorm3d as the mean and
+    standard-deviation are reduced across all devices during training.
+
+    For example, when one uses `nn.DataParallel` to wrap the network during
+    training, PyTorch's implementation normalize the tensor on each device using
+    the statistics only on that device, which accelerated the computation and
+    is also easy to implement, but the statistics might be inaccurate.
+    Instead, in this synchronized version, the statistics will be computed
+    over all training samples distributed on multiple devices.
+    
+    Note that, for one-GPU or CPU-only case, this module behaves exactly same
+    as the built-in PyTorch implementation.
+
+    The mean and standard-deviation are calculated per-dimension over
+    the mini-batches and gamma and beta are learnable parameter vectors
+    of size C (where C is the input size).
+
+    During training, this layer keeps a running estimate of its computed mean
+    and variance. The running sum is kept with a default momentum of 0.1.
+
+    During evaluation, this running mean/variance is used for normalization.
+
+    Because the BatchNorm is done over the `C` dimension, computing statistics
+    on `(N, D, H, W)` slices, it's common terminology to call this Volumetric BatchNorm
+    or Spatio-temporal BatchNorm
+
+    Args:
+        num_features: num_features from an expected input of
+            size batch_size x num_features x depth x height x width
+        eps: a value added to the denominator for numerical stability.
+            Default: 1e-5
+        momentum: the value used for the running_mean and running_var
+            computation. Default: 0.1
+        affine: a boolean value that when set to ``True``, gives the layer learnable
+            affine parameters. Default: ``True``
+
+    Shape:
+        - Input: :math:`(N, C, D, H, W)`
+        - Output: :math:`(N, C, D, H, W)` (same shape as input)
+
+    Examples:
+        >>> # With Learnable Parameters
+        >>> m = SynchronizedBatchNorm3d(100)
+        >>> # Without Learnable Parameters
+        >>> m = SynchronizedBatchNorm3d(100, affine=False)
+        >>> input = torch.autograd.Variable(torch.randn(20, 100, 35, 45, 10))
+        >>> output = m(input)
+    """
+
+    def _check_input_dim(self, input):
+        if input.dim() != 5:
+            raise ValueError('expected 5D input (got {}D input)'
+                             .format(input.dim()))
+        super(SynchronizedBatchNorm3d, self)._check_input_dim(input)
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py
new file mode 100755
index 00000000..b66ec4ae
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/comm.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+# File   : comm.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import queue
+import collections
+import threading
+
+__all__ = ['FutureResult', 'SlavePipe', 'SyncMaster']
+
+
+class FutureResult(object):
+    """A thread-safe future implementation. Used only as one-to-one pipe."""
+
+    def __init__(self):
+        self._result = None
+        self._lock = threading.Lock()
+        self._cond = threading.Condition(self._lock)
+
+    def put(self, result):
+        with self._lock:
+            assert self._result is None, 'Previous result has\'t been fetched.'
+            self._result = result
+            self._cond.notify()
+
+    def get(self):
+        with self._lock:
+            if self._result is None:
+                self._cond.wait()
+
+            res = self._result
+            self._result = None
+            return res
+
+
+_MasterRegistry = collections.namedtuple('MasterRegistry', ['result'])
+_SlavePipeBase = collections.namedtuple('_SlavePipeBase', ['identifier', 'queue', 'result'])
+
+
+class SlavePipe(_SlavePipeBase):
+    """Pipe for master-slave communication."""
+
+    def run_slave(self, msg):
+        self.queue.put((self.identifier, msg))
+        ret = self.result.get()
+        self.queue.put(True)
+        return ret
+
+
+class SyncMaster(object):
+    """An abstract `SyncMaster` object.
+
+    - During the replication, as the data parallel will trigger an callback of each module, all slave devices should
+    call `register(id)` and obtain an `SlavePipe` to communicate with the master.
+    - During the forward pass, master device invokes `run_master`, all messages from slave devices will be collected,
+    and passed to a registered callback.
+    - After receiving the messages, the master device should gather the information and determine to message passed
+    back to each slave devices.
+    """
+
+    def __init__(self, master_callback):
+        """
+
+        Args:
+            master_callback: a callback to be invoked after having collected messages from slave devices.
+        """
+        self._master_callback = master_callback
+        self._queue = queue.Queue()
+        self._registry = collections.OrderedDict()
+        self._activated = False
+
+    def __getstate__(self):
+        return {'master_callback': self._master_callback}
+
+    def __setstate__(self, state):
+        self.__init__(state['master_callback'])
+
+    def register_slave(self, identifier):
+        """
+        Register an slave device.
+
+        Args:
+            identifier: an identifier, usually is the device id.
+
+        Returns: a `SlavePipe` object which can be used to communicate with the master device.
+
+        """
+        if self._activated:
+            assert self._queue.empty(), 'Queue is not clean before next initialization.'
+            self._activated = False
+            self._registry.clear()
+        future = FutureResult()
+        self._registry[identifier] = _MasterRegistry(future)
+        return SlavePipe(identifier, self._queue, future)
+
+    def run_master(self, master_msg):
+        """
+        Main entry for the master device in each forward pass.
+        The messages were first collected from each devices (including the master device), and then
+        an callback will be invoked to compute the message to be sent back to each devices
+        (including the master device).
+
+        Args:
+            master_msg: the message that the master want to send to itself. This will be placed as the first
+            message when calling `master_callback`. For detailed usage, see `_SynchronizedBatchNorm` for an example.
+
+        Returns: the message to be sent back to the master device.
+
+        """
+        self._activated = True
+
+        intermediates = [(0, master_msg)]
+        for i in range(self.nr_slaves):
+            intermediates.append(self._queue.get())
+
+        results = self._master_callback(intermediates)
+        assert results[0][0] == 0, 'The first result should belongs to the master.'
+
+        for i, res in results:
+            if i == 0:
+                continue
+            self._registry[i].result.put(res)
+
+        for i in range(self.nr_slaves):
+            assert self._queue.get() is True
+
+        return results[0][1]
+
+    @property
+    def nr_slaves(self):
+        return len(self._registry)
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py
new file mode 100755
index 00000000..9b97380d
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/replicate.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+# File   : replicate.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import functools
+
+from torch.nn.parallel.data_parallel import DataParallel
+
+__all__ = [
+    'CallbackContext',
+    'execute_replication_callbacks',
+    'DataParallelWithCallback',
+    'patch_replication_callback'
+]
+
+
+class CallbackContext(object):
+    pass
+
+
+def execute_replication_callbacks(modules):
+    """
+    Execute an replication callback `__data_parallel_replicate__` on each module created by original replication.
+
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Note that, as all modules are isomorphism, we assign each sub-module with a context
+    (shared among multiple copies of this module on different devices).
+    Through this context, different copies can share some information.
+
+    We guarantee that the callback on the master copy (the first copy) will be called ahead of calling the callback
+    of any slave copies.
+    """
+    master_copy = modules[0]
+    nr_modules = len(list(master_copy.modules()))
+    ctxs = [CallbackContext() for _ in range(nr_modules)]
+
+    for i, module in enumerate(modules):
+        for j, m in enumerate(module.modules()):
+            if hasattr(m, '__data_parallel_replicate__'):
+                m.__data_parallel_replicate__(ctxs[j], i)
+
+
+class DataParallelWithCallback(DataParallel):
+    """
+    Data Parallel with a replication callback.
+
+    An replication callback `__data_parallel_replicate__` of each module will be invoked after being created by
+    original `replicate` function.
+    The callback will be invoked with arguments `__data_parallel_replicate__(ctx, copy_id)`
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+        # sync_bn.__data_parallel_replicate__ will be invoked.
+    """
+
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelWithCallback, self).replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+
+def patch_replication_callback(data_parallel):
+    """
+    Monkey-patch an existing `DataParallel` object. Add the replication callback.
+    Useful when you have customized `DataParallel` implementation.
+
+    Examples:
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallel(sync_bn, device_ids=[0, 1])
+        > patch_replication_callback(sync_bn)
+        # this is equivalent to
+        > sync_bn = SynchronizedBatchNorm1d(10, eps=1e-5, affine=False)
+        > sync_bn = DataParallelWithCallback(sync_bn, device_ids=[0, 1])
+    """
+
+    assert isinstance(data_parallel, DataParallel)
+
+    old_replicate = data_parallel.replicate
+
+    @functools.wraps(old_replicate)
+    def new_replicate(module, device_ids):
+        modules = old_replicate(module, device_ids)
+        execute_replication_callbacks(modules)
+        return modules
+
+    data_parallel.replicate = new_replicate
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py
new file mode 100755
index 00000000..9716d035
--- /dev/null
+++ b/syncnet_python-master/Audio2Head/Audio2Head/sync_batchnorm/unittest.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+# File   : unittest.py
+# Author : Jiayuan Mao
+# Email  : maojiayuan@gmail.com
+# Date   : 27/01/2018
+# 
+# This file is part of Synchronized-BatchNorm-PyTorch.
+# https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+# Distributed under MIT License.
+
+import unittest
+
+import numpy as np
+from torch.autograd import Variable
+
+
+def as_numpy(v):
+    if isinstance(v, Variable):
+        v = v.data
+    return v.cpu().numpy()
+
+
+class TorchTestCase(unittest.TestCase):
+    def assertTensorClose(self, a, b, atol=1e-3, rtol=1e-3):
+        npa, npb = as_numpy(a), as_numpy(b)
+        self.assertTrue(
+                np.allclose(npa, npb, atol=atol),
+                'Tensor close check failed\n{}\n{}\nadiff={}, rdiff={}'.format(a, b, np.abs(npa - npb).max(), np.abs((npa - npb) / np.fmax(npa, 1e-5)).max())
+        )
diff --git a/syncnet_python-master/Dockerfile b/syncnet_python-master/Dockerfile
new file mode 100755
index 00000000..6141f535
--- /dev/null
+++ b/syncnet_python-master/Dockerfile
@@ -0,0 +1,67 @@
+# 使用 CUDA 11.3.1 基础镜像
+FROM nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu20.04
+
+# 设置非交互模式，避免交互式安装
+ENV DEBIAN_FRONTEND=noninteractive
+
+# 更新系统并添加支持 Python 3.9 的 PPA 源
+RUN apt-get update && apt-get install -y software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update
+
+# 安装 Python 3.9、开发头文件、构建工具和基础依赖
+RUN apt-get install -y \
+    python3.9 \
+    python3.9-dev \
+    python3.9-distutils \
+    build-essential \
+    ffmpeg \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# 手动安装最新版本的 pip，确保正确安装到 Python 3.9 环境
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
+
+# 使用清华源配置 pip
+RUN python3.9 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/ && \
+    python3.9 -m pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn
+
+# 创建 python 的符号链接指向 python3.9
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+
+# 确认 pip 已正确安装
+RUN python3.9 -m pip --version
+
+# 设置工作目录
+WORKDIR /app
+
+# 复制只包含依赖的文件，利用缓存
+COPY requirements.txt /app/
+
+# 升级 pip 并安装项目依赖（不包含 torch, torchvision, torchaudio）
+RUN python3.9 -m pip install --upgrade pip && \
+    python3.9 -m pip install --progress-bar=on -r requirements.txt --no-cache-dir && \
+    python3.9 -m pip install cupy-cuda113 imageio[ffmpeg]
+
+# 升级 pip 并安装 PyTorch 及其依赖（针对 CUDA 11.3），显示下载进度
+RUN python3.9 -m pip install --upgrade pip && \
+    python3.9 -m pip install --progress-bar=on torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0+cu113 -f https://mirrors.aliyun.com/pytorch-wheels/cu113/
+
+# 复制剩余的项目文件
+COPY . /app
+
+# 打印 Python 和 pip 版本以验证安装
+RUN python3.9 --version
+RUN python3.9 -m pip --version
+
+# 创建必要的文件夹
+RUN mkdir -p /app/ref_video \
+    && mkdir -p /app/out_video \
+    && mkdir -p /app/Audio2Head/Audio2Head/input_wav \
+    && mkdir -p /app/Audio2Head/Audio2Head/input_img
+
+# 确保脚本有执行权限
+RUN chmod +x /app/Audio2Head/Audio2Head/run_inference.sh
+
+# 定义启动命令
+CMD ["bash", "-c", "/app/Audio2Head/Audio2Head/run_inference.sh && python /app/batch_psnr.py"]
diff --git a/syncnet_python-master/FID.py b/syncnet_python-master/FID.py
new file mode 100755
index 00000000..7a5a7f9a
--- /dev/null
+++ b/syncnet_python-master/FID.py
@@ -0,0 +1,147 @@
+import cv2
+import numpy as np
+import torch
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy.linalg import sqrtm
+from torchvision import models, transforms
+from PIL import Image
+import sys
+import os
+from datetime import datetime
+
+# 设置设备为GPU（如果可用）
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# 加载预训练的InceptionV3模型，并去掉最后的分类层
+inception_model = models.inception_v3(pretrained=True).to(device)
+inception_model.fc = torch.nn.Identity()  # 去掉最后的分类层
+inception_model.eval()
+
+# 定义图像预处理函数
+preprocess = transforms.Compose([
+    transforms.Resize(299),  # InceptionV3的输入大小
+    transforms.CenterCrop(299),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+
+# 计算图像特征
+def get_features_batch(images, batch_size=32):
+    features = []
+    num_images = len(images)
+    for i in range(0, num_images, batch_size):
+        batch_images = images[i:i+batch_size]
+        batch_pil = [Image.fromarray(img) for img in batch_images]
+        batch_tensor = torch.stack([preprocess(img) for img in batch_pil]).to(device)
+        with torch.no_grad():
+            batch_features = inception_model(batch_tensor)
+        features.append(batch_features.cpu().numpy())
+    return np.vstack(features)
+
+# 计算FID分数
+def calculate_fid(features1, features2):
+    mu1, sigma1 = features1.mean(axis=0), np.cov(features1, rowvar=False)
+    mu2, sigma2 = features2.mean(axis=0), np.cov(features2, rowvar=False)
+    
+    ssdiff = np.sum((mu1 - mu2)**2.0)
+    covmean, _ = sqrtm(sigma1.dot(sigma2), disp=False)
+    
+    if np.iscomplexobj(covmean):
+        covmean = covmean.real
+    
+    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
+    return fid
+
+# 日志记录函数
+def log_fid(log_file_path, ref_video_path, out_video_path, frame_count, fid_score):
+    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    log_entry = f"{timestamp},\"{ref_video_path}\",\"{out_video_path}\",{frame_count},{fid_score}\n"
+    with open(log_file_path, "a") as log_file:
+        # 如果文件是空的，写入表头
+        if os.path.getsize(log_file_path) == 0:
+            log_file.write("Timestamp,Reference_Video_Path,Output_Video_Path,Frame_Count,FID_Score\n")
+        log_file.write(log_entry)
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3 or len(sys.argv) > 4:
+        print("Usage: python script_name.py <reference_video_path> <output_video_path> [<log_file_path>]")
+        sys.exit(1)
+    
+    ref_video_path = sys.argv[1]
+    out_video_path = sys.argv[2]
+    
+    # 可选：获取日志文件路径
+    if len(sys.argv) == 4:
+        log_file_path = sys.argv[3]
+    else:
+        log_file_path = "fid_log.txt"
+    
+    # 检查视频文件是否存在
+    if not os.path.isfile(ref_video_path):
+        print(f"Error: Reference video file '{ref_video_path}' does not exist.")
+        sys.exit(1)
+    if not os.path.isfile(out_video_path):
+        print(f"Error: Output video file '{out_video_path}' does not exist.")
+        sys.exit(1)
+    
+    # 打开参考视频和输出视频文件
+    ref_cap = cv2.VideoCapture(ref_video_path)
+    out_cap = cv2.VideoCapture(out_video_path)
+
+    if not ref_cap.isOpened():
+        print(f"Error: Could not open reference video '{ref_video_path}'.")
+        sys.exit(1)
+    if not out_cap.isOpened():
+        print(f"Error: Could not open output video '{out_video_path}'.")
+        ref_cap.release()
+        sys.exit(1)
+    
+    # 获取视频属性
+    ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    
+    ref_frames = []
+    out_frames = []
+    
+    print("Reading frames from videos...")
+    
+    while True:
+        ret_ref, ref_frame = ref_cap.read()
+        ret_out, out_frame = out_cap.read()
+        
+        if not ret_ref or not ret_out:
+            break
+        # 转换为RGB，因为Inception模型使用RGB图像
+        ref_frames.append(cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB))
+        out_frames.append(cv2.cvtColor(out_frame, cv2.COLOR_BGR2RGB))
+    
+    # 处理输出视频比参考视频少一帧的情况
+    if len(out_frames) < len(ref_frames):
+        ref_frames = ref_frames[:len(out_frames)]
+    elif len(ref_frames) < len(out_frames):
+        out_frames = out_frames[:len(ref_frames)]
+    
+    frame_count = len(ref_frames)
+    print(f"Total frames to process: {frame_count}")
+    
+    # 释放视频捕获对象
+    ref_cap.release()
+    out_cap.release()
+    
+    if frame_count == 0:
+        print("No frames to process.")
+        sys.exit(1)
+    
+    print("Extracting features from reference video frames...")
+    ref_features = get_features_batch(ref_frames)
+    
+    print("Extracting features from output video frames...")
+    out_features = get_features_batch(out_frames)
+    
+    print("Calculating FID score...")
+    fid_score = calculate_fid(ref_features, out_features)
+    print(f"FID score: {fid_score}")
+    
+    # 记录到日志文件
+    log_fid(log_file_path, ref_video_path, out_video_path, frame_count, fid_score)
+    print(f"FID score logged to '{log_file_path}'.")
diff --git a/syncnet_python-master/NIQE.py b/syncnet_python-master/NIQE.py
new file mode 100755
index 00000000..5200b645
--- /dev/null
+++ b/syncnet_python-master/NIQE.py
@@ -0,0 +1,283 @@
+import cv2
+import numpy as np
+import cupy as cp
+import scipy.io
+from os.path import dirname, join
+from PIL import Image
+import scipy.special
+import math
+import os
+from skimage.transform import resize
+import sys
+import cupyx.scipy.ndimage as cupyx_ndimage  # 导入cupyx的ndimage模块
+
+gamma_range = np.arange(0.2, 10, 0.001)
+a = scipy.special.gamma(2.0 / gamma_range)
+a *= a
+b = scipy.special.gamma(1.0 / gamma_range)
+c = scipy.special.gamma(3.0 / gamma_range)
+prec_gammas = a / (b * c)
+
+def aggd_features(imdata):
+    # 将imdata转移到GPU
+    imdata = cp.asarray(imdata)
+    # 展平imdata
+    imdata = imdata.flatten()
+    imdata2 = imdata * imdata
+    left_data = imdata2[imdata < 0]
+    right_data = imdata2[imdata >= 0]
+    
+    left_mean_sqrt = cp.sqrt(cp.average(left_data)) if left_data.size > 0 else 0
+    right_mean_sqrt = cp.sqrt(cp.average(right_data)) if right_data.size > 0 else 0
+
+    gamma_hat = left_mean_sqrt / right_mean_sqrt if right_mean_sqrt != 0 else cp.inf
+
+    imdata2_mean = cp.mean(imdata2)
+    r_hat = (cp.mean(cp.abs(imdata)) ** 2) / cp.mean(imdata2) if imdata2_mean != 0 else cp.inf
+    rhat_norm = r_hat * (((gamma_hat ** 3) + 1) * (gamma_hat + 1)) / ((gamma_hat ** 2) + 1) ** 2
+
+    pos = cp.argmin((cp.asarray(prec_gammas) - rhat_norm) ** 2).get()
+    alpha = gamma_range[pos]
+
+    gam1 = scipy.special.gamma(1.0 / alpha)
+    gam2 = scipy.special.gamma(2.0 / alpha)
+    gam3 = scipy.special.gamma(3.0 / alpha)
+
+    aggdratio = math.sqrt(gam1) / math.sqrt(gam3)
+    bl = aggdratio * left_mean_sqrt.get()
+    br = aggdratio * right_mean_sqrt.get()
+
+    N = (br - bl) * (gam2 / gam1)
+    return (alpha, N, bl, br, left_mean_sqrt.get(), right_mean_sqrt.get())
+
+def ggd_features(imdata):
+    nr_gam = 1 / prec_gammas
+    sigma_sq = cp.var(imdata)
+    E = cp.mean(cp.abs(imdata))
+    rho = sigma_sq / E ** 2
+    pos = cp.argmin(cp.abs(nr_gam - rho)).get()
+    return gamma_range[pos], sigma_sq.get()
+
+def paired_product(new_im):
+    # 将数据转移到CPU进行滚动操作
+    new_im_cpu = new_im.get()
+    shift1 = np.roll(new_im_cpu.copy(), 1, axis=1)
+    shift2 = np.roll(new_im_cpu.copy(), 1, axis=0)
+    shift3 = np.roll(np.roll(new_im_cpu.copy(), 1, axis=0), 1, axis=1)
+    shift4 = np.roll(np.roll(new_im_cpu.copy(), 1, axis=0), -1, axis=1)
+
+    H_img = shift1 * new_im_cpu
+    V_img = shift2 * new_im_cpu
+    D1_img = shift3 * new_im_cpu
+    D2_img = shift4 * new_im_cpu
+
+    return cp.asarray(H_img), cp.asarray(V_img), cp.asarray(D1_img), cp.asarray(D2_img)
+
+def gen_gauss_window(lw, sigma):
+    sd = np.float32(sigma)
+    lw = int(lw)
+    weights = [0.0] * (2 * lw + 1)
+    weights[lw] = 1.0
+    sum_weights = 1.0
+    sd *= sd
+    for ii in range(1, lw + 1):
+        tmp = math.exp(-0.5 * float(ii * ii) / sd)
+        weights[lw + ii] = tmp
+        weights[lw - ii] = tmp
+        sum_weights += 2.0 * tmp
+    weights = [w / sum_weights for w in weights]
+    return cp.asarray(weights, dtype=cp.float32)
+
+def compute_image_mscn_transform(image, C=1, avg_window=None, extend_mode='constant'):
+    if avg_window is None:
+        avg_window = gen_gauss_window(3, 7.0 / 6.0)
+    assert len(image.shape) == 2
+    h, w = image.shape
+    image = cp.asarray(image, dtype=cp.float32)
+    
+    # 使用cupyx.scipy.ndimage.correlate进行多维相关操作
+    mu_image = cupyx_ndimage.correlate(image, avg_window[:, None], mode=extend_mode)
+    mu_image = cupyx_ndimage.correlate(mu_image, avg_window[None, :], mode=extend_mode)
+    
+    var_image = cupyx_ndimage.correlate(image ** 2, avg_window[:, None], mode=extend_mode)
+    var_image = cupyx_ndimage.correlate(var_image, avg_window[None, :], mode=extend_mode)
+    var_image = cp.sqrt(cp.abs(var_image - mu_image ** 2))
+    return (image - mu_image) / (var_image + C), var_image, mu_image
+
+def _niqe_extract_subband_feats(mscncoefs):
+    alpha_m, N, bl, br, lsq, rsq = aggd_features(mscncoefs.copy())
+    pps1, pps2, pps3, pps4 = paired_product(mscncoefs)
+    alpha1, N1, bl1, br1, lsq1, rsq1 = aggd_features(pps1)
+    alpha2, N2, bl2, br2, lsq2, rsq2 = aggd_features(pps2)
+    alpha3, N3, bl3, br3, lsq3, rsq3 = aggd_features(pps3)
+    alpha4, N4, bl4, br4, lsq4, rsq4 = aggd_features(pps4)
+    return cp.asnumpy(cp.array([alpha_m, (bl + br) / 2.0,
+                                 alpha1, N1, bl1, br1,  # (V)
+                                 alpha2, N2, bl2, br2,  # (H)
+                                 alpha3, N3, bl3, bl3,  # (D1)
+                                 alpha4, N4, bl4, bl4,  # (D2)
+                                 ]))
+
+def get_patches_train_features(img, patch_size, stride=8):
+    return _get_patches_generic(img, patch_size, 1, stride)
+
+def get_patches_test_features(img, patch_size, stride=8):
+    return _get_patches_generic(img, patch_size, 0, stride)
+
+def extract_on_patches(img, patch_size):
+    h, w = img.shape
+    patch_size = int(patch_size)
+    patches = []
+    for j in range(0, h - patch_size + 1, patch_size):
+        for i in range(0, w - patch_size + 1, patch_size):
+            patch = img[j:j + patch_size, i:i + patch_size]
+            patches.append(patch)
+
+    patches = cp.asarray(patches)
+
+    patch_features = []
+    for p in patches:
+        patch_features.append(_niqe_extract_subband_feats(p))
+    patch_features = cp.asarray(patch_features)
+    return cp.asnumpy(patch_features)
+
+def _get_patches_generic(img, patch_size, is_train, stride):
+    h, w = img.shape
+    if h < patch_size or w < patch_size:
+        print("Input image is too small")
+        exit(0)
+
+    # 确保补丁能够均匀划分
+    hoffset = h % patch_size
+    woffset = w % patch_size
+
+    if hoffset > 0:
+        img = img[:-hoffset, :]
+    if woffset > 0:
+        img = img[:, :-woffset]
+
+    img = img.astype(cp.float32)
+    # 使用skimage在CPU上调整图像大小
+    img_cpu = cp.asnumpy(img)
+    img2_cpu = resize(img_cpu, (int(img_cpu.shape[0] * 0.5), int(img_cpu.shape[1] * 0.5)), mode='constant', anti_aliasing=True)
+    img2 = cp.asarray(img2_cpu, dtype=cp.float32)
+
+    mscn1, var, mu = compute_image_mscn_transform(img)
+    mscn1 = mscn1.astype(cp.float32)
+
+    mscn2, _, _ = compute_image_mscn_transform(img2)
+    mscn2 = mscn2.astype(cp.float32)
+
+    feats_lvl1 = extract_on_patches(mscn1, patch_size)
+    feats_lvl2 = extract_on_patches(mscn2, patch_size / 2)
+
+    feats = np.hstack((feats_lvl1, feats_lvl2))  # feats_lvl3))
+    return feats
+
+def niqe(inputImgData):
+    patch_size = 96
+    module_path = dirname(__file__)
+
+    # 加载预训练的NIQE参数
+    params = scipy.io.loadmat(join(module_path, 'data', 'niqe_image_params.mat'))
+    pop_mu = cp.asarray(np.ravel(params["pop_mu"]))
+    pop_cov = cp.asarray(params["pop_cov"])
+
+    M, N = inputImgData.shape
+
+    assert M > (patch_size * 2 + 1), "niqe called with small frame size, requires > 192x192 resolution video using current training parameters"
+    assert N > (patch_size * 2 + 1), "niqe called with small frame size, requires > 192x192 resolution video using current training parameters"
+
+    # 将图像转移到GPU
+    inputImgData = cp.asarray(inputImgData, dtype=cp.float32)
+
+    feats = cp.asarray(get_patches_test_features(inputImgData, patch_size))
+    sample_mu = cp.mean(feats, axis=0)
+    sample_cov = cp.cov(feats.T)
+
+    X = sample_mu - pop_mu
+    covmat = (pop_cov + sample_cov) / 2.0
+    pinvmat = cp.linalg.pinv(covmat)
+    niqe_score = cp.sqrt(cp.dot(cp.dot(X, pinvmat), X)).get()
+
+    return niqe_score
+
+def evaluate_video_with_niqe(video_path, log_file_path="niqe_log.txt"):
+    cap = cv2.VideoCapture(video_path)
+    
+    if not cap.isOpened():
+        print("Error: Could not open video.")
+        return
+    
+    niqe_values = []
+    NIQE_sum = 0
+    frame_count = 0
+
+    # 设置设备为GPU（如果可用）
+    device = cp.cuda.Device()
+    device.use()
+
+    # 打开日志文件以追加模式
+    try:
+        log_file = open(log_file_path, "a")
+    except Exception as e:
+        print(f"Error: Could not open log file {log_file_path}: {e}")
+        cap.release()
+        return
+    
+    # 写入日志文件的头部（如果文件是空的）
+    if os.path.getsize(log_file_path) == 0:
+        log_file.write("Video_Path,Frame_Count,Average_NIQE\n")
+    
+    print("Processing frames and calculating NIQE...")
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        
+        # 将帧从BGR转换为灰度图
+        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        
+        # 计算NIQE
+        try:
+            niqe_value = niqe(gray_frame)
+        except Exception as e:
+            print(f"Error during NIQE calculation on frame {frame_count + 1}: {e}")
+            continue  # 跳过该帧并继续处理
+        
+        NIQE_sum += niqe_value
+        frame_count += 1
+
+        # 可选：打印每帧的NIQE分数
+        # print(f"Frame {frame_count}: NIQE score = {niqe_value}")
+    
+    cap.release()
+    
+    if frame_count > 0:
+        NIQE_mean = NIQE_sum / frame_count
+        print(f"Average NIQE over {frame_count} frames: {NIQE_mean}")
+        
+        # 记录到日志文件
+        log_entry = f"\"{video_path}\",{frame_count},{NIQE_mean}\n"
+        log_file.write(log_entry)
+    else:
+        print("No frames were processed.")
+    
+    log_file.close()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        print("Usage: python script_name.py <video_path> [<log_file_path>]")
+        sys.exit(1)
+    video_path = sys.argv[1]
+    
+    # 可选：获取日志文件路径
+    if len(sys.argv) == 3:
+        log_file_path = sys.argv[2]
+    else:
+        log_file_path = "niqe_log.txt"
+    
+    evaluate_video_with_niqe(video_path, log_file_path)
+    print(f"NIQE scores logged to '{log_file_path}'.")
+
diff --git a/syncnet_python-master/PSNR.py b/syncnet_python-master/PSNR.py
new file mode 100755
index 00000000..3525069c
--- /dev/null
+++ b/syncnet_python-master/PSNR.py
@@ -0,0 +1,118 @@
+import cv2
+import numpy as np
+import cupy as cp
+import sys
+import os
+
+def compute_psnr_gpu(ref_gray, out_gray):
+    """
+    在GPU上计算PSNR。
+    """
+    # 将图像数据转换为float32类型并传输到GPU
+    ref_gpu = cp.asarray(ref_gray, dtype=cp.float32)
+    out_gpu = cp.asarray(out_gray, dtype=cp.float32)
+    
+    # 计算均方误差 (MSE)
+    mse = cp.mean((ref_gpu - out_gpu) ** 2)
+    
+    if mse == 0:
+        return float('inf')
+    
+    PIXEL_MAX = 255.0
+    psnr_value = 20 * cp.log10(PIXEL_MAX / cp.sqrt(mse))
+    
+    # 将结果从GPU传回CPU
+    return psnr_value.get()
+
+def evaluate_video_with_psnr(reference_video_path, output_video_path, log_file_path="psnr_log.txt"):
+    # 打开参考视频和输出视频文件
+    ref_cap = cv2.VideoCapture(reference_video_path)
+    out_cap = cv2.VideoCapture(output_video_path)
+    
+    if not ref_cap.isOpened() or not out_cap.isOpened():
+        print(f"Error: Could not open one of the video files.")
+        return
+    
+    # 获取视频属性
+    ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    
+    # 初始化变量
+    frame_count = 0
+    psnr_scores = []
+    
+    # 打开日志文件以追加模式
+    try:
+        log_file = open(log_file_path, "a")
+    except Exception as e:
+        print(f"Error: Could not open log file {log_file_path}: {e}")
+        ref_cap.release()
+        out_cap.release()
+        return
+    
+    # 写入日志文件的头部（如果文件是空的）
+    if os.path.getsize(log_file_path) == 0:
+        log_file.write("Reference_Video_Path,Output_Video_Path,Frame_Count,Average_PSNR\n")
+    
+    while True:
+        ret_ref, ref_frame = ref_cap.read()
+        ret_out, out_frame = out_cap.read()
+        
+        if not ret_ref:
+            # 参考视频已经读完，但输出视频可能还有剩余帧（不处理这些帧）
+            break
+        
+        if not ret_out:
+            break
+        
+        # 调整输出帧大小以匹配参考帧
+        out_frame_resized = cv2.resize(out_frame, (ref_frame_width, ref_frame_height))
+        
+        # 将BGR图像转换为灰度图像
+        ref_gray = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2GRAY)
+        out_gray = cv2.cvtColor(out_frame_resized, cv2.COLOR_BGR2GRAY)
+        
+        # 使用GPU计算PSNR
+        score = compute_psnr_gpu(ref_gray, out_gray)
+        
+        # 保存PSNR分数
+        psnr_scores.append(score)
+        
+        frame_count += 1
+        
+        # 可选：打印每帧的PSNR分数
+        # print(f"Frame {frame_count}: PSNR score = {score}")
+    
+    # 计算并打印平均PSNR分数
+    if psnr_scores:
+        average_score = np.mean(psnr_scores)
+        output_message = f"Average PSNR score over {frame_count} frames: {average_score}"
+        print(output_message)
+        
+        # 写入日志文件
+        log_entry = f"\"{reference_video_path}\",\"{output_video_path}\",Average PSNR score over{frame_count},frames:{average_score}\n"
+        log_file.write(log_entry)
+    else:
+        print("No frames were processed.")
+    
+    # 关闭日志文件
+    log_file.close()
+    
+    # 释放视频捕获对象
+    ref_cap.release()
+    out_cap.release()
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3 or len(sys.argv) > 4:
+        print("Usage: python script_name.py <reference_video_path> <output_video_path> [<log_file_path>]")
+        sys.exit(1)
+    reference_video_path = sys.argv[1]
+    output_video_path = sys.argv[2]
+    
+    # 可选：获取日志文件路径
+    if len(sys.argv) == 4:
+        log_file_path = sys.argv[3]
+    else:
+        log_file_path = "psnr_log.txt"
+    
+    evaluate_video_with_psnr(reference_video_path, output_video_path, log_file_path)
diff --git a/syncnet_python-master/README.md b/syncnet_python-master/README.md
new file mode 100755
index 00000000..4d607ba8
--- /dev/null
+++ b/syncnet_python-master/README.md
@@ -0,0 +1,82 @@
+# test-audio2head
+
+## 使用docker进行部署
+### 首先下载checkpoints
+
+首先请从以下网址下载[google-drive](https://drive.google.com/file/d/1tvI43ZIrnx9Ti2TpFiEO4dK5DOwcECD7/view?usp=sharing)，当上以连接无法访问或者载入后无法运行时，请下载这个备用[checkpoint2](https://drive.google.com/drive/folders/1k-6im7e4EkPjQSXCO7jWEQwYSHCsCyJb?usp=sharing)。
+
+下载完成后，请将下载好的checkpoints文件夹放到syncnet_python-master/Audio2Head/Audio2Head目录下
+
+完成以上步骤后，请用以下命令构建docker容器
+``` bash
+docker build -t syn .
+```
+
+构建完成后，你可以开始运行容器，但是请注意，请确保你准备好了用于的评估的原视频，用于模型生成视频的评估视频的第一帧图片和音频。同时还需要注意，你的评估视频，用于模型生成视频的第一帧图片和音频的文件名应该相同。（比如，你的一个评估视频是eric.mp4，那么你应该截取这个视频的第一帧图片用于模型生成，并命名为eric.png，同截取这个视频的音频，命名为eric.wav）然后将你的所有评估视频放一个文件夹（例如ref_video），所有图片放一个文件夹（例如input_img），所有音频放一个文件夹（例如input_wav）然后通过以下命令进行挂载和运行程序
+```bash
+docker run --rm --gpus all \
+-v /path/to/your/input_img/:/app/Audio2Head/Audio2Head/input_img \
+-v /path/to/your/input_wav/:/app/Audio2Head/Audio2Head/input_wav \
+-v /path/to/your/ref_video/:/app/ref_video \
+syn
+```
+
+然后通过批处理程序，会自动生成调用模型生成视频然后进行评估
+## 使用conda环境部署
+如果你的系统没有Anconda环境，可以按以下操作
+### 1.首先安装conda环境
+打开终端，下载 Miniconda 安装包：
+``` bash
+
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+
+```
+运行安装脚本(注意最后添加环境变量时选择yes，全部输yes)
+```
+bash Miniconda3-latest-Linux-x86_64.sh
+```
+完成安装后，执行：
+```
+source ~/.bashrc
+```
+
+### 2.创建conda环境并进入
+
+```
+conda create -n syn python=3.10 && conda activate syn
+```
+
+### 3.进入到项目目录
+首先安装依赖
+
+```
+# 确保你的电脑安装了cuda12以上的版本
+pip install -r requirements.txt
+pip install imageio[ffmpeg]
+pip install cupy-cuda12x
+apt-get install ffmpeg
+```
+然后将你的所有评估视频的第一帧图像放到
+```
+Audio2Head\Audio2Head\input_img
+```
+将所有评估视频的音频放到
+```
+Audio2Head\Audio2Head\input_wav
+```
+注意，input_wav里要求wav格式，同时input_img和input_wav对应的视频名字应该相同
+
+然后将你的评估视频保存到
+```
+ref_video
+```
+
+然后运行run_inference.sh批处理文件
+```
+./run_inference.sh
+```
+等上一个批处理脚本运行完后，运行
+```
+python batch_psnr.py
+```
+开始评估
diff --git a/syncnet_python-master/SSIM.py b/syncnet_python-master/SSIM.py
new file mode 100755
index 00000000..70474d6f
--- /dev/null
+++ b/syncnet_python-master/SSIM.py
@@ -0,0 +1,66 @@
+import cv2
+import numpy as np
+from skimage.metrics import structural_similarity as ssim
+import sys
+
+def calculate_video_ssim(ref_video_path, out_video_path):
+    # 打开视频文件
+    ref_cap = cv2.VideoCapture(ref_video_path)
+    out_cap = cv2.VideoCapture(out_video_path)
+
+    # 获取视频帧的宽度和高度（假设两者相同）
+    ref_frame_width = int(ref_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    ref_frame_height = int(ref_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+
+    # 初始化SSIM列表
+    ssim_scores = []
+
+    while True:
+        # 读取参考视频和输出视频的帧
+        ret_ref, ref_frame = ref_cap.read()
+        ret_out, out_frame = out_cap.read()
+
+        # 如果输出视频已经结束，则停止循环（即使参考视频还有帧）
+        if not ret_out:
+            break
+
+        # 如果参考视频已经结束（理论上不应该发生，因为参考视频应该更长），则也停止循环
+        if not ret_ref:
+            print("Warning: Reference video ended before output video, which is unexpected.")
+            break
+
+        # 转换帧的颜色空间（如果需要，这里假设已经是RGB或灰度图，OpenCV默认读取为BGR）
+        # ref_frame = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2RGB)  # 如果需要RGB
+        # out_frame = cv2.cvtColor(out_frame, cv2.COLOR_BGR2RGB)  # 如果需要RGB
+        out_frame_resized = cv2.resize(out_frame, (ref_frame_width, ref_frame_height))
+        # 由于OpenCV读取的是BGR，而skimage.metrics.structural_similarity期望的是RGB或灰度图，
+        # 如果你的SSIM计算库需要Rcce7aedf909d531d03df263977188917B，请取消上面两行的注释，并注释掉下面的转换（这里我们假设使用灰度图）
+        ref_gray = cv2.cvtColor(ref_frame, cv2.COLOR_BGR2GRAY)
+        out_gray = cv2.cvtColor(out_frame_resized, cv2.COLOR_BGR2GRAY)
+
+        # 计算SSIM
+        score, _ = ssim(ref_gray, out_gray, full=True)
+        ssim_scores.append(score)
+
+    # 处理输出视频比参考视频少一帧的情况（实际上在这个循环中不需要额外处理，因为循环会在输出视频结束时停止）
+
+    # 计算平均SSIM
+    average_ssim = np.mean(ssim_scores)
+
+    # 释放视频捕获对象
+    ref_cap.release()
+    out_cap.release()
+
+    return average_ssim, ssim_scores
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script_name.py <reference_video_path> <output_video_path>")
+        sys.exit(1)
+    ref_video_path = sys.argv[1]
+    out_video_path = sys.argv[2]
+    average_ssim, ssim_scores = calculate_video_ssim(ref_video_path, out_video_path)
+    print(f"Average SSIM: {average_ssim}")
+    # 如果需要查看每一帧的SSIM，可以打印ssim_scores列表
+    # for i, score in enumerate(ssim_scores):
+    #     print(f"Frame {i+1} SSIM: {score}")
\ No newline at end of file
diff --git a/syncnet_python-master/SyncNetInstance.py b/syncnet_python-master/SyncNetInstance.py
new file mode 100755
index 00000000..497d44fc
--- /dev/null
+++ b/syncnet_python-master/SyncNetInstance.py
@@ -0,0 +1,208 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+    
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval();
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+
+        command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+        
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            images.append(cv2.imread(fname))
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        if (float(len(audio))/16000) != (float(len(images))/25) :
+            print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+        
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        print('Framewise conf: ')
+        print(fconfm)
+        print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), dists_npy
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval();
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+
+        self_state = self.__S__.state_dict();
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param);
diff --git a/syncnet_python-master/SyncNetInstance_calc_scores.py b/syncnet_python-master/SyncNetInstance_calc_scores.py
new file mode 100755
index 00000000..64906e25
--- /dev/null
+++ b/syncnet_python-master/SyncNetInstance_calc_scores.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+# Video 25 FPS, Audio 16000HZ
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, os, math, glob
+import cv2
+import python_speech_features
+
+from scipy import signal
+from scipy.io import wavfile
+from SyncNetModel import *
+from shutil import rmtree
+
+
+# ==================== Get OFFSET ====================
+
+def calc_pdist(feat1, feat2, vshift=10):
+    
+    win_size = vshift*2+1
+
+    feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
+
+    dists = []
+
+    for i in range(0,len(feat1)):
+
+        dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
+
+    return dists
+
+# ==================== MAIN DEF ====================
+
+class SyncNetInstance(torch.nn.Module):
+
+    def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
+        super(SyncNetInstance, self).__init__();
+
+        self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
+
+    def evaluate(self, opt, videofile):
+
+        self.__S__.eval();
+
+        # ========== ==========
+        # Convert files
+        # ========== ==========
+
+        if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+          rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+        os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+        command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+
+        command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        output = subprocess.call(command, shell=True, stdout=None)
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+
+        images = []
+        
+        flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
+        flist.sort()
+
+        for fname in flist:
+            img_input = cv2.imread(fname)
+            img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
+            images.append(img_input)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+
+        # ========== ==========
+        # Load audio
+        # ========== ==========
+
+        sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
+        mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
+        mfcc = numpy.stack([numpy.array(i) for i in mfcc])
+
+        cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
+        cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
+
+        # ========== ==========
+        # Check audio and video input length
+        # ========== ==========
+
+        #if (float(len(audio))/16000) != (float(len(images))/25) :
+        #    print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
+
+        min_length = min(len(images),math.floor(len(audio)/640))
+        
+        # ========== ==========
+        # Generate video and audio feats
+        # ========== ==========
+
+        lastframe = min_length-5
+        im_feat = []
+        cc_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lip(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+            cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            cc_in = torch.cat(cc_batch,0)
+            cc_out  = self.__S__.forward_aud(cc_in.cuda())
+            cc_feat.append(cc_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+        cc_feat = torch.cat(cc_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        #print('Compute time %.3f sec.' % (time.time()-tS))
+
+        dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
+        mdist = torch.mean(torch.stack(dists,1),1)
+
+        minval, minidx = torch.min(mdist,0)
+
+        offset = opt.vshift-minidx
+        conf   = torch.median(mdist) - minval
+
+        fdist   = numpy.stack([dist[minidx].numpy() for dist in dists])
+        # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
+        fconf   = torch.median(mdist).numpy() - fdist
+        fconfm  = signal.medfilt(fconf,kernel_size=9)
+        
+        numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
+        #print('Framewise conf: ')
+        #print(fconfm)
+        #print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
+
+        dists_npy = numpy.array([ dist.numpy() for dist in dists ])
+        return offset.numpy(), conf.numpy(), minval.numpy()
+
+    def extract_feature(self, opt, videofile):
+
+        self.__S__.eval();
+        
+        # ========== ==========
+        # Load video 
+        # ========== ==========
+        cap = cv2.VideoCapture(videofile)
+
+        frame_num = 1;
+        images = []
+        while frame_num:
+            frame_num += 1
+            ret, image = cap.read()
+            if ret == 0:
+                break
+
+            images.append(image)
+
+        im = numpy.stack(images,axis=3)
+        im = numpy.expand_dims(im,axis=0)
+        im = numpy.transpose(im,(0,3,4,1,2))
+
+        imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
+        
+        # ========== ==========
+        # Generate video feats
+        # ========== ==========
+
+        lastframe = len(images)-4
+        im_feat = []
+
+        tS = time.time()
+        for i in range(0,lastframe,opt.batch_size):
+            
+            im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
+            im_in = torch.cat(im_batch,0)
+            im_out  = self.__S__.forward_lipfeat(im_in.cuda());
+            im_feat.append(im_out.data.cpu())
+
+        im_feat = torch.cat(im_feat,0)
+
+        # ========== ==========
+        # Compute offset
+        # ========== ==========
+            
+        print('Compute time %.3f sec.' % (time.time()-tS))
+
+        return im_feat
+
+
+    def loadParameters(self, path):
+        loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
+
+        self_state = self.__S__.state_dict();
+
+        for name, param in loaded_state.items():
+
+            self_state[name].copy_(param);
diff --git a/syncnet_python-master/SyncNetModel.py b/syncnet_python-master/SyncNetModel.py
new file mode 100755
index 00000000..12e87a9f
--- /dev/null
+++ b/syncnet_python-master/SyncNetModel.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-conc
+
+import torch
+import torch.nn as nn
+
+def save(model, filename):
+    with open(filename, "wb") as f:
+        torch.save(model, f);
+        print("%s saved."%filename);
+
+def load(filename):
+    net = torch.load(filename)
+    return net;
+    
+class S(nn.Module):
+    def __init__(self, num_layers_in_fc_layers = 1024):
+        super(S, self).__init__();
+
+        self.__nFeatures__ = 24;
+        self.__nChs__ = 32;
+        self.__midChs__ = 32;
+
+        self.netcnnaud = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(1,1), stride=(1,1)),
+
+            nn.Conv2d(64, 192, kernel_size=(3,3), stride=(1,1), padding=(1,1)),
+            nn.BatchNorm2d(192),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3,3), stride=(1,2)),
+
+            nn.Conv2d(192, 384, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(384),
+            nn.ReLU(inplace=True),
+
+            nn.Conv2d(384, 256, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv2d(256, 256, kernel_size=(3,3), padding=(1,1)),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=(3,3), stride=(2,2)),
+            
+            nn.Conv2d(256, 512, kernel_size=(5,4), padding=(0,0)),
+            nn.BatchNorm2d(512),
+            nn.ReLU(),
+        );
+
+        self.netfcaud = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        );
+
+        self.netfclip = nn.Sequential(
+            nn.Linear(512, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, num_layers_in_fc_layers),
+        );
+
+        self.netcnnlip = nn.Sequential(
+            nn.Conv3d(3, 96, kernel_size=(5,7,7), stride=(1,2,2), padding=0),
+            nn.BatchNorm3d(96),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
+
+            nn.Conv3d(96, 256, kernel_size=(1,5,5), stride=(1,2,2), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2), padding=(0,1,1)),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+
+            nn.Conv3d(256, 256, kernel_size=(1,3,3), padding=(0,1,1)),
+            nn.BatchNorm3d(256),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=(1,3,3), stride=(1,2,2)),
+
+            nn.Conv3d(256, 512, kernel_size=(1,6,6), padding=0),
+            nn.BatchNorm3d(512),
+            nn.ReLU(inplace=True),
+        );
+
+    def forward_aud(self, x):
+
+        mid = self.netcnnaud(x); # N x ch x 24 x M
+        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        out = self.netfcaud(mid);
+
+        return out;
+
+    def forward_lip(self, x):
+
+        mid = self.netcnnlip(x); 
+        mid = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+        out = self.netfclip(mid);
+
+        return out;
+
+    def forward_lipfeat(self, x):
+
+        mid = self.netcnnlip(x);
+        out = mid.view((mid.size()[0], -1)); # N x (ch x 24)
+
+        return out;
\ No newline at end of file
diff --git a/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc
new file mode 100755
index 00000000..dfe081fd
Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetInstance.cpython-39.pyc differ
diff --git a/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc
new file mode 100755
index 00000000..d670ff95
Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetInstance_calc_scores.cpython-39.pyc differ
diff --git a/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc b/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc
new file mode 100755
index 00000000..1c8a3037
Binary files /dev/null and b/syncnet_python-master/__pycache__/SyncNetModel.cpython-39.pyc differ
diff --git a/syncnet_python-master/all_scores.txt b/syncnet_python-master/all_scores.txt
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/batch_process.log b/syncnet_python-master/batch_process.log
new file mode 100755
index 00000000..139597f9
--- /dev/null
+++ b/syncnet_python-master/batch_process.log
@@ -0,0 +1,2 @@
+
+
diff --git a/syncnet_python-master/batch_psnr.log b/syncnet_python-master/batch_psnr.log
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/batch_psnr.py b/syncnet_python-master/batch_psnr.py
new file mode 100755
index 00000000..181c3f38
--- /dev/null
+++ b/syncnet_python-master/batch_psnr.py
@@ -0,0 +1,145 @@
+import os
+import subprocess
+import logging
+from multiprocessing import Pool, cpu_count
+
+def get_ref_video_files(directory):
+    return [f for f in os.listdir(directory) if f.lower().endswith('.mp4')]
+
+def setup_logging(log_file='batch_process.log'):
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s %(levelname)s: %(message)s',
+        handlers=[
+            logging.FileHandler(log_file, mode='w', encoding='utf-8'),
+            logging.StreamHandler()
+        ]
+    )
+
+def run_command(command, logger):
+    try:
+        result = subprocess.run(
+            command,
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        logger.info(f"命令成功执行: {' '.join(command)}")
+        logger.info(f"输出:\n{result.stdout}")
+        if result.stderr:
+            logger.warning(f"错误输出:\n{result.stderr}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"命令执行失败: {' '.join(command)}")
+        logger.error(f"错误输出:\n{e.stderr}")
+
+def process_video_pair(args):
+    ref_path, out_path, logger = args
+    ref_filename = os.path.basename(ref_path)
+    out_filename = os.path.basename(out_path)
+
+    logger.info(f"正在处理: {ref_filename} 与 {out_filename}")
+
+    # 1. 运行 PSNR.py
+    psnr_command = ['python', 'PSNR.py', ref_path, out_path]
+    run_command(psnr_command, logger)
+
+    # 2. 运行 NIQE.py
+    niqe_command = ['python', 'NIQE.py', out_path]
+    run_command(niqe_command, logger)
+
+    # 3. 运行 SSIM.py
+    ssim_command = ['python', 'SSIM.py', ref_path, out_path]
+    run_command(ssim_command, logger)
+
+    # 4. 运行 FID.py
+    fid_command = ['python', 'FID.py', ref_path, out_path]
+    run_command(fid_command, logger)
+
+    # 5. 运行 run_pipeline.py
+    run_pipeline_command = [
+        'python', 'run_pipeline.py',
+        '--videofile', out_path,
+        '--reference', 'wav2lip',
+        '--data_dir', 'tmp_dir'
+    ]
+    run_command(run_pipeline_command, logger)
+
+    # 6. 运行 calculate_scores_real_videos.py 并将输出追加到 all_scores.txt
+    calculate_scores_command = [
+        'python', 'calculate_scores_real_videos.py',
+        '--videofile', out_path,
+        '--reference', 'wav2lip',
+        '--data_dir', 'tmp_dir'
+    ]
+    try:
+        result = subprocess.run(
+            calculate_scores_command,
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True
+        )
+        logger.info(f"命令成功执行: {' '.join(calculate_scores_command)}")
+        logger.info(f"输出:\n{result.stdout}")
+        if result.stderr:
+            logger.warning(f"错误输出:\n{result.stderr}")
+        
+        # 追加输出到 all_scores.txt
+        all_scores_path = os.path.join(os.getcwd(), 'all_scores.txt')
+        with open(all_scores_path, 'a') as f:
+            f.write(result.stdout)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"命令执行失败: {' '.join(calculate_scores_command)}")
+        logger.error(f"错误输出:\n{e.stderr}")
+
+def main(ref_dir, out_dir):
+    setup_logging()
+    logger = logging.getLogger()
+
+    if not os.path.isdir(ref_dir):
+        logger.error(f"参考视频目录不存在: {ref_dir}")
+        return
+    if not os.path.isdir(out_dir):
+        logger.error(f"输出视频目录不存在: {out_dir}")
+        return
+
+    ref_files = get_ref_video_files(ref_dir)
+    
+    if not ref_files:
+        logger.error("参考视频目录下没有找到任何.mp4文件。")
+        return
+    
+    logger.info(f"找到 {len(ref_files)} 个参考文件。开始处理...")
+
+    # 清空 all_scores.txt
+    all_scores_path = os.path.join(os.getcwd(), 'all_scores.txt')
+    with open(all_scores_path, 'w') as f:
+        f.write('')  # 清空文件
+
+    # 准备任务列表
+    tasks = []
+    for ref_filename in sorted(ref_files):
+        ref_basename = os.path.splitext(ref_filename)[0]
+        out_filename = f"{ref_basename}_{ref_basename}.mp4"
+        ref_path = os.path.join(ref_dir, ref_filename)
+        out_path = os.path.join(out_dir, out_filename)
+        
+        if not os.path.isfile(out_path):
+            logger.warning(f"输出视频文件不存在: {out_filename}. 跳过.")
+            continue
+        
+        tasks.append((ref_path, out_path, logger))
+
+    # 使用进程池并行处理
+    with Pool(processes=cpu_count()) as pool:
+        pool.map(process_video_pair, tasks)
+
+    logger.info("所有文件处理完成。")
+
+if __name__ == "__main__":
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    ref_video_dir = os.path.join(script_dir, 'ref_video')
+    out_video_dir = os.path.join(script_dir, 'out_video')
+    
+    main(ref_video_dir, out_video_dir)
diff --git a/syncnet_python-master/calculate_scores_LRS.py b/syncnet_python-master/calculate_scores_LRS.py
new file mode 100755
index 00000000..eda02b8f
--- /dev/null
+++ b/syncnet_python-master/calculate_scores_LRS.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+import glob
+import os
+from tqdm import tqdm
+
+from SyncNetInstance_calc_scores import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_root', type=str, required=True, help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+path = os.path.join(opt.data_root, "*.mp4")
+
+all_videos = glob.glob(path)
+
+prog_bar = tqdm(range(len(all_videos)))
+avg_confidence = 0.
+avg_min_distance = 0.
+
+
+for videofile_idx in prog_bar:
+	videofile = all_videos[videofile_idx]
+	offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
+	avg_confidence += confidence
+	avg_min_distance += min_distance
+	prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
+	prog_bar.refresh()
+
+print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
+print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
+
+
+
diff --git a/syncnet_python-master/calculate_scores_real_videos.py b/syncnet_python-master/calculate_scores_real_videos.py
new file mode 100755
index 00000000..391526e0
--- /dev/null
+++ b/syncnet_python-master/calculate_scores_real_videos.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+
+from SyncNetInstance_calc_scores import *
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="/app/data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='/app/data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+
+
+# ==================== LOAD MODEL AND FILE LIST ====================
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+#print("Model %s loaded."%opt.initial_model);
+
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+
+# ==================== GET OFFSETS ====================
+
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    print (str(dist)+" "+str(conf))
+    dists.append((dist, conf))
+      
+# ==================== PRINT RESULTS TO FILE ====================
+
+with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+    pickle.dump(dists, fil)
diff --git a/syncnet_python-master/calculate_scores_real_videos.sh b/syncnet_python-master/calculate_scores_real_videos.sh
new file mode 100755
index 00000000..4a45cd56
--- /dev/null
+++ b/syncnet_python-master/calculate_scores_real_videos.sh
@@ -0,0 +1,8 @@
+rm all_scores.txt
+yourfilenames=`ls $1`
+
+for eachfile in $yourfilenames
+do
+   python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
+   python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
+done
diff --git a/syncnet_python-master/data/example.avi b/syncnet_python-master/data/example.avi
new file mode 100755
index 00000000..68a47538
Binary files /dev/null and b/syncnet_python-master/data/example.avi differ
diff --git a/syncnet_python-master/data/niqe_image_params.mat b/syncnet_python-master/data/niqe_image_params.mat
new file mode 100755
index 00000000..53df0998
Binary files /dev/null and b/syncnet_python-master/data/niqe_image_params.mat differ
diff --git a/syncnet_python-master/data/syncnet_v2.model b/syncnet_python-master/data/syncnet_v2.model
new file mode 100755
index 00000000..230757f4
--- /dev/null
+++ b/syncnet_python-master/data/syncnet_v2.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:961e8696f888fce4f3f3a6c3d5b3267cf5b343100b238e79b2659bff2c605442
+size 54573114
diff --git a/syncnet_python-master/data/work/pywork/faces.pckl b/syncnet_python-master/data/work/pywork/faces.pckl
new file mode 100755
index 00000000..92c3c883
--- /dev/null
+++ b/syncnet_python-master/data/work/pywork/faces.pckl
@@ -0,0 +1 @@
+�]�.
\ No newline at end of file
diff --git a/syncnet_python-master/demo_feature.py b/syncnet_python-master/demo_feature.py
new file mode 100755
index 00000000..e3bd290e
--- /dev/null
+++ b/syncnet_python-master/demo_feature.py
@@ -0,0 +1,32 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+
+from SyncNetInstance import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
+parser.add_argument('--tmp_dir', type=str, default="data", help='');
+parser.add_argument('--save_as', type=str, default="data/features.pt", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+feats = s.extract_feature(opt, videofile=opt.videofile)
+
+torch.save(feats, opt.save_as)
diff --git a/syncnet_python-master/demo_syncnet.py b/syncnet_python-master/demo_syncnet.py
new file mode 100755
index 00000000..01c25a6f
--- /dev/null
+++ b/syncnet_python-master/demo_syncnet.py
@@ -0,0 +1,30 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess
+
+from SyncNetInstance import *
+
+# ==================== LOAD PARAMS ====================
+
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--videofile', type=str, default="data/example.avi", help='');
+parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
+parser.add_argument('--reference', type=str, default="demo", help='');
+
+opt = parser.parse_args();
+
+
+# ==================== RUN EVALUATION ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+s.evaluate(opt, videofile=opt.videofile)
diff --git a/syncnet_python-master/detectors/README.md b/syncnet_python-master/detectors/README.md
new file mode 100755
index 00000000..f5a8d4fe
--- /dev/null
+++ b/syncnet_python-master/detectors/README.md
@@ -0,0 +1,3 @@
+# Face detector
+
+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
diff --git a/syncnet_python-master/detectors/__init__.py b/syncnet_python-master/detectors/__init__.py
new file mode 100755
index 00000000..059d49bf
--- /dev/null
+++ b/syncnet_python-master/detectors/__init__.py
@@ -0,0 +1 @@
+from .s3fd import S3FD
\ No newline at end of file
diff --git a/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc b/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc
new file mode 100755
index 00000000..df49682d
Binary files /dev/null and b/syncnet_python-master/detectors/__pycache__/__init__.cpython-39.pyc differ
diff --git a/syncnet_python-master/detectors/s3fd/__init__.py b/syncnet_python-master/detectors/s3fd/__init__.py
new file mode 100755
index 00000000..d7f35e05
--- /dev/null
+++ b/syncnet_python-master/detectors/s3fd/__init__.py
@@ -0,0 +1,61 @@
+import time
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+
+PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+
+
+class S3FD():
+
+    def __init__(self, device='cuda'):
+
+        tstamp = time.time()
+        self.device = device
+
+        print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        state_dict = torch.load(PATH_WEIGHT, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+
+        w, h = image.shape[1], image.shape[0]
+
+        bboxes = np.empty(shape=(0, 5))
+
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+
+        return bboxes
diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc
new file mode 100755
index 00000000..4e8df576
Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/__init__.cpython-39.pyc differ
diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc
new file mode 100755
index 00000000..1938a8eb
Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/box_utils.cpython-39.pyc differ
diff --git a/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc b/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc
new file mode 100755
index 00000000..f83ad53f
Binary files /dev/null and b/syncnet_python-master/detectors/s3fd/__pycache__/nets.cpython-39.pyc differ
diff --git a/syncnet_python-master/detectors/s3fd/box_utils.py b/syncnet_python-master/detectors/s3fd/box_utils.py
new file mode 100755
index 00000000..0779bcd5
--- /dev/null
+++ b/syncnet_python-master/detectors/s3fd/box_utils.py
@@ -0,0 +1,217 @@
+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+
+
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep).astype(np.int)
+
+
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+
+
+class Detect(object):
+
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+
+    def forward(self, loc_data, conf_data, prior_data):
+
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+
+        return output
+
+
+class PriorBox(object):
+
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+
+        super(PriorBox, self).__init__()
+
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+
+                mean += [cx, cy, s_kw, s_kh]
+
+        output = torch.FloatTensor(mean).view(-1, 4)
+        
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        
+        return output
diff --git a/syncnet_python-master/detectors/s3fd/nets.py b/syncnet_python-master/detectors/s3fd/nets.py
new file mode 100755
index 00000000..85b5c82c
--- /dev/null
+++ b/syncnet_python-master/detectors/s3fd/nets.py
@@ -0,0 +1,174 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from .box_utils import Detect, PriorBox
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
+
+
+class S3FDNet(nn.Module):
+
+    def __init__(self, device='cuda'):
+        super(S3FDNet, self).__init__()
+        self.device = device
+
+        self.vgg = nn.ModuleList([
+            nn.Conv2d(3, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(64, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            
+            nn.Conv2d(128, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2, ceil_mode=True),
+            
+            nn.Conv2d(256, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 1024, 1, 1),
+            nn.ReLU(inplace=True),
+        ])
+
+        self.L2Norm3_3 = L2Norm(256, 10)
+        self.L2Norm4_3 = L2Norm(512, 8)
+        self.L2Norm5_3 = L2Norm(512, 5)
+
+        self.extras = nn.ModuleList([
+            nn.Conv2d(1024, 256, 1, 1),
+            nn.Conv2d(256, 512, 3, 2, padding=1),
+            nn.Conv2d(512, 128, 1, 1),
+            nn.Conv2d(128, 256, 3, 2, padding=1),
+        ])
+        
+        self.loc = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(1024, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+        ])
+
+        self.conf = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(1024, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(256, 2, 3, 1, padding=1),
+        ])
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.detect = Detect()
+
+    def forward(self, x):
+        size = x.size()[2:]
+        sources = list()
+        loc = list()
+        conf = list()
+
+        for k in range(16):
+            x = self.vgg[k](x)
+        s = self.L2Norm3_3(x)
+        sources.append(s)
+
+        for k in range(16, 23):
+            x = self.vgg[k](x)
+        s = self.L2Norm4_3(x)
+        sources.append(s)
+
+        for k in range(23, 30):
+            x = self.vgg[k](x)
+        s = self.L2Norm5_3(x)
+        sources.append(s)
+
+        for k in range(30, len(self.vgg)):
+            x = self.vgg[k](x)
+        sources.append(x)
+        
+        # apply extra layers and cache source layer outputs
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                sources.append(x)
+
+        # apply multibox head to source layers
+        loc_x = self.loc[0](sources[0])
+        conf_x = self.conf[0](sources[0])
+
+        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
+        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
+
+        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
+        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
+
+        for i in range(1, len(sources)):
+            x = sources[i]
+            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
+            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
+
+        features_maps = []
+        for i in range(len(loc)):
+            feat = []
+            feat += [loc[i].size(1), loc[i].size(2)]
+            features_maps += [feat]
+
+        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
+        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
+
+        with torch.no_grad():
+            self.priorbox = PriorBox(size, features_maps)
+            self.priors = self.priorbox.forward()
+
+        output = self.detect.forward(
+            loc.view(loc.size(0), -1, 4),
+            self.softmax(conf.view(conf.size(0), -1, 2)),
+            self.priors.type(type(x.data)).to(self.device)
+        )
+
+        return output
diff --git a/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth b/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth
new file mode 100755
index 00000000..7a577e1e
--- /dev/null
+++ b/syncnet_python-master/detectors/s3fd/weights/sfd_face.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d54a87c2b7543b64729c9a25eafd188da15fd3f6e02f0ecec76ae1b30d86c491
+size 89844381
diff --git a/syncnet_python-master/download_model.sh b/syncnet_python-master/download_model.sh
new file mode 100755
index 00000000..3e3a9dc2
--- /dev/null
+++ b/syncnet_python-master/download_model.sh
@@ -0,0 +1,9 @@
+# SyncNet model
+
+mkdir data
+wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model
+wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi
+
+# For the pre-processing pipeline
+mkdir detectors/s3fd/weights
+wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth
\ No newline at end of file
diff --git a/syncnet_python-master/fid_log.txt b/syncnet_python-master/fid_log.txt
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/img/ex1.jpg b/syncnet_python-master/img/ex1.jpg
new file mode 100755
index 00000000..b20b57e1
Binary files /dev/null and b/syncnet_python-master/img/ex1.jpg differ
diff --git a/syncnet_python-master/img/ex2.jpg b/syncnet_python-master/img/ex2.jpg
new file mode 100755
index 00000000..851402cc
Binary files /dev/null and b/syncnet_python-master/img/ex2.jpg differ
diff --git a/syncnet_python-master/niqe_log.txt b/syncnet_python-master/niqe_log.txt
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/out_video/Obama_Obama.mp4 b/syncnet_python-master/out_video/Obama_Obama.mp4
new file mode 100755
index 00000000..7febe228
--- /dev/null
+++ b/syncnet_python-master/out_video/Obama_Obama.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d
+size 7483658
diff --git a/syncnet_python-master/psnr_log.txt b/syncnet_python-master/psnr_log.txt
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/ref_video/Obama.mp4 b/syncnet_python-master/ref_video/Obama.mp4
new file mode 100755
index 00000000..f14145dc
--- /dev/null
+++ b/syncnet_python-master/ref_video/Obama.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a01306b103de1ecdda339efa6f65aa34b90c5cc6d0f03d28beaddb6afba3dab
+size 25431660
diff --git a/syncnet_python-master/requirements.txt b/syncnet_python-master/requirements.txt
new file mode 100755
index 00000000..e2818374
--- /dev/null
+++ b/syncnet_python-master/requirements.txt
@@ -0,0 +1,13 @@
+numpy==1.23.0
+scenedetect==0.6.0
+scikit-image==0.24.0
+scikit-learn==1.6.0
+opencv-contrib-python
+python_speech_features
+pyworld
+pyyaml
+pytorch-lightning==1.9.0  # 确保与 torch 1.11.0 兼容
+imageio
+opencv-python
+scipy
+
diff --git a/syncnet_python-master/run_pipeline.py b/syncnet_python-master/run_pipeline.py
new file mode 100755
index 00000000..f2589fb9
--- /dev/null
+++ b/syncnet_python-master/run_pipeline.py
@@ -0,0 +1,322 @@
+#!/usr/bin/python
+
+import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2
+import numpy as np
+from shutil import rmtree
+
+import scenedetect
+from scenedetect.video_manager import VideoManager
+from scenedetect.scene_manager import SceneManager
+from scenedetect.frame_timecode import FrameTimecode
+from scenedetect.stats_manager import StatsManager
+from scenedetect.detectors import ContentDetector
+
+from scipy.interpolate import interp1d
+from scipy.io import wavfile
+from scipy import signal
+
+from detectors import S3FD
+
+# ========== ========== ========== ==========
+# # PARSE ARGS
+# ========== ========== ========== ==========
+
+parser = argparse.ArgumentParser(description = "FaceTracker");
+parser.add_argument('--data_dir',       type=str, default='/app/data/work', help='Output direcotry');
+parser.add_argument('--videofile',      type=str, default='',   help='Input video file');
+parser.add_argument('--reference',      type=str, default='',   help='Video reference');
+parser.add_argument('--facedet_scale',  type=float, default=0.25, help='Scale factor for face detection');
+parser.add_argument('--crop_scale',     type=float, default=0.40, help='Scale bounding box');
+parser.add_argument('--min_track',      type=int, default=100,  help='Minimum facetrack duration');
+parser.add_argument('--frame_rate',     type=int, default=25,   help='Frame rate');
+parser.add_argument('--num_failed_det', type=int, default=25,   help='Number of missed detections allowed before tracking is stopped');
+parser.add_argument('--min_face_size',  type=int, default=100,  help='Minimum face size in pixels');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
+
+# ========== ========== ========== ==========
+# # IOU FUNCTION
+# ========== ========== ========== ==========
+
+def bb_intersection_over_union(boxA, boxB):
+  
+  xA = max(boxA[0], boxB[0])
+  yA = max(boxA[1], boxB[1])
+  xB = min(boxA[2], boxB[2])
+  yB = min(boxA[3], boxB[3])
+ 
+  interArea = max(0, xB - xA) * max(0, yB - yA)
+ 
+  boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+  boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+ 
+  iou = interArea / float(boxAArea + boxBArea - interArea)
+ 
+  return iou
+
+# ========== ========== ========== ==========
+# # FACE TRACKING
+# ========== ========== ========== ==========
+
+def track_shot(opt,scenefaces):
+
+  iouThres  = 0.5     # Minimum IOU between consecutive face detections
+  tracks    = []
+
+  while True:
+    track     = []
+    for framefaces in scenefaces:
+      for face in framefaces:
+        if track == []:
+          track.append(face)
+          framefaces.remove(face)
+        elif face['frame'] - track[-1]['frame'] <= opt.num_failed_det:
+          iou = bb_intersection_over_union(face['bbox'], track[-1]['bbox'])
+          if iou > iouThres:
+            track.append(face)
+            framefaces.remove(face)
+            continue
+        else:
+          break
+
+    if track == []:
+      break
+    elif len(track) > opt.min_track:
+      
+      framenum    = np.array([ f['frame'] for f in track ])
+      bboxes      = np.array([np.array(f['bbox']) for f in track])
+
+      frame_i   = np.arange(framenum[0],framenum[-1]+1)
+
+      bboxes_i    = []
+      for ij in range(0,4):
+        interpfn  = interp1d(framenum, bboxes[:,ij])
+        bboxes_i.append(interpfn(frame_i))
+      bboxes_i  = np.stack(bboxes_i, axis=1)
+
+      if max(np.mean(bboxes_i[:,2]-bboxes_i[:,0]), np.mean(bboxes_i[:,3]-bboxes_i[:,1])) > opt.min_face_size:
+        tracks.append({'frame':frame_i,'bbox':bboxes_i})
+
+  return tracks
+
+# ========== ========== ========== ==========
+# # VIDEO CROP AND SAVE
+# ========== ========== ========== ==========
+        
+def crop_video(opt,track,cropfile):
+
+  flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+  flist.sort()
+
+  fourcc = cv2.VideoWriter_fourcc(*'XVID')
+  vOut = cv2.VideoWriter(cropfile+'t.avi', fourcc, opt.frame_rate, (224,224))
+
+  dets = {'x':[], 'y':[], 's':[]}
+
+  for det in track['bbox']:
+
+    dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) 
+    dets['y'].append((det[1]+det[3])/2) # crop center x 
+    dets['x'].append((det[0]+det[2])/2) # crop center y
+
+  # Smooth detections
+  dets['s'] = signal.medfilt(dets['s'],kernel_size=13)   
+  dets['x'] = signal.medfilt(dets['x'],kernel_size=13)
+  dets['y'] = signal.medfilt(dets['y'],kernel_size=13)
+
+  for fidx, frame in enumerate(track['frame']):
+
+    cs  = opt.crop_scale
+
+    bs  = dets['s'][fidx]   # Detection box size
+    bsi = int(bs*(1+2*cs))  # Pad videos by this amount 
+
+    image = cv2.imread(flist[frame])
+    
+    frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110))
+    my  = dets['y'][fidx]+bsi  # BBox center Y
+    mx  = dets['x'][fidx]+bsi  # BBox center X
+
+    face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))]
+    
+    vOut.write(cv2.resize(face,(224,224)))
+
+  audiotmp    = os.path.join(opt.tmp_dir,opt.reference,'audio.wav')
+  audiostart  = (track['frame'][0])/opt.frame_rate
+  audioend    = (track['frame'][-1]+1)/opt.frame_rate
+
+  vOut.release()
+
+  # ========== CROP AUDIO FILE ==========
+
+  command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) 
+  output = subprocess.call(command, shell=True, stdout=None)
+
+  if output != 0:
+    pdb.set_trace()
+
+  sample_rate, audio = wavfile.read(audiotmp)
+
+  # ========== COMBINE AUDIO AND VIDEO FILES ==========
+
+  command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile))
+  output = subprocess.call(command, shell=True, stdout=None)
+
+  if output != 0:
+    pdb.set_trace()
+
+  print('Written %s'%cropfile)
+
+  os.remove(cropfile+'t.avi')
+
+  print('Mean pos: x %.2f y %.2f s %.2f'%(np.mean(dets['x']),np.mean(dets['y']),np.mean(dets['s'])))
+
+  return {'track':track, 'proc_track':dets}
+
+# ========== ========== ========== ==========
+# # FACE DETECTION
+# ========== ========== ========== ==========
+
+def inference_video(opt):
+
+  DET = S3FD(device='cuda')
+
+  flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+  flist.sort()
+
+  dets = []
+      
+  for fidx, fname in enumerate(flist):
+
+    start_time = time.time()
+    
+    image = cv2.imread(fname)
+
+    image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    bboxes = DET.detect_faces(image_np, conf_th=0.9, scales=[opt.facedet_scale])
+
+    dets.append([]);
+    for bbox in bboxes:
+      dets[-1].append({'frame':fidx, 'bbox':(bbox[:-1]).tolist(), 'conf':bbox[-1]})
+
+    elapsed_time = time.time() - start_time
+
+    print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) 
+
+  savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl')
+
+  with open(savepath, 'wb') as fil:
+    pickle.dump(dets, fil)
+
+  return dets
+
+# ========== ========== ========== ==========
+# # SCENE DETECTION
+# ========== ========== ========== ==========
+
+def scene_detect(opt):
+
+  video_manager = VideoManager([os.path.join(opt.avi_dir,opt.reference,'video.avi')])
+  stats_manager = StatsManager()
+  scene_manager = SceneManager(stats_manager)
+  # Add ContentDetector algorithm (constructor takes detector options like threshold).
+  scene_manager.add_detector(ContentDetector())
+  base_timecode = video_manager.get_base_timecode()
+
+  video_manager.set_downscale_factor()
+
+  video_manager.start()
+
+  scene_manager.detect_scenes(frame_source=video_manager)
+
+  scene_list = scene_manager.get_scene_list(base_timecode)
+
+  savepath = os.path.join(opt.work_dir,opt.reference,'scene.pckl')
+
+  if scene_list == []:
+    scene_list = [(video_manager.get_base_timecode(),video_manager.get_current_timecode())]
+
+  with open(savepath, 'wb') as fil:
+    pickle.dump(scene_list, fil)
+
+  # print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list)))
+
+  return scene_list
+    
+
+# ========== ========== ========== ==========
+# # EXECUTE DEMO
+# ========== ========== ========== ==========
+
+# ========== DELETE EXISTING DIRECTORIES ==========
+
+if os.path.exists(os.path.join(opt.work_dir,opt.reference)):
+  rmtree(os.path.join(opt.work_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.crop_dir,opt.reference)):
+  rmtree(os.path.join(opt.crop_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.avi_dir,opt.reference)):
+  rmtree(os.path.join(opt.avi_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.frames_dir,opt.reference)):
+  rmtree(os.path.join(opt.frames_dir,opt.reference))
+
+if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
+  rmtree(os.path.join(opt.tmp_dir,opt.reference))
+
+# ========== MAKE NEW DIRECTORIES ==========
+
+os.makedirs(os.path.join(opt.work_dir,opt.reference))
+os.makedirs(os.path.join(opt.crop_dir,opt.reference))
+os.makedirs(os.path.join(opt.avi_dir,opt.reference))
+os.makedirs(os.path.join(opt.frames_dir,opt.reference))
+os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
+
+# ========== CONVERT VIDEO AND EXTRACT FRAMES ==========
+
+command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi')))
+output = subprocess.call(command, shell=True, stdout=None)
+
+command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) 
+output = subprocess.call(command, shell=True, stdout=None)
+
+command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) 
+output = subprocess.call(command, shell=True, stdout=None)
+
+# ========== FACE DETECTION ==========
+
+faces = inference_video(opt)
+
+# ========== SCENE DETECTION ==========
+
+scene = scene_detect(opt)
+
+# ========== FACE TRACKING ==========
+
+alltracks = []
+vidtracks = []
+
+for shot in scene:
+
+  if shot[1].frame_num - shot[0].frame_num >= opt.min_track :
+    alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num]))
+
+# ========== FACE TRACK CROP ==========
+
+for ii, track in enumerate(alltracks):
+  vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii)))
+
+# ========== SAVE RESULTS ==========
+
+savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl')
+
+with open(savepath, 'wb') as fil:
+  pickle.dump(vidtracks, fil)
+
+rmtree(os.path.join(opt.tmp_dir,opt.reference))
diff --git a/syncnet_python-master/run_syncnet.py b/syncnet_python-master/run_syncnet.py
new file mode 100755
index 00000000..45099fd6
--- /dev/null
+++ b/syncnet_python-master/run_syncnet.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import time, pdb, argparse, subprocess, pickle, os, gzip, glob
+
+from SyncNetInstance import *
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
+parser.add_argument('--batch_size', type=int, default='20', help='');
+parser.add_argument('--vshift', type=int, default='15', help='');
+parser.add_argument('--data_dir', type=str, default='data/work', help='');
+parser.add_argument('--videofile', type=str, default='', help='');
+parser.add_argument('--reference', type=str, default='', help='');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+
+
+# ==================== LOAD MODEL AND FILE LIST ====================
+
+s = SyncNetInstance();
+
+s.loadParameters(opt.initial_model);
+print("Model %s loaded."%opt.initial_model);
+
+flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
+flist.sort()
+
+# ==================== GET OFFSETS ====================
+
+dists = []
+for idx, fname in enumerate(flist):
+    offset, conf, dist = s.evaluate(opt,videofile=fname)
+    dists.append(dist)
+      
+# ==================== PRINT RESULTS TO FILE ====================
+
+with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
+    pickle.dump(dists, fil)
diff --git a/syncnet_python-master/run_visualise.py b/syncnet_python-master/run_visualise.py
new file mode 100755
index 00000000..85d89253
--- /dev/null
+++ b/syncnet_python-master/run_visualise.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python
+#-*- coding: utf-8 -*-
+
+import torch
+import numpy
+import time, pdb, argparse, subprocess, pickle, os, glob
+import cv2
+
+from scipy import signal
+
+# ==================== PARSE ARGUMENT ====================
+
+parser = argparse.ArgumentParser(description = "SyncNet");
+parser.add_argument('--data_dir', 	type=str, default='data/work', help='');
+parser.add_argument('--videofile', 	type=str, default='', help='');
+parser.add_argument('--reference', 	type=str, default='', help='');
+parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate');
+opt = parser.parse_args();
+
+setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
+setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
+setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
+setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
+setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes'))
+
+# ==================== LOAD FILES ====================
+
+with open(os.path.join(opt.work_dir,opt.reference,'tracks.pckl'), 'rb') as fil:
+    tracks = pickle.load(fil, encoding='latin1')
+
+with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'rb') as fil:
+    dists = pickle.load(fil, encoding='latin1')
+
+flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg'))
+flist.sort()
+
+# ==================== SMOOTH FACES ====================
+
+faces = [[] for i in range(len(flist))]
+
+for tidx, track in enumerate(tracks):
+
+	mean_dists 	=  numpy.mean(numpy.stack(dists[tidx],1),1)
+	minidx 		= numpy.argmin(mean_dists,0)
+	minval 		= mean_dists[minidx] 
+	
+	fdist   	= numpy.stack([dist[minidx] for dist in dists[tidx]])
+	fdist   	= numpy.pad(fdist, (3,3), 'constant', constant_values=10)
+
+	fconf   = numpy.median(mean_dists) - fdist
+	fconfm  = signal.medfilt(fconf,kernel_size=9)
+
+	for fidx, frame in enumerate(track['track']['frame'].tolist()) :
+		faces[frame].append({'track': tidx, 'conf':fconfm[fidx], 's':track['proc_track']['s'][fidx], 'x':track['proc_track']['x'][fidx], 'y':track['proc_track']['y'][fidx]})
+
+# ==================== ADD DETECTIONS TO VIDEO ====================
+
+first_image = cv2.imread(flist[0])
+
+fw = first_image.shape[1]
+fh = first_image.shape[0]
+
+fourcc = cv2.VideoWriter_fourcc(*'XVID')
+vOut = cv2.VideoWriter(os.path.join(opt.avi_dir,opt.reference,'video_only.avi'), fourcc, opt.frame_rate, (fw,fh))
+
+for fidx, fname in enumerate(flist):
+
+	image = cv2.imread(fname)
+
+	for face in faces[fidx]:
+
+		clr = max(min(face['conf']*25,255),0)
+
+		cv2.rectangle(image,(int(face['x']-face['s']),int(face['y']-face['s'])),(int(face['x']+face['s']),int(face['y']+face['s'])),(0,clr,255-clr),3)
+		cv2.putText(image,'Track %d, Conf %.3f'%(face['track'],face['conf']), (int(face['x']-face['s']),int(face['y']-face['s'])),cv2.FONT_HERSHEY_SIMPLEX,0.5,(255,255,255),2)
+
+	vOut.write(image)
+
+	print('Frame %d'%fidx)
+
+vOut.release()
+
+# ========== COMBINE AUDIO AND VIDEO FILES ==========
+
+command = ("ffmpeg -y -i %s -i %s -c:v copy -c:a copy %s" % (os.path.join(opt.avi_dir,opt.reference,'video_only.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'),os.path.join(opt.avi_dir,opt.reference,'video_out.avi'))) #-async 1 
+output = subprocess.call(command, shell=True, stdout=None)
+
+
diff --git a/syncnet_python-master/ssim_log.txt b/syncnet_python-master/ssim_log.txt
new file mode 100755
index 00000000..e69de29b
diff --git a/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav b/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav
new file mode 100755
index 00000000..2d00aca4
--- /dev/null
+++ b/syncnet_python-master/tmp_dir/pyavi/wav2lip/audio.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7a5f093fc3f55a9bafdc161ada7923e1756288937eaada330a9ebc5ce0a828
+size 18270746
diff --git a/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi b/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi
new file mode 100755
index 00000000..21dea003
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pyavi/wav2lip/video.avi differ
diff --git a/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi b/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi
new file mode 100755
index 00000000..660f527c
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pycrop/wav2lip/00000.avi differ
diff --git a/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4 b/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4
new file mode 100755
index 00000000..7febe228
--- /dev/null
+++ b/syncnet_python-master/tmp_dir/pyframes/wav2lip/Obama_Obama.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d
+size 7483658
diff --git a/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4 b/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4
new file mode 100755
index 00000000..7febe228
--- /dev/null
+++ b/syncnet_python-master/tmp_dir/pytmp/wav2lip/Obama_Obama.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dfc9a182c0b6bb497f5cb98d948c9fa24f0299f6161a12bd5c753ce15e7f44d
+size 7483658
diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl
new file mode 100755
index 00000000..6eb466d5
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/activesd.pckl differ
diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl
new file mode 100755
index 00000000..0a1e5f4b
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/faces.pckl differ
diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl
new file mode 100755
index 00000000..bedaf3d7
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/scene.pckl differ
diff --git a/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl b/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl
new file mode 100755
index 00000000..45374675
Binary files /dev/null and b/syncnet_python-master/tmp_dir/pywork/wav2lip/tracks.pckl differ