Academic-Hammer · povli · Dec 24, 2024
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/README.md b/syncnet_python-master/Audio2Head/Audio2Head/README.md
@@ -0,0 +1,49 @@
+# Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion (IJCAI 2021)
+
+#### [Paper](https://www.ijcai.org/proceedings/2021/0152.pdf) | [Demo](https://www.youtube.com/watch?v=xvcBJ29l8rA)
+
+#### Requirements
+
+- Python 3.6 , Pytorch >= 1.6 and ffmpeg
+
+- Other requirements are listed in the 'requirements.txt'
+
+
+
+#### Pretrained Checkpoint
+
+Please download the pretrained checkpoint from [google-drive](https://drive.google.com/file/d/1tvI43ZIrnx9Ti2TpFiEO4dK5DOwcECD7/view?usp=sharing) and put it within the folder (`/checkpoints`).
+
+
+
+#### Generate Demo Results
+
+```
+python inference.py --audio_path xxx.wav --img_path xxx.jpg
+```
+
+Note that the input images must keep the same height and width and the face should be appropriately cropped as in `/demo/img`.
+
+
+
+#### License and Citation
+
+```
+@InProceedings{wang2021audio2head,
+author = Suzhen Wang, Lincheng Li, Yu Ding, Changjie Fan, Xin Yu
+title = {Audio2Head: Audio-driven One-shot Talking-head Generation with Natural Head Motion},
+booktitle = {the 30th International Joint Conference on Artificial Intelligence (IJCAI-21)},
+year = {2021},
+}
+```
+
+
+
+#### Acknowledgement
+
+This codebase is based on [First Order Motion Model](https://github.com/AliaksandrSiarohin/first-order-model), thanks for their contribution.
+
+
+
+
+
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/parameters.yaml
@@ -0,0 +1,8 @@
+block_expansion: 32
+estimate_jacobian: true
+max_features: 512
+num_blocks: 5
+num_kp: 10
+num_w: 2
+seq: true
+seq_len: 64
diff --git a/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml b/syncnet_python-master/Audio2Head/Audio2Head/config/vox-256.yaml
@@ -0,0 +1,83 @@
+dataset_params:
+  root_dir: /root/
+  frame_shape: [256, 256, 3]
+  id_sampling: True
+  pairs_list: data/vox256.csv
+  augmentation_params:
+    flip_param:
+      horizontal_flip: True
+      time_flip: True
+    jitter_param:
+      brightness: 0.1
+      contrast: 0.1
+      saturation: 0.1
+      hue: 0.1
+
+
+model_params:
+  common_params:
+    num_kp: 10
+    num_channels: 3
+    estimate_jacobian: True
+  kp_detector_params:
+     temperature: 0.1
+     block_expansion: 32
+     max_features: 1024
+     scale_factor: 0.25
+     num_blocks: 5
+  generator_params:
+    block_expansion: 64
+    max_features: 512
+    num_down_blocks: 2
+    num_bottleneck_blocks: 6
+    estimate_occlusion_map: True
+    dense_motion_params:
+      block_expansion: 64
+      max_features: 1024
+      num_blocks: 5
+      scale_factor: 0.25
+  discriminator_params:
+    scales: [1]
+    block_expansion: 32
+    max_features: 512
+    num_blocks: 4
+    sn: True
+
+train_params:
+  num_epochs: 100
+  num_repeats: 50
+  epoch_milestones: [5, 20, 30]
+  lr_generator: 2.0e-4
+  lr_discriminator: 2.0e-4
+  lr_kp_detector: 2.0e-4
+  batch_size: 36
+  scales: [1, 0.5, 0.25, 0.125]
+  checkpoint_freq: 10
+  transform_params:
+    sigma_affine: 0.05
+    sigma_tps: 0.005
+    points_tps: 5
+  loss_weights:
+    generator_gan: 0
+    discriminator_gan: 1
+    feature_matching: [10, 10, 10, 10]
+    perceptual: [10, 10, 10, 10, 10]
+    equivariance_value: 10
+    equivariance_jacobian: 10
+
+reconstruction_params:
+  num_videos: 1000
+  format: '.mp4'
+
+animate_params:
+  num_pairs: 50
+  format: '.mp4'
+  normalization_params:
+    adapt_movement_scale: False
+    use_relative_movement: True
+    use_relative_jacobian: True
+
+visualizer_params:
+  kp_size: 5
+  draw_border: True
+  colormap: 'gist_rainbow'