Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions HuaWeiExperiment/.idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions HuaWeiExperiment/.idea/HuaWeiExperiment.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions HuaWeiExperiment/.idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions HuaWeiExperiment/.idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions HuaWeiExperiment/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
具体细节如环境配置、数据处理、模型训练、评估等,请参见文件“语音识别-华为实验-wavenet.docx”,这里仅简单介绍我所做的工作。

我选择的项目是wavenet。

因为数据集太大,所以没有在此处放数据集,如果需要,请在https://keithito.com/LJ-Speech-Dataset处(2.6G)下载,并且依照文档进行预处理(预处理结束后需要约20多G空间)。

因为我的电脑过于拉胯,迫于硬件限制,我只能使用CPU进行训练,同时为了节约时间,我将训练的epoch次数设置为1(原先为2000),尽管如此,训练模型依旧花费了约4个半小时,并且因为epoch过小,所以结果很差,敬请见谅。

实验结果在\wavenet\saveAudio内,其中形如xxx_gen.wav为生成的音频,而xxx_ref.wav为参考的音频。
16 changes: 16 additions & 0 deletions HuaWeiExperiment/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# 这是一个示例 Python 脚本。

# 按 Shift+F10 执行或将其替换为您的代码。
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。


def print_hi(name):
# 在下面的代码行中使用断点来调试脚本。
print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。


# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
print_hi('PyCharm')

# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
1 change: 1 addition & 0 deletions HuaWeiExperiment/train.log
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
E:\anaconda3\envs\HuaWeiExperiment\python.exe: can't open file 'E:\python\pythonProjects\HuaWeiExperiment\train.py': [Errno 2] No such file or directory
305 changes: 305 additions & 0 deletions HuaWeiExperiment/wavenet/README.md

Large diffs are not rendered by default.

Binary file added HuaWeiExperiment/wavenet/WaveNet.mindir
Binary file not shown.
173 changes: 173 additions & 0 deletions HuaWeiExperiment/wavenet/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import librosa
import librosa.filters
import numpy as np
from hparams import hparams
from scipy.io import wavfile
from nnmnkwii import preprocessing as P


def low_cut_filter(x, fs, cutoff=70):
"""APPLY LOW CUT FILTER.

https://github.com/kan-bayashi/PytorchWaveNetVocoder

Args:
x (ndarray): Waveform sequence.
fs (int): Sampling frequency.
cutoff (float): Cutoff frequency of low cut filter.
Return:
ndarray: Low cut filtered waveform sequence.
"""
nyquist = fs // 2
norm_cutoff = cutoff / nyquist
from scipy.signal import firwin, lfilter

# low cut filter
fil = firwin(255, norm_cutoff, pass_zero=False)
lcf_x = lfilter(fil, 1, x)

return lcf_x


def load_wav(path):
sr, x = wavfile.read(path)
signed_int16_max = 2**15
if x.dtype == np.int16:
x = x.astype(np.float32) / signed_int16_max
if sr != hparams.sample_rate:
x = librosa.resample(x, sr, hparams.sample_rate)
x = np.clip(x, -1.0, 1.0)
return x


def save_wav(wav, path):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))


def trim(quantized):
start, end = start_and_end_indices(quantized, hparams.silence_threshold)
return quantized[start:end]


def preemphasis(x, coef=0.85):
return P.preemphasis(x, coef)


def inv_preemphasis(x, coef=0.85):
return P.inv_preemphasis(x, coef)


def adjust_time_resolution(quantized, mel):
"""Adjust time resolution by repeating features

Args:
quantized (ndarray): (T,)
mel (ndarray): (N, D)

Returns:
tuple: Tuple of (T,) and (T, D)
"""
assert len(quantized.shape) == 1
assert len(mel.shape) == 2

upsample_factor = quantized.size // mel.shape[0]
mel = np.repeat(mel, upsample_factor, axis=0)
n_pad = quantized.size - mel.shape[0]
if n_pad != 0:
assert n_pad > 0
mel = np.pad(mel, [(0, n_pad), (0, 0)], mode="constant", constant_values=0)

# trim
start, end = start_and_end_indices(quantized, hparams.silence_threshold)

return quantized[start:end], mel[start:end, :]


def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break

assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold

return start, end


def logmelspectrogram(y, pad_mode="reflect"):
"""Same log-melspectrogram computation as espnet
https://github.com/espnet/espnet
from espnet.transform.spectrogram import logmelspectrogram
"""
D = _stft(y, pad_mode=pad_mode)
S = _linear_to_mel(np.abs(D))
S = np.log10(np.maximum(S, 1e-10))
return S


def get_hop_size():
hop_size = hparams.hop_size
if hop_size is None:
assert hparams.frame_shift_ms is not None
hop_size = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
return hop_size


def get_win_length():
win_length = hparams.win_length
if win_length < 0:
assert hparams.win_length_ms > 0
win_length = int(hparams.win_length_ms / 1000 * hparams.sample_rate)
return win_length


def _stft(y, pad_mode="constant"):
# use constant padding (defaults to zeros) instead of reflection padding
return librosa.stft(y=y, n_fft=hparams.fft_size, hop_length=get_hop_size(),
win_length=get_win_length(), window=hparams.window,
pad_mode=pad_mode)


def pad_lr(x, fsize, fshift):
return (0, fsize)

# Conversions:


_mel_basis = None


def _linear_to_mel(spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram)


def _build_mel_basis():
if hparams.fmax is not None:
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate, hparams.fft_size,
fmin=hparams.fmin, fmax=hparams.fmax,
n_mels=hparams.num_mels)


def _amp_to_db(x):
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))


def _db_to_amp(x):
return np.power(10.0, x * 0.05)


def _normalize(S):
return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)


def _denormalize(S):
return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
38 changes: 38 additions & 0 deletions HuaWeiExperiment/wavenet/compute-meanvar-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# coding: utf-8
"""Compute mean-variance normalization stats.

usage: compute_meanvar_stats.py [options] <list_file> <out_path>

options:
-h, --help Show help message.
--verbose=<n> Verbosity [default: 0].
"""
from docopt import docopt
import sys
from tqdm import tqdm
import numpy as np
import json

from sklearn.preprocessing import StandardScaler
import joblib

if __name__ == "__main__":
args = docopt(__doc__)
list_file = args["<list_file>"]
out_path = args["<out_path>"]
verbose = int(args["--verbose"])

scaler = StandardScaler()
with open(list_file) as f:
lines = f.readlines()
assert len(lines) > 0
for path in tqdm(lines):
c = np.load(path.strip())
scaler.partial_fit(c)
joblib.dump(scaler, out_path)

if verbose > 0:
print("mean:\n{}".format(scaler.mean_))
print("var:\n{}".format(scaler.var_))

sys.exit(0)
Loading