diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 72d488d..cd298a5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,8 +4,8 @@ repos: hooks: - id: black additional_dependencies: ['click==8.0.4'] -- repo: https://gitlab.com/pycqa/flake8 +- repo: https://github.com/PyCQA/flake8 rev: 3.8.4 hooks: - id: flake8 - args: ['--max-line-length=120', '--extend-ignore=E203'] + args: ['--max-line-length=120', '--extend-ignore=E203'] \ No newline at end of file diff --git a/kantts/bin/infer_sambert.py b/kantts/bin/infer_sambert.py index d0fa04d..ff6607d 100644 --- a/kantts/bin/infer_sambert.py +++ b/kantts/bin/infer_sambert.py @@ -23,8 +23,10 @@ ) -def denorm_f0(mel, f0_threshold=30, uv_threshold=0.6, norm_type='mean_std', f0_feature=None): - if norm_type == 'mean_std': +def denorm_f0( + mel, f0_threshold=30, uv_threshold=0.6, norm_type="mean_std", f0_feature=None +): + if norm_type == "mean_std": f0_mvn = f0_feature f0 = mel[:, -2] @@ -38,7 +40,7 @@ def denorm_f0(mel, f0_threshold=30, uv_threshold=0.6, norm_type='mean_std', f0_f mel[:, -2] = f0 mel[:, -1] = uv - else: # global + else: # global f0_global_max_min = f0_feature f0 = mel[:, -2] @@ -55,9 +57,9 @@ def denorm_f0(mel, f0_threshold=30, uv_threshold=0.6, norm_type='mean_std', f0_f return mel -def am_synthesis(symbol_seq, fsnet, ling_unit, device, se=None): - inputs_feat_lst = ling_unit.encode_symbol_sequence(symbol_seq) +def sync_preprocess(symbol_seq, ling_unit, device, se=None): + inputs_feat_lst = ling_unit.encode_symbol_sequence(symbol_seq) inputs_feat_index = 0 if ling_unit.using_byte(): inputs_byte_index = ( @@ -113,14 +115,20 @@ def am_synthesis(symbol_seq, fsnet, ling_unit, device, se=None): torch.zeros(1).to(device).long() + inputs_emo.size(1) - 1 ) # minus 1 for "~" + return inputs_ling, inputs_emo, inputs_spk, inputs_len + +# non-streaming inference +def am_synthesis(symbol_seq, fsnet, ling_unit, device, se=None): + inputs_ling, inputs_emo, inputs_spk, inputs_len = sync_preprocess( + symbol_seq, ling_unit, device, se + ) res = fsnet( inputs_ling[:, :-1, :], inputs_emo[:, :-1], inputs_spk, inputs_len, ) - x_band_width = res["x_band_width"] h_band_width = res["h_band_width"] # enc_slf_attn_lst = res["enc_slf_attn_lst"] @@ -153,7 +161,79 @@ def am_synthesis(symbol_seq, fsnet, ling_unit, device, se=None): ) -def am_infer(sentence, ckpt, output_dir, se_file=None, config=None): +# streaming inference +def am_chunk_synthesis( + symbol_seq, fsnet, ling_unit, device, se=None, mel_chunk_size=48 +): + inputs_ling, inputs_emo, inputs_spk, inputs_len = sync_preprocess( + symbol_seq, ling_unit, device, se + ) + complete_length = 0 + for chunk_id, res in enumerate( + fsnet.chunk_forward( + inputs_ling[:, :-1, :], + inputs_emo[:, :-1], + inputs_spk, + inputs_len, + mel_chunk_size=mel_chunk_size, + ) + ): + if chunk_id == 0: + x_band_width = res["x_band_width"] + h_band_width = res["h_band_width"] + LR_length_rounded = res["LR_length_rounded"] + log_duration_predictions = res["log_duration_predictions"] + pitch_predictions = res["pitch_predictions"] + energy_predictions = res["energy_predictions"] + valid_length = int(LR_length_rounded[0].item()) + duration_predictions = ( + (torch.exp(log_duration_predictions) - 1 + 0.5) + .long() + .squeeze() + .cpu() + .numpy() + ) + pitch_predictions = pitch_predictions.squeeze().cpu().numpy() + energy_predictions = energy_predictions.squeeze().cpu().numpy() + logging.info( + "x_band_width:{}, h_band_width: {}".format(x_band_width, h_band_width) + ) + else: + duration_predictions, pitch_predictions, energy_predictions = ( + None, + None, + None, + ) + + dec_output_chunk = res["dec_output_chunk"] + postnet_output_chunk = res["postnet_output_chunk"] + + if complete_length + dec_output_chunk.size(1) > valid_length: + useless_length = complete_length + dec_output_chunk.size(1) - valid_length + dec_output_chunk = dec_output_chunk[0, :-useless_length, :] + postnet_output_chunk = postnet_output_chunk[0, :-useless_length, :] + dec_output_chunk = dec_output_chunk.squeeze().cpu().numpy() + postnet_output_chunk = postnet_output_chunk.squeeze().cpu().numpy() + + yield ( + dec_output_chunk, + postnet_output_chunk, + duration_predictions, + pitch_predictions, + energy_predictions, + ) + complete_length += dec_output_chunk.shape[0] + + +def am_infer( + sentence, + ckpt, + output_dir, + se_file=None, + config=None, + inference_type="non-streaming", + mel_chunk_size=48, +): if not torch.cuda.is_available(): device = torch.device("cpu") else: @@ -174,36 +254,38 @@ def am_infer(sentence, ckpt, output_dir, se_file=None, config=None): ling_unit_size = ling_unit.get_unit_size() config["Model"]["KanTtsSAMBERT"]["params"].update(ling_unit_size) - se_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("SE", False) + se_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("SE", False) se = np.load(se_file) if se_enable else None # nsf - nsf_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("NSF", False) + nsf_enable = config["Model"]["KanTtsSAMBERT"]["params"].get("NSF", False) if nsf_enable: - nsf_norm_type = config["Model"]["KanTtsSAMBERT"]["params"].get("nsf_norm_type", "mean_std") + nsf_norm_type = config["Model"]["KanTtsSAMBERT"]["params"].get( + "nsf_norm_type", "mean_std" + ) if nsf_norm_type == "mean_std": f0_mvn_file = os.path.join( os.path.dirname(os.path.dirname(ckpt)), "mvn.npy" ) - f0_feature = np.load(f0_mvn_file) - else: # global - nsf_f0_global_minimum = config["Model"]["KanTtsSAMBERT"]["params"].get("nsf_f0_global_minimum", 30.0) - nsf_f0_global_maximum = config["Model"]["KanTtsSAMBERT"]["params"].get("nsf_f0_global_maximum", 730.0) + f0_feature = np.load(f0_mvn_file) + else: # global + nsf_f0_global_minimum = config["Model"]["KanTtsSAMBERT"]["params"].get( + "nsf_f0_global_minimum", 30.0 + ) + nsf_f0_global_maximum = config["Model"]["KanTtsSAMBERT"]["params"].get( + "nsf_f0_global_maximum", 730.0 + ) f0_feature = [nsf_f0_global_maximum, nsf_f0_global_minimum] - model, _, _ = model_builder(config, device) - fsnet = model["KanTtsSAMBERT"] - logging.info("Loading checkpoint: {}".format(ckpt)) - state_dict = torch.load(ckpt) + state_dict = torch.load(ckpt, map_location=device) fsnet.load_state_dict(state_dict["model"], strict=False) results_dir = os.path.join(output_dir, "feat") os.makedirs(results_dir, exist_ok=True) fsnet.eval() - with open(sentence, encoding="utf-8") as f: for line in f: line = line.strip().split("\t") @@ -214,12 +296,46 @@ def am_infer(sentence, ckpt, output_dir, se_file=None, config=None): energy_path = "%s/%s_energy.txt" % (results_dir, line[0]) with torch.no_grad(): - mel, mel_post, dur, f0, energy = am_synthesis( - line[1], fsnet, ling_unit, device, se=se - ) - + if inference_type == "non-streaming": + mel, mel_post, dur, f0, energy = am_synthesis( + line[1], + fsnet, + ling_unit, + device, + se=se, + ) + else: + mel_post = None + for chunk_id, ( + mel_chunk, + mel_post_chunk, + dur_chunk, + f0_chunk, + energy_chunk, + ) in enumerate( + am_chunk_synthesis( + line[1], + fsnet, + ling_unit, + device, + se=se, + mel_chunk_size=mel_chunk_size, + ) + ): + if chunk_id == 0: + dur, f0, energy = dur_chunk, f0_chunk, energy_chunk + if mel_post is None: + mel_post = mel_post_chunk + else: + mel_post = np.concatenate( + [mel_post, mel_post_chunk], axis=0 + ) + + # FIXME: if nsf_enable: - mel_post = denorm_f0(mel_post, norm_type=nsf_norm_type, f0_feature=f0_feature) + mel_post = denorm_f0( + mel_post, norm_type=nsf_norm_type, f0_feature=f0_feature + ) np.save(mel_path, mel_post) np.savetxt(dur_path, dur) @@ -233,7 +349,17 @@ def am_infer(sentence, ckpt, output_dir, se_file=None, config=None): parser.add_argument("--output_dir", type=str, required=True) parser.add_argument("--ckpt", type=str, required=True) parser.add_argument("--se_file", type=str, required=False) + parser.add_argument( + "--inference_type", type=str, required=False, default="non-streaming" + ) + parser.add_argument("--mel_chunk_size", type=int, required=False, default=24) args = parser.parse_args() - - am_infer(args.sentence, args.ckpt, args.output_dir, args.se_file) + am_infer( + args.sentence, + args.ckpt, + args.output_dir, + args.se_file, + inference_type=args.inference_type, + mel_chunk_size=args.mel_chunk_size, + ) \ No newline at end of file diff --git a/kantts/models/sambert/fsmn.py b/kantts/models/sambert/fsmn.py index be72d89..84d6f71 100644 --- a/kantts/models/sambert/fsmn.py +++ b/kantts/models/sambert/fsmn.py @@ -1,6 +1,7 @@ """ FSMN Pytorch Version """ +import torch import torch.nn as nn import torch.nn.functional as F @@ -71,6 +72,27 @@ def forward(self, input, mask=None): return output + def chunk_forward(self, input, mask=None, left_cache=None, rp=None): + if mask is not None: + input = input.masked_fill(mask.unsqueeze(-1), 0) + # padding + if left_cache is None: + x = F.pad(input, (0, 0, self.lp, 0, 0, 0), mode="constant", value=0.0) + else: + x = torch.cat([left_cache, input], dim=1) + # 更新 cache + if rp is not None: + new_left_cache = x[:, -rp - self.lp : x.size(1) - rp] # self.lp + x = F.pad(x, (0, 0, 0, self.rp, 0, 0), mode="constant", value=0.0) + output = ( + self.conv_dw(x.contiguous().transpose(1, 2)).contiguous().transpose(1, 2) + ) + output += input[:, : output.size(1)] + output = self.dropout(output) + if mask is not None: + output = output.masked_fill(mask.unsqueeze(-1), 0) + return output, new_left_cache + class FsmnEncoderV2(nn.Module): def __init__( @@ -122,3 +144,20 @@ def forward(self, input, mask=None): x = memory return x + + def chunk_forward(self, input, mask=None, left_caches=None, right_pad_size=None): + x = F.dropout(input, self.dropout, self.training) + new_left_caches = [] + for ffn, memory_block, left_cache in zip( + self.ffn_lst, self.memory_block_lst, left_caches + ): + context = ffn(x) + memory, left_cache = memory_block.chunk_forward( + context, mask, left_cache, right_pad_size + ) + new_left_caches.append(left_cache) + memory = F.dropout(memory, self.dropout, self.training) + if memory.size(-1) == x.size(-1): + memory += x + x = memory + return x, new_left_caches \ No newline at end of file diff --git a/kantts/models/sambert/kantts_sambert.py b/kantts/models/sambert/kantts_sambert.py index 91ce5b9..484995a 100644 --- a/kantts/models/sambert/kantts_sambert.py +++ b/kantts/models/sambert/kantts_sambert.py @@ -611,6 +611,36 @@ def forward( return dec_output, dec_pnca_attn_x_list, dec_pnca_attn_h_list + def chunk_forward( + self, + memory, + x_band_width, + h_band_width, + mask=None, + return_attns=False, + ): + # stream module is only use for inference + batch_size = memory.size(0) + go_frame = torch.zeros((batch_size, 1, self.d_mel)).to(memory.device) + self.mel_dec.reset_state() + input = go_frame + for step in range(memory.size(1)): + ( + dec_output_step, + dec_pnca_attn_x_step, + dec_pnca_attn_h_step, + ) = self.mel_dec.infer( + step, + input, + memory, + x_band_width, + h_band_width, + mask=mask, + return_attns=return_attns, + ) + input = dec_output_step[:, :, -self.d_mel :] + yield dec_output_step, dec_pnca_attn_x_step, dec_pnca_attn_h_step + class PostNet(nn.Module): def __init__(self, config): @@ -716,7 +746,9 @@ def __init__(self, config): self.text_encoder = TextFftEncoder(config) self.se_enable = config.get("SE", False) if not self.se_enable: - self.spk_tokenizer = nn.Embedding(config["speaker"], config["speaker_units"]) + self.spk_tokenizer = nn.Embedding( + config["speaker"], config["speaker_units"] + ) self.emo_tokenizer = nn.Embedding(config["emotion"], config["emotion_units"]) self.variance_adaptor = VarianceAdaptor(config) self.mel_decoder = MelPNCADecoder(config) @@ -859,7 +891,7 @@ def insert_fp( ) return text_hid, inputs_emotion, inputs_speaker, inter_lengths - def forward( + def pre_forward( self, inputs_ling, inputs_emotion, @@ -874,7 +906,6 @@ def forward( fp_label=None, ): batch_size = inputs_ling.size(0) - is_training = mel_targets is not None input_masks = get_mask_from_lengths(input_lengths, max_len=inputs_ling.size(1)) @@ -925,7 +956,9 @@ def forward( duration_targets[i, input_lengths[i]] = padding emo_hid = self.emo_tokenizer(inputs_emotion) - spk_hid = inputs_speaker if self.se_enable else self.spk_tokenizer(inputs_speaker) + spk_hid = ( + inputs_speaker if self.se_enable else self.spk_tokenizer(inputs_speaker) + ) inter_masks = get_mask_from_lengths(inter_lengths, max_len=text_hid.size(1)) @@ -991,6 +1024,59 @@ def forward( + 0.5 ) h_band_width = x_band_width + res = { + "x_band_width": x_band_width, + "h_band_width": h_band_width, + "enc_slf_attn_lst": enc_sla_attn_lst, + "LR_length_rounded": LR_length_rounded, + "log_duration_predictions": log_duration_predictions, + "pitch_predictions": pitch_predictions, + "energy_predictions": energy_predictions, + "duration_targets": duration_targets, + "pitch_targets": pitch_targets, + "energy_targets": energy_targets, + "fp_predictions": FP_p, + "valid_inter_lengths": inter_lengths, + "LR_text_outputs": LR_text_outputs, + "LR_emo_outputs": LR_emo_outputs, + "LR_spk_outputs": LR_spk_outputs, + } + if self.MAS and is_training: + res["attn_soft"] = attn_soft + res["attn_hard"] = attn_hard + res["attn_logprob"] = attn_logprob + return memory, lfr_masks, output_masks, res + + def forward( + self, + inputs_ling, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=None, + mel_targets=None, + duration_targets=None, + pitch_targets=None, + energy_targets=None, + attn_priors=None, + fp_label=None, + ): + batch_size = inputs_ling.size(0) + memory, lfr_masks, output_masks, res = self.pre_forward( + inputs_ling, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=output_lengths, + mel_targets=mel_targets, + duration_targets=duration_targets, + pitch_targets=pitch_targets, + energy_targets=energy_targets, + attn_priors=attn_priors, + fp_label=fp_label, + ) + x_band_width = res["x_band_width"] + h_band_width = res["h_band_width"] dec_outputs, pnca_x_attn_lst, pnca_h_attn_lst = self.mel_decoder( memory, @@ -1013,35 +1099,132 @@ def forward( if output_masks is not None: postnet_outputs = postnet_outputs.masked_fill(output_masks.unsqueeze(-1), 0) - res = { - "x_band_width": x_band_width, - "h_band_width": h_band_width, - "enc_slf_attn_lst": enc_sla_attn_lst, - "pnca_x_attn_lst": pnca_x_attn_lst, - "pnca_h_attn_lst": pnca_h_attn_lst, - "dec_outputs": dec_outputs, - "postnet_outputs": postnet_outputs, - "LR_length_rounded": LR_length_rounded, - "log_duration_predictions": log_duration_predictions, - "pitch_predictions": pitch_predictions, - "energy_predictions": energy_predictions, - "duration_targets": duration_targets, - "pitch_targets": pitch_targets, - "energy_targets": energy_targets, - "fp_predictions": FP_p, - "valid_inter_lengths": inter_lengths, - } + res["pnca_x_attn_lst"] = pnca_x_attn_lst + res["pnca_h_attn_lst"] = pnca_h_attn_lst + res["dec_outputs"] = dec_outputs + res["postnet_outputs"] = postnet_outputs + return res - res["LR_text_outputs"] = LR_text_outputs - res["LR_emo_outputs"] = LR_emo_outputs - res["LR_spk_outputs"] = LR_spk_outputs + # Use only for inference + def chunk_forward( + self, + inputs_ling, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=None, + attn_priors=None, + fp_label=None, + mel_chunk_size=48, + ): + batch_size = inputs_ling.size(0) + memory, lfr_masks, output_masks, res = self.pre_forward( + inputs_ling, + inputs_emotion, + inputs_speaker, + input_lengths, + output_lengths=output_lengths, + attn_priors=attn_priors, + fp_label=fp_label, + ) + x_band_width = res["x_band_width"] + h_band_width = res["h_band_width"] + + # mel_decoder + complete_length = 0 + dec_outputs = torch.empty( + batch_size, + 0, + self.mel_decoder.d_mel, + dtype=memory.dtype, + device=memory.device, + ) + total_length = memory.size(1) * 3 - if self.MAS and is_training: - res["attn_soft"] = attn_soft - res["attn_hard"] = attn_hard - res["attn_logprob"] = attn_logprob + # initialize cache + h0 = torch.zeros( + [batch_size, 1, self.mel_postnet.lstm_units], device=memory.device + ) + c0 = torch.zeros( + [batch_size, 1, self.mel_postnet.lstm_units], device=memory.device + ) + left_memory_caches = [ + None for _ in range(len(self.mel_postnet.fsmn.memory_block_lst)) + ] - return res + # size of right side receptive filed: 12 + receptive_field_size = self.mel_postnet.fsmn.memory_block_lst[0].rp * len( + self.mel_postnet.fsmn.memory_block_lst + ) + for ( + dec_output_step, + dec_pnca_attn_x_step, + dec_pnca_attn_h_step, + ) in self.mel_decoder.chunk_forward( + memory, x_band_width, h_band_width, mask=lfr_masks, return_attns=True + ): + dec_output_step = dec_output_step.contiguous().view( + batch_size, -1, self.mel_decoder.d_mel + ) + if output_masks is not None: + dec_output_step = dec_output_step.masked_fill( + output_masks.unsqueeze(-1)[ + :, + dec_outputs.size(1) : dec_outputs.size(1) + + dec_output_step.size(1), + :, + ], + 0, # NOQA + ) + dec_outputs = torch.concat([dec_outputs, dec_output_step], dim=1) + # mel postnet + target_length = complete_length + mel_chunk_size + receptive_field_size + if ( + dec_outputs.size(1) >= target_length + or dec_outputs.size(1) == total_length + ): + + # Cache + dec_output_chunk = dec_outputs[:, complete_length:target_length] + ( + postnet_fsmn_output, + left_memory_caches, + ) = self.mel_postnet.fsmn.chunk_forward( + dec_output_chunk, + output_masks[:, complete_length:target_length], + left_memory_caches, + max(0, dec_output_chunk.size(1) - mel_chunk_size), + ) + postnet_fsmn_output = postnet_fsmn_output[:, :mel_chunk_size] + postnet_lstm_output, (h0, c0) = self.mel_postnet.lstm( + postnet_fsmn_output, (h0, c0) + ) + mel_residual_output = self.mel_postnet.fc(postnet_lstm_output) + mel_residual_output = ( + mel_residual_output + + dec_outputs[:, complete_length : complete_length + mel_chunk_size] + ) + if output_masks is not None: + postnet_output = mel_residual_output.masked_fill( + output_masks[ + :, complete_length : complete_length + mel_chunk_size + ].unsqueeze(-1), + 0, + ) + res["postnet_output_chunk"] = postnet_output + res["dec_output_chunk"] = dec_outputs[ + :, complete_length : complete_length + mel_chunk_size + ] + """ TODO + res["pnca_x_attn_lst"] = pnca_x_attn_lst + res["pnca_h_attn_lst"] = pnca_h_attn_lst + """ + yield res + complete_length += postnet_output.size(1) + # Only the first chunk returns additional information + # and subsequent packets only return `postnet_output_chunk` and `dec_output_chunk` + # to avoid redundant information transmission. + res = dict() class KanTtsTextsyBERT(nn.Module): @@ -1065,4 +1248,4 @@ def forward(self, inputs_ling, input_lengths): res["logits"] = logits res["enc_slf_attn_lst"] = enc_sla_attn_lst - return res + return res \ No newline at end of file