
    ujhZ                     D   d dl Z d dlmZmZmZmZ d dlZd dlmZmZ d dl	m
Z dgZd"dededed	ed
ej                  j                   f
dZ	 	 	 	 	 	 d#dededededeeeeee   f      deded	ed
ej                  j$                  fdZded
efdZ G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d d ej*                        Z G d! dej*                        Zy)$    N)ListOptionalTupleUnion)nnTensor)
functional	Tacotron2in_dimout_dimbiasw_init_gainreturnc                    t         j                  j                  | ||      }t         j                  j                  j	                  |j
                  t         j                  j                  j                  |             |S )a  Linear layer with xavier uniform initialization.

    Args:
        in_dim (int): Size of each input sample.
        out_dim (int): Size of each output sample.
        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias. (Default: ``True``)
        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

    Returns:
        (torch.nn.Linear): The corresponding linear layer.
    r   gain)torchr   Linearinitxavier_uniform_weightcalculate_gain)r   r   r   r   linears        Y/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/tacotron2.py_get_linear_layerr   )   sT     XX__VW4_8F	HHMM!!&--ehhmm6R6RS^6_!`M    in_channelsout_channelskernel_sizestridepaddingdilationc           	      \   |'|dz  dk7  rt        d      t        ||dz
  z  dz        }t        j                  j	                  | ||||||      }t        j                  j
                  j                  |j                  t        j                  j
                  j                  |             |S )al  1D convolution with xavier uniform initialization.

    Args:
        in_channels (int): Number of channels in the input image.
        out_channels (int): Number of channels produced by the convolution.
        kernel_size (int, optional): Number of channels in the input image. (Default: ``1``)
        stride (int, optional): Number of channels in the input image. (Default: ``1``)
        padding (str, int or tuple, optional): Padding added to both sides of the input.
            (Default: dilation * (kernel_size - 1) / 2)
        dilation (int, optional): Number of channels in the input image. (Default: ``1``)
        w_init_gain (str, optional): Parameter passed to ``torch.nn.init.calculate_gain``
            for setting the gain parameter of ``xavier_uniform_``. (Default: ``linear``)

    Returns:
        (torch.nn.Conv1d): The corresponding Conv1D layer.
          zkernel_size must be odd)r    r!   r"   r#   r   r   )	
ValueErrorintr   r   Conv1dr   r   r   r   )	r   r   r    r!   r"   r#   r   r   conv1ds	            r   _get_conv1d_layerr+   ;   s    4 ?a677h+/2Q67XX__  F 
HHMM!!&--ehhmm6R6RS^6_!`Mr   lengthsc                    t        j                  |       j                         }t        j                  d|| j                  | j
                        }|| j                  d      k  j                         }t        j                  |d      }|S )al  Returns a binary mask based on ``lengths``. The ``i``-th row and ``j``-th column of the mask
    is ``1`` if ``j`` is smaller than ``i``-th element of ``lengths.

    Args:
        lengths (Tensor): The length of each element in the batch, with shape (n_batch, ).

    Returns:
        mask (Tensor): The binary mask, with shape (n_batch, max of ``lengths``).
    r   )devicedtyper&   )	r   maxitemaranger.   r/   	unsqueezebytele)r,   max_lenidsmasks       r   _get_mask_from_lengthsr9   i   sj     ii %%'G
,,q''..
NC'##A&&,,.D88D!DKr   c                   @     e Zd ZdZdededef fdZdedefdZ xZS )	_LocationLayera  Location layer used in the Attention model.

    Args:
        attention_n_filter (int): Number of filters for attention model.
        attention_kernel_size (int): Kernel size for attention model.
        attention_hidden_dim (int): Dimension of attention hidden representation.
    attention_n_filterattention_kernel_sizeattention_hidden_dimc           	          t         |           t        |dz
  dz        }t        d|||ddd      | _        t        ||dd      | _        y )Nr&   r%   F)r    r"   r   r!   r#   tanhr   r   )super__init__r(   r+   location_convr   location_dense)selfr<   r=   r>   r"   	__class__s        r   rC   z_LocationLayer.__init__   s`     	,q0A56.-
 0 45f
r   attention_weights_catr   c                 n    | j                  |      }|j                  dd      }| j                  |      }|S )a  Location layer used in the Attention model.

        Args:
            attention_weights_cat (Tensor): Cumulative and previous attention weights
                with shape (n_batch, 2, max of ``text_lengths``).

        Returns:
            processed_attention (Tensor): Cumulative and previous attention weights
                with shape (n_batch, ``attention_hidden_dim``).
        r&   r%   )rD   	transposerE   )rF   rH   processed_attentions      r   forwardz_LocationLayer.forward   sA     #001FG1;;AqA"112EF""r   	__name__
__module____qualname____doc__r(   rC   r   rL   __classcell__rG   s   @r   r;   r;   z   s<    

  #
 "	
*#V # #r   r;   c                   ~     e Zd ZdZdedededededdf fd	Zd
edededefdZdedededededeeef   fdZ	 xZ
S )
_Attentiona  Locally sensitive attention model.

    Args:
        attention_rnn_dim (int): Number of hidden units for RNN.
        encoder_embedding_dim (int): Number of embedding dimensions in the Encoder.
        attention_hidden_dim (int): Dimension of attention hidden representation.
        attention_location_n_filter (int): Number of filters for Attention model.
        attention_location_kernel_size (int): Kernel size for Attention model.
    attention_rnn_dimencoder_embedding_dimr>   attention_location_n_filterattention_location_kernel_sizer   Nc                     t         |           t        ||dd      | _        t        ||dd      | _        t        |dd      | _        t        |||      | _        t        d       | _	        y )NFr@   rA   r&   r   inf)
rB   rC   r   query_layermemory_layervr;   location_layerfloatscore_mask_value)rF   rV   rW   r>   rX   rY   rG   s         r   rC   z_Attention.__init__   sx     	,->@T[`ntu-!#7eQW
 ##7G,'* 

 "'ur   queryprocessed_memoryrH   c                     | j                  |j                  d            }| j                  |      }| j                  t	        j
                  ||z   |z               }|j                  d      }|S )a=  Get the alignment vector.

        Args:
            query (Tensor): Decoder output with shape (n_batch, n_mels * n_frames_per_step).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, attention_hidden_dim).
            attention_weights_cat (Tensor): Cumulative and previous attention weights
                with shape (n_batch, 2, max of ``text_lengths``).

        Returns:
            alignment (Tensor): attention weights, it is a tensor with shape (batch, max of ``text_lengths``).
        r&   r%   )r\   r3   r_   r^   r   r@   squeeze)rF   rb   rc   rH   processed_queryprocessed_attention_weightsenergies	alignments           r   _get_alignment_energiesz"_Attention._get_alignment_energies   sh     **5??1+=>&*&9&9:O&P#66%**_7R%RUe%efg$$Q'	r   attention_hidden_statememoryr8   c                    | j                  |||      }|j                  || j                        }t        j                  |d      }t        j                  |j                  d      |      }|j                  d      }||fS )a  Pass the input through the Attention model.

        Args:
            attention_hidden_state (Tensor): Attention rnn last output with shape (n_batch, ``attention_rnn_dim``).
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
            attention_weights_cat (Tensor): Previous and cumulative attention weights
                with shape (n_batch, current_num_frames * 2, max of ``text_lengths``).
            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

        Returns:
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
        r&   dim)	rj   masked_fillra   Fsoftmaxr   bmmr3   re   )	rF   rk   rl   rc   rH   r8   ri   attention_weightsattention_contexts	            r   rL   z_Attention.forward   s~    . 001GIY[pq	))$0E0EF	IIiQ7!II&7&A&A!&DfM-55a8 "333r   )rN   rO   rP   rQ   r(   rC   r   rj   r   rL   rR   rS   s   @r   rU   rU      s    ..  #. "	.
 &). ),. 
.*V v fl qw *4 &4 4 !	4
  &4 4 
vv~	4r   rU   c                   F     e Zd ZdZdedee   ddf fdZdedefdZ xZ	S )	_PrenetzPrenet Module. It is consists of ``len(output_size)`` linear layers.

    Args:
        in_dim (int): The size of each input sample.
        output_sizes (list): The output dimension of each linear layers.
    r   	out_sizesr   Nc                     t         |           |g|d d z   }t        j                  t	        ||      D cg c]  \  }}t        ||d       c}}      | _        y c c}}w )NFr   )rB   rC   r   
ModuleListzipr   layers)rF   r   rx   in_sizesin_sizeout_sizerG   s         r   rC   z_Prenet.__init__
  sY    8in,mmY\]egpYqrBU7Hwu=r
rs   A
xc                     | j                   D ]3  }t        j                  t        j                   ||            dd      }5 |S )zPass the input through Prenet.

        Args:
            x (Tensor): The input sequence to Prenet with shape (n_batch, in_dim).

        Return:
            x (Tensor): Tensor with shape (n_batch, sizes[-1])
              ?T)ptraining)r}   rq   dropoutrelu)rF   r   r   s      r   rL   z_Prenet.forward  s=     kk 	CF		!&&+sTBA	Cr   )
rN   rO   rP   rQ   r(   r   rC   r   rL   rR   rS   s   @r   rw   rw     s9    
s 
tCy 
T 
 F r   rw   c                   D     e Zd ZdZdedededef fdZdedefd	Z xZS )
_Postneta  Postnet Module.

    Args:
        n_mels (int): Number of mel bins.
        postnet_embedding_dim (int): Postnet embedding dimension.
        postnet_kernel_size (int): Postnet kernel size.
        postnet_n_convolution (int): Number of postnet convolutions.
    n_melspostnet_embedding_dimpostnet_kernel_sizepostnet_n_convolutionc                    t         
|           t        j                         | _        t        |      D ]  }|dk(  r|n|}||dz
  k(  r|n|}||dz
  k(  rdnd}||dz
  k(  r|n|}	| j                  j                  t        j                  t        |||dt        |dz
  dz        d|      t        j                  |	                    t        | j                        | _        y )Nr   r&   r   r@   r%   r    r!   r"   r#   r   )rB   rC   r   r{   convolutionsrangeappend
Sequentialr+   r(   BatchNorm1dlenn_convs)rF   r   r   r   r   ir   r   	init_gainnum_featuresrG   s             r   rC   z_Postnet.__init__*  s     	MMO,- 	A$%F&0EK%&+@1+D%E6K`L$%*?!*C$D&I%&+@1+D%E6K`L$$%#$$7  #%81%<$A B!"$- NN<0	( 4,,-r   r   r   c                 ,   t        | j                        D ]{  \  }}|| j                  dz
  k  r<t        j                  t        j                   ||            d| j                        }Tt        j                   ||      d| j                        }} |S )a  Pass the input through Postnet.

        Args:
            x (Tensor): The input sequence with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).

        Return:
            x (Tensor): Tensor with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
        r&   r   )r   )	enumerater   r   rq   r   r   r@   r   )rF   r   r   convs       r   rL   z_Postnet.forwardJ  sy     !!2!23 	DGAt4<<!##IIejja13OIId1gsT]]C		D r   rM   rS   s   @r   r   r      sG    ..  #. !	.
  #.@ F r   r   c                   H     e Zd ZdZdedededdf fdZded	edefd
Z xZS )_Encodera  Encoder Module.

    Args:
        encoder_embedding_dim (int): Number of embedding dimensions in the encoder.
        encoder_n_convolution (int): Number of convolution layers in the encoder.
        encoder_kernel_size (int): The kernel size in the encoder.

    Examples
        >>> encoder = _Encoder(3, 512, 5)
        >>> input = torch.rand(10, 20, 30)
        >>> output = encoder(input)  # shape: (10, 30, 512)
    rW   encoder_n_convolutionencoder_kernel_sizer   Nc                    t         |           t        j                         | _        t        |      D ]e  }t        j                  t        |||dt        |dz
  dz        dd      t        j                  |            }| j                  j                  |       g t        j                  |t        |dz        ddd      | _        | j                  j                          y )Nr&   r%   r   r   T)batch_firstbidirectional)rB   rC   r   r{   r   r   r   r+   r(   r   r   LSTMlstmflatten_parameters)rF   rW   r   r   _
conv_layerrG   s         r   rC   z_Encoder.__init__k  s     	MMO,- 	1A!)) 3!4q!8A => & 45J $$Z0	1 GG!%)*
	 			$$&r   r   input_lengthsc                    | j                   D ]<  }t        j                  t        j                   ||            d| j                        }> |j                  dd      }|j                         }t        j                  j                  j                  ||d      }| j                  |      \  }}t        j                  j                  j                  |d      \  }}|S )a_  Pass the input through the Encoder.

        Args:
            x (Tensor): The input sequences with shape (n_batch, encoder_embedding_dim, n_seq).
            input_lengths (Tensor): The length of each input sequence with shape (n_batch, ).

        Return:
            x (Tensor): A tensor with shape (n_batch, n_seq, encoder_embedding_dim).
        r   r&   r%   T)r   )r   rq   r   r   r   rJ   cpur   utilsrnnpack_padded_sequencer   pad_packed_sequence)rF   r   r   r   outputsr   s         r   rL   z_Encoder.forward  s     %% 	?D		!&&a/3>A	? KK1%))+HHLL--aD-QYYq\
XX\\55g45P
r   rM   rS   s   @r   r   r   ]  sN    '"'  #' !	'
 
'B  6 r   r   c            !           e Zd ZdZdededededededed	ed
ededededededdf fdZdedefdZ	dede
eeeeeeeef   fdZdedefdZdededede
eeef   fdZdedededed ed!ed"ed#eded$ed%ede
eeeeeeeeef	   fd&Zded'ed(ede
eeef   fd)Zdedefd*Zej$                  j&                  ded(ede
eeeef   fd+       Z xZS ),_Decodera,  Decoder with Attention model.

    Args:
        n_mels (int): number of mel bins
        n_frames_per_step (int): number of frames processed per step, only 1 is supported
        encoder_embedding_dim (int): the number of embedding dimensions in the encoder.
        decoder_rnn_dim (int): number of units in decoder LSTM
        decoder_max_step (int): maximum number of output mel spectrograms
        decoder_dropout (float): dropout probability for decoder LSTM
        decoder_early_stopping (bool): stop decoding when all samples are finished
        attention_rnn_dim (int): number of units in attention LSTM
        attention_hidden_dim (int): dimension of attention hidden representation
        attention_location_n_filter (int): number of filters for attention model
        attention_location_kernel_size (int): kernel size for attention model
        attention_dropout (float): dropout probability for attention LSTM
        prenet_dim (int): number of ReLU units in prenet layers
        gate_threshold (float): probability threshold for stop token
    r   n_frames_per_steprW   decoder_rnn_dimdecoder_max_stepdecoder_dropoutdecoder_early_stoppingrV   r>   rX   rY   attention_dropout
prenet_dimgate_thresholdr   Nc                    t         |           || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        t        ||z  ||g      | _        t        j                   ||z   |      | _        t%        |||	|
|      | _        t        j                   ||z   |d      | _        t+        ||z   ||z        | _        t+        ||z   ddd      | _        y )NTr&   sigmoidrA   )rB   rC   r   r   rW   rV   r   r   r   r   r   r   r   rw   prenetr   LSTMCellattention_rnnrU   attention_layerdecoder_rnnr   linear_projection
gate_layer)rF   r   r   rW   r   r   r   r   rV   r>   rX   rY   r   r   r   rG   s                  r   rC   z_Decoder.__init__  s   $ 	!2%:"!2.$ 0,!2.&<#f'88:z:RS[[6K)KM^_)! '* 
 ;;'8;P'PRacgh!2?EZ3Z\bev\v!w+33QTy
r   rl   c                     |j                  d      }|j                  }|j                  }t        j                  || j
                  | j                  z  ||      }|S )am  Gets all zeros frames to use as the first decoder input.

        Args:
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        Returns:
            decoder_input (Tensor): all zeros frames with shape
                (n_batch, max of ``text_lengths``, ``n_mels * n_frames_per_step``).
        r   r/   r.   sizer/   r.   r   zerosr   r   rF   rl   n_batchr/   r.   decoder_inputs         r   _get_initial_framez_Decoder._get_initial_frame  N     ++a.GT[[4;Q;Q-QY^gmnr   c                    |j                  d      }|j                  d      }|j                  }|j                  }t        j                  || j
                  ||      }t        j                  || j
                  ||      }t        j                  || j                  ||      }t        j                  || j                  ||      }	t        j                  ||||      }
t        j                  ||||      }t        j                  || j                  ||      }| j                  j                  |      }||||	|
|||fS )a  Initializes attention rnn states, decoder rnn states, attention
        weights, attention cumulative weights, attention context, stores memory
        and stores processed memory.

        Args:
            memory (Tensor): Encoder outputs with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        Returns:
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
        r   r&   r   )
r   r/   r.   r   r   rV   r   rW   r   r]   )rF   rl   r   max_timer/   r.   attention_hiddenattention_celldecoder_hiddendecoder_cellrt   attention_weights_cumru   rc   s                 r   _initialize_decoder_statesz#_Decoder._initialize_decoder_states  s    * ++a.;;q> ;;w0F0Fe\bcWd.D.DEZ`aWd.B.B%X^_{{7D,@,@V\]!KKvV %GXUSY Z!KK1K1KSXagh//<<VD !	
 		
r   decoder_inputsc                     |j                  dd      }|j                  |j                  d      t        |j                  d      | j                  z        d      }|j                  dd      }|S )ak  Prepares decoder inputs.

        Args:
            decoder_inputs (Tensor): Inputs used for teacher-forced training, i.e. mel-specs,
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)

        Returns:
            inputs (Tensor): Processed decoder inputs with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``).
        r&   r%   r   rz   )rJ   viewr   r(   r   )rF   r   s     r   _parse_decoder_inputsz_Decoder._parse_decoder_inputs.  so     (11!Q7',,"##A&)?)??@
 (11!Q7r   mel_specgramgate_outputs
alignmentsc                 F   |j                  dd      j                         }|j                  dd      j                         }|j                  dd      j                         }|j                  d   d| j                  f} |j                  | }|j                  dd      }|||fS )aq  Prepares decoder outputs for output

        Args:
            mel_specgram (Tensor): mel spectrogram with shape (max of ``mel_specgram_lengths``, n_batch, ``n_mels``)
            gate_outputs (Tensor): predicted stop token with shape (max of ``mel_specgram_lengths``, n_batch)
            alignments (Tensor): sequence of attention weights from the decoder
                with shape (max of ``mel_specgram_lengths``, n_batch, max of ``text_lengths``)

        Returns:
            mel_specgram (Tensor): mel spectrogram with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``)
            gate_outputs (Tensor): predicted stop token with shape (n_batch, max of ``mel_specgram_lengths``)
            alignments (Tensor): sequence of attention weights from the decoder
                with shape (n_batch, max of ``mel_specgram_lengths``, max of ``text_lengths``)
        r   r&   rz   r%   )rJ   
contiguousshaper   r   )rF   r   r   r   r   s        r   _parse_decoder_outputsz_Decoder._parse_decoder_outputsC  s    &  ))!Q/::<
#--a3>>@#--a3>>@##A&DKK8(|((%0#--a3\:55r   r   r   r   r   r   rt   r   ru   rc   r8   c           	         t        j                  ||fd      }| j                  |||f      \  }}t        j                  || j
                  | j                        }t        j                  |j                  d      |j                  d      fd      }| j                  ||	|
||      \  }}||z  }t        j                  ||fd      }| j                  |||f      \  }}t        j                  || j                  | j                        }t        j                  ||fd      }| j                  |      }| j                  |      }|||||||||f	S )a&	  Decoder step using stored states, attention and memory

        Args:
            decoder_input (Tensor): Output of the Prenet with shape (n_batch, ``prenet_dim``).
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
            memory (Tensor): Encoder output with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            processed_memory (Tensor): Processed Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``attention_hidden_dim``).
            mask (Tensor): Binary mask for padded data with shape (n_batch, current_num_frames).

        Returns:
            decoder_output: Predicted mel spectrogram for the current frame with shape (n_batch, ``n_mels``).
            gate_prediction (Tensor): Prediction of the stop token with shape (n_batch, ``1``).
            attention_hidden (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            attention_cell (Tensor): Hidden state of the attention LSTM with shape (n_batch, ``attention_rnn_dim``).
            decoder_hidden (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            decoder_cell (Tensor): Hidden state of the decoder LSTM with shape (n_batch, ``decoder_rnn_dim``).
            attention_weights (Tensor): Attention weights with shape (n_batch, max of ``text_lengths``).
            attention_weights_cum (Tensor): Cumulated attention weights with shape (n_batch, max of ``text_lengths``).
            attention_context (Tensor): Context vector with shape (n_batch, ``encoder_embedding_dim``).
        rz   r&   rn   )r   catr   rq   r   r   r   r3   r   r   r   r   r   )rF   r   r   r   r   r   rt   r   ru   rl   rc   r8   
cell_inputrH    decoder_hidden_attention_contextdecoder_outputgate_predictions                    r   decodez_Decoder.decodec  si   R YY/@A2F
+/+=+=jK[]kJl+m(.99%5t7M7Mt}}] %		+<+F+Fq+IK`KjKjklKm*ntu v/3/C/Cf&68Mt0
,, 	!22		#35F"GL'+'7'7XdGe'f$>43G3GW+099nFW5X^_+`(//0PQ//*JK !

 
	
r   mel_specgram_truthmemory_lengthsc                     | j                  |      j                  d      }| j                  |      }t        j                  ||fd      }| j                  |      }t        |      }| j                  |      \  }}}	}
}}}}g g g }}}t        |      |j                  d      dz
  k  r|t        |         }| j                  ||||	|
||||||      \	  }}}}}	}
}}}||j                  d      gz  }||j                  d      gz  }||gz  }t        |      |j                  d      dz
  k  r| j                  t        j                  |      t        j                  |      t        j                  |            \  }}}|||fS )a  Decoder forward pass for training.

        Args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            mel_specgram_truth (Tensor): Decoder ground-truth mel-specs for teacher forcing
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            memory_lengths (Tensor): Encoder output lengths for attention masking
                (the same as ``text_lengths``) with shape (n_batch, ).

        Returns:
            mel_specgram (Tensor): Predicted mel spectrogram
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            gate_outputs (Tensor): Predicted stop token for each timestep
                with shape (n_batch,  max of ``mel_specgram_lengths``).
            alignments (Tensor): Sequence of attention weights from the decoder
                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
        r   rn   r&   )r   r3   r   r   r   r   r9   r   r   r   r   re   r   stack)rF   rl   r   r   r   r   r8   r   r   r   r   rt   r   ru   rc   mel_outputsr   r   
mel_outputgate_outputr   s                        r   rL   z_Decoder.forward  s   , //7AA!D334FGM>#BJ^4%n5 ++F3		
! 13B:\+!4!4Q!7!!;;*3{+;<M  !%! 
 !%! J..q122K[00344L,--J9 +!4!4Q!7!!;;< 261L1LKK$ekk,&?ZAX2
.lJ \:55r   c                     |j                  d      }|j                  }|j                  }t        j                  || j
                  | j                  z  ||      }|S )aU  Gets all zeros frames to use as the first decoder input

        args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).

        returns:
            decoder_input (Tensor): All zeros frames with shape(n_batch, ``n_mels`` * ``n_frame_per_step``).
        r   r   r   r   s         r   _get_go_framez_Decoder._get_go_frame  r   r   c                    |j                  d      |j                  }}| j                  |      }t        |      }| j	                  |      \  }}}	}
}}}}t        j                  |gt
        j                  |      }t        j                  |gt
        j                  |      }g }g }g }t        | j                        D ]  }| j                  |      }| j                  ||||	|
||||||      \	  }}}}}	}
}}}|j                  |j                  d             |j                  |j                  dd             |j                  |       || xx   dz  cc<   |t        j                   |j#                  d            | j$                  kD  z  }| j&                  rt        j(                  |      r n|} t+        |      | j                  k(  rt-        j.                  d       t        j0                  |d      }t        j0                  |d      }t        j0                  |d      }| j3                  |||      \  }}}||||fS )a  Decoder inference

        Args:
            memory (Tensor): Encoder outputs
                with shape (n_batch, max of ``text_lengths``, ``encoder_embedding_dim``).
            memory_lengths (Tensor): Encoder output lengths for attention masking
                (the same as ``text_lengths``) with shape (n_batch, ).

        Returns:
            mel_specgram (Tensor): Predicted mel spectrogram
                with shape (n_batch, ``n_mels``, max of ``mel_specgram_lengths``).
            mel_specgram_lengths (Tensor): the length of the predicted mel spectrogram (n_batch, ))
            gate_outputs (Tensor): Predicted stop token for each timestep
                with shape (n_batch,  max of ``mel_specgram_lengths``).
            alignments (Tensor): Sequence of attention weights from the decoder
                with shape (n_batch,  max of ``mel_specgram_lengths``, max of ``text_lengths``).
        r   r   r&   zZReached max decoder steps. The generated spectrogram might not cover the whole transcript.rn   )r   r.   r   r9   r   r   r   int32boolr   r   r   r   r   r3   rJ   r   re   r   r   allr   warningswarnr   r   )rF   rl   r   
batch_sizer.   r   r8   r   r   r   r   rt   r   ru   rc   mel_specgram_lengthsfinishedmel_specgramsr   r   r   r   r   s                          r   inferz_Decoder.infer
  s;   & $[[^V]]F
**62%n5 ++F3		
!  %{{J<u{{SYZ;;
|5::fM&(%'#%
t,,- #	)A KK6M  !%! 
 !%!   !7!7!:; 5 5a ;</0 (+q0+k&9&9!&<=@S@SSSH**uyy/B(MG#	)J }!6!66MMo 		-Q7yy15YYzq1
262M2Mm]iku2v/|Z2L*LLr   )rN   rO   rP   rQ   r(   r`   r   rC   r   r   r   r   r   r   r   rL   r   r   jitexportr   rR   rS   s   @r   r   r     s^   &1
1
 1
  #	1

 1
 1
 1
 !%1
 1
 "1
 &)1
 ),1
 !1
 1
 1
  
!1
f F "/
/
	vvvvvvvvM	N/
bF v *6"6286FL6	vvv%	&6@H
H
 !H
 	H

 H
 H
 "H
  &H
 "H
 H
 !H
 H
 
vvvvvvvvvU	VH
TJ6J628J6JPJ6	vvv%	&J6XF v " YYWMF WMF WMuVVU[]cEc?d WM WMr   r   c            /       2    e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dededededededed	ed
ededededededededededededededdf. fdZdedededede	eeeef   f
dZ
ej                  j                  d#ded ee   de	eeef   fd!       Z xZS )$r
   a	  Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
    :cite:`shen2018natural` based on the implementation from
    `Nvidia Deep Learning Examples <https://github.com/NVIDIA/DeepLearningExamples/>`_.

    See Also:
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        mask_padding (bool, optional): Use mask padding (Default: ``False``).
        n_mels (int, optional): Number of mel bins (Default: ``80``).
        n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
        n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
        symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
        encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
        encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
        encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
        decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
        decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
        decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
        decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
        attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
        attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
        attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
        attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
        attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
        prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
        postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
        postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
        postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
        gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
    mask_paddingr   n_symbolr   symbol_embedding_dimrW   r   r   r   r   r   r   rV   r>   rX   rY   r   r   r   r   r   r   r   Nc                    t         |           || _        || _        || _        t        j                  ||      | _        t        j
                  j                  j                  | j                  j                         t        |||      | _        t        ||||	|
|||||||||      | _        t!        ||||      | _        y N)rB   rC   r   r   r   r   	Embedding	embeddingr   r   r   r   r   encoderr   decoderr   postnet)rF   r   r   r   r   r  rW   r   r   r   r   r   r   rV   r>   rX   rY   r   r   r   r   r   r   rG   s                          r   rC   zTacotron2.__init__  s    2 	(!2h0DE%%dnn&;&;< 57LNab!" '*
   (=?RTijr   tokenstoken_lengthsr   r   c                    | j                  |      j                  dd      }| j                  ||      }| j                  |||      \  }}}| j	                  |      }	||	z   }	| j
                  rt        |      }
|
j                  | j                  |
j                  d      |
j                  d            }
|
j                  ddd      }
|j                  |
d       |	j                  |
d       |j                  |
dddddf   d       ||	||fS )a  Pass the input through the Tacotron2 model. This is in teacher
        forcing mode, which is generally used for training.

        The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
        The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.

        Args:
            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
            token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
            mel_specgram (Tensor): The target mel spectrogram
                with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
            mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.

        Returns:
            [Tensor, Tensor, Tensor, Tensor]:
                Tensor
                    Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
                Tensor
                    Sequence of attention weights from the decoder with
                    shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
        r&   r%   )r   r   g        Ng     @@)r  rJ   r  r  r  r   r9   expandr   r   permutemasked_fill_)rF   r	  r
  r   r   embedded_inputsencoder_outputsr   r   mel_specgram_postnetr8   s              r   rL   zTacotron2.forward  s   B ..0::1a@,,F15\- 2> 2
.lJ  $||L9+.BB)*>?D;;t{{DIIaL$))A,GD<<1a(D%%dC0 --dC8%%d1a7mS91<KKr   r,   c                    |j                   \  }}|It        j                  |g      j                  |      j	                  |j
                  |j                        }|J | j                  |      j                  dd      }| j                  ||      }| j                  j                  ||      \  }}}	}
| j                  |      }||z   }|
j                  d||      j                  dd      }
|||
fS )a  Using Tacotron2 for inference. The input is a batch of encoded
        sentences (``tokens``) and its corresponding lengths (``lengths``). The
        output is the generated mel spectrograms, its corresponding lengths, and
        the attention weights from the decoder.

        The input `tokens` should be padded with zeros to length max of ``lengths``.

        Args:
            tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
            lengths (Tensor or None, optional):
                The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
                If ``None``, it is assumed that the all the tokens are valid. Default: ``None``

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
                Tensor
                    The length of the predicted mel spectrogram with shape `(n_batch, )`.
                Tensor
                    Sequence of attention weights from the decoder with shape
                    `(n_batch, max of mel_specgram_lengths, max of lengths)`.
        r&   r%   r   )r   r   tensorr  tor.   r/   r  rJ   r  r  r   r  unfold)rF   r	  r,   r   
max_lengthr  r  r   r   r   r   mel_outputs_postnets               r   r   zTacotron2.infer  s    2 %ll?llJ<077@CCFMMSYS_S_`G"""..0::1a@,,@<@LL<N<N`g<h9*Az"ll<8*-@@&&q'7;EEaK
"$8*DDr   )FP      r&      r           i  皙?Tr            r     r  r  r  r   r  )rN   rO   rP   rQ   r   r(   r`   rC   r   r   rL   r   r   r   r   r   rR   rS   s   @r   r
   r
   e  s   D #!"$'%(%&#$# $!$'+!%$'+-.0#&%&#$%( #/1k1k 1k 	1k
 1k "1k  #1k  #1k !1k 1k 1k 1k !%1k 1k "1k  &)!1k" ),#1k$ !%1k& '1k(  #)1k* !+1k,  #-1k. /1k0 
11kf4L4L 4L 	4L
 %4L 
vvvv-	.4Ll YY&EF &EXf-= &EvW]_eOeIf &E &Er   )Tr   )r&   r&   Nr&   Tr   )r   typingr   r   r   r   r   r   r   torch.nnr	   rq   __all__r(   r   strr   r   r)   r+   r9   Moduler;   rU   rw   r   r   r   r
    r   r   <module>r)     s|  8  / /   $ 
c C t QT didldldsds * 59+++ + 	+
 eCeCj012+ + + + XX__+\F v ".#RYY .#bT4 T4nbii <:ryy :zEryy EP}Mryy }M@qE		 qEr   