
    ujhՊ              ,       @   d dl mZmZ d dlmZmZmZ d dlZd dlm	Z	 g dZ
 G d dej                  j                        Z G d d	ej                  j                        Z G d
 de      Z G d dej                  j                  e      Z G d dej                  j                        Z G d dej                  j                        Z G d dej                  j                        Zdedededededededededededed ed!ed"ed#ed$ed%ed&ed'ed(ed)ef,d*Zded)efd+Zy),    )ABCabstractmethod)ListOptionalTupleN)Emformer)RNNTemformer_rnnt_baseemformer_rnnt_modelc                        e Zd ZdZdeddf fdZdej                  dej                  deej                  ej                  f   fdZ	 xZ
S )	_TimeReductionzCoalesces frames along time dimension into a
    fewer number of frames with higher feature dimensionality.

    Args:
        stride (int): number of frames to merge for each output frame.
    stridereturnNc                 0    t         |           || _        y N)super__init__r   )selfr   	__class__s     T/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/rnnt.pyr   z_TimeReduction.__init__   s        inputlengthsc                 "   |j                   \  }}}||| j                  z  z
  }|ddd|ddf   }|j                  | j                  d      }|| j                  z  }|j                  |||| j                  z        }|j	                         }||fS )a  Forward pass.

        B: batch size;
        T: maximum input sequence length in batch;
        D: feature dimension of each input sequence frame.

        Args:
            input (torch.Tensor): input sequences, with shape `(B, T, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output sequences, with shape
                    `(B, T  // stride, D * stride)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output sequences.
        Ntrunc)rounding_mode)shaper   divreshape
contiguous)	r   r   r   BTD
num_framesT_maxoutputs	            r   forwardz_TimeReduction.forward   s    * ++1a!dkk/*
a*a'(++dkk+Adkk)q%T[[9""$wr   )__name__
__module____qualname____doc__intr   torchTensorr   r'   __classcell__r   s   @r   r   r      sR    s t U\\ ELL U5<<Y^YeYeKeEf r   r   c                        e Zd ZdZ	 	 ddededededdf
 fdZd	ej                  d
e
eej                        deej                  eej                     f   fdZ xZS )_CustomLSTMa  Custom long-short-term memory (LSTM) block that applies layer normalization
    to internal nodes.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
        layer_norm_epsilon (float, optional):  value of epsilon to use in
            layer normalization layers (Default: 1e-5)
    	input_dim
hidden_dim
layer_normlayer_norm_epsilonr   Nc                    t         |           t        j                  j	                  |d|z  |       | _        t        j                  j	                  |d|z  d      | _        |rWt        j                  j                  ||      | _        t        j                  j                  d|z  |      | _	        || _        y t        j                  j                         | _        t        j                  j                         | _	        || _        y )N   biasF)eps)r   r   r-   nnLinearx2gp2g	LayerNormc_normg_normIdentityr4   )r   r3   r4   r5   r6   r   s        r   r   z_CustomLSTM.__init__C   s     	88??9a*n
N?T88??:q:~E?J((,,Z=O,PDK((,,Q^AS,TDK
 %  ((++-DK((++-DK$r   r   statec                    ||j                  d      }t        j                  || j                  |j                  |j
                        }t        j                  || j                  |j                  |j
                        }n|\  }}| j                  |      }g }|j                  d      D ]  }|| j                  |      z   }| j                  |      }|j                  dd      \  }	}
}}|	j                         }	|
j                         }
|j                         }|j                         }|
|z  |	|z  z   }| j                  |      }||j                         z  }|j                  |        t        j                  |d      }||g}||fS )a  Forward pass.

        B: batch size;
        T: maximum sequence length in batch;
        D: feature dimension of each input sequence element.

        Args:
            input (torch.Tensor): with shape `(T, B, D)`.
            state (List[torch.Tensor] or None): list of tensors
                representing internal state generated in preceding invocation
                of ``forward``.

        Returns:
            (torch.Tensor, List[torch.Tensor]):
                torch.Tensor
                    output, with shape `(T, B, hidden_dim)`.
                List[torch.Tensor]
                    list of tensors representing internal state generated
                    in current invocation of ``forward``.
           )devicedtyper   r8   )dim)sizer-   zerosr4   rG   rH   r>   unbindr?   rB   chunksigmoidtanhrA   appendstack)r   r   rD   r!   hcgated_inputoutputsgates
input_gateforget_gate	cell_gateoutput_gater&   s                 r   r'   z_CustomLSTM.forwardV   s[   . =

1AAtu||5;;WAAtu||5;;WADAqhhuo ''* 	EDHHQK'EKK&E>Ckk!Q>O;JY#++-J%--/K!(I%--/Ka*y"88AAAaffh&ANN1	 W!,Au}r   )Fh㈵>r(   r)   r*   r+   r,   boolfloatr   r-   r.   r   r   r   r'   r/   r0   s   @r   r2   r2   7   s    	 !$(%% % 	%
 "% 
%&0\\0*243E*F0	u||T%,,//	00r   r2   c                   H   e Zd Zedej
                  dej
                  deej
                  ej
                  f   fd       Zedej
                  dej
                  dee	e	ej
                           deej
                  ej
                  e	e	ej
                        f   fd       Z
y)_Transcriberr   r   r   c                      y r    )r   r   r   s      r   r'   z_Transcriber.forward   s    r   statesc                      y r   rb   )r   r   r   rc   s       r   inferz_Transcriber.infer   s     	r   N)r(   r)   r*   r   r-   r.   r   r'   r   r   re   rb   r   r   r`   r`      s    U\\ ELL U5<<Y^YeYeKeEf   ||  d5<<012	
 
u||U\\4U\\0B+CC	D r   r`   c            !           e Zd ZdZddddddded	ed
edededededededededededededdf  fdZde	j                  de	j                  dee	j                  e	j                  f   fdZe	j                  j                  de	j                  de	j                  deeee	j                           dee	j                  e	j                  eee	j                        f   fd       Z xZS )_EmformerEncodera  Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).

    Args:
        input_dim (int): feature dimension of each input sequence element.
        output_dim (int): feature dimension of each output sequence element.
        segment_length (int): length of input segment expressed as number of frames.
        right_context_length (int): length of right context expressed as number of frames.
        time_reduction_input_dim (int): dimension to scale each element in input sequences to
            prior to applying time reduction block.
        time_reduction_stride (int): factor by which to reduce length of input sequence.
        transformer_num_heads (int): number of attention heads in each Emformer layer.
        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        transformer_num_layers (int): number of Emformer layers to instantiate.
        transformer_left_context_length (int): length of left context.
        transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
        transformer_activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
            relur   	depthwiseF)transformer_dropouttransformer_activationtransformer_max_memory_size&transformer_weight_init_scale_strategytransformer_tanh_on_memr3   
output_dimsegment_lengthright_context_lengthtime_reduction_input_dimtime_reduction_stridetransformer_num_headstransformer_ffn_dimtransformer_num_layerstransformer_left_context_lengthrk   rl   rm   rn   ro   r   Nc                p   t         |           t        j                  j	                  ||d      | _        t        |      | _        ||z  }t        ||||	||z  |||
||z  |||      | _	        t        j                  j	                  ||      | _
        t        j                  j                  |      | _        y )NFr9   )dropout
activationleft_context_lengthrr   max_memory_sizeweight_init_scale_strategytanh_on_mem)r   r   r-   r<   r=   input_linearr   time_reductionr   transformeroutput_linearr@   r5   )r   r3   rp   rq   rr   rs   rt   ru   rv   rw   rx   rk   rl   rm   rn   ro   transformer_input_dimr   s                    r   r   z_EmformerEncoder.__init__   s    & 	!HHOO$ , 

 --BC 8;P P#!!"33'- ?!59N!N7'M/
 #XX__-BJO((,,Z8r   r   r   c                     | j                  |      }| j                  ||      \  }}| j                  ||      \  }}| j                  |      }| j	                  |      }	|	|fS )a  Forward pass for training.

        B: batch size;
        T: maximum input sequence length in batch;
        D: feature dimension of each input sequence frame (input_dim).

        Args:
            input (torch.Tensor): input frame sequences right-padded with right context, with
                shape `(B, T + right context length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output input lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output frame sequences.
        )r   r   r   r   r5   )
r   r   r   input_linear_outtime_reduction_outtime_reduction_lengthstransformer_outtransformer_lengthsoutput_linear_outlayer_norm_outs
             r   r'   z_EmformerEncoder.forward   sv    ,  ,,U3595H5HIY[b5c22/3/?/?@RTj/k,, ..?):;222r   rc   c                     | j                  |      }| j                  ||      \  }}| j                  j                  |||      \  }}}	| j	                  |      }
| j                  |
      }|||	fS )aR  Forward pass for inference.

        B: batch size;
        T: maximum input sequence segment length in batch;
        D: feature dimension of each input sequence frame (input_dim).

        Args:
            input (torch.Tensor): input frame sequence segments right-padded with right context, with
                shape `(B, T + right context length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``infer``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output input lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation
                    of ``infer``.
        )r   r   r   re   r   r5   )r   r   r   rc   r   r   r   r   r   transformer_statesr   r   s               r   re   z_EmformerEncoder.infer   s    F  ,,U3595H5HIY[b5c22
 ""#57MvV		
 ..?):;24FFFr   )r(   r)   r*   r+   r,   r^   strr]   r   r-   r.   r   r'   jitexportr   r   re   r/   r0   s   @r   rg   rg      s~   H &)&,+,6A(-#*9 *9 	*9
 *9 "*9 #&*9  #*9  #*9 !*9 !$*9 *-*9 #*9 !$*9 &)*9  14!*9" "&#*9$ 
%*9X3U\\ 3ELL 3U5<<Y^YeYeKeEf 3: YY+G||+G +G d5<<012	+G
 
u||U\\4U\\0B+CC	D+G +Gr   rg   c                       e Zd ZdZ	 	 	 ddededededededed	ed
df fdZ	 ddej                  dej                  de
eeej                           d
eej                  ej                  eeej                        f   fdZ xZS )
_Predictora  Recurrent neural network transducer (RNN-T) prediction network.

    Args:
        num_symbols (int): size of target token lexicon.
        output_dim (int): feature dimension of each output sequence element.
        symbol_embedding_dim (int): dimension of each target token embedding.
        num_lstm_layers (int): number of LSTM layers to instantiate.
        lstm_hidden_dim (int): output dimension of each LSTM layer.
        lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
            for LSTM layers. (Default: ``False``)
        lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
            LSTM layer normalization layers. (Default: 1e-5)
        lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)

    num_symbolsrp   symbol_embedding_dimnum_lstm_layerslstm_hidden_dimlstm_layer_normlstm_layer_norm_epsilonlstm_dropoutr   Nc	                 F   t         
|           t        j                  j	                  ||      | _        t        j                  j                  |      | _        t        j                  j                  t        |      D 	cg c]  }	t        |	dk(  r|n||||       c}	      | _        t        j                  j                  |      | _        t        j                  j                  ||      | _        t        j                  j                  |      | _        || _        y c c}	w )Nr   )r5   r6   )p)r   r   r-   r<   	Embedding	embeddingr@   input_layer_norm
ModuleListranger2   lstm_layersDropoutrz   r=   linearoutput_layer_normr   )r   r   rp   r   r   r   r   r   r   idxr   s             r   r   z_Predictor.__init__9  s     	++K9MN % 2 23G H 88.. !1  ,/1H(/#.'>	

 xx'','7hhooozB!&!3!3J!?(s   ?Dr   r   rD   c                    |j                  dd      }| j                  |      }| j                  |      }|}g }t        | j                        D ]:  \  }	}
 |
||dn||	         \  }}| j                  |      }|j                  |       < | j                  |      }| j                  |      }|j                  ddd      ||fS )a#  Forward pass.

        B: batch size;
        U: maximum sequence length in batch;
        D: feature dimension of each input sequence element.

        Args:
            input (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``forward``. (Default: ``None``)

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output encoding sequences, with shape `(B, U, output_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output encoding sequences.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation of ``forward``.
        rF   r   N   )	permuter   r   	enumerater   rz   rP   r   r   )r   r   r   rD   input_tbembedding_outinput_layer_norm_outlstm_out	state_out	layer_idxlstmlstm_state_out
linear_outoutput_layer_norm_outs                 r   r'   z_Predictor.forwardX  s    @ ==A&x0#44]C'.0	()9)9: 	-OIt'+HemdQVW`Qa'b$Hn||H-H^,	-
 [[*
 $ 6 6z B$,,Q15w	IIr   )Fr[   rh   r   r\   r0   s   @r   r   r   (  s    . !&)-!)) ) "	)
 ) ) ) "') ) 
)F 59	-J||-J -J T%,,/01	-J
 
u||U\\4U\\0B+CC	D-Jr   r   c                        e Zd ZdZddedededdf fdZdej                  d	ej                  d
ej                  dej                  de	ej                  ej                  ej                  f   f
dZ
 xZS )_Joinera@  Recurrent neural network transducer (RNN-T) joint network.

    Args:
        input_dim (int): source and target input dimension.
        output_dim (int): output dimension.
        activation (str, optional): activation function to use in the joiner.
            Must be one of ("relu", "tanh"). (Default: "relu")

    r3   rp   r{   r   Nc                 .   t         |           t        j                  j	                  ||d      | _        |dk(  r$t        j                  j                         | _        y |dk(  r$t        j                  j                         | _        y t        d|       )NTr9   ri   rO   zUnsupported activation )
r   r   r-   r<   r=   r   ReLUr{   Tanh
ValueError)r   r3   rp   r{   r   s       r   r   z_Joiner.__init__  sm    hhooi$oG#hhmmoDO6!#hhmmoDO6zlCDDr   source_encodingssource_lengthstarget_encodingstarget_lengthsc                     |j                  d      j                         |j                  d      j                         z   }| j                  |      }| j                  |      }|||fS )a  Forward pass for training.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: dimension of each source and target sequence encoding.

        Args:
            source_encodings (torch.Tensor): source encoding sequences, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``source_encodings``.
            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``target_encodings``.

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor):
                torch.Tensor
                    joint network output, with shape `(B, T, U, output_dim)`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
        r   rF   )	unsqueezer    r{   r   )r   r   r   r   r   joint_encodingsactivation_outr&   s           r   r'   z_Joiner.forward  sb    D +44Q7BBDGWGaGabcGdGoGoGqq9^,~~55r   )ri   )r(   r)   r*   r+   r,   r   r   r-   r.   r   r'   r/   r0   s   @r   r   r     s    E# E3 EC EUY E%6,,%6 %6  ,,	%6
 %6 
u||U\\5<<7	8%6r   r   c                       e Zd ZdZdedededdf fdZ	 ddej                  d	ej                  d
ej                  dej                  de
eeej                           deej                  ej                  ej                  eeej                        f   fdZej                  j                  dej                  d	ej                  de
eeej                           deej                  ej                  eeej                        f   fd       Zej                  j                  dej                  d	ej                  deej                  ej                  f   fd       Zej                  j                  d
ej                  dej                  de
eeej                           deej                  ej                  eeej                        f   fd       Zej                  j                  dej                  d	ej                  dej                  dej                  deej                  ej                  ej                  f   f
d       Z xZS )r	   a  torchaudio.models.RNNT()

    Recurrent neural network transducer (RNN-T) model.

    Note:
        To build the model, please use one of the factory functions.

    See Also:
        :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pre-trained models.

    Args:
        transcriber (torch.nn.Module): transcription network.
        predictor (torch.nn.Module): prediction network.
        joiner (torch.nn.Module): joint network.
    transcriber	predictorjoinerr   Nc                 L    t         |           || _        || _        || _        y r   )r   r   r   r   r   )r   r   r   r   r   s       r   r   zRNNT.__init__  s$    &"r   sourcesr   targetsr   predictor_statec                     | j                  ||      \  }}| j                  |||      \  }}}| j                  ||||      \  }}}||||fS )a  Forward pass for training.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: feature dimension of each source sequence element.

        Args:
            sources (torch.Tensor): source frame sequences right-padded with right context, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.
            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``targets``.
            predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing prediction network internal state generated in preceding invocation
                of ``forward``. (Default: ``None``)

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    joint network output, with shape
                    `(B, max output source length, max output target length, output_dim (number of target symbols))`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing prediction network internal state generated in current invocation
                    of ``forward``.
        )r   r   r   r   rD   r   r   r   r   )r   r   r   )	r   r   r   r   r   r   r   r   r&   s	            r   r'   zRNNT.forward  s    X ,0+;+;" ,< ,
(. =ANN"! =K =
9./
 26-)-)	 2= 2
. 	
 	
r   rD   c                 <    | j                   j                  |||      S )a  Applies transcription network to sources in streaming mode.

        B: batch size;
        T: maximum source sequence segment length in batch;
        D: feature dimension of each source sequence frame.

        Args:
            sources (torch.Tensor): source frame sequence segments right-padded with right context, with
                shape `(B, T + right context length, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing transcription network internal state generated in preceding invocation
                of ``transcribe_streaming``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing transcription network internal state generated in current invocation
                    of ``transcribe_streaming``.
        )r   re   )r   r   r   rD   s       r   transcribe_streamingzRNNT.transcribe_streaming  s     F %%g~uEEr   c                 &    | j                  ||      S )a  Applies transcription network to sources in non-streaming mode.

        B: batch size;
        T: maximum source sequence length in batch;
        D: feature dimension of each source sequence frame.

        Args:
            sources (torch.Tensor): source frame sequences right-padded with right context, with
                shape `(B, T + right context length, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``sources``.

        Returns:
            (torch.Tensor, torch.Tensor):
                torch.Tensor
                    output frame sequences, with
                    shape `(B, T // time_reduction_stride, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output frame sequences.
        )r   )r   r   r   s      r   
transcribezRNNT.transcribeD  s    6 88r   c                 *    | j                  |||      S )a  Applies prediction network to targets.

        B: batch size;
        U: maximum target sequence length in batch;
        D: feature dimension of each target sequence frame.

        Args:
            targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
                mapping to a target symbol, i.e. in range `[0, num_symbols)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``targets``.
            state (List[List[torch.Tensor]] or None): list of lists of tensors
                representing internal state generated in preceding invocation
                of ``predict``.

        Returns:
            (torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
                torch.Tensor
                    output frame sequences, with shape `(B, U, output_dim)`.
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid elements for i-th batch element in output.
                List[List[torch.Tensor]]
                    output states; list of lists of tensors
                    representing internal state generated in current invocation of ``predict``.
        r   )r   )r   r   r   rD   s       r   predictzRNNT.predicta  s    B ~~G^5~QQr   r   r   c                 >    | j                  ||||      \  }}}|||fS )a  Applies joint network to source and target encodings.

        B: batch size;
        T: maximum source sequence length in batch;
        U: maximum target sequence length in batch;
        D: dimension of each source and target sequence encoding.

        Args:
            source_encodings (torch.Tensor): source encoding sequences, with
                shape `(B, T, D)`.
            source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``source_encodings``.
            target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
            target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                valid sequence length of i-th batch element in ``target_encodings``.

        Returns:
            (torch.Tensor, torch.Tensor, torch.Tensor):
                torch.Tensor
                    joint network output, with shape `(B, T, U, output_dim)`.
                torch.Tensor
                    output source lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 1 for i-th batch element in joint network output.
                torch.Tensor
                    output target lengths, with shape `(B,)` and i-th element representing
                    number of valid elements along dim 2 for i-th batch element in joint network output.
        r   )r   )r   r   r   r   r   r&   s         r   joinz	RNNT.join  s:    F 26-)-)	 2= 2
. ~~55r   r   )r(   r)   r*   r+   r`   r   r   r   r-   r.   r   r   r   r'   r   r   r   r   r   r   r/   r0   s   @r   r	   r	     s    L Z QX ]a  ?CA
A
 A
 	A

 A
 "$tELL'9":;A
 
u||U\\5<<d5<<>P9QQ	RA
F YY"F"F "F T%,,/01	"F
 
u||U\\4U\\0B+CC	D"F "FH YY99 9 
u||U\\)	*	9 98 YY R R  R T%,,/01	 R
 
u||U\\4U\\0B+CC	D R  RD YY(6,,(6 (6  ,,	(6
 (6 
u||U\\5<<7	8(6 (6r   r	   r3   encoding_dimr   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   r   r   r   r   r   r   c                     t        | ||||||||	|
|||||      }t        ||||||||      }t        ||      }t        |||      S )a 
  Builds Emformer-based :class:`~torchaudio.models.RNNT`.

    Note:
        For non-streaming inference, the expectation is for `transcribe` to be called on input
        sequences right-concatenated with `right_context_length` frames.

        For streaming inference, the expectation is for `transcribe_streaming` to be called
        on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
        frames.

    Args:
        input_dim (int): dimension of input sequence frames passed to transcription network.
        encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
            passed to joint network.
        num_symbols (int): cardinality of set of target tokens.
        segment_length (int): length of input segment expressed as number of frames.
        right_context_length (int): length of right context expressed as number of frames.
        time_reduction_input_dim (int): dimension to scale each element in input sequences to
            prior to applying time reduction block.
        time_reduction_stride (int): factor by which to reduce length of input sequence.
        transformer_num_heads (int): number of attention heads in each Emformer layer.
        transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        transformer_num_layers (int): number of Emformer layers to instantiate.
        transformer_left_context_length (int): length of left context considered by Emformer.
        transformer_dropout (float): Emformer dropout probability.
        transformer_activation (str): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu").
        transformer_max_memory_size (int): maximum number of memory elements to use.
        transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``).
        transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
        symbol_embedding_dim (int): dimension of each target token embedding.
        num_lstm_layers (int): number of LSTM layers to instantiate.
        lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
        lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
        lstm_dropout (float): LSTM dropout probability.

    Returns:
        RNNT:
            Emformer RNN-T model.
    )r3   rp   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   )r   r   r   r   r   r   )rg   r   r   r	   )r3   r   r   rq   rr   rs   rt   ru   rv   rw   rk   rl   rx   rm   rn   ro   r   r   r   r   r   encoderr   r   s                           r   r   r     s}    B %1!933/5/5(G$?/U 7G" 1',' 7!	I \;/FF++r   c                     t        d(i ddddd| dddd	d
ddd	dddddddddddddddddddd d!d"d#dd$d%d&d'S ))zBuilds basic version of Emformer-based :class:`~torchaudio.models.RNNT`.

    Args:
        num_symbols (int): The size of target token lexicon.

    Returns:
        RNNT:
            Emformer RNN-T model.
    r3   P   r   i   r   rq      rr   r8   rs      rt   ru      rv   i   rw      rk   g?rl   gelurx      rm   r   rn   rj   ro   Tr   i   r      r   r   gMbP?r   g333333?rb   )r   )r   s    r   r
   r
     s         	
  "%     !  "    & )+ %& 0;  !%!" !#$ %& '( !%)* + r   )abcr   r   typingr   r   r   r-   torchaudio.modelsr   __all__r<   Moduler   r2   r`   rg   r   r   r	   r,   r^   r   r]   r   r
   rb   r   r   <module>r      s   # ( (  & @)UXX__ )XO%((// Od3 MGuxx MG`]J ]J@:6ehhoo :6zh6588?? h6V],], ], 	],
 ], ], "], ], ], ],  ], ],  ], &)], "%],  -0!]," "#],$ %],& '],( )],* #+],, -],. 
/],@ C  D  r   