
    ujh                        d dl Z d dlmZmZmZ d dlZdgZdej                  dej                  fdZ	 ddej                  dej                  d	ej                  dej                  d
ej                  deej                     deej                     fdZ	de
dej                  j                  fdZdee
   dedeee      fdZdee   dee   dedej$                  dej                  f
dZ G d dej                  j                        Z G d dej                  j                        Z G d dej                  j                        Z G d de      Zy)    N)ListOptionalTupleEmformerlengthsreturnc                    | j                   d   }t        t        j                  |       j	                               }t        j
                  || j                  | j                        j                  ||      | j                  d      k\  }|S )Nr   )devicedtype   )
shapeinttorchmaxitemaranger
   r   expand	unsqueeze)r   
batch_size
max_lengthpadding_masks       X/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/emformer.py_lengths_to_padding_maskr   
   sq    q!JUYYw',,./J<<
7>>W^^J			1	L     	utteranceright_contextsummarymemsleft_context_keyc                    |j                  d      | j                  d      z   |j                  d      z   }|j                  d      }|dk(  rd }|S |t        j                  |      j                         z
  |j                  d      z
  }	||j                  d      nd}
||j                  d      z   |	z   |
z   }t	        |      }|S )Nr   r   )r   )sizer   r   r   r   )r   r   r   r   r   r   TBr   right_context_blocks_lengthleft_context_blocks_lengthklengthss               r   _gen_padding_maskr'      s     	1	q 11GLLOCA1AAv 	 '(%))G*<*@*@*B&BW\\RS_&T#AQA]%5%:%:1%=cd"TYYq\),GGJdd/Ar   
activationc                     | dk(  rt         j                  j                         S | dk(  rt         j                  j                         S | dk(  rt         j                  j	                         S t        d|        )NrelugelusiluzUnsupported activation )r   nnReLUGELUSiLU
ValueError)r(   s    r   _get_activation_moduler2   '   s]    Vxx}}	v	xx}}	v	xx}}2:,?@@r   weight_init_scale_strategy
num_layersc                 H   | t        |      D cg c]  }d  c}S | dk(  r2t        |      D cg c]  }dt        j                  |dz         z   c}S | dk(  r/t        |      D cg c]  }dt        j                  d      z   c}S t        d|        c c}w c c}w c c}w )N	depthwiseg      ?r   constant   z-Unsupported weight_init_scale_strategy value )rangemathsqrtr1   )r3   r4   _	layer_idxs       r   _get_weight_init_gainsr>   2   s    !)#J/000	#{	2@Ej@QR9dii	A..RR	#z	149*4EFydiil"FFHIcHdeff 1RFs   	B"B%B
col_widthscol_masknum_rowsr
   c           	         t        |       t        |      k7  rt        d      t        | |      D cg c]7  \  }}|rt        j                  |||      nt        j
                  |||      9 }}}t        j                  |d      S c c}}w )Nz0Length of col_widths must match that of col_maskr
   r   dim)lenr1   zipr   oneszeroscat)r?   r@   rA   r
   	col_widthis_ones_col
mask_blocks          r   _gen_attention_mask_blockrN   =   s     :#h-'KLL '**h&?	 #I{  	

8Yv6[[9V<	=J  99ZQ''s   <Bc                   v    e Zd ZdZ	 	 	 	 ddedededee   dedef fdZd	e	j                  d
e	j                  dee	j                  e	j                  f   fdZde	j                  de	j                  dee	j                     de	j                  fdZ	 	 dde	j                  de	j                  de	j                  de	j                  d
e	j                  de	j                  dee	j                     dee	j                     dee	j                  e	j                  e	j                  e	j                  f   fdZde	j                  de	j                  de	j                  de	j                  d
e	j                  de	j                  dee	j                  e	j                  f   fdZe	j                   j"                  de	j                  de	j                  de	j                  de	j                  d
e	j                  de	j                  de	j                  dee	j                  e	j                  e	j                  e	j                  f   fd       Z xZS )_EmformerAttentiona_  Emformer layer attention module.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    	input_dim	num_headsdropoutweight_init_gaintanh_on_memnegative_infc                    t         |           ||z  dk7  rt        d| d| d      || _        || _        || _        || _        || _        | j                  | j                  z  dz  | _        t        j                  j                  |d|z  d      | _        t        j                  j                  ||d      | _        t        j                  j                  ||d      | _        |rt        j                  j                  j!                  | j                  j"                  |	       t        j                  j                  j!                  | j                  j"                  |	       y y )
Nr   zinput_dim (z") is not a multiple of num_heads (z).g      r8   T)bias)gain)super__init__r1   rQ   rR   rS   rU   rV   scalingr   r-   Linearemb_to_key_valueemb_to_queryout_projinitxavier_uniform_weight)selfrQ   rR   rS   rT   rU   rV   	__class__s          r   r[   z_EmformerAttention.__init__Y   s%    	y A%{9+5WXaWbbdeff""&($..8TA %	1y=t T!HHOOIytOL	94HHHMM))$*?*?*F*FM])^HHMM))$*;*;*B*BIY)Z r   inputr   r   c                     |j                   \  }}}|j                  d      dz   }|d ||z
   }t        j                  ||g      }| j	                  |      j                  dd      \  }}	||	fS )Nr   r   r8   chunksrE   )r   r!   r   rJ   r^   chunk)
rd   rf   r   r"   r<   summary_lengthright_ctx_utterance_blockmems_right_ctx_utterance_blockkeyvalues
             r   _gen_key_valuez!_EmformerAttention._gen_key_valuew   s|    ++1a1)$)*>A,>$?!).D:S3T)U&**+IJPPXY_`Pa
UEzr   attention_weightsattention_maskr   c                 
   |j                         }|j                  |j                  d      | j                        }|j	                  d      }|j	                  d      | j
                  z  }||j                  || j
                  |d      }|j                  |j                  d      j                  d      j                  t        j                        | j                        }|j                  || j
                  z  |d      }t        j                  j                  j                  |d      j                  |      }t        j                  j                  j                  |t        | j                        | j                        S )Nr   r   r8   rD   )ptraining)floatmasked_fillr   rV   r!   rR   viewtor   boolr-   
functionalsoftmaxtype_asrS   rv   )rd   rq   rr   r   attention_weights_floatr"   r#   attention_probss           r   _gen_attention_probsz'_EmformerAttention._gen_attention_probs   sG    #4"9"9";"9"E"EnF^F^_`Facgctct"u""1%""1%7#&=&B&B1dnnVWY[&\#&=&I&I&&q)33A699%**EtGXGX'# '>&B&B1t~~CUWXZ\&]#((--556MSU5V^^_pqxx""**?eDLL>Q\`\i\i*jjr   r   r   r   r   r   left_context_valc	           	      d   |j                  d      }	|j                  d      |j                  d      z   |j                  d      z   }
| j                  t        j                  |||g            }| j	                  t        j                  |||g            j                  dd      \  }}|||
t        j                  |      j                         z
  |j                  d      z
  }t        j                  |d |j                  d      |z    |||j                  d      |z   d  g      }t        j                  |d |j                  d      |z    |||j                  d      |z   d  g      }|||fD cg c]W  }|j                         j                  d|	| j                  z  | j                  | j                  z        j                  dd      Y c}\  }}}t        j                  || j                  z  |j                  dd            }t        ||||||      }| j!                  |||      }t        j                  ||      }|j"                  |	| j                  z  |
| j                  | j                  z  fk7  rt%        d      |j                  dd      j                         j                  |
|	| j                        }| j'                  |      }|j                  d      }|d |
|z
   }||
|z
  d  }| j(                  rt        j*                  |      }nt        j,                  |dd	      }||||fS c c}w )
Nr   r   r8   rh   rt   z+Computed attention has incorrect dimensionsi
   )minr   )r!   r_   r   rJ   r^   rj   r   r   
contiguousry   rR   rQ   	transposebmmr\   r'   r   r   AssertionErrorr`   rU   tanhclamp)rd   r   r   r   r   r   rr   r   r   r#   r"   queryrn   ro   r$   tensorreshaped_queryreshaped_keyreshaped_valuerq   r   r   	attentionoutput_right_context_memsrk   output_right_contextoutput_memss                              r   _forward_implz _EmformerAttention._forward_impl   s    NN1q!INN1$55QG !!%))]Iw,O"PQ **599dM95U+VW]]eflm]n
U',<,H*+eii.@.D.D.F*FVW*X'))D$))A,)DDE$		!'BBDEC IIFDIIaL+FFG$$))A,)DDFGE !#u-8
 $$RT^^);T^^t~~=]^hhijlmn8
4n "IInt||&C\E[E[\]_`Eab )M7GUY[kl 334E~Wcd IIo~>	??NNdnn,
 

 !!NOO''1-88:??1dnnU	 %)MM)$<! a89M1~;MN/N0B0DE**[1K++ksCK#[#u<<C8
s   0AL-c                 F    | j                  ||||||      \  }}}	}	||dd fS )ac  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        Nrt   )r   )
rd   r   r   r   r   r   rr   outputr   r<   s
             r   forwardz_EmformerAttention.forward   s=    D %)$6$6y'=Zacgiw$x!Q{3B'''r   c           
         |j                  d      |j                  d      z   |j                  d      z   }|j                  d      |j                  d      z   |j                  d      z   |j                  d      z   }	t        j                  ||	      j                  t        j                  |j
                        }
d|
dd|j                  d      f<   | j                  ||||||
||      \  }}}}||||j                  d      |j                  d      z   d ||j                  d      |j                  d      z   d fS )a  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        S: number of summary elements;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
            left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

        Returns:
            (Tensor, Tensor, Tensor, and Tensor):
                Tensor
                    output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
                Tensor
                    attention key computed for left context and utterance.
                Tensor
                    attention value computed for left context and utterance.
        r   r   r
   Trt   N)r   r   )r!   r   rI   rz   r{   r
   r   )rd   r   r   r   r   r   r   r   	query_dimkey_dimrr   r   r   rn   ro   s                  r   inferz_EmformerAttention.infer   sA   R "&&q)INN1,==QO	$$Q')..*;;diilJM]MbMbcdMeeY8;;%**U^UeUe;f-1r>TYYq\>)**.*<*<-- += 	+
'S% 		!}11!4467$))A,!3!3A!6689	
 	
r   )        NF    ח)NN)__name__
__module____qualname____doc__r   rw   r   r{   r[   r   Tensorr   rp   r   r   r   jitexportr   __classcell__re   s   @r   rP   rP   L   s   
  ,0!"[[ [ 	[
 #5/[ [ [<ELL  u||]b]i]iOiIj k <<k k u||,	k
 
k6 4837G=<<G= G= ||	G=
 G= llG= G= #5<<0G= #5<<0G= 
u||U\\5<<E	FG=R#(<<#( #( ||	#(
 #( ll#( #( 
u||U\\)	*#(J YY;
<<;
 ;
 ||	;

 ;
 ll;
  ,,;
  ,,;
 
u||U\\5<<E	F;
 ;
r   rP   c                       e Zd ZdZ	 	 	 	 	 	 	 d%dededededededed	ed
ee   dedef fdZ	dedee
j                     dee
j                     fdZdee
j                     dee
j                  e
j                  e
j                  f   fdZde
j                  de
j                  dede
j                  dee
j                     dee
j                     fdZde
j                  de
j                  de
j                  de
j                  fdZde
j                  de
j                  dee
j                  e
j                  f   fdZde
j                  de
j                  de
j                  dee
j                  e
j                  f   fdZde
j                  de
j                  de
j                  de
j                  d ee
j                     dee
j                  e
j                  f   fd!Zde
j                  de
j                  de
j                  de
j                  deee
j                        dee
j                  e
j                  ee
j                     f   fd"Zde
j                  de
j                  de
j                  de
j                  d e
j                  dee
j                  e
j                  e
j                  f   fd#Ze
j0                  j2                  de
j                  de
j                  de
j                  deee
j                        de
j                  dee
j                  e
j                  ee
j                     e
j                  f   fd$       Z xZS )&_EmformerLayera$  Emformer layer that constitutes Emformer.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads.
        ffn_dim: (int): hidden layer dimension of feedforward network.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in feedforward network.
            Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_gain (float or None, optional): scale factor to apply when initializing
            attention module parameters. (Default: ``None``)
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
    rQ   rR   ffn_dimsegment_lengthrS   r(   left_context_lengthmax_memory_sizerT   rU   rV   c           
      N   t         |           t        ||||	|
|      | _        t        j
                  j                  |      | _        t        j
                  j                  ||d      | _	        t        |      }t        j
                  j                  t        j
                  j                  |      t        j
                  j                  ||      |t        j
                  j                  |      t        j
                  j                  ||      t        j
                  j                  |            | _        t        j
                  j                  |      | _        t        j
                  j                  |      | _        || _        || _        || _        || _        |dkD  | _        y )N)rQ   rR   rS   rT   rU   rV   Tkernel_sizestride	ceil_moder   )rZ   r[   rP   r   r   r-   DropoutrS   	AvgPool1d	memory_opr2   
Sequential	LayerNormr]   pos_fflayer_norm_inputlayer_norm_outputr   r   r   rQ   use_mem)rd   rQ   rR   r   r   rS   r(   r   r   rT   rU   rV   activation_modulere   s                r   r[   z_EmformerLayer.__init__R  s=    	+-#%
 xx''0++~im+n2:>hh))HHy)HHOOIw/HHW%HHOOGY/HHW%
 !& 2 29 =!&!3!3I!>#6 ,."&*r   r   r
   r   c                 j   t        j                  | j                  || j                  |      }t        j                  | j                  || j                  |      }t        j                  | j                  || j                  |      }t        j                  d|t         j
                  |      }||||gS )NrC   r   r   )r   rI   r   rQ   r   int32)rd   r   r
   empty_memoryr   r   past_lengths          r   _init_statez_EmformerLayer._init_state  s    {{4#7#7T^^\bc ;;t'?'?T^^djk ;;t'?'?T^^djkkk!Zu{{6R.0@+NNr   statec                 T   |d   d   d   j                         }t        | j                  |      }t        | j                  t	        j
                  || j                  z              }|d   | j                  |z
  d  }|d   | j                  |z
  d  }|d   | j                  |z
  d  }|||fS )N   r   r   r8   )r   r   r   r   r:   ceilr   )rd   r   r   past_left_context_lengthpast_mem_lengthpre_memslc_keylc_vals           r   _unpack_statez_EmformerLayer._unpack_state  s    Ahqk!n))+#&t'?'?#M d22DIIkDL_L_>_4`a8D00?BDEq$225MMOPq$225MMOP''r   next_knext_vupdate_lengthr   c                 b   t        j                  |d   |g      }t        j                  |d   |g      }t        j                  |d   |g      | j                   d  |d<   ||j                  d   | j                  z
  d  |d<   ||j                  d   | j                  z
  d  |d<   |d   |z   |d<   |S )Nr   r8   r   r   )r   rJ   r   r   r   )rd   r   r   r   r   r   new_knew_vs           r   _pack_statez_EmformerLayer._pack_state  s     		58V,-		58V,-99eAh-.0D0D/D/FGaQ$*B*BBDEaQ$*B*BBDEa8m+ar   	rc_outputr   r   c                     | j                  |      t        j                  ||g      z   }| j                  |      |z   }| j	                  |      }|S N)rS   r   rJ   r   r   )rd   r   r   r   results        r   _process_attention_outputz(_EmformerLayer._process_attention_output  sM     i(599mY5O+PPV$v-''/r   c                     | j                  t        j                  ||g            }||j                  d      d  |d |j                  d       fS Nr   )r   r   rJ   r!   )rd   r   r   r   s       r   _apply_pre_attention_layer_normz._EmformerLayer._apply_pre_attention_layer_norm  sY      00M9;U1VW]//2454}11!45
 	
r   c                 x    | j                  |||      }||j                  d      d  |d |j                  d       fS r   )r   r!   )rd   r   r   r   s       r   _apply_post_attention_ffnz(_EmformerLayer._apply_post_attention_ffn  sJ     229iW	++A.019=T}?Q?QRS?T3UUUr   r   rr   c                 L   |t        d      | j                  r4| j                  |j                  ddd            j                  ddd      }n:t	        j
                  d      j                  |j                  |j                        }| j                  ||||||      \  }}||fS )Nz;attention_mask must be not None when for_inference is Falser   r8   r   r   )r   r   r   r   r   rr   )
r1   r   r   permuter   emptyrz   r   r
   r   )	rd   r   r   r   r   rr   r   r   next_ms	            r   _apply_attention_forwardz'_EmformerLayer._apply_attention_forward  s     !Z[[<<nnY%6%6q!Q%?@HHAqQGkk!n''iooiFVFV'WG NN') + 
	6 &  r   c           	      &   |,| j                  |j                  d      |j                        }| j                  |      \  }}}| j                  r9| j                  |j                  ddd            j                  ddd      }	|	d d }	n:t        j                  d      j                  |j                  |j                        }	| j                  j                  ||||	|||      \  }
}}}| j                  |||j                  d      ||      }|
||fS )Nr   rC   r8   r   r   )r   r   r   r   r   r   r   )r   r!   r
   r   r   r   r   r   r   rz   r   r   r   r   )rd   r   r   r   r   r   r   r   r   r   r   r   r   r   s                 r   _apply_attention_inferz%_EmformerLayer._apply_attention_infer  s    =$$Y^^A%6y?O?O$PE#'#5#5e#< &&<<nnY%6%6q!Q%?@HHAqQGbqkGkk!n''iooiFVFV'WG,0NN,@,@'## -A -
)	666   1BD%P&%''r   c                     | j                  ||      \  }}| j                  |||||      \  }}	| j                  |||      \  }
}|
||	fS )a1  Forward pass for training.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
            attention_mask (torch.Tensor): attention mask for underlying attention module.

        Returns:
            (Tensor, Tensor, Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rd   r   r   r   r   rr   layer_norm_utterancelayer_norm_right_contextr   r   output_utterancer   s               r   r   z_EmformerLayer.forward  sr    H 00MJ	
 $!%!>!> $"
	; 261O1OPY[dfs1t..!5{BBr   c                     | j                  ||      \  }}| j                  |||||      \  }}	}
| j                  |||      \  }}|||
|	fS )a2  Forward pass for inference.

        B: batch size;
        D: feature dimension of each frame;
        T: number of utterance frames;
        R: number of right context frames;
        M: number of memory elements.

        Args:
            utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``utterance``.
            right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
            state (List[torch.Tensor] or None): list of tensors representing layer internal state
                generated in preceding invocation of ``infer``.
            mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

        Returns:
            (Tensor, Tensor, List[torch.Tensor], Tensor):
                Tensor
                    encoded utterance frames, with shape `(T, B, D)`.
                Tensor
                    updated right context frames, with shape `(R, B, D)`.
                List[Tensor]
                    list of tensors representing layer internal state
                    generated in current invocation of ``infer``.
                Tensor
                    updated memory elements, with shape `(M, B, D)`.
        )r   r   r   )rd   r   r   r   r   r   r   r   r   r   output_stater   r   s                r   r   z_EmformerLayer.infer  ss    R 00MJ	
 $/3/J/J '+CT50
,	; 261O1OPY[dfs1t..!5|[PPr   )r   r*   r   r   NFr   )r   r   r   r   r   rw   strr   r{   r[   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r   ?  s   0  #$ ,0!",+,+ ,+ 	,+
 ,+ ,+ ,+ !,+ ,+ #5/,+ ,+ ,+\Oc O8ELL3I OdSXS_S_N` O(4#5 (%ell\a\h\h@h:i (  	
 ll ELL! 
ell	 	<<	 <<	 ||		
 
	

6;ll
	u||U\\)	*
VV27,,VOT||V	u||U\\)	*V!<<! ! ||	!
 ll! !.! 
u||U\\)	*!2(<<( ( ||	(
 ll( U\\*+( 
u||U\\4+==	>(8-C<<-C -C ||	-C
 ll-C -C 
u||U\\5<<7	8-C^ YY-Q<<-Q -Q ||	-Q
 U\\*+-Q ll-Q 
u||U\\4+=u||K	L-Q -Qr   r   c                   L    e Zd Z	 	 	 ddej                  j
                  dedededef
 fdZdej                  dej                  fd	Z	d
edede
e   fdZdej                  dej                  fdZdej                  dej                  deej                  ej                  f   fdZej                  j                   	 ddej                  dej                  dee
e
ej                           deej                  ej                  e
e
ej                        f   fd       Z xZS )_EmformerImplemformer_layersr   r   right_context_lengthr   c                     t         |           |dkD  | _        t        j                  j                  ||d      | _        || _        || _        || _	        || _
        || _        y )Nr   Tr   )rZ   r[   r   r   r-   r   r   r   r   r   r   r   )rd   r   r   r   r   r   re   s         r   r[   z_EmformerImpl.__init__P  sj     	&*++&! , 

  /#6 $8!,.r   rf   r   c                 ~   |j                   d   }t        j                  || j                  z
  | j                  z        }g }t        |dz
        D ]7  }|dz   | j                  z  }|| j                  z   }|j                  |||        9 |j                  ||| j                  z
  d         t        j                  |      S Nr   r   )	r   r:   r   r   r   r9   appendr   rJ   )rd   rf   r"   num_segsright_context_blocksseg_idxstartends           r   _gen_right_contextz _EmformerImpl._gen_right_contextf  s    KKN99a$";";;t?R?RRS!X\* 	:Gq[D$7$77E$333C ''eC(89	: 	##E!d.G.G*G*I$JKyy-..r   r   utterance_lengthc           
         t        j                  || j                  z        }| j                  }| j                  }||z  }||z   }t        || j                  z  |z
  d      }t        |dz   | j                  z  |      }	| j                  |z  }
| j                  r:t        || j                  z
  d      }|dz
  }|||z
  ||z
  |||
|z
  ||	|z
  ||	z
  g	}|S |||
|z
  ||	|z
  ||	z
  g}|S r   )	r:   r   r   r   r   r   r   r   r   )rd   r   r   r   rclcrc_startrc_end	seg_startseg_end	rc_lengthm_start
mem_lengthr?   s                 r   _gen_attention_mask_col_widthsz,_EmformerImpl._gen_attention_mask_col_widthsq  s(   99-0C0CCD&&%%R<B$"5"55:A>	w{d&9&99;KL--8	<<'D$8$88!<G!AJ'!W$F")# 7*
J*  F")# 7*J r   c                    |j                  d      }t        j                  || j                  z        }g }g }g }| j                  r<d}t        |      D cg c]  }|dv  }	}t        |      D cg c]  }|dv  }
}|||g}n"d}t        |      D cg c]  }|dv  }	}d }
||g}t        |      D ]  }| j                  ||      }t        ||	| j                  |j                        }|j                  |       t        ||	t        | j                  ||| j                  z  z
        |j                        }|j                  |       |
t        ||
d|j                        }|j                  |        dt        j                  |D cg c]  }t        j                  |       c}      z
  j                  t        j                        }|S c c}w c c}w c c}w c c}w )Nr   	   )r         )r  r	     )r   r  r   )r!   r:   r   r   r   r9   r  rN   r   r
   r   r   r   rJ   rz   r{   )rd   rf   r   r   rc_mask
query_masksummary_masknum_colsidxrc_q_cols_masks_cols_maskmasks_to_concatr   r?   rc_mask_blockquery_mask_blocksummary_mask_blockmaskrr   s                      r   _gen_attention_maskz!_EmformerImpl._gen_attention_mask  s    ::a=99-0C0CCD
<<H:?/J3cY.JNJ49(ODS3&=DKD&
LAOH7<XGcVmGNGK&
3OX 	8G<<WFVWJ5ND,E,Eu||M NN=)8''$w1D1D'DD   ./&%>z;XY[`[g[g%h"##$67+	8. eii_(UT4(UVVZZ[`[e[efG KD
 H6 )Vs   G
/GGG
r   c                    |j                  ddd      }| j                  |      }|d|j                  d      | j                  z
   }| j	                  |      }| j
                  r6| j                  |j                  ddd            j                  ddd      dd n9t        j                  d      j                  |j                  |j                        }|}| j                  D ]  } ||||||      \  }}} |j                  ddd      |fS )aG  Forward pass for training and non-streaming inference.

        B: batch size;
        T: max number of input frames in batch;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, T + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid utterance frames for i-th batch element in ``input``.

        Returns:
            (Tensor, Tensor):
                Tensor
                    output frames, with shape `(B, T, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r   r8   Nrt   r   )r   r   r!   r   r  r   r   r   r   rz   r   r
   r   )	rd   rf   r   r   r   rr   r   r   layers	            r   r   z_EmformerImpl.forward  s   * aA&//6EEJJqMD,E,EEF	11)< || NN9,,Q156>>q!QGLQ""U\\"J 	
 )) 	fE*/PTVd*e'FM4	f~~aA&//r   statesc                 J   |j                  d      | j                  | j                  z   k7  r8t        d| j                  | j                  z    d|j                  d       d      |j	                  ddd      }|j                  d      | j                  z
  }||d }|d| }t        j                  || j                  z
  d      }| j                  r3| j                  |j	                  ddd            j	                  ddd      n9t        j                  d      j                  |j                  |j                  	      }|}	g }
t        | j                        D ]7  \  }}|j                  |	|||dn||   |      \  }	}}}|
j!                  |       9 |	j	                  ddd      ||
fS )
a  Forward pass for streaming inference.

        B: batch size;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): utterance frames right-padded with right context frames, with
                shape `(B, segment_length + right_context_length, D)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.
            states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

        Returns:
            (Tensor, Tensor, List[List[Tensor]]):
                Tensor
                    output frames, with shape `(B, segment_length, D)`.
                Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
                List[List[Tensor]]
                    output states; list of lists of tensors representing internal state
                    generated in current invocation of ``infer``.
        r   zIPer configured segment_length and right_context_length, expected size of z# for dimension 1 of input, but got .r   r8   N)r   r   )r!   r   r   r1   r   r   r   r   r   r   rz   r   r
   	enumerater   r   r   )rd   rf   r   r  right_context_start_idxr   r   output_lengthsr   r   output_statesr=   r  r   s                 r   r   z_EmformerImpl.infer  s   > ::a=D//$2K2KKK&&*&9&9D<U<U&U%V W"ZZ]O1. 
 aA&"'**Q-$2K2K"K567223	Wt/H/H%HaP || NN9,,Q156>>q!QGQ""U\\"J 	
 24 )$*>*> ? 	/Iu8=F9,=95FM<   .	/ ~~aA&EEr   )r   r   r   r   )r   r   r   r   r-   
ModuleListr   r[   r   r   r   r  r  r   r   r   r   r   r   r   r   s   @r   r   r   O  sj   
 $%$% /,,/ / !	/
 "/ /,	/ 	/ 	/"c "S "UYZ]U^ "H. .%,, .`!0U\\ !0ELL !0U5<<Y^YeYeKeEf !0F YY
 6:	:F||:F :F d5<<012	:F
 
u||U\\4U\\0B+CC	D:F :Fr   r   c                   p     e Zd ZdZ	 	 	 	 	 	 	 	 ddededededededed	ed
ededee   dedef fdZ	 xZ
S )r   a_  Emformer architecture introduced in
    *Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition*
    :cite:`shi2021emformer`.

    See Also:
        * :func:`~torchaudio.models.emformer_rnnt_model`,
          :func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Emformer layer.
        ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
        num_layers (int): number of Emformer layers to instantiate.
        segment_length (int): length of each input segment.
        dropout (float, optional): dropout probability. (Default: 0.0)
        activation (str, optional): activation function to use in each Emformer layer's
            feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
        left_context_length (int, optional): length of left context. (Default: 0)
        right_context_length (int, optional): length of right context. (Default: 0)
        max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
        weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
            strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
        tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
        negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

    Examples:
        >>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
        >>> input = torch.rand(128, 400, 512)  # batch, num_frames, feature_dim
        >>> lengths = torch.randint(1, 200, (128,))  # batch
        >>> output, lengths = emformer(input, lengths)
        >>> input = torch.rand(128, 5, 512)
        >>> lengths = torch.ones(128) * 5
        >>> output, lengths, states = emformer.infer(input, lengths, None)
    rQ   rR   r   r4   r   rS   r(   r   r   r   r3   rU   rV   c                     t        ||      }t        j                  j                  t	        |      D cg c]  }t        ||||||||
||   ||       c}      }t        |   ||||	|
       y c c}w )N)rS   r(   r   r   rT   rU   rV   )r   r   r   )r>   r   r-   r!  r9   r   rZ   r[   )rd   rQ   rR   r   r4   r   rS   r(   r   r   r   r3   rU   rV   weight_init_gainsr=   r   re   s                    r   r[   zEmformer.__init__K  s      33MzZ((-- "'z!2  "#)(;$3%6y%A +!-
$ 	 3!5+ 	 	
#s    A0)r   r*   r   r   r   r6   Fr   )r   r   r   r   r   rw   r   r   r{   r[   r   r   s   @r   r   r   &  s    "V  #$$% 4?!")
)
 )
 	)

 )
 )
 )
 )
 !)
 ")
 )
 %-SM)
 )
 )
 )
r   r   )r:   typingr   r   r   r   __all__r   r   r'   r   r-   Moduler2   r   rw   r>   r{   r
   rN   rP   r   r   r    r   r   <module>r)     s    ( (  ,ell u||  04||<< \\ \\	
 ,, u||, ell(As Auxx Agx} gRU gZ^_ghm_nZo g(S	(%)$Z(;>(HM(
\\(p
 p
fMQUXX__ MQ`TFEHHOO TFnN
} N
r   