
    ujhT'                     R   d dl mZmZ d dlZdgZdej
                  dej
                  fdZ G d dej                  j                        Z	 G d	 d
ej                  j                        Z
 G d dej                  j                        Z G d dej                  j                        Zy)    )OptionalTupleN	Conformerlengthsreturnc                    | j                   d   }t        t        j                  |       j	                               }t        j
                  || j                  | j                        j                  ||      | j                  d      k\  }|S )Nr   )devicedtype   )
shapeinttorchmaxitemaranger	   r
   expand	unsqueeze)r   
batch_size
max_lengthpadding_masks       Y/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/conformer.py_lengths_to_padding_maskr   	   sq    q!JUYYw',,./J<<
7>>W^^J			1	L     c                        e Zd ZdZ	 	 	 ddededededededd	f fd
Zdej                  dej                  fdZ
 xZS )_ConvolutionModulea  Conformer convolution module.

    Args:
        input_dim (int): input dimension.
        num_channels (int): number of depthwise convolution layer input channels.
        depthwise_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        bias (bool, optional): indicates whether to add bias term to each convolution layer. (Default: ``False``)
        use_group_norm (bool, optional): use GroupNorm rather than BatchNorm. (Default: ``False``)
    	input_dimnum_channelsdepthwise_kernel_sizedropoutbiasuse_group_normr   Nc                    t         |           |dz
  dz  dk7  rt        d      t        j                  j                  |      | _        t        j                  j                  t        j                  j                  |d|z  ddd|      t        j                  j                  d      t        j                  j                  |||d|dz
  dz  ||      |r!t        j                  j                  d|      nt        j                  j                  |      t        j                  j                         t        j                  j                  ||ddd|	      t        j                  j                  |            | _        y )
Nr      r   z<depthwise_kernel_size must be odd to achieve 'SAME' padding.)stridepaddingr    )dim)r$   r%   groupsr    )
num_groupsr   )kernel_sizer$   r%   r    )super__init__
ValueErrorr   nn	LayerNorm
layer_norm
SequentialConv1dGLU	GroupNormBatchNorm1dSiLUDropout
sequential)selfr   r   r   r   r    r!   	__class__s          r   r+   z_ConvolutionModule.__init__   sF    	!A%*a/[\\((,,Y7((--HHOOL    HHLLQLHHOO%.2q8#    HH!,G%%l3HHMMOHHOO   HHW%? 
r   inputc                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      S )z
        Args:
            input (torch.Tensor): with shape `(B, T, D)`.

        Returns:
            torch.Tensor: output, with shape `(B, T, D)`.
        r   r#   )r/   	transposer7   )r8   r:   xs      r   forwardz_ConvolutionModule.forwardM   sA     OOE"KK1OOA{{1a  r           FF)__name__
__module____qualname____doc__r   floatboolr+   r   Tensorr>   __classcell__r9   s   @r   r   r      sx    	  $-
-
 -
  #	-

 -
 -
 -
 
-
^!U\\ !ell !r   r   c            	       n     e Zd ZdZd
dedededdf fdZdej                  dej                  fd	Z	 xZ
S )_FeedForwardModulezPositionwise feed forward layer.

    Args:
        input_dim (int): input dimension.
        hidden_dim (int): hidden dimension.
        dropout (float, optional): dropout probability. (Default: 0.0)
    r   
hidden_dimr   r   Nc                    t         |           t        j                  j	                  t        j                  j                  |      t        j                  j                  ||d      t        j                  j                         t        j                  j                  |      t        j                  j                  ||d      t        j                  j                  |            | _	        y )NT)r    )
r*   r+   r   r-   r0   r.   Linearr5   r6   r7   )r8   r   rL   r   r9   s       r   r+   z_FeedForwardModule.__init__d   s    ((--HHy)HHOOIzO=HHMMOHHW%HHOOJ	O=HHW%
r   r:   c                 $    | j                  |      S )z
        Args:
            input (torch.Tensor): with shape `(*, D)`.

        Returns:
            torch.Tensor: output, with shape `(*, D)`.
        )r7   )r8   r:   s     r   r>   z_FeedForwardModule.forwardo   s     u%%r   )r@   )rA   rB   rC   rD   r   rE   r+   r   rG   r>   rH   rI   s   @r   rK   rK   [   sE    	
# 	
3 	
 	
QU 	
&U\\ &ell &r   rK   c                        e Zd ZdZ	 	 	 ddededededededed	d
f fdZdej                  d	ej                  fdZ
dej                  deej                     d	ej                  fdZ xZS )ConformerLayera  Conformer layer that constitutes Conformer.

    Args:
        input_dim (int): input dimension.
        ffn_dim (int): hidden layer dimension of feedforward network.
        num_attention_heads (int): number of attention heads.
        depthwise_conv_kernel_size (int): kernel size of depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)
    r   ffn_dimnum_attention_headsdepthwise_conv_kernel_sizer   r!   convolution_firstr   Nc                    t         |           t        |||      | _        t        j
                  j                  |      | _        t        j
                  j                  |||      | _	        t        j
                  j                  |      | _        t        ||||d|      | _        t        |||      | _        t        j
                  j                  |      | _        || _        y )N)r   T)r   r   r   r   r    r!   )r*   r+   rK   ffn1r   r-   r.   self_attn_layer_normMultiheadAttention	self_attnr6   self_attn_dropoutr   conv_moduleffn2final_layer_normrU   )	r8   r   rR   rS   rT   r   r!   rU   r9   s	           r   r+   zConformerLayer.__init__   s     	&y'7K	$)HH$6$6y$A!44Y@S]d4e!&!1!1'!:-""<)
 'y'7K	 % 2 29 =!2r   r:   c                 ~    |}|j                  dd      }| j                  |      }|j                  dd      }||z   }|S )Nr   r   )r<   r\   )r8   r:   residuals      r   _apply_convolutionz!ConformerLayer._apply_convolution   sF    1%  '1%5 r   key_padding_maskc                    |}| j                  |      }|dz  |z   }| j                  r| j                  |      }|}| j                  |      }| j	                  ||||d      \  }}| j                  |      }||z   }| j                  s| j                  |      }|}| j                  |      }|dz  |z   }| j                  |      }|S )a
  
        Args:
            input (torch.Tensor): input, with shape `(T, B, D)`.
            key_padding_mask (torch.Tensor or None): key padding mask to use in self attention layer.

        Returns:
            torch.Tensor: output, with shape `(T, B, D)`.
        g      ?F)querykeyvaluerb   need_weights)rW   rU   ra   rX   rZ   r[   r]   r^   )r8   r:   rb   r`   r=   _s         r   r>   zConformerLayer.forward   s     IIeGh!!''*A%%a(~~-  
1 ""1%L%%''*AIIaLGh!!!$r   r?   )rA   rB   rC   rD   r   rE   rF   r+   r   rG   ra   r   r>   rH   rI   s   @r   rQ   rQ   z   s    ( $"'33 3 !	3
 %(3 3 3  3 
3>  $U\\ $Xell=S $X]XdXd $r   rQ   c                        e Zd ZdZ	 	 	 ddededededededed	ef fd
Zdej                  dej                  de
ej                  ej                  f   fdZ xZS )r   a(  Conformer architecture introduced in
    *Conformer: Convolution-augmented Transformer for Speech Recognition*
    :cite:`gulati2020conformer`.

    Args:
        input_dim (int): input dimension.
        num_heads (int): number of attention heads in each Conformer layer.
        ffn_dim (int): hidden layer dimension of feedforward networks.
        num_layers (int): number of Conformer layers to instantiate.
        depthwise_conv_kernel_size (int): kernel size of each Conformer layer's depthwise convolution layer.
        dropout (float, optional): dropout probability. (Default: 0.0)
        use_group_norm (bool, optional): use ``GroupNorm`` rather than ``BatchNorm1d``
            in the convolution module. (Default: ``False``)
        convolution_first (bool, optional): apply the convolution module ahead of
            the attention module. (Default: ``False``)

    Examples:
        >>> conformer = Conformer(
        >>>     input_dim=80,
        >>>     num_heads=4,
        >>>     ffn_dim=128,
        >>>     num_layers=4,
        >>>     depthwise_conv_kernel_size=31,
        >>> )
        >>> lengths = torch.randint(1, 400, (10,))  # (batch,)
        >>> input = torch.rand(10, int(lengths.max()), input_dim)  # (batch, num_frames, input_dim)
        >>> output = conformer(input, lengths)
    r   	num_headsrR   
num_layersrT   r   r!   rU   c	                     t         
|           t        j                  j	                  t        |      D 	cg c]  }	t        |||||||       c}	      | _        y c c}	w )N)r   r!   rU   )r*   r+   r   r-   
ModuleListrangerQ   conformer_layers)r8   r   rj   rR   rk   rT   r   r!   rU   rh   r9   s             r   r+   zConformer.__init__   se     	 % 3 3 z*  .##1&7!
s   Ar:   r   r   c                     t        |      }|j                  dd      }| j                  D ]  } |||      } |j                  dd      |fS )aX  
        Args:
            input (torch.Tensor): with shape `(B, T, input_dim)`.
            lengths (torch.Tensor): with shape `(B,)` and i-th element representing
                number of valid frames for i-th batch element in ``input``.

        Returns:
            (torch.Tensor, torch.Tensor)
                torch.Tensor
                    output frames, with shape `(B, T, input_dim)`
                torch.Tensor
                    output lengths, with shape `(B,)` and i-th element representing
                    number of valid frames for i-th batch element in output frames.
        r   r   )r   r<   ro   )r8   r:   r   encoder_padding_maskr=   layers         r   r>   zConformer.forward  sW      8@OOAq!** 	/Ea-.A	/{{1a '))r   r?   )rA   rB   rC   rD   r   rE   rF   r+   r   rG   r   r>   rH   rI   s   @r   r   r      s    H $"'

 
 	

 
 %(
 
 
  
8*U\\ *ELL *U5<<Y^YeYeKeEf *r   )typingr   r   r   __all__rG   r   r-   Moduler   rK   rQ   r    r   r   <module>rw      s    "  -ell u|| F! F!R& &>ZUXX__ ZzN* N*r   