
    |hz                     
   d Z ddlZddlmZmZ ddlZddlmZ ddlmc m	Z
 ddlmZmZ ddlmZ ddlmZmZmZ dZ G d	 d
ej*                        Z G d de      Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Z G d dej*                        Zy)zTransformer modules.    N)ListOptional)	constant_xavier_uniform_   )Conv)_get_clonesinverse_sigmoid#multi_scale_deformable_attn_pytorch)
TransformerEncoderLayerTransformerLayerTransformerBlockMLPBlockLayerNorm2dAIFIDeformableTransformerDecoder!DeformableTransformerDecoderLayerMSDeformAttnMLPc                       e Zd ZdZddd ej
                         dfdededed	ed
ej                  de	f fdZ
eddej                  deej                     dej                  fd       Z	 	 	 ddej                  deej                     deej                     deej                     dej                  f
dZ	 	 	 ddej                  deej                     deej                     deej                     dej                  f
dZ	 	 	 ddej                  deej                     deej                     deej                     dej                  f
dZ xZS )r   a  
    A single layer of the transformer encoder.

    This class implements a standard transformer encoder layer with multi-head attention and feedforward network,
    supporting both pre-normalization and post-normalization configurations.

    Attributes:
        ma (nn.MultiheadAttention): Multi-head attention module.
        fc1 (nn.Linear): First linear layer in the feedforward network.
        fc2 (nn.Linear): Second linear layer in the feedforward network.
        norm1 (nn.LayerNorm): Layer normalization after attention.
        norm2 (nn.LayerNorm): Layer normalization after feedforward network.
        dropout (nn.Dropout): Dropout layer for the feedforward network.
        dropout1 (nn.Dropout): Dropout layer after attention.
        dropout2 (nn.Dropout): Dropout layer after feedforward network.
        act (nn.Module): Activation function.
        normalize_before (bool): Whether to apply normalization before attention and feedforward.
                  Fc1cm	num_headsdropoutactnormalize_beforec                    t         |           ddlm} |st	        d      t        j                  |||d      | _        t        j                  ||      | _	        t        j                  ||      | _
        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        t        j                  |      | _        || _        || _        y)a  
        Initialize the TransformerEncoderLayer with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
           )	TORCH_1_9z]TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).T)r   batch_firstN)super__init__utils.torch_utilsr"   ModuleNotFoundErrornnMultiheadAttentionmaLinearfc1fc2	LayerNormnorm1norm2Dropoutr   dropout1dropout2r   r   )	selfr   r   r   r   r   r   r"   	__class__s	           a/var/www/html/test/engine/venv/lib/python3.12/site-packages/ultralytics/nn/modules/transformer.pyr%   z TransformerEncoderLayer.__init__1   s    ( 	2%o  ''IwTXY99R$99R$\\"%
\\"%
zz'*

7+

7+ 0    tensorposreturnc                     || S | |z   S )z2Add position embeddings to the tensor if provided. r8   r9   s     r6   with_pos_embedz&TransformerEncoderLayer.with_pos_embedZ        v6&3,6r7   srcsrc_masksrc_key_padding_maskc           	      l   | j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j                  |      }| j	                  | j                  | j                  | j                  |                        }|| j                  |      z   }| j                  |      S )a  
        Perform forward pass with post-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        value	attn_maskkey_padding_maskr   )
r>   r*   r2   r/   r-   r   r   r,   r3   r0   )r4   r@   rA   rB   r9   qksrc2s           r6   forward_postz$TransformerEncoderLayer.forward_post_   s    & ##C--Awwq!3(MawbcdeDMM$''jjoxxTXXdhhsm%<=>DMM$''zz#r7   c           	      l   | j                  |      }| j                  ||      x}}| j                  |||||      d   }|| j                  |      z   }| j	                  |      }| j                  | j                  | j                  | j                  |                        }|| j                  |      z   S )a  
        Perform forward pass with pre-normalization.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after attention and feedforward.
        rD   r   )
r/   r>   r*   r2   r0   r-   r   r   r,   r3   )r4   r@   rA   rB   r9   rJ   rH   rI   s           r6   forward_prez#TransformerEncoderLayer.forward_prez   s    & zz###D#..Awwq!48NbwcdefDMM$''zz#xxTXXdhhtn%=>?T]]4(((r7   c                 j    | j                   r| j                  ||||      S | j                  ||||      S )a  
        Forward propagate the input through the encoder module.

        Args:
            src (torch.Tensor): Input tensor.
            src_mask (torch.Tensor, optional): Mask for the src sequence.
            src_key_padding_mask (torch.Tensor, optional): Mask for the src keys per batch.
            pos (torch.Tensor, optional): Positional encoding.

        Returns:
            (torch.Tensor): Output tensor after transformer encoder layer.
        )r   rM   rK   )r4   r@   rA   rB   r9   s        r6   forwardzTransformerEncoderLayer.forward   s=    &   ##C3GMM  h0DcJJr7   NNNN)__name__
__module____qualname____doc__r(   GELUintfloatModuleboolr%   staticmethodtorchTensorr   r>   rK   rM   rO   __classcell__r5   s   @r6   r   r      s   ,  !&'1'1 '1 	'1
 '1 YY'1 '1R 7u|| 7(5<<2H 7TYT`T` 7 7 ,07;&*\\ 5<<( 'u||4	
 ell# 
< ,07;&*)\\) 5<<() 'u||4	)
 ell#) 
)< ,07;&*K\\K 5<<(K 'u||4	K
 ell#K 
Kr7   r   c                        e Zd ZdZddd ej
                         dfdededed	ed
ej                  de	f fdZ
dej                  dej                  f fdZe	 ddededededej                  f
d       Z xZS )r   z
    AIFI transformer layer for 2D data with positional embeddings.

    This class extends TransformerEncoderLayer to work with 2D feature maps by adding 2D sine-cosine positional
    embeddings and handling the spatial dimensions appropriately.
    r   r   r   Fr   r   r   r   r   r   c                 .    t         |   ||||||       y)a  
        Initialize the AIFI instance with specified parameters.

        Args:
            c1 (int): Input dimension.
            cm (int): Hidden dimension in the feedforward network.
            num_heads (int): Number of attention heads.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            normalize_before (bool): Whether to apply normalization before attention and feedforward.
        N)r$   r%   )r4   r   r   r   r   r   r   r5   s          r6   r%   zAIFI.__init__   s    ( 	RGS:JKr7   xr:   c                 d   |j                   dd \  }}}| j                  |||      }t        |   |j	                  d      j                  ddd      |j                  |j                  |j                              }|j                  ddd      j                  d|||g      j                         S )z
        Forward pass for the AIFI transformer layer.

        Args:
            x (torch.Tensor): Input tensor with shape [B, C, H, W].

        Returns:
            (torch.Tensor): Output tensor with shape [B, C, H, W].
        r   N   r   )devicedtype)r9   )shape"build_2d_sincos_position_embeddingr$   rO   flattenpermutetore   rf   view
contiguous)r4   rb   chw	pos_embedr5   s         r6   rO   zAIFI.forward   s     ''!"+1a;;Aq!D	GOAIIaL00Aq9y||STS[S[cdcjcj|?kOlyyAq!&&Aq!}5@@BBr7   rq   rp   	embed_dimtemperaturec                    |dz  dk(  sJ d       t        j                  | t         j                        }t        j                  |t         j                        }t        j                  ||d      \  }}|dz  }t        j                  |t         j                        |z  }d||z  z  }|j	                         d   |d	   z  }|j	                         d   |d	   z  }	t        j
                  t        j                  |      t        j                  |      t        j                  |	      t        j                  |	      gd
      d	   S )a  
        Build 2D sine-cosine position embedding.

        Args:
            w (int): Width of the feature map.
            h (int): Height of the feature map.
            embed_dim (int): Embedding dimension.
            temperature (float): Temperature for the sine/cosine functions.

        Returns:
            (torch.Tensor): Position embedding with shape [1, embed_dim, h*w].
           r   zHEmbed dimension must be divisible by 4 for 2D sin-cos position embeddingrf   ij)indexingg      ?.NNr   )r\   arangefloat32meshgridrj   catsincos)
rq   rp   rs   rt   grid_wgrid_hpos_dimomegaout_wout_hs
             r6   ri   z'AIFI.build_2d_sincos_position_embedding   s     1}!m#mm!au}}5au}}5Fq.WEMM:WD{E)* +eDk9 +eDk9yy%))E*EIIe,<eii>NPUPYPYZ_P`acdefjkkr7   )   g     @)rR   rS   rT   rU   r(   rV   rW   rX   rY   rZ   r%   r\   r]   rO   r[   ri   r^   r_   s   @r6   r   r      s      !&LL L 	L
 L YYL L,C C%,, C  CJlll#&l;@l	l lr7   r   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )r   zeTransformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance).ro   r   c                 |   t         |           t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        y)z
        Initialize a self-attention mechanism using linear transformations and multi-head attention.

        Args:
            c (int): Input and output channel dimension.
            num_heads (int): Number of attention heads.
        F)bias)rs   r   N)r$   r%   r(   r+   rH   rI   vr)   r*   r,   r-   )r4   ro   r   r5   s      r6   r%   zTransformerLayer.__init__   s     	1ae,1ae,1ae,''!yI99Q.99Q.r7   rb   r:   c                     | j                  | j                  |      | j                  |      | j                  |            d   |z   }| j	                  | j                  |            |z   S )z
        Apply a transformer block to the input x and return the output.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after transformer layer.
        r   )r*   rH   rI   r   r-   r,   r4   rb   s     r6   rO   zTransformerLayer.forward  sT     GGDFF1Itvvay$&&)4Q7!;xx$q((r7   
rR   rS   rT   rU   rW   r%   r\   r]   rO   r^   r_   s   @r6   r   r      s4    o/# /# / ) )%,, )r7   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z xZ	S )
r   a  
    Vision Transformer block based on https://arxiv.org/abs/2010.11929.

    This class implements a complete transformer block with optional convolution layer for channel adjustment,
    learnable position embedding, and multiple transformer layers.

    Attributes:
        conv (Conv, optional): Convolution layer if input and output channels differ.
        linear (nn.Linear): Learnable position embedding.
        tr (nn.Sequential): Sequential container of transformer layers.
        c2 (int): Output channel dimension.
    r   c2r   
num_layersc                     t         |           d| _        |k7  rt        |      | _        t	        j
                        | _        t	        j                  fdt        |      D         | _	        | _
        y)aU  
        Initialize a Transformer module with position embedding and specified number of heads and layers.

        Args:
            c1 (int): Input channel dimension.
            c2 (int): Output channel dimension.
            num_heads (int): Number of attention heads.
            num_layers (int): Number of transformer layers.
        Nc              3   6   K   | ]  }t                y wrP   )r   ).0_r   r   s     r6   	<genexpr>z,TransformerBlock.__init__.<locals>.<genexpr>7  s     !]a"22y"A!]s   )r$   r%   convr   r(   r+   linear
Sequentialrangetrr   )r4   r   r   r   r   r5   s     `` r6   r%   zTransformerBlock.__init__(  s`     		8RDIiiB'--!]5Q[K\!]^r7   rb   r:   c                 B   | j                   | j                  |      }|j                  \  }}}}|j                  d      j                  ddd      }| j	                  || j                  |      z         j                  ddd      j                  || j                  ||      S )z
        Forward propagate the input through the transformer block.

        Args:
            x (torch.Tensor): Input tensor with shape [b, c1, w, h].

        Returns:
            (torch.Tensor): Output tensor with shape [b, c2, w, h].
        rd   r   r   )r   rh   rj   rk   r   r   reshaper   )r4   rb   br   rq   rp   ps          r6   rO   zTransformerBlock.forward:  s     99 		!AWW
1aIIaL  Aq)wwq4;;q>)*221a;CCAtwwPQSTUUr7   r   r_   s   @r6   r   r     sG    3 C C S $V V%,, Vr7   r   c                   |     e Zd ZdZej
                  fdedef fdZdej                  dej                  fdZ
 xZS )r   z+A single block of a multi-layer perceptron.embedding_dimmlp_dimc                     t         |           t        j                  ||      | _        t        j                  ||      | _         |       | _        y)a   
        Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function.

        Args:
            embedding_dim (int): Input and output dimension.
            mlp_dim (int): Hidden dimension.
            act (nn.Module): Activation function.
        N)r$   r%   r(   r+   lin1lin2r   )r4   r   r   r   r5   s       r6   r%   zMLPBlock.__init__N  s=     	IImW5	IIg}5	5r7   rb   r:   c                 `    | j                  | j                  | j                  |                  S )z
        Forward pass for the MLPBlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP block.
        )r   r   r   r   s     r6   rO   zMLPBlock.forward\  s$     yy$))A,/00r7   )rR   rS   rT   rU   r(   rV   rW   r%   r\   r]   rO   r^   r_   s   @r6   r   r   K  s;    5=?WW c C 
1 
1%,, 
1r7   r   c                        e Zd ZdZej
                  dfdededededef
 fdZd	e	j                  d
e	j                  fdZ xZS )r   a  
    A simple multi-layer perceptron (also called FFN).

    This class implements a configurable MLP with multiple linear layers, activation functions, and optional
    sigmoid output activation.

    Attributes:
        num_layers (int): Number of layers in the MLP.
        layers (nn.ModuleList): List of linear layers.
        sigmoid (bool): Whether to apply sigmoid to the output.
        act (nn.Module): Activation function.
    F	input_dim
hidden_dim
output_dimr   sigmoidc                     t         |           || _        |g|dz
  z  }t        j                  d t        |g|z   ||gz         D              | _        || _         |       | _        y)a  
        Initialize the MLP with specified input, hidden, output dimensions and number of layers.

        Args:
            input_dim (int): Input dimension.
            hidden_dim (int): Hidden dimension.
            output_dim (int): Output dimension.
            num_layers (int): Number of layers.
            act (nn.Module): Activation function.
            sigmoid (bool): Whether to apply sigmoid to the output.
        r   c              3   N   K   | ]  \  }}t        j                  ||        y wrP   )r(   r+   )r   nrI   s      r6   r   zMLP.__init__.<locals>.<genexpr>  s     #g1BIIaO#gs   #%N)	r$   r%   r   r(   
ModuleListziplayersr   r   )	r4   r   r   r   r   r   r   rp   r5   s	           r6   r%   zMLP.__init__w  se     	$LJN+mm#gYKRSOUVZdYeUe@f#gg5r7   rb   r:   c                    t        | j                        D ]J  \  }}|| j                  dz
  k  r+ t        | dt	        j
                                ||            n ||      }L t        | dd      r|j                         S |S )z
        Forward pass for the entire MLP.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after MLP.
        r   r   r   F)	enumerater   r   getattrr(   ReLUr   )r4   rb   ilayers       r6   rO   zMLP.forward  sx     "$++. 	cHAu=>STAT=T/eRWWY/a9Z_`aZbA	c%dIu=qyy{D1Dr7   )rR   rS   rT   rU   r(   r   rW   rZ   r%   r\   r]   rO   r^   r_   s   @r6   r   r   i  sc     VXU\U\ns*-;>LOgk*E E%,, Er7   r   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	r   au  
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    This class implements layer normalization for 2D feature maps, normalizing across the channel dimension
    while preserving spatial dimensions.

    Attributes:
        weight (nn.Parameter): Learnable scale parameter.
        bias (nn.Parameter): Learnable bias parameter.
        eps (float): Small constant for numerical stability.

    References:
        https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
        https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py
    num_channelsepsc                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y)z
        Initialize LayerNorm2d with the given parameters.

        Args:
            num_channels (int): Number of channels in the input.
            eps (float): Small constant for numerical stability.
        N)
r$   r%   r(   	Parameterr\   onesweightzerosr   r   )r4   r   r   r5   s      r6   r%   zLayerNorm2d.__init__  sI     	ll5::l#;<LL\!:;	r7   rb   r:   c                    |j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }| j
                  ddddf   |z  | j                  ddddf   z   S )z
        Perform forward pass for 2D layer normalization.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Normalized output tensor.
        r   Tkeepdimrd   N)meanpowr\   sqrtr   r   r   )r4   rb   uss       r6   rO   zLayerNorm2d.forward  s     FF1dF#UKKN40UejjTXX..{{1dD=)A-		!T4-0HHHr7   )gư>)rR   rS   rT   rU   rW   rX   r%   r\   r]   rO   r^   r_   s   @r6   r   r     s9     S u I I%,, Ir7   r   c                        e Zd ZdZddedededef fdZd Z	 ddej                  d	ej                  d
ej                  de	de
ej                     dej                  fdZ xZS )r   a  
    Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    This module implements multiscale deformable attention that can attend to features at multiple scales
    with learnable sampling locations and attention weights.

    Attributes:
        im2col_step (int): Step size for im2col operations.
        d_model (int): Model dimension.
        n_levels (int): Number of feature levels.
        n_heads (int): Number of attention heads.
        n_points (int): Number of sampling points per attention head per feature level.
        sampling_offsets (nn.Linear): Linear layer for generating sampling offsets.
        attention_weights (nn.Linear): Linear layer for generating attention weights.
        value_proj (nn.Linear): Linear layer for projecting values.
        output_proj (nn.Linear): Linear layer for projecting output.

    References:
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    d_modeln_levelsn_headsn_pointsc                    t         |           ||z  dk7  rt        d| d|       ||z  }||z  |k(  sJ d       d| _        || _        || _        || _        || _        t        j                  |||z  |z  dz        | _
        t        j                  |||z  |z        | _        t        j                  ||      | _        t        j                  ||      | _        | j                          y)aG  
        Initialize MSDeformAttn with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_levels (int): Number of feature levels.
            n_heads (int): Number of attention heads.
            n_points (int): Number of sampling points per attention head per feature level.
        r   z.d_model must be divisible by n_heads, but got z and z(`d_model` must be divisible by `n_heads`@   rd   N)r$   r%   
ValueErrorim2col_stepr   r   r   r   r(   r+   sampling_offsetsattention_weights
value_projoutput_proj_reset_parameters)r4   r   r   r   r   _d_per_headr5   s         r6   r%   zMSDeformAttn.__init__  s     	W!MgYV[\c[deff(W$/[1[[/   "		'7X3E3PST3T U!#7Gh4F4Q!R))GW599Wg6 r7   c                 H   t        | j                  j                  j                  d       t	        j
                  | j                  t        j                        dt        j                  z  | j                  z  z  }t	        j                  |j                         |j                         gd      }||j                         j                  dd      d   z  j                  | j                  ddd	      j!                  d| j"                  | j$                  d      }t'        | j$                        D ]  }|d
d
d
d
|d
d
fxx   |dz   z  cc<    t	        j(                         5  t+        j,                  |j                  d            | j                  _        d
d
d
       t        | j0                  j                  j                  d       t        | j0                  j.                  j                  d       t3        | j4                  j                  j                         t        | j4                  j.                  j                  d       t3        | j6                  j                  j                         t        | j6                  j.                  j                  d       y
# 1 sw Y   xY w)zReset module parameters.r   rw   g       @rg   Tr   r   r   rd   N)r   r   r   datar\   r{   r   r|   mathpistackr   r   absmaxrm   repeatr   r   r   no_gradr(   r   r   r   r   r   r   )r4   thetas	grid_initr   s       r6   r   zMSDeformAttn._reset_parameters  s   $''..33S9dll%--@C$''MTXT`T`D`aKKvzz| <bA	,,R,>qAAT$,,1a(VAt}}dmmQ7 	
 t}}% 	+AaAqj!QU*!	+]]_ 	J)+innR6H)ID!!&	J$((//44c:$((--22C8..334$//&&++S1((//445$""'',,c2	J 	Js    4JJ!query
refer_bboxrE   value_shapes
value_maskr:   c           	         |j                   dd \  }}|j                   d   }t        d |D              |k(  sJ | j                  |      }||j                  |d   t	        d            }|j                  ||| j                  | j                  | j                  z        }| j                  |      j                  ||| j                  | j                  | j                  d      }	| j                  |      j                  ||| j                  | j                  | j                  z        }
t        j                  |
d      j                  ||| j                  | j                  | j                        }
|j                   d   }|dk(  rdt        j                  ||j                   |j"                        j%                  d      }|	|ddddddddf   z  }|ddddddddddf   |z   }nQ|d	k(  r=|	| j                  z  |ddddddddddf   z  d
z  }|ddddddddddf   |z   }nt'        d| d      t)        ||||
      }| j+                  |      S )a  
        Perform forward pass for multiscale deformable attention.

        Args:
            query (torch.Tensor): Query tensor with shape [bs, query_length, C].
            refer_bbox (torch.Tensor): Reference bounding boxes with shape [bs, query_length, n_levels, 2],
                range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area.
            value (torch.Tensor): Value tensor with shape [bs, value_length, C].
            value_shapes (list): List with shape [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})].
            value_mask (torch.Tensor, optional): Mask tensor with shape [bs, value_length], True for non-padding
                elements, False for padding elements.

        Returns:
            (torch.Tensor): Output tensor with shape [bs, Length_{query}, C].

        References:
            https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        Nrd   r   c              3   2   K   | ]  }|d    |d   z    yw)r   r   Nr<   )r   r   s     r6   r   z'MSDeformAttn.forward.<locals>.<genexpr>/  s     511Q4!A$;5s   rz   r   rg   )rf   re   rv   g      ?z5Last dim of reference_points must be 2 or 4, but got .)rh   sumr   masked_fillrX   rm   r   r   r   r   r   r   Fsoftmaxr\   	as_tensorrf   re   flipr   r   r   )r4   r   r   rE   r   r   bslen_qlen_vr   r   
num_pointsoffset_normalizeraddsampling_locationsoutputs                   r6   rO   zMSDeformAttn.forward  sH   4 KKO	EA555>>>&!%%j&;U1XFE

2udllDLLDLL4PQ007<<RVZVcVceierertuv 2259>>r5$,,X\XeXehlhuhuXuvII&7<AA"eT\\[_[h[hjnjwjwx%%b)
? %EKKX]XdXd e j jkm n"%6tT4DRS7S%TTC!+Aq$4,B!Cc!I1_"T]]2Z1dAtUVUW@W5XX[^^C!+Aq$4!,C!Ds!JTU_T``abcc4ULJ\^op''r7   )r   rv   r   rv   rP   )rR   rS   rT   rU   rW   r%   r   r\   r]   r   r   rO   r^   r_   s   @r6   r   r     s    *! !S !s !Z] !>36 .21(||1( LL1( ||	1(
 1( U\\*1( 
1(r7   r   c                       e Zd ZdZdddd ej
                         ddfdeded	ed
edej                  dedef fdZ	e
dej                  deej                     dej                  fd       Zdej                  dej                  fdZ	 	 	 ddej                  dej                  dej                  dedeej                     deej                     deej                     dej                  fdZ xZS )r   a;  
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    This class implements a single decoder layer with self-attention, cross-attention using multiscale deformable
    attention, and a feedforward network.

    Attributes:
        self_attn (nn.MultiheadAttention): Self-attention module.
        dropout1 (nn.Dropout): Dropout after self-attention.
        norm1 (nn.LayerNorm): Layer normalization after self-attention.
        cross_attn (MSDeformAttn): Cross-attention module.
        dropout2 (nn.Dropout): Dropout after cross-attention.
        norm2 (nn.LayerNorm): Layer normalization after cross-attention.
        linear1 (nn.Linear): First linear layer in the feedforward network.
        act (nn.Module): Activation function.
        dropout3 (nn.Dropout): Dropout in the feedforward network.
        linear2 (nn.Linear): Second linear layer in the feedforward network.
        dropout4 (nn.Dropout): Dropout after the feedforward network.
        norm3 (nn.LayerNorm): Layer normalization after the feedforward network.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
        https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    r   r   i   r   rv   r   r   d_ffnr   r   r   r   c                 h   t         |           t        j                  |||      | _        t        j
                  |      | _        t        j                  |      | _        t        ||||      | _
        t        j
                  |      | _        t        j                  |      | _        t        j                  ||      | _        || _        t        j
                  |      | _        t        j                  ||      | _        t        j
                  |      | _        t        j                  |      | _        y)a  
        Initialize the DeformableTransformerDecoderLayer with the given parameters.

        Args:
            d_model (int): Model dimension.
            n_heads (int): Number of attention heads.
            d_ffn (int): Dimension of the feedforward network.
            dropout (float): Dropout probability.
            act (nn.Module): Activation function.
            n_levels (int): Number of feature levels.
            n_points (int): Number of sampling points.
        )r   N)r$   r%   r(   r)   	self_attnr1   r2   r.   r/   r   
cross_attnr3   r0   r+   linear1r   dropout3linear2dropout4norm3)	r4   r   r   r   r   r   r   r   r5   s	           r6   r%   z*DeformableTransformerDecoderLayer.__init__a  s    , 	 ..wQ

7+\\'*
 'w'8L

7+\\'*
 yy%0

7+yy0

7+\\'*
r7   r8   r9   r:   c                     || S | |z   S )z;Add positional embeddings to the input tensor, if provided.r<   r=   s     r6   r>   z0DeformableTransformerDecoderLayer.with_pos_embed  r?   r7   tgtc           	          | j                  | j                  | j                  | j                  |                        }|| j	                  |      z   }| j                  |      S )z
        Perform forward pass through the Feed-Forward Network part of the layer.

        Args:
            tgt (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after FFN.
        )r   r   r   r   r   r   )r4   r  tgt2s      r6   forward_ffnz-DeformableTransformerDecoderLayer.forward_ffn  sN     ||DMM$((4<<3D*EFGDMM$''zz#r7   embedr   featsshapespadding_maskrF   	query_posc                    | j                  ||      x}}	| j                  |j                  dd      |	j                  dd      |j                  dd      |      d   j                  dd      }
|| j                  |
      z   }| j	                  |      }| j                  | j                  ||      |j                  d      |||      }
|| j                  |
      z   }| j                  |      }| j                  |      S )aH  
        Perform the forward pass through the entire decoder layer.

        Args:
            embed (torch.Tensor): Input embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Feature maps.
            shapes (list): Feature shapes.
            padding_mask (torch.Tensor, optional): Padding mask.
            attn_mask (torch.Tensor, optional): Attention mask.
            query_pos (torch.Tensor, optional): Query position embeddings.

        Returns:
            (torch.Tensor): Output tensor after decoder layer.
        r   r   )rF   rd   )
r>   r   	transposer2   r/   r   	unsqueezer3   r0   r  )r4   r  r   r  r  r	  rF   r
  rH   rI   r  s              r6   rO   z)DeformableTransformerDecoderLayer.forward  s    4 ##E955AnnQ[[A.Aq0A5??STVWCXdmnn

)Aq/ 	 c**

5! ooy1:3G3G3JESY[g
 c**

5! &&r7   rQ   )rR   rS   rT   rU   r(   r   rW   rX   rY   r%   r[   r\   r]   r   r>   r  r   rO   r^   r_   s   @r6   r   r   G  sS   6  (+(+ (+ 	(+
 (+ YY(+ (+ (+T 7u|| 7(5<<2H 7U\\ 7 7u||  ( 04,0,0)'||)' LL)' ||	)'
 )' u||,)' ELL))' ELL))' 
)'r7   r   c                   (    e Zd ZdZddedej                  dedef fdZ	 	 ddej                  dej                  d	ej                  d
e
dej                  dej                  dej                  deej                     deej                     fdZ xZS )r   av  
    Deformable Transformer Decoder based on PaddleDetection implementation.

    This class implements a complete deformable transformer decoder with multiple decoder layers and prediction
    heads for bounding box regression and classification.

    Attributes:
        layers (nn.ModuleList): List of decoder layers.
        num_layers (int): Number of decoder layers.
        hidden_dim (int): Hidden dimension.
        eval_idx (int): Index of the layer to use during evaluation.

    References:
        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    r   decoder_layerr   eval_idxc                     t         |           t        ||      | _        || _        || _        |dk\  r|| _        y||z   | _        y)aU  
        Initialize the DeformableTransformerDecoder with the given parameters.

        Args:
            hidden_dim (int): Hidden dimension.
            decoder_layer (nn.Module): Decoder layer module.
            num_layers (int): Number of decoder layers.
            eval_idx (int): Index of the layer to use during evaluation.
        r   N)r$   r%   r	   r   r   r   r  )r4   r   r  r   r  r5   s        r6   r%   z%DeformableTransformerDecoder.__init__  sD     	!-<$$$,MzH7Lr7   r  r   r  r  	bbox_head
score_headpos_mlprF   r	  c
                    |}
g }g }d}|j                         }t        | j                        D ]  \  }} ||
||||	| ||            }
 ||   |
      }t        j                   |t	        |      z         }| j
                  rb|j                   ||   |
             |dk(  r|j                  |       nm|j                  t        j                   |t	        |      z                n<|| j                  k(  r-|j                   ||   |
             |j                  |        n#|}| j
                  r|j                         n|} t        j                  |      t        j                  |      fS )a  
        Perform the forward pass through the entire decoder.

        Args:
            embed (torch.Tensor): Decoder embeddings.
            refer_bbox (torch.Tensor): Reference bounding boxes.
            feats (torch.Tensor): Image features.
            shapes (list): Feature shapes.
            bbox_head (nn.Module): Bounding box prediction head.
            score_head (nn.Module): Score prediction head.
            pos_mlp (nn.Module): Position MLP.
            attn_mask (torch.Tensor, optional): Attention mask.
            padding_mask (torch.Tensor, optional): Padding mask.

        Returns:
            dec_bboxes (torch.Tensor): Decoded bounding boxes.
            dec_cls (torch.Tensor): Decoded classification scores.
        Nr   )
r   r   r   r\   r
   trainingappendr  detachr   )r4   r  r   r  r  r  r  r  rF   r	  r   
dec_bboxesdec_clslast_refined_bboxr   r   bboxrefined_bboxs                     r6   rO   z$DeformableTransformerDecoder.forward  sN   < 
 '')
!$++. 	RHAu6:uflIW^_iWjkF9Q<'D ==
0K)KLL}}}z!}V456%%l3%%emmD?K\;]4]&^_dmm#}z!}V45!!,/ ,26--,,.\J%	R( {{:&G(<<<r7   )rg   )NN)rR   rS   rT   rU   rW   r(   rY   r%   r\   r]   r   r   rO   r^   r_   s   @r6   r   r     s     M3 Mryy Mc M]` M2 -1/37=||7= LL7= ||	7=
 7= 997= II7= 7= ELL)7= u||,7=r7   r   ) rU   r   typingr   r   r\   torch.nnr(   torch.nn.functional
functionalr   torch.nn.initr   r   r   r   utilsr	   r
   r   __all__rY   r   r   r   r   r   r   r   r   r   r   r<   r7   r6   <module>r%     s      !     4  T TMKbii MK`Il" IlX)ryy )B.Vryy .Vb1ryy 1</E")) /Ed+I")) +I\{(299 {(|@'		 @'FX=299 X=r7   