
    hh                    R   d Z ddlmZmZmZ ddlZddlmZ ddlmc m	Z
 ddlmZ ddlmZmZmZmZmZmZ ddlmZ dZ G d	 d
ej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z G d dej.                        Z  G d dej.                        Z! G d de!      Z" G d d ej.                        Z# G d! d"e!      Z$ G d# d$e!      Z% G d% d&ej.                        Z& G d' d(ej.                        Z' G d) d*ej.                        Z( G d+ d,ej.                        Z) G d- d.ej.                        Z* G d/ d0ej.                        Z+ G d1 d2ej.                        Z, G d3 d4ej.                        Z- G d5 d6ej.                        Z. G d7 d8ej.                        Z/ G d9 d:e'      Z0 G d; d<e!      Z1 G d= d>ej.                        Z2 G d? d@e2      Z3 G dA dBej.                        Z4 G dC dDej.                        Z5 G dE dFej.                        Z6 G dG dHej.                        Z7 G dI dJej.                        Z8 G dK dLej.                        Z9 G dM dNe       Z: G dO dPe!      Z; G dQ dRej                  j.                        Z< G dS dTej.                        Z= G dU dVe       Z> G dW dXej.                        Z? G dY dZej.                        Z@ G d[ d\ej.                        ZA G d] d^ej.                        ZB G d_ d`e       ZC G da dbej.                        ZD G dc ddej.                        ZE G de dfej.                        ZF G dg dhej.                        ZG G di djej.                        ZH G dk dlej.                        ZI G dm dnej.                        ZJ G do dpej.                        ZKy)qzBlock modules.    )ListOptionalTupleN)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)'DFLHGBlockHGStemSPPSPPFC1C2C3C2fC2fAttnImagePoolingAttnContrastiveHeadBNContrastiveHeadC3xC3TRC3GhostGhostBottleneck
BottleneckBottleneckCSPProtoRepC3ResNetLayerRepNCSPELAN4ELAN1ADownAConvSPPELANCBFuseCBLinearC3k2C2fPSAC2PSARepVGGDWCIBC2fCIB	AttentionPSASCDownTorchVisionc                   b     e Zd ZdZddef fdZdej                  dej                  fdZ xZ	S )r   z
    Integral module of Distribution Focal Loss (DFL).

    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
    c1c                 d   t         |           t        j                  |ddd      j	                  d      | _        t        j                  |t        j                        }t        j                  |j                  d|dd            | j
                  j                  j                  dd || _        y)z
        Initialize a convolutional layer with a given number of input channels.

        Args:
            c1 (int): Number of input channels.
        r   Fbias)dtypeN)super__init__nnConv2drequires_grad_convtorcharangefloat	Parameterviewweightdatar7   )selfr7   x	__class__s      Z/var/www/html/dev/engine/venv/lib/python3.12/site-packages/ultralytics/nn/modules/block.pyr=   zDFL.__init__A   s~     	IIb!QU3BB5I	LL5;;/#%<<q"a0C#D		a     rJ   returnc                     |j                   \  }}}| j                  |j                  |d| j                  |      j	                  dd      j                  d            j                  |d|      S )zCApply the DFL module to input tensor and return transformed output.      r   )shaperA   rF   r7   	transposesoftmax)rI   rJ   b_as        rL   forwardzDFL.forwardN   s]    ''1ayy1dggq1;;AqAII!LMRRSTVWYZ[[rM   )   
__name__
__module____qualname____doc__intr=   rB   TensorrX   __classcell__rK   s   @rL   r   r   :   s2    3 \ \%,, \rM   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )
r"   zBUltralytics YOLO models mask Proto module for segmentation models.r7   c_c2c                     t         |           t        ||d      | _        t	        j
                  ||dddd      | _        t        ||d      | _        t        ||      | _        y)a  
        Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.

        Args:
            c1 (int): Input channels.
            c_ (int): Intermediate channels.
            c2 (int): Output channels (number of protos).
           )krQ   r   Tr9   N)	r<   r=   r   cv1r>   ConvTranspose2dupsamplecv2cv3)rI   r7   rd   re   rK   s       rL   r=   zProto.__init__X   sY     	B!$**2r1aFB!$B<rM   rJ   rN   c           	      ~    | j                  | j                  | j                  | j                  |                        S )zEPerform a forward pass through layers using an upsampled input image.)rm   rl   rk   ri   rI   rJ   s     rL   rX   zProto.forwardg   s+    xxtxx{!;<==rM   )       rZ   rb   s   @rL   r"   r"   U   s;    L 3  C  3  > >%,, >rM   r"   c                   h     e Zd ZdZdededef fdZdej                  dej                  fdZ xZ	S )	r   z
    StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    r7   cmre   c           	         t         |           t        ||ddt        j                               | _        t        ||dz  dddt        j                               | _        t        |dz  |dddt        j                               | _        t        |dz  |ddt        j                               | _        t        ||ddt        j                               | _	        t        j                  dddd      | _        y)	z
        Initialize the StemBlock of PPHGNetV2.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
        rg   rQ   actr   r   T)kernel_sizestridepadding	ceil_modeN)r<   r=   r   r>   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)rI   r7   rs   re   rK   s       rL   r=   zHGStem.__init__s   s     	"b!QBGGI6
2rQw1aRWWY?27B1aRWWY?"q&"a	:
"b!QBGGI6
LLQq!tT	rM   rJ   rN   c                 d   | j                  |      }t        j                  |g d      }| j                  |      }t        j                  |g d      }| j	                  |      }| j                  |      }t        j                  ||gd      }| j                  |      }| j                  |      }|S )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
r|   Fpadr}   r~   r   rB   catr   r   )rI   rJ   x2x1s       rL   rX   zHGStem.forward   s    JJqMEE!\"[[^UU2|$[[_YYq\IIr2hA&JJqMJJqMrM   rZ   rb   s   @rL   r   r   l   sA    U3 UC US U" %,, rM   r   c                        e Zd ZdZdddd ej
                         fdedededed	ed
ededej                  f fdZ	de
j                  de
j                  fdZ xZS )r   z
    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    rg      Fr7   rs   re   rh   n	lightconvshortcutrv   c	                 0  	 t         
|           |rt        nt        	t	        j
                  	fdt        |      D              | _        t        |z  z   |dz  dd      | _        t        |dz  |dd      | _	        |xr |k(  | _
        y)a  
        Initialize HGBlock with specified parameters.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            n (int): Number of LightConv or Conv blocks.
            lightconv (bool): Whether to use LightConv.
            shortcut (bool): Whether to use shortcut connection.
            act (nn.Module): Activation function.
        c              3   D   K   | ]  } |d k(  rn        yw)r   rh   rv   N ).0irv   blockr7   rs   rh   s     rL   	<genexpr>z#HGBlock.__init__.<locals>.<genexpr>   s(     _QRu16Rr2LL_    rQ   r   ru   N)r<   r=   r   r   r>   
ModuleListrangemscecadd)rI   r7   rs   re   rh   r   r   r   rv   r   rK   s    `` `   `@rL   r=   zHGBlock.__init__   s~    0 	&	D_V[\]V^__rAF{B!GQs;rQwAqc2(brM   rJ   rN   c                     |gj                  fd| j                  D               | j                  | j                  t	        j
                  d                  | j                  r|z   S S )r   c              3   4   K   | ]  } |d            ywNr   r   r   ys     rL   r   z"HGBlock.forward.<locals>.<genexpr>        *a1R5*   r   )extendr   r   r   rB   r   r   rI   rJ   r   s     @rL   rX   zHGBlock.forward   sV    C	*466**GGDGGEIIaO,-q1u'a'rM   )r[   r\   r]   r^   r>   r{   r_   boolModuler=   rB   r`   rX   ra   rb   s   @rL   r   r      s      )) ) 	)
 ) ) ) ) YY)>( (%,, (rM   r   c            	       t     e Zd ZdZd
dededeedf   f fdZdej                  dej                  fd	Z	 xZ
S )r   zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.r7   re   rh   .c                 "   t         |           |dz  }t        ||dd      | _        t        |t	        |      dz   z  |dd      | _        t        j                  |D cg c]  }t        j                  |d|dz         c}      | _	        yc c}w )z
        Initialize the SPP layer with input/output channels and pooling kernel sizes.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (tuple): Kernel sizes for max pooling.
        rQ   r   rw   rx   ry   N)
r<   r=   r   ri   lenrl   r>   r   r   r   )rI   r7   re   rh   rd   rJ   rK   s         rL   r=   zSPP.__init__   s|     	1WB1%c!fqj)2q!4_`aZ[1aSTf Uabas   "BrJ   rN   c                     | j                  |      }| j                  t        j                  |g| j                  D cg c]
  } ||       c}z   d            S c c}w )zBForward pass of the SPP layer, performing spatial pyramid pooling.r   )ri   rl   rB   r   r   )rI   rJ   r   s      rL   rX   zSPP.forward   sF    HHQKxx		1#tvv(>!1(>">BCC(>s   A))   	      )r[   r\   r]   r^   r_   r   r=   rB   r`   rX   ra   rb   s   @rL   r   r      sJ    Nc3 cC cE#s(O cD D%,, DrM   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )
r   zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.r7   re   rh   c                     t         |           |dz  }t        ||dd      | _        t        |dz  |dd      | _        t        j                  |d|dz        | _        y)a'  
        Initialize the SPPF layer with given input/output channels and kernel size.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.

        Notes:
            This module is equivalent to SPP(k=(5, 9, 13)).
        rQ   r   rP   r   N)r<   r=   r   ri   rl   r>   r   r   )rI   r7   re   rh   rd   rK   s        rL   r=   zSPPF.__init__   sY     	1WB1%QAq)!AqAvFrM   rJ   rN   c                       j                  |      gj                   fdt        d      D                j                  t	        j
                  d            S )zRApply sequential pooling operations to input and return concatenated feature maps.c              3   F   K   | ]  }j                  d            ywr   r   )r   rV   rI   r   s     rL   r   zSPPF.forward.<locals>.<genexpr>   s     11"1s   !rg   r   )ri   r   r   rl   rB   r   r   s   ` @rL   rX   zSPPF.forward   sA    XXa[M	1a11xx		!Q((rM   r   rZ   rb   s   @rL   r   r      s?    QG3 GC GC G$) )%,, )rM   r   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )
r   z"CSP Bottleneck with 1 convolution.r7   re   r   c                     t         |           t        |dd      | _        t	        j
                  fdt        |      D         | _        y)z
        Initialize the CSP Bottleneck with 1 convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of convolutions.
        r   c              3   8   K   | ]  }t        d         yw)rg   N)r   )r   rV   re   s     rL   r   zC1.__init__.<locals>.<genexpr>  s      CQb"a Cs   N)r<   r=   r   ri   r>   
Sequentialr   r   )rI   r7   re   r   rK   s     ` rL   r=   zC1.__init__   s<     	B1% C%( CDrM   rJ   rN   c                 L    | j                  |      }| j                  |      |z   S )z:Apply convolution and residual connection to input tensor.)ri   r   r   s      rL   rX   z
C1.forward  s!    HHQKvvay1}rM   r   rZ   rb   s   @rL   r   r      s?    ,E3 EC EC E %,, rM   r   c                   v     e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ
 xZS )r   z#CSP Bottleneck with 2 convolutions.r7   re   r   r   gec                 "    t                    t        ||z         _        t	        |d j                  z  dd       _        t	        d j                  z  |d       _        t        j                   fdt        |      D          _
        y)ah  
        Initialize a CSP Bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rQ   r   c           	   3   h   K   | ])  }t        j                  j                  d d       + yw)rg   rg   r         ?rh   r   Nr    cr   rV   r   rI   r   s     rL   r   zC2.__init__.<locals>.<genexpr>  s.      vhiDFFDFFHaK[_b!c!c v   /2Nr<   r=   r_   r   r   ri   rl   r>   r   r   r   rI   r7   re   r   r   r   r   rK   s   `   `` rL   r=   zC2.__init__  sn     	R!VAJ1-DFF
B* vmrstmu vwrM   rJ   rN   c                     | j                  |      j                  dd      \  }}| j                  t        j                  | j                  |      |fd            S )z<Forward pass through the CSP bottleneck with 2 convolutions.rQ   r   )ri   chunkrl   rB   r   r   rI   rJ   rW   rU   s       rL   rX   z
C2.forward   sF    xx{  A&1xx		466!9a.!455rM   r   Tr         ?r[   r\   r]   r^   r_   r   rD   r=   rB   r`   rX   ra   rb   s   @rL   r   r   
  sX    -x3 xC xC xt xs x[` x&6 6%,, 6rM   r   c                        e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ
d	ej                  d
ej                  fdZ xZS )r   <Faster Implementation of CSP Bottleneck with 2 convolutions.r7   re   r   r   r   r   c                 .    t                    t        ||z         _        t	        |d j                  z  dd       _        t	        d|z    j                  z  |d       _        t        j                   fdt        |      D               _
        y)ah  
        Initialize a CSP bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rQ   r   c           	   3   h   K   | ])  }t        j                  j                  d d       + ywr   r   r   s     rL   r   zC2f.__init__.<locals>.<genexpr>9  .     tfgz$&&$&&(AIY]`aatr   N)r<   r=   r_   r   r   ri   rl   r>   r   r   r   r   s   `   `` rL   r=   zC2f.__init__)  ss     	R!VAJ1-Q$&&("a0tkpqrksttrM   rJ   rN   c                     t        | j                  |      j                  dd            j                  fd| j                  D               | j                  t        j                  d            S )zForward pass through C2f layer.rQ   r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   zC2f.forward.<locals>.<genexpr>>  r   r   )listri   r   r   r   rl   rB   r   r   s     @rL   rX   zC2f.forward;  sQ    !""1a()	*466**xx		!Q((rM   c                    | j                  |      j                  | j                  | j                  fd      d   d   gj                  fd| j                  D               | j                  t        j                  d            S ).Forward pass using split() instead of chunk().r   r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z$C2f.forward_split.<locals>.<genexpr>E  r   r   )ri   splitr   r   r   rl   rB   r   r   s     @rL   forward_splitzC2f.forward_splitA  sj    HHQKtvvtvv.2qT1Q4L	*466**xx		!Q((rM   r   Fr   r   r[   r\   r]   r^   r_   r   rD   r=   rB   r`   rX   r   ra   rb   s   @rL   r   r   &  st    Fu3 uC uC ut uPS u\a u$) )%,, ))u|| ) )rM   r   c                   v     e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ
 xZS )r   z#CSP Bottleneck with 3 convolutions.r7   re   r   r   r   r   c                    t         |           t        ||z        t        |dd      | _        t        |dd      | _        t        dz  |d      | _        t        j                  fdt        |      D         | _
        y)aj  
        Initialize the CSP Bottleneck with 3 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rQ   c           	   3   @   K   | ]  }t        d d        yw)))r   r   r   r   r   Nr    r   rV   rd   r   r   s     rL   r   zC3.__init__.<locals>.<genexpr>]  s&      n`aBHaCSWZ![![ n   N)r<   r=   r_   r   ri   rl   rm   r>   r   r   r   	rI   r7   re   r   r   r   r   rd   rK   s	       `` @rL   r=   zC3.__init__L  sr     	a[B1%B1%BA& nejklem norM   rJ   rN   c           	          | j                  t        j                  | j                  | j	                  |            | j                  |      fd            S )z<Forward pass through the CSP bottleneck with 3 convolutions.r   )rm   rB   r   r   ri   rl   ro   s     rL   rX   z
C3.forward_  s:    xx		466$((1+#6"DaHIIrM   r   r   rb   s   @rL   r   r   I  s[    -p3 pC pC pt ps p[` p&J J%,, JrM   r   c                   >     e Zd ZdZd	dedededededef fdZ xZS )
r   z"C3 module with cross-convolutions.r7   re   r   r   r   r   c                      t            ||||       t        ||z         _        t	        j
                   fdt        |      D          _        y)ae  
        Initialize C3 module with cross-convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c           	   3   h   K   | ])  }t        j                  j                  d d       + yw)))r   rg   rg   r   r   r   N)r    rd   r   s     rL   r   zC3x.__init__.<locals>.<genexpr>u  s.      vhiDGGTWWhM]ab!c!c vr   N)r<   r=   r_   rd   r>   r   r   r   r   s   `   `` rL   r=   zC3x.__init__g  sH     	RHa3b1f+ vmrstmu vwrM   r   	r[   r\   r]   r^   r_   r   rD   r=   ra   rb   s   @rL   r   r   d  sC    ,x3 xC xC xt xs x[` x xrM   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z	 xZ
S )r#   zRep C3.r7   re   r   r   c           	      h   t         |           t        ||z        }t        ||dd      | _        t        ||dd      | _        t        j                  t        |      D cg c]  }t        ||       c} | _
        ||k7  rt        ||dd      | _        yt        j                         | _        yc c}w )z
        Initialize CSP Bottleneck with a single convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepConv blocks.
            e (float): Expansion ratio.
        r   N)r<   r=   r_   r   ri   rl   r>   r   r   r   r   Identityrm   )rI   r7   re   r   r   rd   rV   rK   s          rL   r=   zRepC3.__init__{  s     	a[B1%B1%%( CQR CD)+r4B1%r{{} !Ds    B/rJ   rN   c                     | j                  | j                  | j                  |            | j                  |      z         S )zForward pass of RepC3 module.)rm   r   ri   rl   ro   s     rL   rX   zRepC3.forward  s/    xxtxx{+dhhqk9::rM   )rg   r   r[   r\   r]   r^   r_   rD   r=   rB   r`   rX   ra   rb   s   @rL   r#   r#   x  sG    E3 EC EC E E"; ;%,, ;rM   r#   c                   >     e Zd ZdZd	dedededededef fdZ xZS )
r   z"C3 module with TransformerBlock().r7   re   r   r   r   r   c                 p    t         |   ||||||       t        ||z        }t        ||d|      | _        y)ad  
        Initialize C3 module with TransformerBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Transformer blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rP   N)r<   r=   r_   r   r   r   s	           rL   r=   zC3TR.__init__  s;     	RHa3a[!"b!Q/rM   r   r   rb   s   @rL   r   r     s;    ,03 0C 0C 0t 0s 0[` 0 0rM   r   c                   >     e Zd ZdZd	dedededededef fdZ xZS )
r   z!C3 module with GhostBottleneck().r7   re   r   r   r   r   c                     t         |   ||||||       t        ||z        t        j                  fdt        |      D         | _        y)ah  
        Initialize C3 module with GhostBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Ghost bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3   6   K   | ]  }t                y w)N)r   )r   rV   rd   s     rL   r   z#C3Ghost.__init__.<locals>.<genexpr>  s      KQR!8 Ks   Nr<   r=   r_   r>   r   r   r   r   s	          @rL   r=   zC3Ghost.__init__  sC     	RHa3a[ K%( KLrM   r   r   rb   s   @rL   r   r     sC    +M3 MC MC Mt Ms M[` M MrM   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZ	S )r   zGGhost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones.r7   re   rh   sc                    t         |           |dz  }t        j                  t	        ||dd      |dk(  rt        ||||d      nt        j                         t	        ||ddd            | _        |dk(  r8t        j                  t        ||||d      t        ||ddd            | _	        yt        j                         | _	        y)z
        Initialize Ghost Bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        rQ   r   Fru   N)
r<   r=   r>   r   r
   r	   r   rA   r   r   )rI   r7   re   rh   r   rd   rK   s         rL   r=   zGhostBottleneck.__init__  s     	1WMMb"a#/0AvF2r1aU+2;;=b"a.
	 ^_bc]cBMM&RA594B1RW;XY 	ikititiv 	rM   rJ   rN   c                 H    | j                  |      | j                  |      z   S )z8Apply skip connection and concatenation to input tensor.)rA   r   ro   s     rL   rX   zGhostBottleneck.forward  s    yy|dmmA...rM   r   rZ   rb   s   @rL   r   r     sB    Q
3 
C 
C 
 
*/ /%,, /rM   r   c                        e Zd ZdZ	 ddededededeeef   def fdZd	e	j                  d
e	j                  fdZ xZS )r    zStandard bottleneck.r7   re   r   r   rh   r   c                     t         |           t        ||z        }t        |||d   d      | _        t        |||d   d|      | _        |xr ||k(  | _        y)ac  
        Initialize a standard bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   r   N)r<   r=   r_   r   ri   rl   r   	rI   r7   re   r   r   rh   r   rd   rK   s	           rL   r=   zBottleneck.__init__  s[     	a[B!a(B!a1-(brM   rJ   rN   c                     | j                   r#|| j                  | j                  |            z   S | j                  | j                  |            S )z3Apply bottleneck with optional shortcut connection.)r   rl   ri   ro   s     rL   rX   zBottleneck.forward  s:    ,0HHq488DHHQK((O$((488A;:OOrM   Tr   r   r   )r[   r\   r]   r^   r_   r   r   rD   r=   rB   r`   rX   ra   rb   s   @rL   r    r      si     lo)))*.):=)FKCQTHo)ch)(P P%,, PrM   r    c                   v     e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ
 xZS )r!   zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.r7   re   r   r   r   r   c                    t         |           t        ||z        t        |dd      | _        t        j                  |ddd      | _        t        j                  ddd      | _        t        dz  |dd      | _	        t        j                  dz        | _        t        j                         | _        t        j                  fdt        |      D         | _        y)aR  
        Initialize CSP Bottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   Fr9   rQ   c              3   >   K   | ]  }t        d         ywr   r   Nr   r   s     rL   r   z)BottleneckCSP.__init__.<locals>.<genexpr>	  s!      ZABHa3!G!G Z   N)r<   r=   r_   r   ri   r>   r?   rl   rm   cv4BatchNorm2dbnSiLUrv   r   r   r   r   s	       `` @rL   r=   zBottleneckCSP.__init__  s     	a[B1%99RQ699RQ6BAq)..R(779 ZQVWXQY Z[rM   rJ   rN   c           
         | j                  | j                  | j                  |                  }| j                  |      }| j	                  | j                  | j                  t        j                  ||fd                        S )z)Apply CSP bottleneck with 3 convolutions.r   )	rm   r   ri   rl   r  rv   r  rB   r   )rI   rJ   y1y2s       rL   rX   zBottleneckCSP.forward  s^    XXdffTXXa[)*XXa[xxB8Q)?!@ABBrM   r   r   rb   s   @rL   r!   r!     s[    Q\3 \C \C \t \s \[` \,C C%,, CrM   r!   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZ	S )ResNetBlockz.ResNet block with standard convolution layers.r7   re   r   r   c           	      B   t         |           ||z  }t        ||ddd      | _        t        ||d|dd      | _        t        ||dd      | _        |dk7  s||k7  r)t        j                  t        ||d|d            | _	        yt        j                         | _	        y)	z
        Initialize ResNet block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            e (int): Expansion ratio.
        r   Trh   r   rv   rg   rh   r   prv   Fr   N)
r<   r=   r   ri   rl   rm   r>   r   r   r   )rI   r7   re   r   r   c3rK   s         rL   r=   zResNetBlock.__init__  s     	VB!qd3B!qA48B!/LMQRFVX\^V^d2rQ!&GHdfdododqrM   rJ   rN   c           	          t        j                  | j                  | j                  | j	                  |                  | j                  |      z         S )z&Forward pass through the ResNet block.)r   relurm   rl   ri   r   ro   s     rL   rX   zResNetBlock.forward&  s9    vvdhhtxx45a8HHIIrM   )r   rP   rZ   rb   s   @rL   r  r    sJ    8r3 rC rC r r"J J%,, JrM   r  c                   v     e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ	 xZ
S )r$   z)ResNet layer with multiple ResNet blocks.r7   re   r   is_firstr   r   c                    t         	|           || _        | j                  rAt        j                  t        ||dddd      t        j                  ddd            | _        y	t        ||||      g}|j                  t        |dz
        D cg c]  }t        ||z  |d|       c}       t        j                  | | _        y	c c}w )
a5  
        Initialize ResNet layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            is_first (bool): Whether this is the first layer.
            n (int): Number of ResNet blocks.
            e (int): Expansion ratio.
           rQ   rg   Tr  r   r   r
  N)r<   r=   r  r>   r   r   r   layerr  r   r   )
rI   r7   re   r   r  r   r   blocksrV   rK   s
            rL   r=   zResNetLayer.__init__.  s     	 ==RqA5r||PQZ[ef7gDJ ""b!q12FMME!a%LQq;q2vr1:QR/DJ Rs   CrJ   rN   c                 $    | j                  |      S )z&Forward pass through the ResNet layer.)r   ro   s     rL   rX   zResNetLayer.forwardF  s    zz!}rM   )r   Fr   rP   r[   r\   r]   r^   r_   r   r=   rB   r`   rX   ra   rb   s   @rL   r$   r$   +  sR    303 0C 0C 0t 0PS 0\_ 00 %,, rM   r$   c                        e Zd ZdZddedededededef fdZd	ej                  d
ej                  dej                  fdZ	 xZ
S )MaxSigmoidAttnBlockzMax Sigmoid attention block.r7   re   nhr   gcscalec                    t         |           || _        ||z  | _        ||k7  rt	        ||dd      nd| _        t        j                  ||      | _        t        j                  t        j                  |            | _        t	        ||ddd      | _        |r1t        j                  t        j                  d|dd            | _        yd| _        y)aH  
        Initialize MaxSigmoidAttnBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            nh (int): Number of heads.
            ec (int): Embedding channels.
            gc (int): Guide channels.
            scale (bool): Whether to use learnable scale parameter.
        r   Fr   Nrg   r  r   )r<   r=   r&  hcr   r   r>   LinearglrE   rB   zerosr:   	proj_convonesr(  )rI   r7   re   r&  r   r'  r(  rK   s          rL   r=   zMaxSigmoidAttnBlock.__init__N  s     	(24($r2.))B#LLR1	b"QE:>CR\\%**QAq"9:

rM   rJ   guiderN   c                    |j                   \  }}}}| j                  |      }|j                  ||j                   d   | j                  | j                        }| j
                  | j                  |      n|}|j                  || j                  | j                  ||      }t        j                  d||      }|j                  d      d   }|| j                  dz  z  }|| j                  dddddf   z   }|j                         | j                  z  }| j                  |      }|j                  || j                  d||      }||j                  d      z  }|j                  |d||      S )	z
        Forward pass of MaxSigmoidAttnBlock.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor.

        Returns:
            (torch.Tensor): Output tensor after attention.
        r   Nzbmchw,bnmc->bmhwnr   r   r   r   rQ   )rR   r,  rF   r&  r*  r   rB   einsummaxr:   sigmoidr(  r.  	unsqueeze)	rI   rJ   r0  bsrV   hwembedaws	            rL   rX   zMaxSigmoidAttnBlock.forwardc  s4    ggAq!

2u{{1~tww@"gg1
q

2twwA6\\-ue<VVV^A477C< $))D!T4/00ZZ\DJJ&NN1FF2twwAq)Qvvb"a##rM   )r         Fr#  rb   s   @rL   r%  r%  K  sc    &M3 MC MS M# M M[_ M*$ $ell $u|| $rM   r%  c                        e Zd ZdZ	 	 	 	 	 	 	 ddededededededed	ed
ef fdZdej                  dej                  dej                  fdZ
dej                  dej                  dej                  fdZ xZS )r   z*C2f module with an additional attn module.r7   re   r   r   r&  r'  r   r   r   c
                     t         
           t        ||	z         _        t	        |d j                  z  dd       _        t	        d|z    j                  z  |d       _        t        j                   fdt        |      D               _
        t         j                   j                  |||       _        y)a  
        Initialize C2f module with attention mechanism.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            ec (int): Embedding channels for attention.
            nh (int): Number of heads for attention.
            gc (int): Guide channels for attention.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rQ   r   rg   c           	   3   h   K   | ])  }t        j                  j                  d d       + ywr   r   r   s     rL   r   z#C2fAttn.__init__.<locals>.<genexpr>  r   r   )r'  r   r&  N)r<   r=   r_   r   r   ri   rl   r>   r   r   r   r%  attn)rI   r7   re   r   r   r&  r'  r   r   r   rK   s   `      `` rL   r=   zC2fAttn.__init__  s    4 	R!VAJ1-Q$&&("a0tkpqrkstt'2"L	rM   rJ   r0  rN   c                 2   t        | j                  |      j                  dd            j                  fd| j                  D               j                  | j                  d   |             | j                  t        j                  d            S )a  
        Forward pass through C2f layer with attention.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        rQ   r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z"C2fAttn.forward.<locals>.<genexpr>  r   r   r   )
r   ri   r   r   r   appendr@  rl   rB   r   rI   rJ   r0  r   s      @rL   rX   zC2fAttn.forward  sn     !""1a()	*466**	1R5%()xx		!Q((rM   c                 ^   t        | j                  |      j                  | j                  | j                  fd            j	                  fd| j
                  D               j                  | j                  d   |             | j                  t        j                  d            S )a  
        Forward pass using split() instead of chunk().

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z(C2fAttn.forward_split.<locals>.<genexpr>  r   r   r   )r   ri   r   r   r   r   rC  r@  rl   rB   r   rD  s      @rL   r   zC2fAttn.forward_split  s{     !""DFFDFF#3Q78	*466**	1R5%()xx		!Q((rM   )r   r;  r   r<  Fr   r   r   rb   s   @rL   r   r     s    4 MM M 	M
 M M M M M MB) )ell )u|| ) )u|| )ELL )U\\ )rM   r   c                        e Zd ZdZ	 ddedeedf   dedededef fd	Zd
ee	j                     de	j                  de	j                  fdZ xZS )r   zKImagePoolingAttn: Enhance the text embeddings with image-aware information.r   ch.ctr&  rh   r(  c           
         t         
|           t        |      }t        j                  t        j
                  |      t        j                  ||            | _        t        j                  t        j
                  |      t        j                  ||            | _        t        j                  t        j
                  |      t        j                  ||            | _	        t        j                  ||      | _
        |r+t        j                  t        j                  dg      d      nd| _        t        j                  |D cg c]  }t        j                   ||d       c}      | _        t        j                  t%        |      D 	cg c]  }	t        j&                  ||f       c}	      | _        || _        || _        || _        ||z  | _        || _        yc c}w c c}	w )a  
        Initialize ImagePoolingAttn module.

        Args:
            ec (int): Embedding channels.
            ch (tuple): Channel dimensions for feature maps.
            ct (int): Channel dimension for text embeddings.
            nh (int): Number of attention heads.
            k (int): Kernel size for pooling.
            scale (bool): Whether to use learnable scale parameter.
        g        Trequires_gradr   r   )rw   N)r<   r=   r   r>   r   	LayerNormr+  querykeyvalueprojrE   rB   tensorr(  r   r?   projectionsr   AdaptiveMaxPool2dim_poolsr   r&  nfr*  rh   )rI   r   rH  rI  r&  rh   r(  rV  in_channelsrV   rK   s             rL   r=   zImagePoolingAttn.__init__  sJ    	W]]2<<#3RYYr25FG
==b!1299R3DE]]2<<#3RYYr25FG
IIb"%	NSR\\%,,u"5TJY\
==gi)jXc"))KQR*S)jkUSUY&Wr';';QF'C&WX( *k&Ws   G
G rJ   textrN   c           
         |d   j                   d   }t        |      | j                  k(  sJ | j                  dz  }t	        || j
                  | j                        D cg c]%  \  }}} | ||            j                  |d|      ' c}}}}t        j                  |d      j                  dd      }| j                  |      }| j                  |      }| j                  |      }	|j                  |d| j                  | j                         }|j                  |d| j                  | j                         }|	j                  |d| j                  | j                         }	t        j"                  d||      }
|
| j                   dz  z  }
t%        j&                  |
d      }
t        j"                  d|
|	      }| j)                  |j                  |d| j*                              }|| j,                  z  |z   S c c}}}w )	z
        Forward pass of ImagePoolingAttn.

        Args:
            x (List[torch.Tensor]): List of input feature maps.
            text (torch.Tensor): Text embeddings.

        Returns:
            (torch.Tensor): Enhanced text embeddings.
        r   rQ   r   r   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rR   r   rV  rh   ziprS  rU  rF   rB   r   rS   rN  rO  rP  reshaper&  r*  r2  r   rT   rQ  r   r(  )rI   rJ   rX  r6  num_patchesrQ  r   qrh   vr:  s              rL   rX   zImagePoolingAttn.forward  s    qTZZ]1v   ffaiLOPQSWScSceiererLstt!T4T$q']B4tIIaR **1a0JJtHHQKJJqM IIb"dggtww/IIb"dggtww/IIb"dggtww/\\+Q2477C< YYrr"LL*B2IIaiiB014::~$$# us   !*G7)rp   r   r<     rg   F)r[   r\   r]   r^   r_   r   r   r=   r   rB   r`   rX   ra   rb   s   @rL   r   r     su    U ns!&sCx;>JMVYfj<%ell+ %5<< %ELL %rM   r   c                   r     e Zd ZdZ fdZdej                  dej                  dej                  fdZ xZS )r   zZImplements contrastive learning head for region-text similarity in vision-language models.c                     t         |           t        j                  t	        j
                  dg            | _        t        j                  t	        j                  g       t	        j
                  d      j                         z        | _	        y)zBInitialize ContrastiveHead with region-text similarity parameters.      $g$I$I,@N)
r<   r=   r>   rE   rB   rR  r:   r/  loglogit_scale)rI   rK   s    rL   r=   zContrastiveHead.__init__  sY    LLug!67	<<

2h9O9S9S9U(UVrM   rJ   r8  rN   c                     t        j                  |dd      }t        j                  |dd      }t        j                  d||      }|| j                  j                         z  | j                  z   S )z
        Forward function of contrastive learning.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rQ   r   r  r   bchw,bkc->bkhw)r   	normalizerB   r2  rd  expr:   rI   rJ   r8  s      rL   rX   zContrastiveHead.forward  s^     KKqA&KKrQ'LL)1a04##''))DII55rM   )	r[   r\   r]   r^   r=   rB   r`   rX   ra   rb   s   @rL   r   r   	  s2    dW6 6%,, 65<< 6rM   r   c                        e Zd ZdZdef fdZd Zdej                  dej                  dej                  fdZ	dej                  dej                  dej                  fd	Z
 xZS )
r   z
    Batch Norm Contrastive Head using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    
embed_dimsc                    t         |           t        j                  |      | _        t        j
                  t        j                  dg            | _        t        j
                  dt        j                  g       z        | _
        y)z
        Initialize BNContrastiveHead.

        Args:
            embed_dims (int): Embedding dimensions for features.
        rb  g      N)r<   r=   r>   r  normrE   rB   rR  r:   r/  rd  )rI   rl  rK   s     rL   r=   zBNContrastiveHead.__init__,  sY     	NN:.	LLug!67	<<uzz"~(=>rM   c                 2    | ` | `| `| j                  | _        y)zCFuse the batch normalization layer in the BNContrastiveHead module.N)rn  r:   rd  forward_fuserX   )rI   s    rL   fusezBNContrastiveHead.fuse:  s    II((rM   rJ   r8  rN   c                     |S )zPasses input out unchanged.r   rj  s      rL   rp  zBNContrastiveHead.forward_fuseA  s    rM   c                     | j                  |      }t        j                  |dd      }t        j                  d||      }|| j
                  j                         z  | j                  z   S )z
        Forward function of contrastive learning with batch normalization.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rQ   rf  rg  )rn  r   rh  rB   r2  rd  ri  r:   rj  s      rL   rX   zBNContrastiveHead.forwardE  sY     IIaLKKrQ'LL)1a04##''))DII55rM   )r[   r\   r]   r^   r_   r=   rq  rB   r`   rp  rX   ra   rb   s   @rL   r   r   $  se    ?3 ?)ell u||  6 6%,, 65<< 6rM   r   c                   J     e Zd ZdZ	 d	dededededeeef   def fdZ xZ	S )
RepBottleneckzRep bottleneck.r7   re   r   r   rh   r   c                 v    t         |   ||||||       t        ||z        }t        |||d   d      | _        y)aT  
        Initialize RepBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   N)r<   r=   r_   r   ri   r  s	           rL   r=   zRepBottleneck.__init__Z  s?     	R1a3a[2r1Q4+rM   r  )
r[   r\   r]   r^   r_   r   r   rD   r=   ra   rb   s   @rL   ru  ru  W  sP     lo,,,*.,:=,FKCQTHo,ch, ,rM   ru  c                   >     e Zd ZdZd	dedededededef fdZ xZS )
RepCSPzXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.r7   re   r   r   r   r   c                     t         |   ||||       t        ||z        t        j                  fdt        |      D         | _        y)aS  
        Initialize RepCSP layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepBottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3   >   K   | ]  }t        d         ywr	  )ru  r   s     rL   r   z"RepCSP.__init__.<locals>.<genexpr>~  s!      ]qr2xc!J!J ]r  Nr   r   s	       `` @rL   r=   zRepCSP.__init__p  sF     	RHa3a[ ]TYZ[T\ ]^rM   r   r   rb   s   @rL   rx  rx  m  sC    b_3 _C _C _t _s _[` _ _rM   rx  c                        e Zd ZdZddededededef
 fdZdej                  d	ej                  fd
Zdej                  d	ej                  fdZ	 xZ
S )r%   z	CSP-ELAN.r7   re   r  c4r   c           	      \   t         |           |dz  | _        t        ||dd      | _        t        j                  t        |dz  ||      t        ||dd            | _        t        j                  t        |||      t        ||dd            | _	        t        |d|z  z   |dd      | _
        y)a  
        Initialize CSP-ELAN layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for RepCSP.
            n (int): Number of RepCSP blocks.
        rQ   r   rg   N)r<   r=   r   r   ri   r>   r   rx  rl   rm   r  )rI   r7   re   r  r|  r   rK   s         rL   r=   zRepNCSPELAN4.__init__  s     	qB1%==aQ!7b"a9KL==B!2DRA4FGa"fr1a0rM   rJ   rN   c                    t        | j                  |      j                  dd            j                  fd| j                  | j
                  fD               | j                  t        j                  d            S )z(Forward pass through RepNCSPELAN4 layer.rQ   r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z'RepNCSPELAN4.forward.<locals>.<genexpr>  s     :!AbE(:r   )	r   ri   r   r   rl   rm   r  rB   r   r   s     @rL   rX   zRepNCSPELAN4.forward  sZ    !""1a()	:dhh%9::xx		!Q((rM   c                 .   t        | j                  |      j                  | j                  | j                  fd            j	                  fd| j
                  | j                  fD               | j                  t        j                  d            S )r   r   c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z-RepNCSPELAN4.forward_split.<locals>.<genexpr>  s     8a1R58r   )
r   ri   r   r   r   rl   rm   r  rB   r   r   s     @rL   r   zRepNCSPELAN4.forward_split  sg    !""DFFDFF#3Q78	8DHHdhh#788xx		!Q((rM   r   )r[   r\   r]   r^   r_   r=   rB   r`   rX   r   ra   rb   s   @rL   r%   r%     sd    13 1C 1S 1c 1c 1$) )%,, ))u|| ) )rM   r%   c                   4     e Zd ZdZdedededef fdZ xZS )r&   z!ELAN1 module with 4 convolutions.r7   re   r  r|  c                     t         |   ||||       |dz  | _        t        ||dd      | _        t        |dz  |dd      | _        t        ||dd      | _        t        |d|z  z   |dd      | _        y)z
        Initialize ELAN1 layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for convolutions.
        rQ   r   rg   N)r<   r=   r   r   ri   rl   rm   r  )rI   r7   re   r  r|  rK   s        rL   r=   zELAN1.__init__  sw     	RR(qB1%aQ*B1%a"fr1a0rM   )r[   r\   r]   r^   r_   r=   ra   rb   s   @rL   r&   r&     s,    +13 1C 1S 1c 1 1rM   r&   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )r(   zAConv.r7   re   c                 J    t         |           t        ||ddd      | _        y)z
        Initialize AConv module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rg   rQ   r   N)r<   r=   r   ri   rI   r7   re   rK   s      rL   r=   zAConv.__init__  s$     	B1a(rM   rJ   rN   c                     t         j                  j                  j                  |ddddd      }| j	                  |      S )z!Forward pass through AConv layer.rQ   r   r   FT)rB   r>   
functional
avg_pool2dri   ro   s     rL   rX   zAConv.forward  s4    HH**1aAudCxx{rM   rZ   rb   s   @rL   r(   r(     s4    	)3 	)C 	) %,, rM   r(   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )r'   zADown.r7   re   c                     t         |           |dz  | _        t        |dz  | j                  ddd      | _        t        |dz  | j                  ddd      | _        y)z
        Initialize ADown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rQ   rg   r   r   N)r<   r=   r   r   ri   rl   r  s      rL   r=   zADown.__init__  sS     	qaAq1aAq1rM   rJ   rN   c                 T   t         j                  j                  j                  |ddddd      }|j	                  dd      \  }}| j                  |      }t         j                  j                  j                  |ddd      }| j                  |      }t        j                  ||fd      S )z!Forward pass through ADown layer.rQ   r   r   FTrg   )	rB   r>   r  r  r   ri   
max_pool2drl   r   )rI   rJ   r   r   s       rL   rX   zADown.forward  s    HH**1aAudCABXXb\XX  ++B1a8XXb\yy"b1%%rM   rZ   rb   s   @rL   r'   r'     s4    23 2C 2& &%,, &rM   r'   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZ	S )r)   z	SPP-ELAN.r7   re   r  rh   c                 B   t         |           || _        t        ||dd      | _        t        j                  |d|dz        | _        t        j                  |d|dz        | _        t        j                  |d|dz        | _	        t        d|z  |dd      | _
        y)z
        Initialize SPP-ELAN block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            k (int): Kernel size for max pooling.
        r   rQ   r   rP   N)r<   r=   r   r   ri   r>   r   rl   rm   r  cv5)rI   r7   re   r  rh   rK   s        rL   r=   zSPPELAN.__init__  s     	B1%<<AaaH<<AaaH<<AaaHBAq)rM   rJ   rN   c                     | j                  |      gj                  fd| j                  | j                  | j                  fD               | j                  t        j                  d            S )z#Forward pass through SPPELAN layer.c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z"SPPELAN.forward.<locals>.<genexpr>  s     Ba1R5Br   r   )ri   r   rl   rm   r  r  rB   r   r   s     @rL   rX   zSPPELAN.forward  sP    XXa[M	BDHHdhh#ABBxx		!Q((rM   r   rZ   rb   s   @rL   r)   r)     sB    *3 *C *S *S *$) )%,, )rM   r)   c                        e Zd ZdZddedee   dededee   def fdZd	ej                  d
eej                     fdZ
 xZS )r+   z	CBLinear.r7   c2srh   r   r  r   c           
          t         |           || _        t        j                  |t        |      ||t        ||      |d      | _        y)a  
        Initialize CBLinear module.

        Args:
            c1 (int): Input channels.
            c2s (List[int]): List of output channel sizes.
            k (int): Kernel size.
            s (int): Stride.
            p (int | None): Padding.
            g (int): Groups.
        T)groupsr:   N)r<   r=   r  r>   r?   sumr   rA   )rI   r7   r  rh   r   r  r   rK   s          rL   r=   zCBLinear.__init__  s>     	IIb#c(Aq'!Q-PTU	rM   rJ   rN   c                 Z    | j                  |      j                  | j                  d      S )z$Forward pass through CBLinear layer.r   r   )rA   r   r  ro   s     rL   rX   zCBLinear.forward  s$    yy|!!$((!22rM   )r   r   Nr   )r[   r\   r]   r^   r_   r   r   r=   rB   r`   rX   ra   rb   s   @rL   r+   r+     sf    V3 VT#Y V3 Vs V8TW= Vdg V 3 3$u||*< 3rM   r+   c                   l     e Zd ZdZdee   f fdZdeej                     dej                  fdZ	 xZ
S )r*   zCBFuse.idxc                 0    t         |           || _        y)zv
        Initialize CBFuse module.

        Args:
            idx (List[int]): Indices for feature selection.
        N)r<   r=   r  )rI   r  rK   s     rL   r=   zCBFuse.__init__  s     	rM   xsrN   c           	         |d   j                   dd }t        |dd       D cg c]-  \  }}t        j                  || j                  |      |d      / }}}t        j                  t        j                  ||dd z         d      S c c}}w )z
        Forward pass through CBFuse layer.

        Args:
            xs (List[torch.Tensor]): List of input tensors.

        Returns:
            (torch.Tensor): Fused output tensor.
        r   rQ   Nnearest)sizemoder   r   )rR   	enumerater   interpolater  rB   r  stack)rI   r  target_sizer   rJ   ress         rL   rX   zCBFuse.forward'  s     fll12&[deghkikel[mnSWSTVWq}}Qtxx{^+INnnyyS2bc7]3;; os   2B	r[   r\   r]   r^   r   r_   r=   rB   r`   rX   ra   rb   s   @rL   r*   r*     s6    DI <$u||, < <rM   r*   c                   v     e Zd ZdZddedededededef fdZd	ej                  d
ej                  fdZ
 xZS )C3fr   r7   re   r   r   r   r   c                    t         |           t        ||z        t        |dd      | _        t        |dd      | _        t        d|z   z  |d      | _        t        j                  fdt        |      D              | _
        y)an  
        Initialize CSP bottleneck layer with two convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rQ   c           	   3   @   K   | ]  }t        d d        ywr   r   r   s     rL   r   zC3f.__init__.<locals>.<genexpr>J  s&     l^_z"b(AAQUXYYlr   N)r<   r=   r_   r   ri   rl   rm   r>   r   r   r   r   s	       `` @rL   r=   zC3f.__init__9  sv     	a[B1%B1%Q"b!,lchijckllrM   rJ   rN   c                     | j                  |      | j                  |      gj                  fd| j                  D               | j	                  t        j                  d            S )zForward pass through C3f layer.c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   zC3f.forward.<locals>.<genexpr>O  r   r   r   )rl   ri   r   r   rm   rB   r   r   s     @rL   rX   zC3f.forwardL  sL    XXa[$((1+&	*466**xx		!Q((rM   r   r   rb   s   @rL   r  r  6  sY    Fm3 mC mC mt mPS m\a m&) )%,, )rM   r  c                   D     e Zd ZdZ	 d
dededededededef fd	Z xZS )r,   r   r7   re   r   c3kr   r   r   c                      t            ||||       t        j                   fdt	        |      D               _        y)aw  
        Initialize C3k2 module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of blocks.
            c3k (bool): Whether to use C3k blocks.
            e (float): Expansion ratio.
            g (int): Groups for convolutions.
            shortcut (bool): Whether to use shortcut connections.
        c              3      K   | ]K  }r#t        j                  j                  d       n!t        j                  j                         M yw)rQ   N)C3kr   r    )r   rV   r  r   rI   r   s     rL   r   z C3k2.__init__.<locals>.<genexpr>f  sG      
hi3C8Q/JtvvtvvW_ab<cc
s   AANr<   r=   r>   r   r   r   )	rI   r7   re   r   r  r   r   r   rK   s	   `   ` ``rL   r=   zC3k2.__init__V  s?     	RHa3 
mrstmu
 
rM   )r   Fr   r   Tr   rb   s   @rL   r,   r,   S  sO    F mq


#&
15
BG
RU
ei
 
rM   r,   c                   B     e Zd ZdZd
dededededededef fd	Z xZS )r  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.r7   re   r   r   r   r   rh   c                     t         	|   ||||       t        ||z        t        j                  fdt        |      D         | _        y)ap  
        Initialize C3k module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
            k (int): Kernel size.
        c           	   3   D   K   | ]  }t        fd         yw)r   r   Nr   )r   rV   rd   r   rh   r   s     rL   r   zC3k.__init__.<locals>.<genexpr>~  s(      dVWBHaAq6S!Q!Q dr   Nr   )
rI   r7   re   r   r   r   r   rh   rd   rK   s
       `` `@rL   r=   zC3k.__init__n  sF     	RHa3a[ d[`ab[c derM   )r   Tr   r   rg   r   rb   s   @rL   r  r  k  sL    rf3 fC fC ft fs f[` fkn f frM   r  c                        e Zd ZdZdeddf fdZdej                  dej                  fdZdej                  dej                  fdZ	 ej                         d	        Z xZS )
r/   zfRepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture.edrN   Nc           	          t         |           t        ||ddd|d      | _        t        ||ddd|d      | _        || _        t        j                         | _        y)zm
        Initialize RepVGGDW module.

        Args:
            ed (int): Input and output channels.
        r  r   rg   Fr   rv   N)	r<   r=   r   rA   conv1r   r>   r  rv   )rI   r  rK   s     rL   r=   zRepVGGDW.__init__  sT     	RAqBE:	"b!QRU;
779rM   rJ   c                 f    | j                  | j                  |      | j                  |      z         S )z
        Perform a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rv   rA   r  ro   s     rL   rX   zRepVGGDW.forward  s(     xx		!tzz!}455rM   c                 B    | j                  | j                  |            S )a  
        Perform a forward pass of the RepVGGDW block without fusing the convolutions.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        )rv   rA   ro   s     rL   rp  zRepVGGDW.forward_fuse  s     xx		!%%rM   c                 F   t        | j                  j                  | j                  j                        }t        | j                  j                  | j                  j                        }|j                  }|j
                  }|j                  }|j
                  }t        j                  j                  j                  |g d      }||z   }||z   }|j                  j                  j                  |       |j
                  j                  j                  |       || _        | `y)z
        Fuse the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        )rQ   rQ   rQ   rQ   N)r   rA   r  r  rG   r:   rB   r>   r  r   rH   copy_)	rI   rA   r  conv_wconv_bconv1_wconv1_bfinal_conv_wfinal_conv_bs	            rL   rq  zRepVGGDW.fuse  s      				= $**--@,,**((%%))'<@''|,		\*	JrM   )r[   r\   r]   r^   r_   r=   rB   r`   rX   rp  no_gradrq  ra   rb   s   @rL   r/   r/     sk    p3 4 
6 
6%,, 
6
&ell 
&u|| 
& U]]_ rM   r/   c                   r     e Zd ZdZddededededef
 fdZdej                  d	ej                  fd
Z
 xZS )r0   a  
    Conditional Identity Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    r7   re   r   r   lkc                 N   t         |           t        ||z        }t        j                  t        ||d|      t        |d|z  d      |rt        d|z        nt        d|z  d|z  dd|z        t        d|z  |d      t        ||d|            | _        |xr ||k(  | _        y)a!  
        Initialize the CIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            e (float): Expansion ratio.
            lk (bool): Whether to use RepVGGDW.
        rg   r  rQ   r   N)	r<   r=   r_   r>   r   r   r/   ri   r   )rI   r7   re   r   r   r  rd   rK   s          rL   r=   zCIB.__init__  s     	a[==Rb!QVQ "HQVQVQVQ!b&(IRQRb!
 (brM   rJ   rN   c                 d    | j                   r|| j                  |      z   S | j                  |      S )z
        Forward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        )r   ri   ro   s     rL   rX   zCIB.forward  s)     #'((q488A;;;rM   )Tr   Fr   rb   s   @rL   r0   r0     sL    	)3 )C )4 )5 )TX ).
< 
<%,, 
<rM   r0   c                   D     e Zd ZdZ	 d
dededededededef fd	Z xZS )r1   aQ  
    C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use local key connection. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    r7   re   r   r   r  r   r   c                      t            |||||       t        j                   fdt	        |      D               _        y)a  
        Initialize C2fCIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of CIB modules.
            shortcut (bool): Whether to use shortcut connection.
            lk (bool): Whether to use local key connection.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3   f   K   | ](  }t        j                  j                  d        * yw)r   )r   r  N)r0   r   )r   rV   r  rI   r   s     rL   r   z"C2fCIB.__init__.<locals>.<genexpr>  s)     ]qs4664668srJJ]s   .1Nr  )	rI   r7   re   r   r   r  r   r   rK   s	   `   ``  rL   r=   zC2fCIB.__init__  s9     	RHa3]TYZ[T\]]rM   )r   FFr   r   r   rb   s   @rL   r1   r1     sZ     nq^^^#&^6:^HL^Y\^ej^ ^rM   r1   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
r2   a  
    Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.

    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    r   	num_heads
attn_ratioc                 P   t         |           || _        ||z  | _        t	        | j                  |z        | _        | j
                  dz  | _        | j
                  |z  }||dz  z   }t        ||dd      | _        t        ||dd      | _	        t        ||dd|d      | _
        y)	z
        Initialize multi-head attention module.

        Args:
            dim (int): Input dimension.
            num_heads (int): Number of attention heads.
            attn_ratio (float): Attention ratio for key dimension.
              rQ   r   Fru   rg   r  N)r<   r=   r  head_dimr_   key_dimr(  r   qkvrQ  pe)rI   r   r  r  nh_kdr7  rK   s         rL   r=   zAttention.__init__(  s     	"y(4==:56\\4'
y(%!)OQu-c1%0	sCA%8rM   rJ   rN   c           	      P   |j                   \  }}}}||z  }| j                  |      }|j                  || j                  | j                  dz  | j
                  z   |      j                  | j                  | j                  | j
                  gd      \  }}	}
|j                  dd      |	z  | j                  z  }|j                  d      }|
|j                  dd      z  j                  ||||      | j                  |
j                  ||||            z   }| j                  |      }|S )z
        Forward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        rQ   r   r   )rR   r  rF   r  r  r  r   rS   r(  rT   r  r[  rQ  )rI   rJ   BCHWNr  r]  rh   r^  r@  s               rL   rX   zAttention.forward<  s    WW
1aEhhqk((1dnndllQ.>.NPQRXX\\4<<7Q Y 
1a B#a'4::5|||#B''--aAq9DGGAIIaQRTUWXDY<ZZIIaLrM   )r_  r   r   rb   s   @rL   r2   r2     s=    $9C 9C 9 9( %,, rM   r2   c                   r     e Zd ZdZddededededdf
 fdZd	ej                  dej                  fd
Z
 xZS )PSABlockaK  
    PSABlock class implementing a Position-Sensitive Attention block for neural networks.

    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
    with optional shortcut connections.

    Attributes:
        attn (Attention): Multi-head attention module.
        ffn (nn.Sequential): Feed-forward neural network module.
        add (bool): Flag indicating whether to add shortcut connections.

    Methods:
        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

    Examples:
        Create a PSABlock and perform a forward pass
        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
        >>> input_tensor = torch.randn(1, 128, 32, 32)
        >>> output_tensor = psablock(input_tensor)
    r   r  r  r   rN   Nc           	          t         |           t        |||      | _        t	        j
                  t        ||dz  d      t        |dz  |dd            | _        || _        y)a&  
        Initialize the PSABlock.

        Args:
            c (int): Input and output channels.
            attn_ratio (float): Attention ratio for key dimension.
            num_heads (int): Number of attention heads.
            shortcut (bool): Whether to use shortcut connections.
        r  r  rQ   r   Fru   N)	r<   r=   r2   r@  r>   r   r   ffnr   )rI   r   r  r  r   rK   s        rL   r=   zPSABlock.__init__j  sU     	aJ)L	==aQ!2DQ1%4PQrM   rJ   c                     | j                   r|| j                  |      z   n| j                  |      }| j                   r|| j                  |      z   }|S | j                  |      }|S )z
        Execute a forward pass through PSABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        )r   r@  r  ro   s     rL   rX   zPSABlock.forwardz  sV     !%A		!diil#xxAO .2XXa[rM   )r   rP   T)r[   r\   r]   r^   r_   rD   r   r=   rB   r`   rX   ra   rb   s   @rL   r  r  T  sM    *# 5 3 VZ fj   %,, rM   r  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
r3   a  
    PSA class for implementing Position-Sensitive Attention in neural networks.

    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
    input tensors, enhancing feature extraction and processing capabilities.

    Attributes:
        c (int): Number of hidden channels after applying the initial convolution.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        attn (Attention): Attention module for position-sensitive attention.
        ffn (nn.Sequential): Feed-forward network for further processing.

    Methods:
        forward: Applies position-sensitive attention and feed-forward network to the input tensor.

    Examples:
        Create a PSA module and apply it to an input tensor
        >>> psa = PSA(c1=128, c2=128, e=0.5)
        >>> input_tensor = torch.randn(1, 128, 64, 64)
        >>> output_tensor = psa.forward(input_tensor)
    r7   re   r   c           	         t         |           ||k(  sJ t        ||z        | _        t	        |d| j                  z  dd      | _        t	        d| j                  z  |d      | _        t        | j                  d| j                  dz        | _        t        j                  t	        | j                  | j                  dz  d      t	        | j                  dz  | j                  dd            | _        y)	z
        Initialize PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            e (float): Expansion ratio.
        rQ   r   r   @   r  Fru   N)r<   r=   r_   r   r   ri   rl   r2   r@  r>   r   r  )rI   r7   re   r   rK   s       rL   r=   zPSA.__init__  s     	RxxR!VAJ1-DFF
B*dff"M	==dffdffqj!!<d466A:tvvWX^c>derM   rJ   rN   c                    | j                  |      j                  | j                  | j                  fd      \  }}|| j                  |      z   }|| j	                  |      z   }| j                  t        j                  ||fd            S )z
        Execute forward pass in PSA module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        r   r   )ri   r   r   r@  r  rl   rB   r   r   s       rL   rX   zPSA.forward  ss     xx{  $&&$&&!1q 91		!Oxx		1a&!,--rM   )r   r   rb   s   @rL   r3   r3     sA    .f3 fC fE f$. .%,, .rM   r3   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z	 xZ
S )r.   aL  
    C2PSA module with attention mechanism for enhanced feature extraction and processing.

    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.

    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)
    r7   re   r   r   c                 (    t                    ||k(  sJ t        ||z         _        t	        |d j                  z  dd       _        t	        d j                  z  |d       _        t        j                   fdt        |      D          _
        y)z
        Initialize C2PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        rQ   r   c              3   h   K   | ])  }t        j                  d j                  dz         + ywr   r  r  Nr  r   r   rV   rI   s     rL   r   z!C2PSA.__init__.<locals>.<genexpr>  s+      l^_$&&SDFFVXL!Y!Y lr   Nr   rI   r7   re   r   r   rK   s   `    rL   r=   zC2PSA.__init__  sy     	RxxR!VAJ1-DFF
B* lchijck lmrM   rJ   rN   c                     | j                  |      j                  | j                  | j                  fd      \  }}| j                  |      }| j	                  t        j                  ||fd            S )z
        Process the input tensor through a series of PSA blocks.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   r   )ri   r   r   r   rl   rB   r   r   s       rL   rX   zC2PSA.forward  s]     xx{  $&&$&&!1q 91FF1Ixx		1a&!,--rM   r   r   r   rb   s   @rL   r.   r.     sI    0n3 nC nC n n$. .%,, .rM   r.   c            	       6     e Zd ZdZddedededef fdZ xZS )r-   a  
    C2fPSA module with enhanced feature extraction using PSA blocks.

    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature extraction.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.ModuleList): List of PSA blocks for feature extraction.

    Methods:
        forward: Performs a forward pass through the C2fPSA module.
        forward_split: Performs a forward pass using split() instead of chunk().

    Examples:
        >>> import torch
        >>> from ultralytics.models.common import C2fPSA
        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> output = model(x)
        >>> print(output.shape)
    r7   re   r   r   c                      ||k(  sJ t            ||||       t        j                   fdt	        |      D               _        y)z
        Initialize C2fPSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        )r   r   c              3   h   K   | ])  }t        j                  d j                  dz         + ywr  r  r  s     rL   r   z"C2fPSA.__init__.<locals>.<genexpr>"  s+     j\]x3$&&TV,WWjr   Nr  r  s   `    rL   r=   zC2fPSA.__init__  sC     RxxR1*jafghaijjrM   r  )r[   r\   r]   r^   r_   rD   r=   ra   rb   s   @rL   r-   r-     s4    0k3 kC kC k k krM   r-   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z xZ	S )
r4   a<  
    SCDown module for downsampling with separable convolutions.

    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

    Attributes:
        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

    Methods:
        forward: Applies the SCDown module to the input tensor.

    Examples:
        >>> import torch
        >>> from ultralytics import SCDown
        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> y = model(x)
        >>> print(y.shape)
        torch.Size([1, 128, 64, 64])
    r7   re   rh   r   c                 t    t         |           t        ||dd      | _        t        |||||d      | _        y)z
        Initialize SCDown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        r   F)rh   r   r   rv   N)r<   r=   r   ri   rl   )rI   r7   re   rh   r   rK   s        rL   r=   zSCDown.__init__=  s8     	B1%B!qBE:rM   rJ   rN   c                 B    | j                  | j                  |            S )z
        Apply convolution and downsampling to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Downsampled output tensor.
        )rl   ri   ro   s     rL   rX   zSCDown.forwardK  s     xx$$rM   rZ   rb   s   @rL   r4   r4   %  sD    .;3 ;C ;C ;C ;
% 
%%,, 
%rM   r4   c                   t     e Zd ZdZ	 ddededededef
 fdZdej                  d	ej                  fd
Z
 xZS )r5   aZ  
    TorchVision module to allow loading any torchvision model.

    This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and customize the model by truncating or unwrapping layers.

    Attributes:
        m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.

    Args:
        model (str): Name of the torchvision model to load.
        weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
        unwrap (bool, optional): If True, unwraps the model to a sequential containing all but the last `truncate` layers. Default is True.
        truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
        split (bool, optional): Returns output from intermediate child modules as list. Default is False.
    modelweightsunwraptruncater   c                    ddl }t        | 	          t        |j                  d      r#|j                  j                  ||      | _        n. |j                  j                  |   t        |            | _        |rt        | j                  j                               }t        |d   t        j                        r#g t        |d   j                               |dd }t        j                  |r|d|  n| | _        || _        yd| _        t        j                         x| j                  _        | j                  _        y)an  
        Load the model and weights from torchvision.

        Args:
            model (str): Name of the torchvision model to load.
            weights (str): Pre-trained weights to load.
            unwrap (bool): Whether to unwrap the model.
            truncate (int): Number of layers to truncate.
            split (bool): Whether to split the output.
        r   N	get_model)r  )
pretrainedr   F)torchvisionr<   r=   hasattrmodelsr  r   __dict__r   r   children
isinstancer>   r   r   r   headheads)	rI   r  r  r  r  r   r  layersrK   s	           rL   r=   zTorchVision.__init__i  s     	;%%{3 ''11%1IDF7[''0074=QDF$&&//+,F&)R]]3C4q	 2 2 45Cqr
C]]8VJhY%7QDFDJDJ)+6DFFK$&&,rM   rJ   rN   c                     | j                   r)|gj                  fd| j                  D               S | j                  |      S )z
        Forward pass through the model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor | List[torch.Tensor]): Output tensor or list of tensors.
        c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z&TorchVision.forward.<locals>.<genexpr>  s     .!QquX.r   )r   r   r   r   s     @rL   rX   zTorchVision.forward  sD     ::AHH.tvv..  q	ArM   )DEFAULTTrQ   F)r[   r\   r]   r^   strr   r_   r=   rB   r`   rX   ra   rb   s   @rL   r5   r5   X  sW    " kp77#&7<@7SV7cg7< %,, rM   r5   c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )
AAttna  
    Area-attention module for YOLO models, providing efficient attention mechanisms.

    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
    making it particularly effective for object detection tasks.

    Attributes:
        area (int): Number of areas the feature map is divided.
        num_heads (int): Number of heads into which the attention mechanism is divided.
        head_dim (int): Dimension of each attention head.
        qkv (Conv): Convolution layer for computing query, key and value tensors.
        proj (Conv): Projection convolution layer.
        pe (Conv): Position encoding convolution layer.

    Methods:
        forward: Applies area-attention to input tensor.

    Examples:
        >>> attn = AAttn(dim=256, num_heads=8, area=4)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = attn(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    r   r  areac           	          t         |           || _        || _        ||z  x| _        }|| j                  z  }t        ||dz  dd      | _        t        ||dd      | _        t        ||ddd|d      | _        y)a'  
        Initialize an Area-attention module for YOLO models.

        Args:
            dim (int): Number of hidden channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            area (int): Number of areas the feature map is divided.
        rg   r   Fru   r  r  N)	r<   r=   r  r  r  r   r  rQ  r  )rI   r   r  r  r  all_head_dimrK   s         rL   r=   zAAttn.__init__  s~     		"#&)#33$..0\A-qe<sA59	|S!QSeDrM   rJ   rN   c                    |j                   \  }}}}||z  }| j                  |      j                  d      j                  dd      }| j                  dkD  r@|j                  || j                  z  || j                  z  |dz        }|j                   \  }}}|j                  ||| j                  | j                  dz        j                  dddd      j                  | j                  | j                  | j                  gd      \  }	}
}|	j                  dd      |
z  | j                  dz  z  }|j                  d      }||j                  dd      z  }|j                  dddd      }|j                  dddd      }| j                  dkD  rj|j                  || j                  z  || j                  z  |      }|j                  || j                  z  || j                  z  |      }|j                   \  }}}|j                  ||||      j                  dddd      j                         }|j                  ||||      j                  dddd      j                         }|| j                  |      z   }| j                  |      S )	z
        Process the input tensor through the area-attention.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention.
        rQ   r   rg   r   r   r  r   r  )rR   r  flattenrS   r  r[  rF   r  r  permuter   rT   
contiguousr  rQ  )rI   rJ   r  r  r  r  r  r  rV   r]  rh   r^  r@  s                rL   rX   zAAttn.forward  s)    WW
1aEhhqk!!!$..q!499q=++a$))mQ$))^QUCCiiGAq!HHQ4>>4==1+<=WQ1a UDMM4==$--@aUH 	1a
 B#a'DMM4,?@|||#r2&&IIaAq!IIaAq!99q=		!tyy.!dii-;A		!tyy.!dii-;AggGAq!IIaAq!))!Q15@@BIIaAq!))!Q15@@B
Nyy|rM   r   rZ   rb   s   @rL   r  r    sA    2EC EC Es E(% %%,, %rM   r  c            	            e Zd ZdZddedededef fdZdej                  fdZ	d	e
j                  d
e
j                  fdZ xZS )ABlocka  
    Area-attention block module for efficient feature extraction in YOLO models.

    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
    maintaining effectiveness.

    Attributes:
        attn (AAttn): Area-attention module for processing spatial features.
        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.

    Methods:
        _init_weights: Initializes module weights using truncated normal distribution.
        forward: Applies area-attention and feed-forward processing to input tensor.

    Examples:
        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = block(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    r   r  	mlp_ratior  c           	          t         |           t        |||      | _        t	        ||z        }t        j                  t        ||d      t        ||dd            | _        | j                  | j                         y)ae  
        Initialize an Area-attention block module.

        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            area (int): Number of areas the feature map is divided.
        )r  r  r   Fru   N)r<   r=   r  r@  r_   r>   r   r   mlpapply_init_weights)rI   r   r  r  r  mlp_hidden_dimrK   s         rL   r=   zABlock.__init__  si     	#>	S9_-==c>1!=tNTWYZ`e?fg

4%%&rM   r   c                     t        |t        j                        rct        j                  j	                  |j
                  d       |j                  +t        j                  j                  |j                  d       yyy)z
        Initialize weights using a truncated normal distribution.

        Args:
            m (nn.Module): Module to initialize.
        g{Gz?)stdNr   )r  r>   r?   inittrunc_normal_rG   r:   	constant_)rI   r   s     rL   r  zABlock._init_weights  sY     a#GG!!!((!5vv!!!!&&!, " $rM   rJ   rN   c                 R    || j                  |      z   }|| j                  |      z   S )z
        Forward pass through ABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
        )r@  r  ro   s     rL   rX   zABlock.forward%  s(     		!488A;rM   )g333333?r   )r[   r\   r]   r^   r_   rD   r=   r>   r   r  rB   r`   rX   ra   rb   s   @rL   r  r    sT    .'C 'C 'E 's '$
-ryy 
- %,, rM   r  c                        e Zd ZdZ	 	 	 	 	 	 	 	 ddededededededed	ed
edef fdZdej                  dej                  fdZ
 xZS )A2C2fa  
    Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.

    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
    processing. It supports both area-attention and standard convolution modes.

    Attributes:
        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.

    Methods:
        forward: Processes input through area-attention or standard convolution pathway.

    Examples:
        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
        >>> x = torch.randn(1, 512, 32, 32)
        >>> output = m(x)
        >>> print(output.shape)
        torch.Size([1, 512, 32, 32])
    r7   re   r   a2r  residualr  r   r   r   c                   	
 t         |           t        ||z        dz  dk(  sJ d       t        |dd      | _        t        d|z   z  |d      | _        r/|r-t        j                  dt        j                  |      z  d      nd| _
        t        j                  	
fd	t        |      D              | _        y)
a  
        Initialize Area-Attention C2f module.

        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            n (int): Number of ABlock or C3k modules to stack.
            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
            area (int): Number of areas the feature map is divided.
            residual (bool): Whether to use residual connections with learnable gamma parameter.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            e (float): Channel expansion ratio for hidden channels.
            g (int): Number of groups for grouped convolutions.
            shortcut (bool): Whether to use shortcut connections in C3k blocks.
        rq   r   z(Dimension of ABlock be a multiple of 32.r   g{Gz?TrK  Nc              3      K   | ];  }r&t        j                  fd t        d      D         nt        d       = yw)c              3   @   K   | ]  }t        d z          yw)rq   N)r  )r   rV   r  rd   r  s     rL   r   z+A2C2f.__init__.<locals>.<genexpr>.<genexpr>p  s      TaF2rRxDATr   rQ   N)r>   r   r   r  )r   rV   r%  r  rd   r   r  r   s     rL   r   z!A2C2f.__init__.<locals>.<genexpr>o  sI      
   MMT5QR8TURQ!,-
s   AA)r<   r=   r_   r   ri   rl   r>   rE   rB   r/  gammar   r   r   )rI   r7   re   r   r%  r  r&  r  r   r   r   rd   rK   s       `` ` ``@rL   r=   zA2C2f.__init__K  s    8 	a[Bw!|GGG|B1%Q"b!,PRW_R\\$B"7tLei
 
 
 1X	
 
rM   rJ   rN   c                 D   | j                  |      gj                  fd| j                  D               | j                  t	        j
                  d            | j                  7|| j                  j                  dt        | j                        dd      z  z   S S )z
        Forward pass through A2C2f layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        c              3   4   K   | ]  } |d            ywr   r   r   s     rL   r   z A2C2f.forward.<locals>.<genexpr>  r   r   r   r   )	ri   r   r   rl   rB   r   r*  rF   r   r   s     @rL   rX   zA2C2f.forwardv  s|     XXa[M	*466**HHUYYq!_%::!tzzr3tzz?AqAAEEErM   )r   Tr   Fg       @r   r   Tr   rb   s   @rL   r$  r$  3  s    6 )
)
 )
 	)

 )
 )
 )
 )
 )
 )
 )
V %,, rM   r$  c            	       n     e Zd ZdZd
dedededdf fdZdej                  dej                  fd	Z xZ	S )	SwiGLUFFNz@SwiGLU Feed-Forward Network for transformer-based architectures.r'  r   r   rN   Nc                     t         |           t        j                  |||z        | _        t        j                  ||z  dz  |      | _        y)z
        Initialize SwiGLU FFN with input dimension, output dimension, and expansion factor.

        Args:
            gc (int): Guide channels.
            ec (int): Embedding channels.
            e (int): Expansion factor.
        rQ   N)r<   r=   r>   r+  w12w3)rI   r'  r   r   rK   s       rL   r=   zSwiGLUFFN.__init__  s@     	99RR())AFaK,rM   rJ   c                     | j                  |      }|j                  dd      \  }}t        j                  |      |z  }| j	                  |      S )z.Apply SwiGLU transformation to input features.rQ   r   r   )r0  r   r   silur1  )rI   rJ   x12r   r   hiddens         rL   rX   zSwiGLUFFN.forward  sD    hhqk1"%BbwwvrM   )rP   rZ   rb   s   @rL   r.  r.    sB    J-3 -C -C - - %,, rM   r.  c                   x     e Zd ZdZdej
                  ddf fdZdej                  dej                  fdZ	 xZ
S )Residualz7Residual connection wrapper for neural network modules.r   rN   Nc                 $   t         |           || _        t        j                  j                  | j                  j                  j                         t        j                  j                  | j                  j                  j                         y)z
        Initialize residual module with the wrapped module.

        Args:
            m (nn.Module): Module to wrap with residual connection.
        N)	r<   r=   r   r>   r  zeros_r1  r:   rG   )rI   r   rK   s     rL   r=   zResidual.__init__  sS     	
tvvyy~~& 	tvvyy''(rM   rJ   c                 *    || j                  |      z   S )z,Apply residual connection to input features.r   ro   s     rL   rX   zResidual.forward  s    466!9}rM   )r[   r\   r]   r^   r>   r   r=   rB   r`   rX   ra   rb   s   @rL   r7  r7    s8    A)")) ) ) %,, rM   r7  c                        e Zd ZdZdee   dedef fdZdeej                     dej                  dej                  fd	Z	 xZ
S )
SAVPEzESpatial-Aware Visual Prompt Embedding module for feature enhancement.rH  r  r9  c           	         t         |           t        j                  fdt	        |      D              | _        t        j                  fdt	        |      D              | _        d| _        t        j                  dz  |d      | _	        t        j                  dz  | j                  dd      | _
        t        j                  d| j                  dd      | _        t        j                  t        d| j                  z  | j                  d      t        j                  | j                  | j                  dd            | _        y)	a  
        Initialize SAVPE module with channels, intermediate channels, and embedding dimension.

        Args:
            ch (List[int]): List of input channel dimensions.
            c3 (int): Intermediate channels.
            embed (int): Embedding dimension.
        c           	   3      K   | ]c  \  }}t        j                  t        |d       t        d       |dv rt        j                  |dz        nt        j                                e yw)rg      r   rQ   rQ   scale_factorNr>   r   r   Upsampler   r   r   rJ   r  s      rL   r   z!SAVPE.__init__.<locals>.<genexpr>  sa      !
 1 MMQARQTUY_T_!a%1Pegepeper!
s   A)A,c              3      K   | ]W  \  }}t        j                  t        |d       |dv rt        j                  |dz        nt        j                                Y yw)r   r?  rQ   r@  NrB  rD  s      rL   r   z!SAVPE.__init__.<locals>.<genexpr>  sP      !
1 MM$q"a.QRX["++1q5*I^`^i^i^kl!
s   AA rY   rg   r   )ry   rQ   N)r<   r=   r>   r   r  ri   rl   r   r?   rm   r  r  r   r   cv6)rI   rH  r  r9  rK   s     ` rL   r=   zSAVPE.__init__  s     	== !
 ""	!
 
 == !
!"!
 

 99QVUA.99QVTVVQ:99Q15==a$&&j$&&!!<biiPTPVPVXYcd>efrM   rJ   vprN   c                    t        |      D cg c]  \  }} | j                  |   |       }}}| j                  t        j                  |d            }t        |      D cg c]  \  }} | j
                  |   |       }}}| j                  t        j                  |d            }|j                  \  }}}}	|j                  d   }
|j                  ||d      }|j                  |d| j                  ||	      j                  d|
ddd      j                  ||
z  | j                  ||	      }|j                  ||
d||	      j                  ||
z  d||	      }| j                  t        j                  || j                  |      fd            }|j                  ||
| j                  d      }|j                  ||
dd      }||z  t        j                  |      t        j                  |j                         j"                  z  z   }t%        j&                  |dt        j(                        j+                  |j                         }|j-                  dd      |j                  || j                  || j                  z  d      j-                  dd      z  }t%        j.                  |j-                  dd      j                  ||
d      dd      S c c}}w c c}}w )	zJProcess input features and visual prompts to generate enhanced embeddings.r   r   r   )r   r;   r  rQ   rf  )r  rl   r  rB   r   ri   rm   rR   rF   r[  r   expandrF  r  logical_notfinfor;   minr   rT   rD   torS   rh  )rI   rJ   rG  r   xir   r  r  r  r  Qscore
aggregateds                rL   rX   zSAVPE.forward  sO   *3A,7B[TXXa[_77HHUYYqa()*3A,7B[TXXa[_77HHUYYqa()WW
1aHHQKFF1aIIaDFFAq)00QBCKKAPQESWSYSY[\^_`ZZ1aA&..q1uaA>HHUYY488B<0a89IIaDFFB'ZZ1a$B**2.QWW1E1I1III		%Ru{{;>>u{{K__R,qyyDFFAKQS/T/^/^_ace/ff
{{://B7??1bIrUVWW5 8 8s   K%Kr  rb   s   @rL   r<  r<    sW    Og49 g# gc g8Xell+ X X%,, XrM   r<  )Lr^   typingr   r   r   rB   torch.nnr>   torch.nn.functionalr  r   ultralytics.utils.torch_utilsr   rA   r   r	   r
   r   r   r   transformerr   __all__r   r   r"   r   r   r   r   r   r   r   r   r   r#   r   r   r   r    r!   r  r$   r%  r   r   r   r   ru  rx  r%   r&   r(   r'   r)   r+   r*   r  r,   r  r/   r0   r1   r2   r  r3   r.   r-   r4   r5   r  r  r$  r.  r7  r<  r   rM   rL   <module>rY     s    ( (     : F F )(V\")) \6>BII >.#RYY #L+(bii +(\D")) D0)299 )8 ,6 68 )"))  )FJ J6x" x(;BII ;202 0(Mb M(/bii /:P P8CBII C@J")) J2")) @3$")) 3$lB)bii B)J@%ryy @%F6bii 6606		 06f,J ,,_R _()299 )D1L 1*BII (&BII &4)bii )83ryy 30<RYY <8)")) ):
3 
0f" f,@uxx @F-<")) -<`^S ^B<		 <~2ryy 2j7.")) 7.t7.BII 7.t%kS %kP0%RYY 0%f>")) >BSBII SlARYY AHRBII Rj		 0ryy ,;XBII ;XrM   