
    |h	d                         d dl mZmZmZmZ d dlZd dlmZ d dlmZm	Z	  G d dej                        Z G d dej                        Zy)	    )ListOptionalTupleTypeN)nn)MLPLayerNorm2dc                       e Zd ZdZdej
                  ddfdedej                  dedeej                     ded	ed
df fdZ	de
j                  de
j                  de
j                  de
j                  ded
ee
j                  e
j                  f   fdZde
j                  de
j                  de
j                  de
j                  d
ee
j                  e
j                  f   f
dZ xZS )MaskDecodera  
    Decoder module for generating masks and their associated quality scores using a transformer architecture.

    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
    generate mask predictions along with their quality scores.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): Transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Internal method for mask prediction.

    Examples:
        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
        >>> masks, iou_pred = decoder(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
        ... )
        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
          transformer_dimtransformernum_multimask_outputs
activationiou_head_depthiou_head_hidden_dimreturnNc                    t         |           || _        || _        || _        t        j                  d|      | _        |dz   | _        t        j                  | j                  |      | _	        t        j                  t        j                  ||dz  dd      t        |dz         |       t        j                  |dz  |dz  dd       |             | _        t        j                  t        | j                        D cg c]  }t!        |||dz  d       c}      | _        t!        ||| j                  |      | _        yc c}w )a  
        Initialize the MaskDecoder module for generating masks and their associated quality scores.

        Args:
            transformer_dim (int): Channel dimension for the transformer module.
            transformer (nn.Module): Transformer module used for mask prediction.
            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
                 kernel_sizestride   r   N)super__init__r   r   r   r   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr	   output_upscaling
ModuleListranger   output_hypernetworks_mlpsiou_prediction_head)	selfr   r   r   r   r   r   _	__class__s	           f/var/www/html/test/engine/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/decoders.pyr   zMaskDecoder.__init__)   s$   2 	.&%:"a94q8<<(<(<oN "10DRS\]^1,-L!3_5IWXabcL!
 *,UZ[_[o[oUpqPQS/?a3GKq*
& $'8KTMaMacq#r  rs    Eimage_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputc                     | j                  ||||      \  }}|rt        dd      nt        dd      }|dd|ddddf   }|dd|f   }||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder.
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            masks (torch.Tensor): Batched predicted masks.
            iou_pred (torch.Tensor): Batched predictions of mask quality.

        Examples:
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
            >>> image_emb = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_emb = torch.rand(1, 2, 256)
            >>> dense_emb = torch.rand(1, 256, 64, 64)
            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
        )r.   r/   r0   r1   r   Nr   )predict_masksslice)	r*   r.   r/   r0   r1   r2   masksiou_pred
mask_slices	            r-   forwardzMaskDecoder.forwardY   sk    > ,,-%=$;	 - 
x (8U1d^U1a[
aQ)*AzM*h    c           
         t        j                  | j                  j                  | j                  j                  gd      }|j                  d      j                  |j                  d   dd      }t        j                  ||fd      }t        j                  ||j                  d   d      }||z   }t        j                  ||j                  d   d      }|j                  \  }	}
}}| j                  |||      \  }}|dddddf   }|dddd| j                  z   ddf   }|j                  dd      j                  |	|
||      }| j                  |      }t        | j                        D cg c]!  } | j                  |   |dd|ddf         # }}t        j                   |d      }|j                  \  }	}
}}||j                  |	|
||z        z  j                  |	d||      }| j#                  |      }||fS c c}w )z`Predict masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   )torchcatr    weightr"   	unsqueezeexpandshaperepeat_interleaver   r!   	transposeviewr%   r'   r(   stackr)   )r*   r.   r/   r0   r1   output_tokenstokenssrcpos_srcbchwhsiou_token_outmask_tokens_outupscaled_embeddingihyper_in_listhyper_inr6   r7   s                         r-   r4   zMaskDecoder.predict_masks   s    		4>>#8#8$:J:J:Q:Q"RXYZ%//299:R:X:XYZ:[]_acdM+CD!L %%&6QQO++))(FLLOKYY
1a ""38C1a7QQ)=)=%= >AB mmAq!&&q!Q2!2237QVW[WkWkQl-
LM-D**1-oaAg.FG-
 -
 ;;}!4'--
1a.33Aq!a%@@FFq"aQRS ++M:h-
s   3&H)__name__
__module____qualname____doc__r   GELUintModuler   r   r?   Tensorboolr   r9   r4   __classcell__r,   s   @r-   r   r      s2   B &'&(gg#&.s.s YY.s  #	.s
 O.s .s !.s 
.s`+,,+ ,,+ #(,,	+
 "'+ + 
u||U\\)	*+Z%,,% ,,% #(,,	%
 "'% 
u||U\\)	*%r:   r   c                       e Zd ZdZdej
                  ddddddddddfdedej                  d	ed
eej                     dedede	de	de	de	ddf fdZ
	 ddej                  dej                  dej                  dej                  de	de	deeej                        deej                  ej                  ej                  ej                  f   fdZ	 ddej                  dej                  dej                  dej                  de	deeej                        deej                  ej                  ej                  ej                  f   fdZd Zd Z xZS ) SAM2MaskDecodera
  
    Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

    This class extends the functionality of the MaskDecoder, incorporating additional features such as
    high-resolution feature processing, dynamic multimask output, and object score prediction.

    Attributes:
        transformer_dim (int): Channel dimension of the transformer.
        transformer (nn.Module): Transformer used to predict masks.
        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
        iou_token (nn.Embedding): Embedding for IOU token.
        num_mask_tokens (int): Total number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for mask tokens.
        pred_obj_scores (bool): Whether to predict object scores.
        obj_score_token (nn.Embedding): Embedding for object score token.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        output_upscaling (nn.Sequential): Upscaling layers for output.
        use_high_res_features (bool): Whether to use high-resolution features.
        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
        iou_prediction_head (MLP): MLP for IOU prediction.
        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Predict instance segmentation masks from image and prompt embeddings.
        _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
        _dynamic_multimask_via_stability: Dynamically select the most stable mask output.

    Examples:
        >>> image_embeddings = torch.rand(1, 256, 64, 64)
        >>> image_pe = torch.rand(1, 256, 64, 64)
        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
        >>> decoder = SAM2MaskDecoder(256, transformer)
        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
        ... )
    r   r   Fg?g\(\?r   r   r   r   r   r   use_high_res_featurespred_obj_scorespred_obj_scores_mlpuse_multimask_token_for_obj_ptrr   Nc                 4   t         |           || _        || _        || _        t        j                  d|      | _        |dz   | _        t        j                  | j                  |      | _	        || _
        | j                  rt        j                  d|      | _        || _        t        j                  t        j                  ||dz  dd      t        |dz         |       t        j                  |dz  |dz  dd       |             | _        || _        |rBt        j$                  ||dz  dd      | _        t        j$                  ||dz  dd      | _        t        j*                  t-        | j                        D cg c]  }t/        |||dz  d       c}      | _        t/        ||| j                  ||      | _        | j                  r0t        j4                  |d      | _        |rt/        ||dd      | _        |	| _        |
| _        || _        yc c}w )	a  
        Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.

        This decoder extends the functionality of MaskDecoder, incorporating additional features such as
        high-resolution feature processing, dynamic multimask output, and object score prediction.

        Args:
            transformer_dim (int): Channel dimension of the transformer.
            transformer (nn.Module): Transformer used to predict masks.
            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
            activation (Type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
            use_high_res_features (bool): Whether to use high-resolution features.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
            pred_obj_scores (bool): Whether to predict object scores.
            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.

        Examples:
            >>> transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8), num_layers=6)
            >>> decoder = SAM2MaskDecoder(transformer_dim=256, transformer=transformer)
            >>> print(decoder)
        r   r   r   r   r   r   )sigmoidN)r   r   r   r   r   r   r   r    r!   r"   rf   obj_score_tokenrh   r#   r$   r	   r%   re   Conv2dconv_s0conv_s1r&   r'   r   r(   r)   Linearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r*   r   r   r   r   r   r   re   iou_prediction_use_sigmoidrq   rr   rs   rf   rg   rh   r+   r,   s                   r-   r   zSAM2MaskDecoder.__init__   s   X 	.&%:"a94q8<<(<(<oN.#%<<?#CD /N, "10DRS\]^1,-L!3_5IWXabcL!
 &;" 99_o6JXYbcdDL99_o6JXYbcdDL)+UZ[_[o[oUpqPQS/?a3GKq*
& $'  .$
  ')yy!'DD$"+.QRTU+V( 0O,1R.2T/' rs    Hr.   r/   r0   r1   r2   repeat_imagehigh_res_featuresc                 b   | j                  ||||||      \  }}	}
}|r|ddddddddf   }|	ddddf   }	nJ| j                  r"| j                  s| j                  ||	      \  }}	n|ddddddddf   }|	ddddf   }	|r| j                  r|
ddddf   }n|
ddddf   }||	||fS )a  
        Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
            multimask_output (bool): Whether to return multiple masks or a single mask.
            repeat_image (bool): Flag to repeat the image embeddings.
            high_res_features (List[torch.Tensor] | None, optional): Optional high-resolution features.

        Returns:
            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

        Examples:
            >>> image_embeddings = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
            >>> decoder = SAM2MaskDecoder(256, transformer)
            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
            ... )
        )r.   r/   r0   r1   ru   rv   Nr   r   )r4   rq   training _dynamic_multimask_via_stabilityrh   )r*   r.   r/   r0   r1   r2   ru   rv   r6   r7   rS   object_score_logitssam_tokens_outs                r-   r9   zSAM2MaskDecoder.forward8  s    L AE@R@R-%=$;%/ AS A
=x*= !QRA+&E12H11$--"CCE8TOE8!QqS!Q,'E1Q3'H D D,QU3N -Q!V4Nh0CCCr:   c           
         d}| j                   rYt        j                  | j                  j                  | j
                  j                  | j                  j                  gd      }d}nAt        j                  | j
                  j                  | j                  j                  gd      }|j                  d      j                  |j                  d      dd      }t        j                  ||fd      }	|r&t        j                  ||	j                  d   d      }
n#|j                  d   |	j                  d   k(  sJ |}
|
|z   }
|j                  d      dk(  sJ d       t        j                  ||	j                  d   d      }|
j                  \  }}}}| j                  |
||	      \  }}
|dd|ddf   }|dd|dz   |dz   | j                  z   ddf   }|
j                  dd      j                  ||||      }
| j                   s| j#                  |
      }n?| j"                  \  }}}}}|\  }} | | ||
      |z               } | ||      |z         }t%        | j                        D cg c]!  } | j&                  |   |dd|ddf         # }}t        j(                  |d      }|j                  \  }}}}||j                  ||||z        z  j                  |d||      }| j+                  |      }| j                   r#|dk(  sJ | j-                  |dddddf         } n"d|j/                  |j                  d   d      z  } |||| fS c c}w )	zYPredict instance segmentation masks from image and prompt embeddings using a transformer.r   r<   r   r>   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   g      $@)rf   r?   r@   rk   rA   r    r"   rB   rC   sizerE   rD   r   r!   rF   rG   re   r%   r'   r(   rH   r)   rp   new_ones)!r*   r.   r/   r0   r1   ru   rv   srI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   dc1ln1act1dc2act2feat_s0feat_s1rU   rV   rW   r6   r7   rz   s!                                    r-   r4   zSAM2MaskDecoder.predict_masks}  s^    !II((//NN))$$++
 M A!IIt~~'<'<d>N>N>U>U&V\]^M%//299:R:W:WXY:Z\^`bcM+CD!L ))*:FLLOQRSC#))!,Q???"C++}}Q1$h&hh$))(FLLOKYY
1a ""38C1a7QAQ1E1E)E FIJ mmAq!&&q!Q2))!%!6!6s!;(,(=(=%CdC0GW!%c#c(W*<&=!>!%c*<&=&G!H RWW[WkWkQl-
LM-D**1-oaAg.FG-
 -
 ;;}!4'--
1a.33Aq!a%@@FFq"aQRS ++M:6M6"&":":2aAg;"G #'):):8>>!;La)P"Ph1DDD!-
s   /&Mc                    |j                  d      }| j                  }t        j                  ||kD  d      j	                         }t        j                  || kD  d      j	                         }t        j
                  |dkD  ||z  d      S )zNCompute mask stability scores based on IoU between upper and lower thresholds.r>   r<   r   g      ?)flattenrr   r?   sumfloatwhere)r*   mask_logitsstability_deltaarea_iarea_us        r-   _get_stability_scoresz%SAM2MaskDecoder._get_stability_scores  sw    !))"-@@;8bAGGI;/)99rBHHJ{{6A:v<<r:   c                 F   |ddddddddf   }|ddddf   }t        j                  |d      }t        j                  |j                  d      |j                        }|||f   }|j                  d      }|||f   }|j                  d      }|ddddddddf   }	|ddddf   }
| j                  |	      }|| j                  k\  }t        j                  |d   j                  |	      |	|      }t        j                  |j                  |
      |
|      }||fS )a  
        Dynamically select the most stable mask output based on stability scores and IoU predictions.

        This method is used when outputting a single mask. If the stability score from the current single-mask
        output (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs
        (based on output tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask
        for both clicking and tracking scenarios.

        Args:
            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is
                batch size, N is number of masks (typically 4), and H, W are mask dimensions.
            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

        Returns:
            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

        Examples:
            >>> decoder = SAM2MaskDecoder(...)
            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
            >>> all_iou_scores = torch.rand(2, 4)
            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
            >>> print(mask_logits.shape, iou_scores.shape)
            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
        Nr   r>   r<   r   )device).NN)
r?   argmaxaranger}   r   rB   r   rs   r   	expand_as)r*   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_inds
batch_indsbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresstability_scores	is_stablemask_logits_outiou_scores_outs                  r-   ry   z0SAM2MaskDecoder._dynamic_multimask_via_stability  sL   6 +1ab!Q;7-ae4 <<(<"E\\"6";";A">~G\G\]
 0=M1M N 5 ? ? B$8EU9U$V!$=$G$G$J! ,AqsAqL9 .q!A#v 6556GH$(O(OO	  ++o&001BC!

  56!%

 ..r:   )N)rX   rY   rZ   r[   r   r\   r]   r^   r   r`   r   r?   r_   r   r   r   r9   r4   r   ry   ra   rb   s   @r-   rd   rd      s   *` &'&(gg#&&+#((-*.+/ %$)05[U[U YY[U  #	[U
 O[U [U ![U  $[U [U "[U *.[U  
![UJ ;?CD,,CD ,,CD #(,,	CD
 "'CD CD CD $D$67CD 
u||U\\5<<E	FCDX ;?EE,,EE ,,EE #(,,	EE
 "'EE EE $D$67EE 
u||U\\5<<E	FEEN=5/r:   rd   )typingr   r   r   r   r?   r   ultralytics.nn.modulesr   r	   r^   r   rd    r:   r-   <module>r      s;    / .   3`")) `FS/bii S/r:   