
    |h                        d dl mZ d dlZd dlmc mZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ dZ G d dej2                        Z G d dej                  j2                        Zy)    )ListN)nn)trunc_normal_)MLP)LOGGER   )SAM2TwoWayTransformer)MaskDecoderSAM2MaskDecoder)ImageEncoderViTPromptEncoder)get_1d_sine_peselect_closest_cond_framesg      c                   d     e Zd ZU dZdZeed<   	 	 ddedede	de
e   de
e   d	d
f fdZd Z xZS )SAMModela  
    Segment Anything Model (SAM) for object segmentation tasks.

    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images
    and input prompts.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
        pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
        pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.

    Methods:
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> image_encoder = ImageEncoderViT(...)
        >>> prompt_encoder = PromptEncoder(...)
        >>> mask_decoder = MaskDecoder(...)
        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
        >>> # Further usage depends on SAMPredictor class

    Notes:
        All forward() operations are implemented in the SAMPredictor class.
            mask_thresholdimage_encoderprompt_encodermask_decoder
pixel_mean	pixel_stdreturnNc                 (   t         |           || _        || _        || _        | j                  dt        j                  |      j                  ddd      d       | j                  dt        j                  |      j                  ddd      d       y)a  
        Initialize the SAMModel class to predict object masks from an image and input prompts.

        Args:
            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
            pixel_mean (List[float]): Mean values for normalizing pixels in the input image.
            pixel_std (List[float]): Standard deviation values for normalizing pixels in the input image.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> prompt_encoder = PromptEncoder(...)
            >>> mask_decoder = MaskDecoder(...)
            >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
            >>> # Further usage depends on SAMPredictor class

        Notes:
            All forward() operations moved to SAMPredictor.
        r   r   Fr   N)	super__init__r   r   r   register_buffertorchTensorview)selfr   r   r   r   r   	__class__s         a/var/www/html/test/engine/venv/lib/python3.12/site-packages/ultralytics/models/sam/modules/sam.pyr   zSAMModel.__init__;   s    8 	*,(\5<<
+C+H+HQPQ+RTYZ[%,,y*A*F*Fr1a*PRWX    c                     t        | j                  d      r| j                  j                  |       || j                  _        |D cg c]  }|dz  	 c}| j                  _        |d   | j                  _        yc c}w )CSet image size to make model compatible with different image sizes.	set_imgsz   r   N)hasattrr   r(   r   input_image_sizeimage_embedding_sizeimg_sizer"   imgszxs      r$   r(   zSAMModel.set_imgsz^   si    4%%{3((//4,EJ3KAG3K0&+Ah# 4Ls   A:))g33333^@gR]@gRY@)g(\2M@g(\L@g     L@)__name__
__module____qualname____doc__r   float__annotations__r   r   r
   r   r   r(   __classcell__r#   s   @r$   r   r      sp    8  NE #<!8!Y&!Y &!Y "	!Y
 K!Y ;!Y 
!YF/r%   r   c                   <    e Zd ZU dZdZeed<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededed	ed
edef fdZe	d        Z
d Zd Z	 	 	 	 ddZd Zdej                   fdZd Z	 d dZd Zd Zd Z	 	 	 d!dZd Zed        Zd dZd Z xZS )"	SAM2Modela  
    SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms
    for temporal consistency and efficient tracking of objects across frames.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
        memory_attention (nn.Module): Module for attending to memory features.
        memory_encoder (nn.Module): Encoder for generating memory representations.
        num_maskmem (int): Number of accessible memory frames.
        image_size (int): Size of input images.
        backbone_stride (int): Stride of the backbone network output.
        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
        sam_image_embedding_size (int): Size of SAM image embeddings.
        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
        obj_ptr_proj (nn.Module): Projection layer for object pointers.
        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
        hidden_dim (int): Hidden dimension of the model.
        mem_dim (int): Memory dimension for encoding features.
        use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
        use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
        max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
        add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
        proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
            encoding in object pointers.
        use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
        only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
            evaluation.
        pred_obj_scores (bool): Whether to predict if there is an object in the frame.
        pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
        fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
        soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
        use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
        no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
        max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
            first frame.
        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
            conditioning frames.
        multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
        multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
        multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
        iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
        memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
            memory encoder during evaluation.
        sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
        sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
            with clicks during evaluation.
        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
            prompt encoder and mask decoder on frames with mask input.

    Methods:
        forward_image: Process image batch through encoder to extract multi-level features.
        track_step: Perform a single tracking step, updating object masks and memory features.
        set_binarize: Set binarize for VideoPredictor.
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
        >>> image_batch = torch.rand(1, 3, 512, 512)
        >>> features = model.forward_image(image_batch)
        >>> track_results = model.track_step(0, True, features, None, None, None, {})
    r   r   use_multimask_token_for_obj_ptrpred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatialcompile_image_encoderc$                 V   t         $|           || _        || _        |rdnd| _        || _        || _        |r(t        j                  j                  dddd      | _
        || _        |r|sJ || _        || _        || _        || _        |j                   | _        || _        | j"                  | _        t)        | j$                  d      rRt)        | j$                  j*                  d      r2| j$                  j*                  j,                  j.                  d   | _        || _        t        j                  j3                  t        j4                  |dd| j&                              | _        t9        | j6                  d	       t        j                  j3                  t        j4                  dd| j"                              | _        t        j                  j3                  t        j4                  dd| j"                              | _        t9        | j:                  d	       t9        | j<                  d	       || _        || _         || _!        |	| _"        || _#        || _$        |
| _%        || _&        || _'        || _(        || _)        || _*        || _+        || _,        || _-        |"| _.        || _/        || _0        || _1        || _2        | jb                  r| j^                  sJ | j
                  sJ | j^                  re| j
                  rYt        j                  j3                  t        j4                  d| j"                              | _3        t9        | jf                  d	       | | _4        d
| _5        |!rYt        j                  j3                  t        j4                  d| j&                              | _5        t9        | jj                  d	       | jm                          || _7        |#rRtq        jr                  d       t        jt                  | j                  jv                  ddd      | j                  _;        y
y
)a  
        Initialize the SAM2Model for video object segmentation with memory-based tracking.

        Args:
            image_encoder (nn.Module): Visual encoder for extracting image features.
            memory_attention (nn.Module): Module for attending to memory features.
            memory_encoder (nn.Module): Encoder for generating memory representations.
            num_maskmem (int): Number of accessible memory frames.
            image_size (int): Size of input images.
            backbone_stride (int): Stride of the image backbone output.
            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames
                with clicks during evaluation.
            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                prompt encoder and mask decoder on frames with mask input.
            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the
                first frame.
            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial
                conditioning frames.
            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in
                memory encoder during evaluation.
            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                cross-attention.
            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in
                the encoder.
            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                encoding in object pointers.
            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
                in the object pointers.
            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                during evaluation.
            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
            no_obj_embed_spatial (bool): Whether add no obj embedding to spatial frames.
            sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.

        Examples:
            >>> image_encoder = ImageEncoderViT(...)
            >>> memory_attention = SAM2TwoWayTransformer(...)
            >>> memory_encoder = nn.Sequential(...)
            >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
            >>> image_batch = torch.rand(1, 3, 512, 512)
            >>> features = model.forward_image(image_batch)
            >>> track_results = model.track_step(0, True, features, None, None, None, {})
           r      )kernel_sizestrideout_projweightr   g{Gz?)stdNzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneTF)mode	fullgraphdynamic)<r   r   r   use_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encoderr   r   Conv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentiond_model
hidden_dimmemory_encodermem_dimr*   rH   rI   shapenum_maskmem	Parameterzerosmaskmem_tpos_encr   no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackingr;   iou_prediction_use_sigmoid
image_sizebackbone_stridesam_mask_decoder_extra_argsr<   r=   r>   r?   
no_obj_ptrr@   rA   _build_sam_headsmax_cond_frames_in_attnr   infocompileforward)%r"   r   rX   r[   r^   rp   rq   re   rf   rg   rj   ru   rd   rN   rk   rl   rm   rn   r;   ro   ri   rh   rP   rQ   rT   rU   rV   rW   r<   r=   r>   r?   r@   rA   rr   rB   r#   s%                                       r$   r   zSAM2Model.__init__   sm   B 	 +,H)'C!'>$'>$" $)88??1aQq?#QD (@%$+++)B&/N,2T/ !1*22 -4&&
3@S@S@\@\^f8g..77>>DDQGDL& % 2 25;;{AqRVR^R^3_ `d++6!HH..u{{1a/QR#hh00Q4??1STd''T2d))t4)B& *C&(@%2T/-J*/N, 5Y1'>$$8!$8!-J*/N,*D' %.+F(.#6  0.  ''''////D$@$@#hh00Q1PQDO$//t4(@%$(!(-(:(:5;;q$,,;W(XD%$33>'>$ !KK`a).""**#	*D& !r%   c                 H    t        | j                               j                  S )z=Return the device on which the model's parameters are stored.)next
parametersdevicer"   s    r$   r|   zSAM2Model.deviceo  s     DOO%&---r%   c                     t        d      )zWProcess image and prompt inputs to generate object masks and scores in video sequences.zPlease use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.)NotImplementedError)r"   argskwargss      r$   rx   zSAM2Model.forwardt  s    !J
 	
r%   c                    | j                   | _        | j                  | j                  z  | _        t        | j                  | j                  | j                  f| j                  | j                  fd      | _        t        ddt        d| j                  dd      | j                  dd| j                  | j                  | j                  | j                  | j                  d	
| j                  xs i | _        | j                   rwt"        j$                  j'                  | j                   | j                         | _        | j*                  rUt-        | j                   | j                   | j                   d      | _        n#t"        j$                  j/                         | _        | j0                  r:t"        j$                  j'                  | j                   | j2                        | _        y
t"        j$                  j/                         | _        y
)zMBuild SAM-style prompt encoder and mask decoder for image segmentation tasks.r)   )	embed_dimr,   r+   mask_in_chansrD      i      )depthembedding_dimmlp_dim	num_heads   )
num_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimuse_high_res_featuresro   r<   r=   r;   N )rZ   sam_prompt_embed_dimrp   rq   sam_image_embedding_sizer   sam_prompt_encoderr   r	   rN   ro   r<   r=   r;   rr   sam_mask_decoderrP   r   r   Linearobj_ptr_projr@   r   IdentityrU   r\   obj_ptr_tpos_projr}   s    r$   rt   zSAM2Model._build_sam_heads{  s   $(OO!(,4;O;O(O% #0//----" #oot?#
 !0 !
"#-"77	 !55 #"&"C"C'+'F'F 00 $ 8 8,0,P,P!
  //52!!
$ '' % QD,,$'$//[\$]! % 1 1 3D)) &+XX__T__dll%SD"%*XX%6%6%8D"r%   c           	         |j                  d      }|j                  }|j                  d      | j                  k(  sJ |j                  d      | j                  k(  sJ |j                  d      | j                  k(  sJ |4|d   }|d   }	|j                  d      |k(  r|	j                  d      |k(  sCJ t	        j
                  |dd|      }t	        j                  |dt        j                  |	       }	|t        |j                        d
k(  r|j                  dd |dfk(  sJ |j                  dd | j                  j                  k7  r=t        j                  |j                         | j                  j                  ddd      }
n|}
nd}
| j                  ||	fd|
      \  }}| j                  || j                  j!                         |||d|      \  }}}}| j"                  r(|dkD  }t	        j$                  |ddddf   |t&              }|j                         }t        j                  || j(                  | j(                  fdd      }|dddf   }|rvt	        j*                  |d      }t	        j,                  ||      }|||f   j/                  d      }|||f   j/                  d      }|j                  d      dkD  r|||f   }n||}}| j1                  |      }| j"                  rS| j2                  r|j5                         }nj                         }| j6                  r||z  }|d|z
  | j8                  z  z   }|||||||fS )a{
  
        Forward pass through SAM prompt encoders and mask heads.

        This method processes image features and optional point/mask inputs to generate object masks and scores.

        Args:
            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
            point_inputs (Dict[str, torch.Tensor] | None): Dictionary containing point prompts.
                'point_coords': Tensor of shape (B, P, 2) with float32 dtype, containing absolute
                    pixel-unit coordinates in (x, y) format for P input points.
                'point_labels': Tensor of shape (B, P) with int32 dtype, where 1 means positive clicks,
                    0 means negative clicks, and -1 means padding.
            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the
                same spatial size as the image.
            high_res_features (List[torch.Tensor] | None): List of two feature maps with shapes
                (B, C, 4*H, 4*W) and (B, C, 2*H, 2*W) respectively, used as high-resolution feature maps
                for SAM decoder.
            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False,
                output only 1 mask and its IoU estimate.

        Returns:
            low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
            high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
            ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
            low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
            high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
            obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
            object_score_logits (torch.Tensor): Tensor of shape (B) with object score logits.

        Examples:
            >>> backbone_features = torch.rand(1, 256, 32, 32)
            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
            >>> mask_inputs = torch.rand(1, 1, 512, 512)
            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
            >>> (
            ...     low_res_multimasks,
            ...     high_res_multimasks,
            ...     ious,
            ...     low_res_masks,
            ...     high_res_masks,
            ...     obj_ptr,
            ...     object_score_logits,
            ... ) = results
        r   r   r   rD   Npoint_coordspoint_labelsr|   )dtyper|   rE   FbilinearTsizealign_cornersrK   	antialias)pointsboxesmasks)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputrepeat_imagehigh_res_features)r   rK   r   r   dim)r   r|   r   r   r   r`   onesint32lenr]   r   mask_input_sizeFinterpolater5   r   get_dense_per<   whereNO_OBJ_SCORErp   argmaxarange	unsqueezer   r?   sigmoidr>   rs   )r"   backbone_featurespoint_inputsmask_inputsr   r   Br|   sam_point_coordssam_point_labelssam_mask_promptsparse_embeddingsdense_embeddingslow_res_multimasksioussam_output_tokensobject_score_logitsis_obj_appearinghigh_res_multimaskssam_output_tokenbest_iou_inds
batch_indslow_res_maskshigh_res_masksobj_ptrlambda_is_obj_appearings                             r$   _forward_sam_headszSAM2Model._forward_sam_heads  s   h ""1%")) %%a(D,E,EEEE %%a(D,I,IIII %%a(D,I,IIII #+N;+N;#((+q05E5J5J15MQR5RRR  %{{1a6B %

1au{{6 RR " {(()Q.;3D3DRa3HQPQF3RRR  %)@)@)P)PP"#--%%'00@@"'#"# #. #O.2.E.E$&67! /F /
++
 LPK`K`.,,99;%6$4-/ La L
HD"35H 2Q6 "'-=atm-LN`bn!o 0557mm//4??3	
 -QT2!LL26Ma7J.z=/HISSTUVM0]1JKUUVWXN %%a(1,#4Z5N#O ,>@S>M ##$45##*=*E*E*G'*:*@*@*B'$$1G;%<!< OOG 
 	
r%   c                    d\  }}|j                         }||z  |z   }t        j                  ||j                  d      dz  |j                  d      dz  fddd      }|j	                  |j                  d	      d
      j                         }	| j
                  s<t        j                  |j                  d	      | j                  |j                        }
n+| j                  || j                  |      |      \  }}}}}}
}t        j                  |j                  d
      j                         dkD  d
      }|d   }|j                         }||z  |z   }| j                  r&| j                  r||
z  }
|
d
|z
  | j                   z  z   }
|||	|||
|fS )zFProcess mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@      $r   rE   r   Fr   Tr   r   r   r   )r   r   r   r   r   ).N)r5   r   r   r   new_onesrP   r   r`   rZ   r|   r   rS   anyflattenr<   r>   rs   )r"   r   r   r   	out_scaleout_biasmask_inputs_floatr   r   r   r   _r   r   r   s                  r$   _use_mask_as_outputzSAM2Model._use_mask_as_outputC  s    *	8'--/*Y6A %%b)Q.0C0CB0G10LM
 ##K$4$4Q$7;AAC++kk+"2"21"5t{OaOabG )-(?(?"3 001BC"3 )@ )%Aq!Q7A !99[%8%8%;%A%A%Cc%IqQ+I6"2"8"8":'*AAHL$$1G;%<!< OOG 
 	
r%   	img_batchc                     | j                  |      }| j                  rN| j                  j                  |d   d         |d   d<   | j                  j	                  |d   d         |d   d<   |S )zRProcess image batch through encoder to extract multi-level features for SAM model.backbone_fpnr   r   )r   rN   r   conv_s0conv_s1)r"   r   backbone_outs      r$   forward_imagezSAM2Model.forward_imager  s{    )))4,, /3.C.C.K.KLYgLhijLk.lL(+.2.C.C.K.KLYgLhijLk.lL(+r%   c                    t        |d         t        |d         k(  sJ t        |d         | j                  k\  sJ |d   | j                   d }|d   | j                   d }|D cg c]   }|j                  d   |j                  d   f" }}|D cg c]$  }|j                  d      j	                  ddd      & }}|D cg c]$  }|j                  d      j	                  ddd      & }}||||fS c c}w c c}w c c}w )	zZPrepare and flatten visual features from the image backbone output for further processing.r   vision_pos_encNr   r   r   r   r   )r   rO   r]   r   permute)r"   r   feature_mapsvision_pos_embedsr0   
feat_sizesvision_featss          r$   _prepare_backbone_featuresz$SAM2Model._prepare_backbone_features|  s   </0CEU8V4WWWW</0D4K4KKKK#N3T5L5L4L4NO()9:D<S<S;S;UV:KLQqwwr{AGGBK0L
L?KL!		!,,Q15LLDUVqQYYq\11!Q:VV\+<jHH MLVs   '%C2)C7)C<c	                     |d   j                  d      }	| j                  }
|d   \  }}|d   j                  }| j                  dk(  r(|d   j	                  ddd      j                  |	|
||      S d}|rdnd}|sg g }}t        |d         dkD  sJ |d   }t        ||| j                        \  }}|j                         D cg c]  }d|f }}| j                  rdn| j                  }t        d| j                        D ]  }| j                  |z
  }|dk(  r|r||z   n||z
  }n1|s|dz
  |z  |z  }||dz
  |z  z
  }n|dz    |z   |z  }||dz
  |z  z   }|d   j                  |d      }||j                  |d      }|j                  ||f        |D ]  \  }}|	|d   j                  |d	
      }|j                  |j!                  d      j	                  ddd             |d   d   j                  |      }|j!                  d      j	                  ddd      }|| j"                  | j                  |z
  dz
     z   }|j                  |        | j$                  r4t'        || j(                        }| j                  s=| j*                  r1|j-                         D ci c]  \  }}|r||k\  r	n||k  r|| } }}n|} | j-                         D cg c],  \  }}| j.                  r||z
  |z  nt1        ||z
        |d   f. }!}}t        d|      D ]Z  }"|r||"z   n||"z
  }|dk  s|||k\  r n@|d   j                  ||j                  |d            }|E|!j                  |"|d   f       \ |!rt3        |! \  }#}$t5        j6                  |$d      }%| j8                  r|dz
  }&| j:                  r|
n| j<                  }'t5        j>                  |#|      }(tA        |(|&z  |'      }(| jC                  |(      }(|(jE                  d      jG                  d|	| j<                        }(n&|%jI                  t        |#      |	| j<                        }(| j<                  |
k  ro|%jK                  d|	|
| j<                  z  | j<                        }%|%j	                  dddd      j!                  dd      }%|(jM                  |
| j<                  z  d      }(|j                  |%       |j                  |(       |%jN                  d   }nd}n| jP                  r9|d   | jR                  z   })|)j	                  ddd      j                  |	|
||      })|)S | jR                  jG                  d|	| j<                        g}| jT                  jG                  d|	| j<                        g}t5        jV                  |d      }*t5        jV                  |d      }+| jY                  |||*|+|      })|)j	                  ddd      j                  |	|
||      })|)S c c}w c c}}w c c}}w )zePrepare memory-conditioned features by fusing current frame's visual features with previous memories.r   r   r   r   cond_frame_outputsnon_cond_frame_outputsNmaskmem_featuresT)r|   non_blockingmaskmem_pos_encr   r   r   rD   )currcurr_posmemory
memory_posnum_obj_ptr_tokens)-r   rZ   r|   r^   r   r!   r   r   ru   valuestrainingri   rangegetappendtor   ra   rP   minrQ   rW   itemsrV   abszipr   stackrT   rU   r\   tensorr   r   r   expand	new_zerosreshaperepeat_interleaver]   rd   rb   rc   catrX   ),r"   	frame_idxis_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr   output_dict
num_framestrack_in_reverser   CHWr|   r   tpos_sign_multo_cat_memoryto_cat_memory_pos_embedcond_outputsselected_cond_outputsunselected_cond_outputsoutt_pos_and_prevsrt_post_relprev_frame_idxprevfeatsmaskmem_encrQ   tptr_cond_outputspos_and_ptrst_diffpos_list	ptrs_listobj_ptrs
t_diff_maxtpos_dimobj_pospix_feat_with_memr   memory_pos_embeds,                                               r$   $_prepare_memory_conditioned_featuresz.SAM2Model._prepare_memory_conditioned_features  s    !$))!,OO"~1%b)00 q '+33Aq!<AA!Q1MM.A!572M {#789A===&';<L=W<)E)E>:!#: 4I3O3O3QRC3xROR
 ]](L(LAq$"2"23 5((50A::JY%6PY\aPaN) (11}&:a%?N%3uqyAo%EN *3Q'71'<%=%AN%3uqyAo%EN!":;??PTU; 255ndKC&&s|4-50  / <t< /0336PT3U$$U]]1%5%=%=aA%FG"#45b9<<F<K)11!4<<Q1E)D,A,A$BRBRUZBZ]^B^,__'..{;< ++*-j$:V:V*W' }})P)P '<&A&A&C("As.>ANAN 3($ ( (=$ #3"8"8":  3  $CC ']m;!$Y]!3I    $A'>? FF.>	F*IPVDVA1u!7AO%&>?CCAG^GbGbcdfjGklC$++VS^,DEF  *-|*<'Hi${{9!<H 44%<q%@
(,(F(F1DLL"',,x"G"0:1E8"T"&"8"8"A")"3"3A"6"="=b!T\\"R"*"4"4S]At||"T||a'#+#3#3B1;Ldll#[#+#3#3Aq!Q#?#G#G1#M")";";A<MST";"U!((2+227;)1):&)*& --$8$<t?P?P$P!$5$=$=aA$F$K$KAqRSUV$W!(( "..55aDLLIJM'+':':'A'A!Q'U&V# =a0 99%<!D 11%.'1 2 
 .55aA>CCAq!QO  A Sd( s   W?!X1X
c                    |d   j                  d      }| j                  }|d   \  }}	|d   j                  ddd      j                  ||||	      }
| j                  r| j
                  s| j                  |      }| j                  xr |}|r | j
                  s|dkD  j                         }nt        j                  |      }| j                  dk7  r|| j                  z  }| j                  dk7  r|| j                  z   }| j                  |
|d      }|d	   }|d
   }| j                  E|dkD  j                         }|d|d   z
   | j                  d   j                  |j                    z  z  }||fS )zXEncode frame features and masks into a new memory representation for video segmentation.r   r   r   r         ?r   T)skip_mask_sigmoidvision_featuresr   ).NN)r   rZ   r   r!   rh   r   "_apply_non_overlapping_constraintsrg   r5   r   r   re   rf   r[   rA   r  r]   )r"   r  r   pred_masks_high_resr   is_mask_from_ptsr   r  r  r  pix_featbinarizemask_for_memmaskmem_outr   r   r   s                    r$   _encode_new_memoryzSAM2Model._encode_new_memory/  s    !$))!,OO"~1'+33Aq!<AA!Q1M--dmm #'"I"IJ]"^::O?ODMM/!3::<L !==)<=L))S0'$*H*HHL((C/'$*G*GGL))(LTX)Y&'89%&67 $$0 3a 7>>@%5o%F!F K$JcJcKf&,,K. !. .  00r%   c           
      j   ||d}t        |      dkD  rft        |dd |dd       D cg c]H  \  }} |j                  ddd      j                  |j	                  d      |j	                  d      g| J }}}nd}|W| j
                  rK|d   j                  ddd      } |j                  d| j                  g|d    }| j                  |||      }nT| j                  |||dd |dd |dd ||	|
      }|||J |}| j                  ||      }| j                  |||||      }||||fS c c}}w )	hPerform a single tracking step, updating object masks and memory features based on current frame inputs.)r   r   r   Nr   r   r   )r  r  r  r  r   r  r  r  )r   r   r   r   r   )r   r  r   r!   r   rj   rZ   r   r0  _use_multimaskr   )r"   r  r  r  r  r   r   r   r  r  r  prev_sam_mask_logitscurrent_outr0   sr   r8  sam_outputsr   s                      r$   _track_stepzSAM2Model._track_step[  s    (4KP#$q(   4Sb 9:cr?K!Aq (		!Q"''q	166!9AqA! !
 !%"t'P'P ,B/771a@H$x}}RJ:b>JH228=NP[\K @@##5%9"#%>*CBC*H%bc?'%!1 A 	H $/#/K4GGG2#223E|T11"*)'"3!1 2 K K):HDDO!s   AD/c                     |r7| j                   dkD  r(|}| j                  |||||du      \  }	}
|	|d<   |
|d<   yd|d<   d|d<   y)z^Run memory encoder on predicted mask to encode it into a new memory feature for future frames.r   N)r  r   r6  r   r7  r   r   )r^   r<  )r"   r  r   r   run_mem_encoderr   r   rA  high_res_masks_for_mem_encr   r   s              r$   _encode_memory_in_outputz"SAM2Model._encode_memory_in_output  sx     t//!3)7&040G0G%9%$>$7".d": 1H 1-o /?K*+-<K)*.2K*+-1K)*r%   c                     | j                  |||||||||	|
|      \  }}}}|\  }}}}}}}||d<   ||d<   ||d<   | j                  s||d<   | j                  |||||||       |S )r>  
pred_masksr6  r   r   )rD  r   rH  )r"   r  r  r  r  r   r   r   r  r  r  rF  r@  rA  rC  r   r   r   r   r   s                       r$   
track_stepzSAM2Model.track_step  s    , *.)9)9 % *
&[!Q P[L1a9L$1L!-;)*!(I}} 2EK-. 	%% 	
 r%   c                     |dn|d   j                  d      }| j                  xr6 |xs | j                  xr$ | j                  |cxk  xr | j                  k  S c S )zaDetermine whether to use multiple mask outputs in the SAM head based on configuration and inputs.r   r   r   )r   rk   rn   rl   rm   )r"   r  r   num_ptss       r$   r?  zSAM2Model._use_multimask  sk    #+!n1M1R1RST1U(( T#It'I'IT**gR9R9RR	
 S	
r%   c                     | j                  d      }|dk(  r| S | j                  }t        j                  | dd      }t        j                  ||      dddddf   }||k(  }t        j
                  || t        j                  | d            } | S )	z\Apply non-overlapping constraints to masks, keeping the highest scoring object per location.r   r   T)r   keepdimr   Nr   )max)r   r|   r   r   r   r   clamp)rJ  
batch_sizer|   max_obj_indsbatch_obj_indskeeps         r$   r5  z,SAM2Model._apply_non_overlapping_constraints  s      __Q'
?""||JAtDj@D$PTATU~- [[z5;;zu3UV
r%   c                     || _         y)z Set binarize for VideoPredictor.N)rg   )r"   r9  s     r$   set_binarizezSAM2Model.set_binarize  s
    2:/r%   c                     |d   | _         || j                  _        |D cg c]  }|dz  	 c}| j                  _        yc c}w )r'   r   r)   N)rp   r   r+   r,   r.   s      r$   r(   zSAM2Model.set_imgsz	  s;    (380IN7OAR7O47Os   ?)    i   r)   r2  r   FFr   FFFr   r   FFFr   FFr)   TFFFFFFFFFNF)NNNF)F)FTN)r1   r2   r3   r4   r   r5   r6   boolr   propertyr|   rx   rt   r   r   r   r    r   r   r0  r<  rD  rH  rK  r?  staticmethodr5  rW  r(   r7   r8   s   @r$   r:   r:   g   s   DL  NE "%!$+0-2 ""'%* %&+05#(()&+ % "!%"'(-+0 %$)!& %).%*$(&+I}& *.'}: ;}< "=}> ?}@ A}B #'C}D #E}H  $I}~ . .
-9d W
r-
^u|| I0 b!H*1X9Ev2J  !'8t
  ";Pr%   r:   )typingr   r   torch.nn.functionalr   
functionalr   torch.nn.initr   ultralytics.nn.modulesr   ultralytics.utilsr   blocksr	   decodersr
   r   encodersr   r   utilsr   r   r   Moduler   r:   r   r%   r$   <module>rh     s]         ' & $ ) 2 4 = H/ryy H/VfP fPr%   