
    '}hT                        d dl Z d dlmZ d dlZd dlmZ ddlmZ  eej                  d      s] ed      ej                  j                  d<    ed      ej                  j                  d<    ed	      ej                  j                  d	<   d d
lmZmZmZ d Zd Z G d dej                  j                        Z G d d      Z	 ddZy)    N)Optional)_pytree   )_dummy_type_CudaStreamBase
_CUDAGraph_graph_pool_handle_cuda_isCurrentStreamCapturing)r
   r   r	   c                      t               S )zReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r
        P/var/www/html/test/engine/venv/lib/python3.12/site-packages/torch/cuda/graphs.pyis_current_stream_capturingr      s    
 *++r   c                      t               S )zReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )r	   r   r   r   graph_pool_handler       s     r   c                   j     e Zd ZdZ fdZd
 fd	Z fdZ fdZ fdZ fdZ	 fdZ
 fd	Z xZS )	CUDAGraphzrWrapper around a CUDA graph.

    .. warning::
        This API is in beta and may change in future releases.
    c                 "    t         |   |       S N)super__new__)cls	__class__s    r   r   zCUDAGraph.__new__3   s    ws##r   c                 (    t         |   ||       y)a  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )poolcapture_error_modeN)r   capture_begin)selfr   r   r   s      r   r   zCUDAGraph.capture_begin6   s    " 	4<NOr   c                 "    t         |           y)aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r   capture_endr   r   s    r   r    zCUDAGraph.capture_endI   s     	r   c                 "    t         |           y)z,Replay the CUDA work captured by this graph.N)r   replayr!   s    r   r#   zCUDAGraph.replayT   s    r   c                 "    t         |           y)z1Delete the graph currently held by this instance.N)r   resetr!   s    r   r%   zCUDAGraph.resetX   s    r   c                      t         |          S )zReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r   r   r!   s    r   r   zCUDAGraph.pool\   s     w|~r   c                      t         |          S )z/Enable debugging mode for CUDAGraph.debug_dump.)r   enable_debug_moder!   s    r   r(   zCUDAGraph.enable_debug_moded   s    w(**r   c                 "    t         |   |      S )z
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r   
debug_dump)r   
debug_pathr   s     r   r*   zCUDAGraph.debug_dumph   s     w!*--r   )Nglobal)__name__
__module____qualname____doc__r   r   r    r#   r%   r   r(   r*   __classcell__)r   s   @r   r   r   ,   s6    $P&	+. .r   r   c                   F    e Zd ZU dZdZed   ed<   	 	 	 d	defdZd Z	d Z
y)
grapha  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Nztorch.cuda.Streamdefault_capture_streamr   c                 t   | j                   j                  -t        j                  j	                         | j                   _        |dn|f| _        ||n| j                   j                  | _        | j                  J t        j                  j                  | j                        | _        || _	        || _
        y )Nr   )r   r4   torchcudaStreamr   capture_streamstream
stream_ctx
cuda_graphr   )r   r<   r   r:   r   s        r   __init__zgraph.__init__   s     >>00849JJ4E4E4GDNN1,BTG	(Fdnn.S.S 	 ""...**++D,?,?@$"4r   c                 8   t         j                  j                          t        j                          t         j                  j                          | j                  j                           | j                  j                  | j                  d| j                  i y )Nr   )r6   r7   synchronizegccollectempty_cacher;   	__enter__r<   r   r   r   )r   s    r   rC   zgraph.__enter__   sf    

 




  	!!#%%%YY	
+/+B+B	
r   c                 r    | j                   j                          | j                  j                  |||       y r   )r<   r    r;   __exit__)r   exc_type	exc_value	tracebacks       r   rE   zgraph.__exit__   s(    ##%  9i@r   )NNr,   )r-   r.   r/   r0   r4   r   __annotations__strr=   rC   rE   r   r   r   r3   r3   s   s@    : =AH%89@
 "*5
  5,
Ar   r3   c                    t        j                         rt        j                         rt        d      d}t	        | t
              sd}| f} |f}g }t        | |      D ]  \  }}t	        |t         j                  j                        rvt        |j                        dk(  r0t        |j                        dk(  rt        |j                        dk(  sJ d       t        d |j                         D              sJ d       t        j                   | }	|j#                  t        |	             t        d |	D              rJ d	        |D cg c]  }t        |       }
}| D cg c]A  }t	        |t         j                  j                        rt        |j%                               nd
C }}t'        t        |             D cg c]  }||   ||   z    }}t'        t        |             D cg c]   }t         j(                  j+                         " }}t'        t        |             D cg c]   }t         j(                  j+                         " }}|
t-               n|}t         j(                  j/                          t         j(                  j1                  t         j(                  j3                               5  t        | ||      D ]  \  }}}t'        |      D ]n  }t        j4                   ||       }t         j6                  j9                  t        d |D              t        d |D              t        d |D              d|      }p ~~ 	 ddd       t         j(                  j/                          g }g }t        | ||      D ]x  \  }}}t         j(                  j;                  ||      5   || }ddd       t        j<                        \  }}|j#                  t        |             |j#                  |       z g }g }t        t?        |      t?        |      t?        |      t?        |            D ]  \  }}}}t        d |D              } t         j(                  j;                  ||      5  t         j6                  j9                  t        d |D              t        d |D              t        d | D              d|      }ddd       g }!d}"|D ]9  }#|#j@                  r|!j#                  |"          |"dz  }")|!j#                  d       ; t        |!      }!|j#                  |        |j#                  |!       
 |jC                          |jC                          d }$g }%tE        |       D ]  \  }} |$||   ||   ||   |
|   ||   ||   ||   ||   ||   	      }&t	        |t         j                  j                        r9d }' |'||jF                  |&|jH                        |_$        |%j#                  |       |%j#                  |&        |r|%d   S t        |%      S c c}w c c}w c c}w c c}w c c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)a  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FTr   zModules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c              3   8   K   | ]  }|j                   d u   yw)FNrequires_grad.0bs     r   	<genexpr>z)make_graphed_callables.<locals>.<genexpr>  s     EAq%/Es   zIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c              3   P   K   | ]  }t        |t        j                           y wr   )
isinstancer6   Tensor)rP   args     r   rR   z)make_graphed_callables.<locals>.<genexpr>!  s     HS:c5<<0Hs   $&zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.r   Nc              3   :   K   | ]  }|j                   s|  y wr   rM   rP   os     r   rR   z)make_graphed_callables.<locals>.<genexpr>B  s     !H!!H   c              3   :   K   | ]  }|j                   s|  y wr   rM   rP   is     r   rR   z)make_graphed_callables.<locals>.<genexpr>C  s      TqAOO TrZ   c              3   `   K   | ]&  }|j                   st        j                  |       ( y wr   rN   r6   
empty_likerX   s     r   rR   z)make_graphed_callables.<locals>.<genexpr>D  s&      '01((+'s   ..)outputsinputsgrad_outputsonly_inputsallow_unused)r   c              3   b   K   | ]'  }|j                   rt        j                  |      nd  ) y wr   r_   rX   s     r   rR   z)make_graphed_callables.<locals>.<genexpr>g  s+      $
AB1??EQ<$
s   -/c              3   :   K   | ]  }|j                   s|  y wr   rM   rX   s     r   rR   z)make_graphed_callables.<locals>.<genexpr>m  s     KA1??aKrZ   c              3   :   K   | ]  }|j                   s|  y wr   rM   r\   s     r   rR   z)make_graphed_callables.<locals>.<genexpr>n  s     P1QPrZ   c              3   &   K   | ]	  }||  y wr   r   rX   s     r   rR   z)make_graphed_callables.<locals>.<genexpr>o  s     "SQ]1"Ss      c	           	      ~    
  G  fddt         j                  j                        

fd}	|	S )Nc                       e Zd Zefd       Zeej                  j                  j                   fd              Z	y)Omake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                 
   t              D ]A  }|   j                         ||   j                         k7  s+|   j                  ||          C j                          t	        t
              sJ t        d D              S )Nc              3   <   K   | ]  }|j                           y wr   detachrX   s     r   rR   zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>  s     @AQXXZ@s   )rangedata_ptrcopy_r#   rT   tuple)ctxrb   r]   	fwd_graphlen_user_argsstatic_input_surfacestatic_outputss      r   forwardzWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward  s     }- AA+A.779VAY=O=O=QQ,Q/55fQi@A   "!.%888@@@@r   c                 2   t        |      t              k(  sJ t        |      D ];  \  }}|	|j                         |j                         k7  s+|j                  |       = j	                          t        t              sJ t        d D              S )Nc              3   D   K   | ]  }||j                         n|  y wr   rp   rO   s     r   rR   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>  s$      ;<!-AHHJQ6s    )lenziprs   rt   r#   rT   ru   )rv   gradsggrad	bwd_graphstatic_grad_inputsstatic_grad_outputss       r   backwardzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward  s     5zS)<%===="#6> *GAt} ::<4==?:GGDM*   " ""4e<<< @R  r   N)
r-   r.   r/   staticmethodr{   r6   autogradfunctiononce_differentiabler   )r   rw   rx   r   r   ry   rz   s   r   Graphedrm     sC    A A ^^$$88 9 r   r   c                      t        j                  |  } j                  t        |      z    }t        j                  |      S r   )r   arg_tree_leavesapplyru   tree_unflatten)	user_argsflatten_user_argsoutr   module_paramsoutput_unflatten_specs      r   functionalizedzVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized  sE     !( 7 7 C'--%(9":]"JLC))#/DEEr   )r6   r   Function)rw   r   r   rx   r   ry   rz   r   r   r   r   s   ````````` @r   make_graphed_autograd_functionz>make_graphed_callables.<locals>.make_graphed_autograd_function  s-    	 	enn-- 	:	F r   c                       fd}|S )Nc                  6    j                   k(  r |  S  |  S r   )training)r   funcgraph_training_stategraphedorig_fwds    r   new_fwdzEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwd  s(     }}(<<&	22'33r   r   )r   r   r   r   r   s   ```` r   make_graphed_forwardz4make_graphed_callables.<locals>.make_graphed_forward  s    4 r   )%r6   is_autocast_enabledis_autocast_cache_enabledRuntimeErrorrT   ru   r   nnModuler~   _backward_hooks_forward_hooks_forward_pre_hooksallbuffersr   r   append
parametersrr   r7   r   r   r?   r:   r8   tree_leavesr   r   r3   tree_flattenreversedrN   reverse	enumerater   r{   )(	callablessample_argsnum_warmup_itersallow_unused_inputr   just_one_callableflatten_sample_argscargsflatten_argper_callable_len_user_argsper_callable_module_paramsr]   "per_callable_static_input_surfaces_
fwd_graphs
bwd_graphsmempoolr   ry   ra   grad_inputsper_callable_static_outputs"per_callable_output_unflatten_specrw   flatten_outputsspec per_callable_static_grad_outputsper_callable_static_grad_inputsrz   r   r   r   r   grad_idxrV   r   retr   r   s(                                           r   make_graphed_callablesr      sC   J   "u'F'F'Hm
 	
 i' L	"ny+. 
4a)A%%&!+(()Q.,,-2
]3 EEE -E
 --t4""5#56HKHH 	
Z	
H#
0 9L!L#d)!L!L " ",Auxx!?allnRG" " s9~&* 	A!;A!>>*& *
 38I2GHQ%**&&(HJH27I2GHQ%**&&(HJH%)\!tG
 
JJ			5::,,.	/ %03{$F1
 	%,D$, +, 
!--dDk:#nn11!!HW!HH  T,@ TT!& '5<' " !%!3 2 
 	%%  
JJ #%)+&!$YZ!H 8dIZZig6 	"DkG	" !( 4 4W =#**5+AB*11$78 (*$&(#JM34,-+,	K #CFni $ $
FT$
 
 ZZig6 	..--KKKP(<PP""S.A"SS / . K	  ' 	0C  "))+h*?@A"))$/	0 ##56(//0CD'../ABG#CL %,,.#++-0f CY'  40qMqM&q)&q).q1.q1'*,Q/+A.

 dEHHOO,	 0dmmWdll[DLJJtJJw; > 1v:o "M"*
 IH% %4	" 	",	 	sE   XAX#=X('%X-$%X22BX7'Y,AY7YY	Y	)   FN)r@   typingr   r6   torch.utilsr   _utilsr   hasattr_C__dict__torch._Cr
   r   r	   r   r   r   r3   r   r   r   r   <module>r      s    	     uxx*+&1,&?EHHl#.9:N.OEHH*+:E(;EHH67 , D.## D.NFA FAV PTbr   