
    '}hX              #          d dl Z d dl mZ ddlmZmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlmZmZ ddgZ G d de      Zd	d
e de de dz   e_        	 	 	 	 	 d!dee   dee   dee   dee   dee   dee   dedee   dededededededededef"dZdee   dee   dee   dee   dee   dee   dedededededededededef dZdee   dee   dee   dee   dee   dee   dedededededededededef d Zy)"    N)Tensor   )	Optimizer_use_grad_for_differentiable
_get_value_dispatch_sqrt_stack_if_compiling_get_scalar_dtype_default_to_fused_or_foreach_view_as_real_capturable_doc_differentiable_doc_foreach_doc)ListOptionalNAdamnadamc            
       h     e Zd Z	 	 ddddddedee   dedef fdZ fd	Zd
 Zedd       Z	 xZ
S )r   FN)foreach
capturabledifferentiabledecoupled_weight_decayr   r   r   c                j   d|k  st        d|       d|k  st        d|       d|d   cxk  rdk  sn t        d|d          d|d   cxk  rdk  sn t        d|d          d|k  st        d	|       d|k  st        d
|       t        ||||||||	|
	      }t        |   ||       y )N        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: r   z#Invalid beta parameter at index 1: zInvalid weight_decay value: zInvalid momentum_decay value: )	lrbetasepsweight_decaymomentum_decayr   r   r   r   )
ValueErrordictsuper__init__)selfparamsr   r   r   r   r    r   r   r   r   defaults	__class__s               P/var/www/html/test/engine/venv/lib/python3.12/site-packages/torch/optim/nadam.pyr$   zNAdam.__init__   s     by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKKn$=n=MNOO2U%1./E 'J~_ 	*    c                    t         |   |       | j                  D ]h  }|j                  dd        |j                  dd       |j                  dd       |j                  dd       |d   D ]  }| j                  j                  |g       }t        |      dk7  s/t        j                  |d         s_t        |d         }|d   r*t        j                  |t               |j                  	      nt        j                  |t               
      |d<   t        j                  |d         r|d   }|d   r*t        j                  |t               |j                  	      nt        j                  |t               
      |d<    k y )Nr   r   Fr   r   r&   r   stepdtypedevicer.   
mu_product)r#   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr
   r/   )r%   r5   grouppp_statestep_valmu_prod_valr(   s          r)   r2   zNAdam.__setstate__!   sb   U#&& 	CEY-\51-u55u=8_ 
C**..B/w<1$ ??76?;#(#9.3L.A ,1<<HYH[dedldl+mGL||T\dudwGx   ??7<+@A&-l&;49,4G 27kQbQdmnmumu1vMR\\Zem~  nA  NB  -
C	Cr*   c                    d}|d   D ]  }	|	j                   |t        j                  |	      z  }|j                  |	       |	j                   j                  rt        d      |j                  |	j                          | j                  |	   }
t        |
      dk(  r|d   r*t        j                  dt               |	j                        nt        j                  dt               	      |
d
<   |d   r*t        j                  dt               |	j                        nt        j                  dt               	      |
d<   t        j                  |	t        j                        |
d<   t        j                  |	t        j                        |
d<   |j                  |
d          |j                  |
d          |j                  |
d          |j                  |
d
           |S )NFr&   z'NAdam does not support sparse gradientsr   r    r-   r   r0   r,   r   r1   )memory_formatexp_avg
exp_avg_sq)gradr8   
is_complexappend	is_sparseRuntimeErrorr5   r7   zerosr
   r/   r;   ones
zeros_likepreserve_format)r%   r<   params_with_gradgradsexp_avgsexp_avg_sqsmu_productsstate_stepshas_complexr=   r5   s              r)   _init_groupzNAdam._init_group5   s   x 	2Avv!u//22 ''*66##&'PQQQVV$

1u:? !. B.?.A!((S49LLL]L_4` &M !. 

2->-@R49LLL]L_4` ,'
 (-'7'7I^I^'_E)$*/*:*:1ELaLa*bE,'i 01""5#67""5#67""5=1=	2> r*   c                 n   | j                          d}|$t        j                         5   |       }ddd       | j                  D ]a  }g }g }g }g }g }g }	|d   \  }
}| j	                  |||||||	      }t        ||||||	|
||d   |d   |d   |d   |d   |d   |d	   |d
   |       c |S # 1 sw Y   {xY w)zPerforms a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r    r   r   r   r   r   )beta1beta2r   r   r    r   r   r   r   r   rU   ) _cuda_graph_capture_health_checkr8   enable_gradr3   rV   r   )r%   closurelossr<   rO   rP   rQ   rR   rS   rT   rX   rY   rU   s                r)   r,   z
NAdam.stepX   s    	--/""$ !y! && 	+E!EHKKK >LE5**52BE8U`bmoz{K"4[$^4!&'7!8El)./G)H	*"<0!&'7!8)!+	+: A! !s   B++B4)gMb`?)g?g+?g:0yE>r   gMbp?FN)__name__
__module____qualname__boolr   r$   r2   rV   r   r,   __classcell__)r(   s   @r)   r   r   
   sa    @DUZ+.2u(-+NR+%d^+@D+ "&+,C(!F "+ "+r*   a  Implements NAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma_t \text{ (lr)}, \: \beta_1,\beta_2 \text{ (betas)},
                \: \theta_0 \text{ (params)}, \: f(\theta) \text{ (objective)}                   \\
            &\hspace{13mm} \: \lambda \text{ (weight decay)}, \:\psi \text{ (momentum decay)}    \\
            &\hspace{13mm} \: \textit{decoupled\_weight\_decay}                                  \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)}                                 \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{15mm} \theta_t \leftarrow \theta_{t-1} - \gamma \lambda \theta_{t-1}                    \\
            &\hspace{10mm}\textbf{else}                                                          \\
            &\hspace{15mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm} \mu_t \leftarrow \beta_1 \big(1 - \frac{1}{2}  0.96^{t \psi} \big)     \\
            &\hspace{5mm} \mu_{t+1} \leftarrow \beta_1 \big(1 - \frac{1}{2} 0.96^{(t+1)\psi}\big)\\
            &\hspace{5mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{5mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{5mm}\widehat{m_t} \leftarrow \mu_{t+1} m_t/(1-\prod_{i=1}^{t+1}\mu_i)\\[-1.ex]
            & \hspace{11mm} + (1-\mu_t) g_t /(1-\prod_{i=1}^{t} \mu_{i})                         \\
            &\hspace{5mm}\widehat{v_t} \leftarrow   v_t/\big(1-\beta_2^t \big)                   \\
            &\hspace{5mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}/
                \big(\sqrt{\widehat{v_t}} + \epsilon \big)                                       \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Incorporating Nesterov Momentum into Adam`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 2e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        momentum_decay (float, optional): momentum momentum_decay (default: 4e-3)
        decoupled_weight_decay (bool, optional): whether to use decoupled weight
            decay as in AdamW to obtain NAdamW (default: False)
        z	
        z

    .. _Incorporating Nesterov Momentum into Adam:
        https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    r&   rP   rQ   rR   rS   rT   r   r   r   r   rU   rX   rY   r   r   r    r   c                ~   t        d |D              st        d      t        d |D              st        d      |t        | |	d      \  }}|r)t        j                  j                         rt        d      |r%t        j                  j                         st        }nt        } || ||||||||||||||	|
	       y)
zpFunctional API that performs NAdam algorithm computation.

    See :class:`~torch.optim.NAdam` for details.
    c              3   P   K   | ]  }t        |t        j                           y wr^   
isinstancer8   r   .0ts     r)   	<genexpr>znadam.<locals>.<genexpr>        @qz!U\\*@   $&zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsc              3   P   K   | ]  }t        |t        j                           y wr^   rf   rh   s     r)   rk   znadam.<locals>.<genexpr>   rl   rm   zPAPI has changed, `mu_products` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizers)
rX   rY   r   r   r    r   r   r   r   rU   )allrJ   r   r8   jitis_scripting_multi_tensor_nadam_single_tensor_nadam)r&   rP   rQ   rR   rS   rT   r   r   r   r   rU   rX   rY   r   r   r    r   _funcs                      r)   r   r      s    4 @K@@mnn@K@@mnn1&.TYZ
7599))+STTuyy--/"#					"& 6& "r*   c       
   
      F   t        |       D ]  \  }}||   }||   }||   }||   }||   }t        j                  |      rTt        j                  |      }t        j                  |      }t        j                  |      }t        j                  |      }t        j                  j                         sQ|rO|j                  r|j                  r|j                  s+|j                  r|j                  r|j                  sJ d       |dz  }|r|}nt        |      }d||z  z
  }|	dk7  r-|r|j                  d||	z  z
         n|j                  ||	      }|ddd||
z  z  z  z
  z  }|ddd|dz   |
z  z  z  z
  z  }||z  }|j                  |d|z
         |j                  |      j                  ||d|z
         |j                  |      j                         }|s|r]|j                  |      }||z  }|| d|z
  z  d|z
  z  z  }|| |z  d|z
  z  z  }|j                  ||       |j                  ||       ,t        |      |z  }|j!                  |       |j                  ||| d|z
  z  dt        |      z
  z         |j                  ||| |z  d|z
  z          y )	NzUIf capturable=True, params, mu_products, and state_steps must be CUDA or XLA tensors.r   r   alphar         ?Q?)value)	enumerater8   rG   view_as_real_utilsis_compilingis_cudais_xlar   mul_addlerp_addcmul_divsqrtaddcdiv_add_)r&   rP   rQ   rR   rS   rT   rX   rY   r   r   r    r   r   r   r   rU   iparamrF   rD   rE   r1   step_tr,   bias_correction2mumu_nextdenommu_product_nexts                                r)   rt   rt      s   $ f% >[5Qx1+ ^
 ^
QE"&&u-E%%d+D((1G++J7J ||((*z:#5#5&..ell_i_p_pu{  vC  vCgfg D 	!Df%Du},1%

1rL001xx\x: b3$4.+@"ABBC2t^0K'L MMN 	b
 	dAI&''d!e)'D/0557ZIIcNE )72OB3"r'?b:o>?D"w"2F!GHGNN4'NN7E*(4w>OJJsONN4sb2g"zR\G]B]/^N`NN7E2#-BDX1YNZ}>[r*   c       
         
   t        |       dk(  ry |rJ d       t        j                  j                         s&|r$t	        d t        | ||      D              sJ d       t        j                  | |||||g      }|j                         D ]  \  \  }}}}}}}|rt        ||||       |d   j                  r.t        j                  |t        j                  dd      d       nt        j                  |d	       |	dk7  r7|rt        j                  |d	||	z  z
         nt        j                  |||	      }t        j                  ||d	|z
         t        j                  ||       t        j                   |||d	|z
         t        j"                  |      }|r4t        j$                  ||
      }t        j&                  d
|      }t        j                  |d       t        j                  |d       t        j                  ||       t        j                  ||
       t        j&                  d
|      }t        j                  |d       t        j                  |d       t        j                  ||       ~t        j&                  ||      }t        j(                  |d       t        j*                  |       t        j,                  |       nx|D cg c]  }t/        d	|t1        |      z  z
         }}|D cg c]  }|ddd
t1        |      |
z  z  z  z
  z   }}|D cg c]  }|ddd
t1        |      d	z   |
z  z  z  z
  z  ! }}t        j                  ||       t        j2                  ||       t        j                  ||       ~|rt        j(                  |d       t        j                  ||       t        j4                  |d      }t        j*                  |       t        j2                  ||       |}~t        j$                  ||      }t        j                  ||       t        j(                  |d       t        j2                  ||       |} ~t        j$                  ||      }!t        j                   |!| |       t        j6                  ||!|       t9        t        ||      D "#cg c]  \  }"}#|d|#z
  z  dt1        |"      z
  z  dz  ! c}#}"      }t9        t        ||      D "$cg c]  \  }"}$||$z  dt1        |"      |$z  z
  z  dz  ! c}$}"      } t        j6                  ||||       t        j6                  ||||         y c c}w c c}w c c}w c c}#}"w c c}$}"w )Nr   z#_foreach ops don't support autogradc              3   t   K   | ]0  \  }}}|j                   xr |j                   xr |j                    2 y wr^   )r   )ri   r=   mpr,   s       r)   rk   z&_multi_tensor_nadam.<locals>.<genexpr>k  s9      M"q"d 99<<< Ms   68zNIf capturable=True, params, mu_products, and state_steps must be CUDA tensors.r   cpu)r/   rx   r   r{   g      rz   )r7   r8   r   r   rp   zipr   "_group_tensors_by_device_and_dtypevaluesr   is_cpu_foreach_add_r;   _foreach_mul__foreach_add_foreach_lerp__foreach_addcmul__foreach_sqrt_foreach_mul_foreach_pow_foreach_sub__foreach_neg__foreach_sqrt_r   r   _foreach_div__foreach_sub_foreach_addcdiv_r	   )%r&   rP   rQ   rR   rS   rT   rX   rY   r   r   r    r   r   r   r   rU   grouped_tensorsgrouped_paramsgrouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_mu_productsgrouped_state_stepsru   exp_avg_sq_sqrtexponentmusmu_nextsbias_correction_sqrtr,   r   step_size_gradsstep_size_expavg	numeratorr1   r   r   s%                                        r)   rs   rs   R  sQ   $ 6{aDDD <<$$&: M&)&+{&KM M 	]\	] M
  BBFES[]hju  xC  DD  EOO^OeOeOgli 	L 
H.-)9	13F .-9IK^_ q!(( 3U\\#e5T\_` 3Q71%##NA\8I4IJ % 2 2=.Xd e 	-}a%iH/7 3]MSTW\S\]--.AB))*=~NH$$T84CT*S)U+ .9))$9H$/#.%0 #(#5#5e=P#Q  4c: 45  !56^q#rVZN1u
4@P7P3P$Q#r #rbuvZ^5BD1AN1R(S!TTUvCv$79  cTz$7G!7K~6]-^&_!_` 9H 9 	/5O-ABOS1 !S)R(&&':C@E&U+!O &&':HEE"- s+%0' **?MJI##I/?AQR ##NIO1ILM`beIf3h7Ez2 57"r'Nb:V`KaFa4bfh3h 3h iO2ORSfhpOq4s8K
G 68'\R*U_J`cjJjEj5koq4q 4s  t ##NM?Tcd##N4DoWghYlin $sv9Z3h4ss   !U7!U$U/$U/$U$)FNFFF)r8   r   	optimizerr   r   r   r   r	   r
   r   r   r   r   r   typingr   r   __all__r   __doc__rb   r:   r   rt   rs   rB   r*   r)   <module>r      s    \ \ \ \ "G
zI zx#F	 
 		 		 G9H */$("!&#:"$v, :"f:"L:" F|:" F|	:"
 F|:" #':" D>:" :" :" :" :" :"  !:"" #:"$  %:"& ':"zP[f P[ $VP[#'<P[ '+6lP[ '+6l	P[
 '+6lP[ !&P[ !&P[ #P[ (-P[ */P[ $P[ 26P[ &*P[ *.P[  '+!P[fKiV Ki#F|Ki"&v,Ki &*&\Ki &*&\	Ki
 &*&\Ki  %Ki  %Ki "Ki ',Ki ).Ki #Ki 15Ki %)Ki )-Ki  &*!Kir*   