
    '}h	,                        d dl Z d dl mZ ddlmZmZmZmZmZmZm	Z	 d dl
mZmZ ddgZ G d de      Zd	d
e de de dz   e_        	 	 	 ddee   dee   dee   dee   dee   dededededededefdZdee   dee   dee   dee   dededededededefdZdee   dee   dee   dee   dededededededefdZy)    N)Tensor   )	Optimizer_use_grad_for_differentiable_default_to_fused_or_foreach_differentiable_doc_foreach_doc_maximize_doc_view_as_real)ListOptionalAdadeltaadadeltac                   h     e Zd Z	 	 	 	 	 d
ddddee   dedef fdZ fdZd Zedd	       Z	 xZ
S )r   F)maximizedifferentiableforeachr   r   c          	          d|k  st        d|       d|cxk  rdk  sn t        d|       d|k  st        d|       d|k  st        d|       t        |||||||      }	t        
|   ||	       y )Ng        zInvalid learning rate:       ?zInvalid rho value: zInvalid epsilon value: zInvalid weight_decay value: )lrrhoepsweight_decayr   r   r   )
ValueErrordictsuper__init__)selfparamsr   r   r   r   r   r   r   defaults	__class__s             S/var/www/html/test/engine/venv/lib/python3.12/site-packages/torch/optim/adadelta.pyr   zAdadelta.__init__   s     by6rd;<<c S 23%899cz6se<==l";L>JKK%)
 	*    c                     t         |   |       | j                  D ]8  }|j                  dd        |j                  dd       |j                  dd       : y )Nr   r   Fr   )r   __setstate__param_groups
setdefault)r   stategroupr!   s      r"   r%   zAdadelta.__setstate__,   sV    U#&& 	6EY-Z/-u5	6r#   c                 T   d}|d   D ]  }|j                   |t        j                  |      z  }|j                  |       |j                   j                  rt        d      |j                  |j                          | j                  |   }t        |      dk(  rUd|d<   t        j                  |t        j                        |d<   t        j                  |t        j                        |d<   |j                  |d          |j                  |d          |dxx   d	z  cc<    |S )
NFr   z*Adadelta does not support sparse gradientsr   step)memory_format
square_avg	acc_deltar   )
gradtorch
is_complexappend	is_sparseRuntimeErrorr(   len
zeros_likepreserve_format)	r   r)   params_with_gradgradssquare_avgs
acc_deltashas_complexpr(   s	            r"   _init_groupzAdadelta._init_group3   s   x 	Avv~5++A..K##A&vv"#OPPLL JJqME 5zQ !f&+&6&6U%:%:'l# &+%5%5U%:%:&k" u\23eK01&MQM1	2 r#   c                 D   d}|$t        j                         5   |       }ddd       | j                  D ]\  }g }g }g }g }|d   |d   |d   |d   |d   |d   |d   f\  }}	}
}}}}| j                  |||||      }t	        ||||||	|
|||||	       ^ |S # 1 sw Y   vxY w)
zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r   r   r   )r   r   r   r   r   r   r   r<   )r0   enable_gradr&   r>   r   )r   closurelossr)   r8   r9   r:   r;   r   r   r   r   r   r   r   r<   s                   r"   r+   zAdadelta.stepP   s    ""$ !y! && 	E!EKJdeen%i j!&'MIBS,> **52BE;XbcK )!-'#	@ G! !s   BB)r   g?gư>r   N)N)__name__
__module____qualname__r   boolr   r%   r>   r   r+   __classcell__)r!   s   @r"   r   r      sg     "&+ $+ $+ + +@6: ", ",r#   a  Implements Adadelta algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)},
                \: f(\theta) \text{ (objective)}, \: \rho \text{ (decay)},
                \: \lambda \text{ (weight decay)}                                                \\
            &\textbf{initialize} :  v_0  \leftarrow 0 \: \text{ (square avg)},
                \: u_0 \leftarrow 0 \: \text{ (accumulate variables)}                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm}if \: \lambda \neq 0                                                    \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda  \theta_{t-1}                            \\
            &\hspace{5mm} v_t      \leftarrow v_{t-1} \rho + g^2_t (1 - \rho)                    \\
            &\hspace{5mm}\Delta x_t    \leftarrow   \frac{\sqrt{u_{t-1} +
                \epsilon }}{ \sqrt{v_t + \epsilon}  }g_t \hspace{21mm}                           \\
            &\hspace{5mm} u_t  \leftarrow   u_{t-1}  \rho +
                 \Delta x^2_t  (1 - \rho)                                                        \\
            &\hspace{5mm}\theta_t      \leftarrow   \theta_{t-1} - \gamma  \Delta x_t            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `ADADELTA: An Adaptive Learning Rate Method`_.
    a  
    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        rho (float, optional): coefficient used for computing a running average
            of squared gradients (default: 0.9). A higher value of `rho` will
            result in a slower average, which can be helpful for preventing
            oscillations in the learning process.
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-6).
        lr (float, optional): coefficient that scale delta before it is applied
            to the parameters (default: 1.0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        z	
        zd

    .. _ADADELTA\: An Adaptive Learning Rate Method:
        https://arxiv.org/abs/1212.5701

    r   r9   r:   r;   r   r   r<   r   r   r   r   r   c                    |t        | |d      \  }}|r)t        j                  j                         rt	        d      |r%t        j                  j                         st
        }nt        } || ||||||	|
|||       y)zvFunctional API that performs Adadelta algorithm computation.

    See :class:`~torch.optim.Adadelta` for details.
    NF)	use_fusedz6torch.jit.script not supported with foreach optimizers)r   r   r   r   r   r   r<   )r   r0   jitis_scriptingr4   _multi_tensor_adadelta_single_tensor_adadelta)r   r9   r:   r;   r   r   r<   r   r   r   r   r   _funcs                 r"   r   r      s    , 1&.TYZ
7599))+STTuyy--/%&!%r#   c                   t        | |||      D ]u  \  }}}}|s|n| }|dk7  r|j                  ||      }t        j                  |      r?t        j                  |      }t        j                  |      }t        j                  |      }|j                  |      j                  ||d|z
         |j                  |      j                         }|j                  |      j                         }|	r|j                         }|j                  |      j                  |       |j                  |      j                  ||d|z
         t        j                  |      rt        j                  |      }|j                  ||        x y )Nr   alphar   value)zipaddr0   r1   view_as_realmul_addcmul_sqrt_clonediv_view_as_complexadd_)r   r9   r:   r;   r   r   r   r   r   r   r<   paramr/   r-   r.   stddeltas                    r"   rM   rM      sU    14{J1 %,j) $t$188E86DE"++J7J**95I%%d+D%%dDC%@nnS!'')c"((*KKME

3T"s$$UES$AE"))%0E

5
$/%r#   c                `   |	rJ d       t        |       dk(  ry t        j                  | |||g      }|j                         D ]i  \  \  }}}}}|
rt	        ||||       |rt        j                  |      }|dk7  r3|rt        j                  |||       nt        j                  |||      }t        j                  ||       t        j                  |||d|z
         t        j                  ||      }t        j                  |       t        j                  ||      }t        j                  |       t        j                  ||       t        j                  ||       t        j                  |||        t        j                  ||       t        j                  |||d|z
         l y )Nz#_foreach ops don't support autogradr   rQ   r   rS   )r5   r   "_group_tensors_by_device_and_dtypevaluesr   r0   _foreach_neg_foreach_add__foreach_add_foreach_mul__foreach_addcmul__foreach_sqrt__foreach_div_)r   r9   r:   r;   r   r   r   r   r   r   r<   grouped_tensorsdevice_paramsdevice_gradsdevice_square_avgsdevice_acc_deltasrN   r`   deltass                      r"   rL   rL     s    DDD
6{aBBFES^`jCklOUdUkUkUm RQ	M-'9;Lq-7IK\] --l;L1##L-|T$11,Uab.4 2L,VWZ]V]^  !3S9S!##$5s;V$FC(FL1M6"=-s3 166SQ9Rr#   )NFF)r0   r   	optimizerr   r   r   r   r	   r
   r   typingr   r   __all__r   __doc__rF   floatr   rM   rL    r#   r"   <module>rx      s    Y Y Y !z
"ry rj6	 
 		 		 7/ r # -L-<- f- V	- d^- - - 	- 
- 
- - -`&%L&%<&% f&% V	&% 	&% 
&% 
&% &% &% &% &%R1RL1R<1R f1R V	1R 	1R 1R 
1R 
1R 1R 1R 1Rr#   