
    |h9                     X   d Z ddlZddlZddlZddlmc mZ ddlm	Z	 ddl
mZ ddl
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d dej4                  j6                  j8                        ZdZ eddg        G d de             Z G d de      Z  G d de!      Z" G d d      Z#	 ej4                  jH                  jJ                  jM                  dd ej4                  jH                  jJ                  jO                  d ddd      g       ej                   jS                  d#e      e_         y# e($ ri ej4                  jH                  jJ                  jM                  d d! ej4                  jH                  jJ                  jO                  d" ddd      g       Y w xY w)$zBase class of optimizer.    N)logging)backend)initializers)utils)learning_rate_schedule)tf_utils)keras_export)doc_controlsc                   B    e Zd ZdZ	 	 	 	 	 	 	 	 d,dZd Zd Zd Zd Zd Z	e
j                  d	        Z ej                  d
      d        Zd Zd-dZd Zed        Zej*                  d        Zed        Zej*                  d        Zeej0                  d               Zej*                  d        Zd Ze
j                  d        Zd Zd.dZ	 d/dZd0 fd	Zd-dZ d Z!d/dZ"d Z#d-dZ$d  Z%d! Z&d" Z'd# Z(d$ Z)d% Z*d& Z+e,d-d'       Z-ed(        Z.d) Z/d* Z0d+ Z1 xZ2S )1_BaseOptimizerzBOptimizer base class, which only supports non-distribute use case.Tc
                 V   || _         || _        || _        || _        || _        || _        |	r;t        j                         r't        j                  j                  d      rd| _        nd| _        t        j                         dk(  r[t        j                         dk(  rDt        j                   d| j"                  j$                   d| j"                  j$                   d       |r=|d	kD  s|d
k  rt'        d|       |r#t)        |t*              r|d	k  rt'        d|       || _        || _        | j                  2| j                  &t'        d| j                   d| j                   d      g | _        | j3                  i       | _        | j7                          | j9                  |
       y )NGPUTFDarwinarmz8At this time, the v2.11+ optimizer `tf.keras.optimizers.zv` runs slowly on M1/M2 Macs, please use the legacy TF-Keras optimizer instead, located at `tf.keras.optimizers.legacy.`.   r   zC`ema_momentum` must be in the range [0, 1]. Received: ema_momentum=z\`ema_overwrite_frequency` must be an integer > 1 or None. Received: ema_overwrite_frequency=zOAt most one of `clipnorm` and `global_clipnorm` can be set. Received: clipnorm=z, global_clipnorm=.)nameweight_decayclipnormglobal_clipnorm	clipvalueuse_emar   can_jit_compiletfconfiglist_physical_devicesjit_compileplatformsystem	processorr   warning	__class____name__
ValueError
isinstanceintema_momentumema_overwrite_frequency
_variables_no_dependency_sharded_variable_builders_create_iteration_variable_process_kwargs)selfr   r   r   r   r   r   r(   r)   r   kwargss              `/var/www/html/test/engine/venv/lib/python3.12/site-packages/tf_keras/src/optimizers/optimizer.py__init__z_BaseOptimizer.__init__'   s    	( ." ((*		//6#D$D??(X-?-?-AU-JOO((,(?(?'@ A/ 04~~/F/F.Gr	K a<!#3 ..:^=  '6<*Q. 9./1 
 )'>$==$)=)=)I..2mm_ =##'#7#7"8;   +/*=*=b*A''')V$    c                     t        j                         5  t        j                  ddt         j                  d      | _        ddd       | j
                  j                  | j                         y# 1 sw Y   /xY w)z'Create the iterations counter variable.r   	iterationFr   dtype	trainableN)r   
init_scopeVariableint64_iterationsr*   appendr/   s    r1   r-   z)_BaseOptimizer._create_iteration_variablep   sZ    ]]_ 	  "{{288u D	 	t//0	 	s   -A00A9c                 ,   |j                  dd        |j                  dd       }|r-t        j                  d| j                  j                   d       h d}|D ]8  }||v r%t        | d| j                  j                   d      t        | d       y )Nis_legacy_optimizerlrz`lr` is deprecated in TF-Keras optimizer, please use `learning_rate` or use the legacy optimizer, e.g.,tf.keras.optimizers.legacy.r   >   decaygradient_aggregatorgradient_transformersz is deprecated in the new TF-Keras optimizer, please check the docstring for valid arguments, or use the legacy optimizer, e.g., tf.keras.optimizers.legacy.zY is not a valid argument, kwargs should be empty  for `optimizer_experimental.Optimizer`.)popr   r"   r#   r$   r%   	TypeError)r/   r0   rA   legacy_kwargsks        r1   r.   z_BaseOptimizer._process_kwargsz   s    

($/ZZd#OO..2nn.E.E-FaI


  	AM! c 2 37..2I2I1J!M   c ? ? 	r3   c                 H    t        d| j                  j                   d      )Na  You are trying to restore a checkpoint from a legacy TF-Keras optimizer into a v2.11+ Optimizer, which can cause errors. Please update the optimizer referenced in your code to be an instance of `tf.keras.optimizers.legacy.Optimizer`, e.g.: `tf.keras.optimizers.legacy.r   )r%   r#   r$   )r/   r0   s     r1    _create_or_restore_slot_variablez/_BaseOptimizer._create_or_restore_slot_variable   s.    +
 ,0>>+B+B*C2G
 	
r3   c                     |j                   S ).Get a unique identifier of the given variable.
_unique_idr/   variables     r1   _var_keyz_BaseOptimizer._var_key   s    
 """r3   c                    g }|D ]  }t        |t        j                        r|j                  }|j                  }t        j
                  |      \  }}t        j                  j                  ||t        j                  |      d         }|j                  t        j                  |||j                               |j                  |        |S )aB  Deduplicate sparse gradient.

        For sparse gradients, i.e., gradient is of type `tf.IndexedSlices`,
        it is possible that `gradient.indices` has duplicated indices.
        This function adds up values for the duplicated indices, and returns
        a `tf.IndexedSlices` with indices of unique values.
        r   )r&   r   IndexedSlicesvaluesindicesuniquemathunsorted_segment_sumshaper=   dense_shape)	r/   gradsprocessed_gradsgradrT   rU   unique_indicesnew_index_positionssummed_valuess	            r1   _deduplicate_sparse_gradz'_BaseOptimizer._deduplicate_sparse_grad   s      	-D$ 0 01,,68ii6H3 3 " < </.1I!1L!  &&$$%~t7G7G  &&t,	-  r3   c                     t         )ai  Function to update variable value based on given gradients.

        This method must be implemented in customized optimizers.

        Args:
          gradient: backpropagated gradient of the given variable.
          variable: variable whose value needs to be updated.

        Returns:
          An `Operation` that applies the specified gradients.

        NotImplementedErrorr/   gradientrP   s      r1   update_stepz_BaseOptimizer.update_step   s
     "!r3   )r   c                 &    | j                  ||      S )a  A wrapper of `update_step` to enable XLA acceleration.

        Due to `tf.function` tracing mechanism, for (gradient, variable) pairs
        of the same shape and dtype, the execution graph always invoke the first
        pair it has seen. Thus, we need a `key` argument to make each (gradient,
        variable) pair unique. In additions, XLA cannot understand string input,
        so the key is an integer.

        Args:
          gradient: backpropagated gradient of the given variable.
          variable: variable whose value needs to be updated.
          key (int): a unique key that identifies the variable.

        Returns:
          An `Operation` that applies the specified gradients.
        )_update_step)r/   rf   rP   keys       r1   _update_step_xlaz_BaseOptimizer._update_step_xla   s    $   844r3   c                     t        |dd       y | j                  |      | j                  vr0t        d|j                   d| j
                  j                   d      | j                  ||       y )NrN   z(The optimizer cannot recognize variable a  . This usually means you are trying to call the optimizer to update different parts of the model separately. Please call `optimizer.build(variables)` with the full list of trainable variables before the training loop or use legacy optimizer `tf.keras.optimizers.legacy.r   )getattrrQ   _index_dictKeyErrorr   r#   r$   rg   re   s      r1   ri   z_BaseOptimizer._update_step   sw    8\408 =="$*:*:::8==/ J/
 04~~/F/F.GqJ  	8,r3   c                 h   t        |      s|t        d| d| d      |t        j                         }t        |      r@|5  t        |      s|j	                  |        |       }t        |      r |       }ddd       |j                  ||      }t        t        ||            S # 1 sw Y   0xY w)aB  Compute gradients of loss on trainable variables.

        Args:
          loss: `Tensor` or callable. If a callable, `loss` should take no
            arguments and return the value to minimize.
          var_list: list or tuple of `Variable` objects to update to minimize
            `loss`, or a callable returning the list or tuple of `Variable`
            objects. Use callable when the variable list would otherwise be
            incomplete before `minimize` since the variables are created at the
            first time `loss` is called.
          tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
            `Tensor`, the tape that computed the `loss` must be provided.

        Returns:
          A list of (gradient, variable) pairs. Variable is always present, but
          gradient can be `None`.
        NzB`tape` is required when a `Tensor` loss is passed. Received: loss=z, tape=r   )callabler%   r   GradientTapewatchrf   listzip)r/   lossvar_listtaper[   s        r1   compute_gradientsz _BaseOptimizer.compute_gradients   s    $ ~$,""&wtfA7  <??$DD> *)JJx(vH%'zH* dH-Cx())* *s   6B((B1c                 <   g }| j                   r[| j                   dkD  rL|D ]E  }||j                  |       |j                  t        j                  || j                                G |S | j                  r2| j                  dkD  r#t        j
                  || j                        d   S | j                  rh| j                  dkD  rY|D ]R  }||j                  |       |j                  t        j                  || j                   | j                               T |S |S )Nr   )clip_value_minclip_value_max)r   r=   r   clip_by_normr   clip_by_global_normr   clip_by_value)r/   r[   clipped_gradsgs       r1   _clip_gradientsz_BaseOptimizer._clip_gradients  s   ==T]]Q. L9!((+!((DMM)JK	L
 ! D$8$81$<))%1E1EFqII>>dnnq0 
9!((+!((((,0NN?+/>>	
 ! r3   c                     | j                   S )zThe number of training steps this `optimizer` has run.

        By default, iterations would be incremented by one every time
        `apply_gradients()` is called.
        )r<   r>   s    r1   
iterationsz_BaseOptimizer.iterations8  s     r3   c                 J    t        | dd      rt        d| d      || _        y )N_builtFzCannot set `iterations` to a new Variable after the Optimizer weights have been created. Here it is attempting to set `iterations` to z.Usually this means you are trying to set `iterations` after calling `apply_gradients()`. Please set `iterations` before calling `apply_gradients()`.)rm   RuntimeErrorr<   rO   s     r1   r   z_BaseOptimizer.iterationsA  s;    45)55=J ?CC  $r3   c                     t        | d      r| j                  t        d      | j                  }t        |t        j
                        r| j                  S |S )N_learning_ratezPMissing learning rate, please set self.learning_rate at optimizer creation time.)hasattrr   r%   r&   r   LearningRateSchedule_current_learning_rate)r/   rA   s     r1   learning_ratez_BaseOptimizer.learning_rateN  s[    t-.$2E2E2M,    b0EEF ...	r3   c                     t        |t        j                        r|| _        y t        | j                  t        j                        rt	        d      | j                  j                  |       y )Na  This optimizer was created with a `LearningRateSchedule` object as its `learning_rate` constructor argument, hence its learning rate is not settable. If you need the learning rate to be settable, you should instantiate the optimizer with a float `learning_rate` argument.)r&   r   r   r   rF   assignr/   r   s     r1   r   z_BaseOptimizer.learning_rate]  se    1FF
 #0D##%;%P%P  K  &&}5r3   c                     | j                   S )zAlias of `learning_rate()`.

        `lr()` is heavily called in workflows using `optimizer_v2.OptimizerV2`,
        so we keep it for backward compabitliy.
        r   r>   s    r1   rA   z_BaseOptimizer.lrp  s     !!!r3   c                     || _         y Nr   r   s     r1   rA   z_BaseOptimizer.lrz  s
    *r3   c                    t        j                         5  t        |t        j                        rXt        j
                   || j                              }t        j                  |d|j                  d      | _	        |cd d d        S t        j                  |dt        j                         d      cd d d        S # 1 sw Y   y xY w)Ncurrent_learning_rateFr6   r   )r   r9   r&   r   r   convert_to_tensorr   r:   r7   r   r   floatx)r/   r   r   s      r1   _build_learning_ratez#_BaseOptimizer._build_learning_rate~  s    ]]_ 	5JJ )+(<(<!$//2)% /1kk)0/55#	/+ %	 	  ;;$nn&	!	 	 	s   A)B<*B<<Cc                     t        | dd      ry| j                  |       | j                  r<g | _        |D ]/  }| j                  j	                  | j                  |d|             1 yy)a  Initialize the optimizer's variables, such as momemtum variables.

        This function has to be implemented by subclass optimizers, and subclass
        optimizers need to call `super().build(var_list)`.

        Args:
          var_list: List of model variables to build optimizers on. For example,
            SGD optimizer with momentum will store one momentum variable
            corresponding to each model variable.
        r   FNaverage)initial_value)rm   _build_index_dictr   _model_variables_moving_averager=   add_variable_from_reference)r/   rw   vars      r1   buildz_BaseOptimizer.build  st     45)x(<<35D0  44;;44Yc 5  r3   c                 x    i | _         t        |      D ]%  \  }}| j                  |      }|| j                   |<   ' y)zBuild variable to index dictionary.

        Build a dictionary that maps variable to the index of it in the given
        var_list.

        Args:
          var_list: List of variables to build index dict on.

        Returns:
          None
        N)rn   	enumeraterQ   )r/   rw   ir   var_keys        r1   r   z _BaseOptimizer._build_index_dict  sB     ) 	*FAsmmC(G()DW%	*r3   c                     t        |t              rt        j                  |      }|t	        j
                         }|g }t        j                   |||      |d      }| j                  j                  |       |S )a  Create an optimizer variable.

        Args:
          shape: A list of integers, a tuple of integers, or a 1-D Tensor of
            type int32. Defaults to scalar if unspecified.
          dtype: The DType of the optimizer variable to be created. Defaults to
            `tf.keras.backend.floatx` if unspecified.
          initializer: string or callable. Initializer instance.
          name: The name of the optimizer variable to be created.

        Returns:
          An optimizer variable, in the format of tf.Variable.

        F)r   r   r8   )
r&   strr   getr   r   r   r:   r*   r=   )r/   rY   r7   initializerr   rP   s         r1   add_variablez_BaseOptimizer.add_variable  sn     k3'&**;7K=NN$E=E;;%eU3$%
 	x(r3   c                    t        |d      r|j                  }n|j                  }|i|P|j                  j                  t        j                  ||      }n9t        j                  |j                  |      }nt        j                  ||      }t        j                  || d|j                   |d      }t        |d      r|j                         }| j                  j                  |j                  |ft        t        |j                                     }|j#                  |       |j%                         r)| j&                  j)                  |j+                                |S | j&                  j)                  |       |S )ak  Create an optimizer variable from model variable.

        Create an optimizer variable based on the information of model variable.
        For example, in SGD optimizer momemtum, for each model variable, a
        corresponding momemtum variable is created of the same shape and dtype.

        Args:
          model_variable: tf.Variable. The corresponding model variable to the
            optimizer variable to be created.
          variable_name: String. The name prefix of the optimizer variable to be
            created. The create variables name will follow the pattern
            `{variable_name}/{model_variable.name}`, e.g., `momemtum/dense_1`.
          shape: List or Tuple, defaults to None. The shape of the optimizer
            variable to be created. If None, the created variable will have the
            same shape as `model_variable`.
          initial_value: A Tensor, or Python object convertible to a Tensor,
            defaults to None. The initial value of the optimizer variable, if
            None, the initial value will be default to 0.

        Returns:
          An optimizer variable.
        
true_dtyper7   /Fr   r   r7   r8   _sharded_container)r   r   r7   rY   rankr   
zeros_likezerosr:   _shared_namer   r,   
setdefaultrN   _ShardedVariableBuilderlen	variables	add_shardhas_all_shardsr*   r=   r   )	r/   model_variablevariable_namerY   r   r7   rP   sharded_variable
sv_builders	            r1   r   z*_BaseOptimizer.add_variable_from_reference  sR   6 ><0"--E"((E }!'',,4 %'MM.$NM %'HH^-A-A$OM "e <;;'!?!N$?$?#@A	
 >#78-@@B88CC!,,m<',<,F,F(GHJ   *((*&&z'7'7'9:  OO""8,r3   c                 (  	 t        | d      s| j                  i       | _        t        
|   |fi |}|j                         D ]  \  }}|dvr|}t        |t              rg }|D ]V  }t        |d      r7|j                         	t        	fd|D              r4|j                  	       F|j                  |       X t        j                  j                  j                  |      }|| j                  |<   || j                  |<    | j                  S )a*  Override in order to coalesce and track `ShardedVariable`s.

        If an optimizer variable's corresponding model variable is a shard of a
        larger `ShardedVariable`, then we track the optimizer variable in
        `self._variables` as a `ShardedVariable` via the logic in
        `add_variable_from_reference`. However, most optimizer implementations
        additionally keep their variables as attributes, which will be tracked
        via `AutoTrackable` functionality and not accumulated into
        `ShardedVariable`s.

        So, to enable restoration of these attributes in possibly different
        sharding configurations, we should save them as `ShardedVariable`s.
        Here, any optimizer attributes that are variable shards of a larger
        `ShardedVariable` are here replaced by the `ShardedVariable` itself,
        which was created in `add_variable_from_reference`.

        All non-sharded variables are kept as-is. If none of the model variables
        are sharded, this reduces to `AutoTrackable._trackable_children()`.
        _coalesced_children)r*   rn   r   r<   r   c              3   P   K   | ]  }j                   |j                   k(    y wr   rM   ).0other_svsvs     r1   	<genexpr>z5_BaseOptimizer._trackable_children.<locals>.<genexpr>L  s(      +"(0 %'MMX5H5H$H+"s   #&)r   r+   r   super_trackable_childrenitemsr&   rt   r   anyr=   r   __internal__trackingwrap)r/   	save_typer0   childrenrj   valnew_valsv_valsr   r   r#   s            @r1   r   z"_BaseOptimizer._trackable_children  s   . t23 (,':':2'>D$w29GGH$NN, 8S   "G!#t,"$#& 4C&s,@A%(%;%;%= (+ +"4;+" (" %,NN2$6 's 34 #%//":":"?"?"H4;D,,S147D,,S1788 '''r3   c                 L    | j                  |||      }| j                  |       y)a  Minimize `loss` by updating `var_list`.

        This method simply computes gradient using `tf.GradientTape` and calls
        `apply_gradients()`. If you want to process the gradient before applying
        then call `tf.GradientTape` and `apply_gradients()` explicitly instead
        of using this function.

        Args:
          loss: `Tensor` or callable. If a callable, `loss` should take no
            arguments and return the value to minimize.
          var_list: list or tuple of `Variable` objects to update to minimize
            `loss`, or a callable returning the list or tuple of `Variable`
            objects.  Use callable when the variable list would otherwise be
            incomplete before `minimize` since the variables are created at the
            first time `loss` is called.
          tape: (Optional) `tf.GradientTape`.

        Returns:
          None
        N)ry   apply_gradients)r/   rv   rw   rx   grads_and_varss        r1   minimizez_BaseOptimizer.minimizeY  s&    * //hE^,r3   c                 |   t        | j                  t        j                        rt	        | d      r5| j
                  j                  | j                  | j                               y t        j                  | j                  | j                              }t        j                  |d|j                  d      | _        y y )Nr   r   Fr6   )r&   r   r   r   r   r   r   r   r   r   r:   r7   )r/   r   s     r1   _compute_current_learning_ratez-_BaseOptimizer._compute_current_learning_rateq  s    !7!L!L

 t56++22''8 )+(<(<''8)% /1kk)0/55#	/+
r3   c                     t        | d      r| j                  rt        d      |r%|D cg c]  }| j                  |       c}| _        ng | _        |xs g | _        yc c}w )a  Exclude variables from weight decay.

        This method must be called before the optimizer's `build` method is
        called. You can set specific variables to exclude out, or set a list of
        strings as the anchor words, if any of which appear in a variable's
        name, then the variable is excluded.

        Args:
            var_list: A list of `tf.Variable`s to exclude from weight decay.
            var_names: A list of strings. If any string in `var_names` appear
                in the model variable's name, then this model variable is
                excluded from weight decay. For example, `var_names=['bias']`
                excludes all bias variables from weight decay.
        r   zR`exclude_from_weight_decay()` can only be configued before the optimizer is built.N)r   r   r%   rQ   _exclude_from_weight_decay _exclude_from_weight_decay_names)r/   rw   	var_namesrP   s       r1   exclude_from_weight_decayz(_BaseOptimizer.exclude_from_weight_decay  sg     4"t{{* 
 8@/,4h'/D+ /1D+09R-/s   Ac                     t        | dg       }t        | dg       }| j                  |      }|D ]	  }||k(  s	 y |D ]$  }t        j                  ||j                        $ y y)Nr   r   FT)rm   rQ   researchr   )r/   rP   r   exclude_from_weight_decay_namesvariable_id
exclude_idr   s          r1   _use_weight_decayz _BaseOptimizer._use_weight_decay  s    $+.%
! +24b+
' mmH-3 	Jj(	 4 	Dyyx}}-9	 r3   c                 *   | j                          t        |      }t        |      dk(  r| j                  S t	        | \  }}|xs | j
                  xs d}t        j                  |      5  t        j                         5  | j                  |       ddd       t        j                  |      }t        t        |            dk(  r| j                  cddd       S t	        | \  }}| j                  |      }| j                  |      }| j                  |       t        t	        ||            }| j                  |      }|D ]/  }|j                   |j#                  |j!                  |             1 |cddd       S # 1 sw Y   xY w# 1 sw Y   yxY w)a  Apply gradients to variables.

        Args:
          grads_and_vars: List of `(gradient, variable)` pairs.
          name: string, defaults to None. The name of the namescope to
            use when creating variables. If None, `self.name` will be used.

        Returns:
          A `tf.Variable`, representing the current iteration.

        Raises:
          TypeError: If `grads_and_vars` is malformed.
        r   	optimizerN)r   rt   r   r<   ru   r   r   
name_scoper9   r   optimizer_utilsfilter_empty_gradientsr   ra   _apply_weight_decay_internal_apply_gradients
constraintr   )r/   r   r   r[   trainable_variables
scope_namer5   rP   s           r1   r   z_BaseOptimizer.apply_gradients  s    	++-n-~!# ###%(.%9""5TYY5+
]]:& 	 0 

./0 -CCN 4'(A-''	 	 *-n)=&E&((/E11%8E$$%89!#e-@"ABN66~FI 0 C&&2OOH$7$7$ABC 1	 	0 0	 	s1   *F	?E=?F	A5F	#F	=F	F		Fc                 &   | j                   y |D ]  }| j                  |      st        j                  | j                  |j
                        }t        j                  | j                   |j
                        }|j                  ||z  |z          y r   )r   r   r   castr   r7   
assign_sub)r/   r   rP   rA   wds        r1   r   z"_BaseOptimizer._apply_weight_decay  sy    $! 	8H%%h/WWT//@WWT..?##HrMB$67		8r3   c           
          | j                   r6|D ]0  \  }}| j                  ||t        | j                  |                   2 n|D ]  \  }}| j	                  ||        | j
                  j                  d      S )zHelper function of apply gradients.

        This is required for separating out distributed training logic.

        Args:
          grads_and_vars: List of (gradient, variable) pairs.
        r   )r   rk   idrQ   ri   r   
assign_add)r/   r   r]   r   s       r1   r   z(_BaseOptimizer._internal_apply_gradients  s|     + I	c%%dCDMM#4F1GHI , -	c!!$,-))!,,r3   c                     | j                   rd|D ]^  }| j                  | j                  | j                  |            }|j	                  | j
                  |z  d| j
                  z
  |z  z          ` yy)8Update the stored moving average using the latest value.r   N)r   r   rn   rQ   r   r(   r/   rw   r   r   s       r1   &_update_model_variables_moving_averagez5_BaseOptimizer._update_model_variables_moving_average  su    << >>$$T]]3%78 %%/1t7H7H3HC2OO	 r3   c                     |D ]>  }| j                   | j                  | j                  |            }|j                  |       @ y)z2Overwrite model variables with its moving average.N)r   rn   rQ   r   r   s       r1   -_overwrite_model_variables_with_average_valuez<_BaseOptimizer._overwrite_model_variables_with_average_value  sF     	 C::  s!34G JJw		 r3   c                 @    | j                   r| j                  |       yy)a  Set the final value of model's trainable variables.

        Sometimes there are some extra steps before ending the variable updates,
        such as overriding the model variables with its average value.

        Args:
          var_list: list of model variables.
        N)r   r   r/   rw   s     r1   finalize_variable_valuesz'_BaseOptimizer.finalize_variable_values  s      << >>xH	 r3   c                     t        |t        j                        rt        j                  |      S t        |t        j
                        r|j                         S t        |      r |       S |S )z=Serialize a hyperparameter that can be a numeric or callable.)r&   r   r   	serializer   r:   numpyrq   )r/   hyperparameters     r1   _serialize_hyperparameterz(_BaseOptimizer._serialize_hyperparameter  s]    2GG
 *33NCCnbkk2!''))N#!##r3   c                     | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  dd
}|S )a  Returns the config of the optimizer.

        An optimizer config is a Python dictionary (serializable)
        containing the configuration of an optimizer.
        The same optimizer can be reinstantiated later
        (without any saved state) from this configuration.

        Subclass optimizer should override this method to include other
        hyperparameters.

        Returns:
            Python dictionary.
        F)
r   r   r   r   r   r   r(   r)   r   r@   )	r   r   r   r   r   r   r(   r)   r   )r/   r   s     r1   
get_configz_BaseOptimizer.get_config*  s^     II --#33|| --'+'C'C++#(
 r3   c                 z    d|v r0t        |d   t              rt        j                  |d   |      |d<    | di |S )a  Creates an optimizer from its config.

        This method is the reverse of `get_config`, capable of instantiating the
        same optimizer from the config dictionary.

        Args:
            config: A Python dictionary, typically the output of get_config.
            custom_objects: A Python dictionary mapping names to additional
              user-defined Python objects needed to recreate this optimizer.

        Returns:
            An optimizer instance.
        r   )custom_objects )r&   dictr   deserialize)clsr   r  s      r1   from_configz_BaseOptimizer.from_configF  sH     f$&148*@*L*L?+N+' }V}r3   c                 ,    t        | j                        S )z$Returns variables of this optimizer.)CallableListr*   r>   s    r1   r   z_BaseOptimizer.variables\  s     DOO,,r3   c                 X   t        | dd      st        d      t        | j                  |      D ]y  \  }}|j                  |j                  k7  rJt        d| j                  |       dt        |j                         dt        |j                         d      |j                  |       { y)	zSet the weights of the optimizer.

        Args:
            weights: a list of `tf.Variable`s or numpy arrays, the target values
                of optimizer variables. It should have the same order as
                `self._variables`.
        r   FzYou are calling `set_weights()` on an optimizer that has not yet been built. Please call `optimizer.build(trainable_variables)` to create the optimizer weights before calling `set_weights()`.zOptimizer variable z has shape z+ not compatible with provided weight shape r   N)rm   r%   ru   r*   rY   rQ   r   r   )r/   weightsrP   weights       r1   set_weightsz_BaseOptimizer.set_weightsa  s     tXu-D  !$DOOW = 	$Hf~~- )$--*A)B+8>>*+ ,$$'$5#6a9 
 OOF#	$r3   c                 v    t        | j                        D ]!  \  }}|j                         |t        |      <   # y)z'Get the state of this optimizer object.N)r   r   r   r   )r/   storer   rP   s       r1   save_own_variablesz!_BaseOptimizer.save_own_variablesz  s2    $T^^4 	-KAx$NN,E#a&M	-r3   c           	         t        |j                               t        | j                        k7  rsd| j                   dt        | j                         dt        |j                                d}t        | j                        dk(  r|dz  }t	        j
                  |       yt        | j                        D ]"  \  }}|j                  |t        |                $ y)z'Set the state of this optimizer object.z)Skipping variable loading for optimizer 'z', because it has z+ variables whereas the saved optimizer has z variables. r   zCThis is likely because the optimizer has not been called/built yet.N)	r   keysr   r   r   r"   r   r   r   )r/   r  msgr   rP   s        r1   load_own_variablesz!_BaseOptimizer.load_own_variables  s    uzz|DNN 33;DII; G""%dnn"5!6 7++.uzz|+<*=\K 
 4>>"a'( OOC $T^^4 	+KAxOOE#a&M*	+r3   )NNNNFGz?NTr   )Nr   NNN)
checkpoint)3r$   
__module____qualname____doc__r2   r-   r.   rJ   rQ   ra   abcabstractmethodrg   r   functionrk   ri   ry   r   propertyr   setterr   r
   do_not_generate_docsrA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  classmethodr  r   r  r  r  __classcell__r#   s   @r1   r   r   $   s   L
  $G%R1:
#6 	" " R[[T"5 #5&- "*H:     
$ 
$   6 6$ &&" ' " YY+ +0 	 2*"8 HLAF8(t-0*@: .`8- 	 I
8  * - -$2-
+r3   r   ac	  name: String. The name to use
          for momentum accumulator weights created by
          the optimizer.
      weight_decay: Float, defaults to None. If set, weight decay is applied.
      clipnorm: Float. If set, the gradient of each weight is individually
          clipped so that its norm is no higher than this value.
      clipvalue: Float. If set, the gradient of each weight is clipped to be no
          higher than this value.
      global_clipnorm: Float. If set, the gradient of all weights is clipped so
          that their global norm is no higher than this value.
      use_ema: Boolean, defaults to False. If True, exponential moving average
          (EMA) is applied. EMA consists of computing an exponential moving
          average of the weights of the model (as the weight values change after
          each training batch), and periodically overwriting the weights with
          their moving average.
      ema_momentum: Float, defaults to 0.99. Only used if `use_ema=True`.
          This is the momentum to use when computing
          the EMA of the model's weights:
          `new_average = ema_momentum * old_average + (1 - ema_momentum) *
          current_variable_value`.
      ema_overwrite_frequency: Int or None, defaults to None. Only used if
          `use_ema=True`. Every `ema_overwrite_frequency` steps of iterations,
          we overwrite the model variable by its moving average.
          If None, the optimizer
          does not overwrite model variables in the middle of training, and you
          need to explicitly overwrite the variables at the end of training
          by calling `optimizer.finalize_variable_values()`
          (which updates the model
          variables in-place). When using the built-in `fit()` training loop,
          this happens automatically after the last epoch,
          and you don't need to do anything.
      jit_compile: Boolean, defaults to True.
          If True, the optimizer will use XLA
          compilation. If no GPU device is found, this flag will be ignored.
      mesh: optional `tf.experimental.dtensor.Mesh` instance. When provided,
          the optimizer will be run in DTensor mode, e.g. state
          tracking variable will be a DVariable, and aggregation/reduction will
          happen in the global DTensor context.
      **kwargs: keyword arguments only used for backward compatibility.zkeras.optimizers.Optimizerz'keras.optimizers.experimental.Optimizer)v1c                        e Zd ZdZ	 	 	 	 	 	 	 	 d fd	Z	 d fd	Z fdZ fdZd Z	 	 d fd	Z	d Z
 fd	Z fd
Z fdZd Zd Z xZS )	OptimizeraA  Abstract optimizer base class.

    This class supports distributed training. If you want to implement your own
    optimizer, please subclass this class instead of _BaseOptimizer.

    Args:
      {{base_optimizer_keyword_args}}

    ### Usage

    ```python
    # Create an optimizer with the desired parameters.
    opt = keras.optimizers.SGD(learning_rate=0.1)
    var1, var2 = tf.Variable(1.0), tf.Variable(2.0)
    # `loss` is a callable that takes no argument and returns the value
    # to minimize.
    loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
    # Call minimize to update the list of variables.
    opt.minimize(loss, var_list=[var1, var2])
    ```

    ### Processing gradients before applying them

    Calling `minimize()` takes care of both computing the gradients and
    applying them to the variables. If you want to process the gradients
    before applying them you can instead use the optimizer in three steps:

    1.  Compute the gradients with `tf.GradientTape`.
    2.  Process the gradients as you wish.
    3.  Apply the processed gradients with `apply_gradients()`.

    Example:

    ```python
    # Create an optimizer.
    opt = tf.keras.optimizers.experimental.SGD(learning_rate=0.1)
    var1, var2 = tf.Variable(1.0), tf.Variable(2.0)

    # Compute the gradients for a list of variables.
    with tf.GradientTape() as tape:
      loss = 3 * var1 * var1 + 2 * var2 * var2
    grads = tape.gradient(loss, [var1, var2])

    # Process the gradients.
    grads[0] = grads[0] + 1

    # Ask the optimizer to apply the gradients on variables.
    opt.apply_gradients(zip(grads, [var1, var2]))
    ```

    ### Dynamic learning rate

    Dynamic learning rate can be achieved by setting learning rate as a built-in
    or customized `tf.keras.optimizers.schedules.LearningRateSchedule`.

    Example:

    >>> var = tf.Variable(np.random.random(size=(1,)))
    >>> learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
    ...   initial_learning_rate=.01, decay_steps=20, decay_rate=.1)
    >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=learning_rate)
    >>> loss = lambda: 3 * var
    >>> opt.minimize(loss, var_list=[var])

    ### Gradients clipping

    Users can clip the gradients before applying to variables by setting
    `clipnorm`, `clipvalue` and `global_clipnorm`. Notice that `clipnorm` and
    `global_clipnorm` can only have one being set.

    Example:

    >>> opt = tf.keras.optimizers.experimental.SGD(learning_rate=1, clipvalue=1)
    >>> var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
    >>> with tf.GradientTape() as tape:
    ...   loss = 2 * var1 + 2 * var2
    >>> grads = tape.gradient(loss, [var1, var2])
    >>> print([grads[0].numpy(), grads[1].numpy()])
    [2.0, 2.0]
    >>> opt.apply_gradients(zip(grads, [var1, var2]))
    >>> # Without clipping, we should get [0, 0], but as gradients are clipped
    >>> # to have max value 1, we get [1.0, 1.0].
    >>> print([var1.numpy(), var2.numpy()])
    [1.0, 1.0]

    ### Using weight decay.

    Weight decay in certain scenarios can boost the model's performance. Keras
    has built-in support for weight decay in all optimizers. Users can apply
    weight decay by setting `weight_decay` argument.

    >>> opt = tf.keras.optimizers.experimental.SGD(1, weight_decay=0.004)
    >>> grads, var1, var2 = tf.zeros(()), tf.Variable(2.0), tf.Variable(2.0)
    >>> # You can exclude variables from weight decay, in this case we
    >>> # exclude `var2`.
    >>> opt.exclude_from_weight_decay(var_list=[var2])
    >>> opt.apply_gradients(zip([grads, grads], [var1, var2]))
    >>> print([var1.numpy(), var2.numpy()])
    [1.992, 2.0]


    ### Using exponential moving average.

    Empirically it has been found that using the exponential moving average
    (EMA) of the trained parameters of a deep network achieves a better
    performance than using its trained parameters directly. TF-Keras optimizers
    allows users to compute this moving average and overwrite the model
    variables at desired time.

    Example:

    ```python
    # Create an SGD optimizer with EMA on. `ema_momentum` controls the decay
    # rate of the moving average. `ema_momentum=1` means no decay and the stored
    # moving average is always model variable's initial value before training.
    # Reversely, `ema_momentum=0` is equivalent to not using EMA.
    # `ema_overwrite_frequency=3` means every 3 iterations, we overwrite the
    # trainable variables with their moving average values.
    opt = tf.keras.optimizers.experimental.SGD(
        learning_rate=1,
        use_ema=True,
        ema_momentum=0.5,
        ema_overwrite_frequency=3)
    var1, var2 = tf.Variable(2.0), tf.Variable(2.0)
    with tf.GradientTape() as tape:
      loss = var1 + var2
    grads = tape.gradient(loss, [var1, var2])
    # First iteration: [var1, var2] = [1.0, 1.0]
    opt.apply_gradients(zip(grads, [var1, var2]))
    print([var1, var2])

    # Second iteration: [var1, var2] = [0.0, 0.0]
    opt.apply_gradients(zip(grads, [var1, var2]))
    print([var1, var2])

    # Third iteration, without EMA, we should see [var1, var2] = [-1.0, -1.0],
    # but overwriting results in [var1, var2] = [-0.125, -0.125]. The full
    # calculation for the moving average of var1 is:
    # var1=2*0.5**3+1*(1-0.5)*0.5**2+0*(1-0.5)*0.5**1+(-1)*(1-0.5)=-0.125.
    opt.apply_gradients(zip(grads, [var1, var2]))
    print([var1, var2])

    ```
    When optimizer is constructed with `use_ema=True`, in custom training loop,
    users can explicitly call `finalize_variable_values()` to overwrite
    trainable variables with their EMA values. `finalize_variable_values()` is
    by default called at the end of `model.fit()`.

    ### Use with `tf.distribute.Strategy`

    This optimizer class is `tf.distribute.Strategy` aware, which means it
    automatically sums gradients across all replicas. To aggregate gradients
    yourself, call `apply_gradients` with `skip_gradients_aggregation` set to
    True.  This is useful if you need to process aggregated gradients.

    ```python
    # This example is not runnable, it consists of dummy code for simple
    # tutorial.
    strategy = tf.distribute.experimental.TPUStrategy()

    with strategy.scope():
      opt = tf.keras.optimizers.experimental.SGD()
      model = magic_function_that_returns_model()
      gradients = magic_function_that_returns_gradients()
      # Custom logic to aggregate gradients.
      gradients = strategy.reduce("SUM", gradients, axis=None)
      opt.apply_gradients(zip(gradients, model.trainable_variables),
          skip_gradients_aggregation=True)
    ```

    ### Creating a custom optimizer

    If you intend to create your own optimization algorithm, please inherit from
    this class and override the following methods:

      - `build`: Create your optimizer-related variables, such as `momentums` in
        SGD optimizer.
      - `update_step`: Implement your optimizer's updating logic.
      - `get_config`: serialization of the optimizer, include all hyper
        parameters.

    Your optimizer would automatically be compatible with tensorflow distributed
    training if you subclass `optimizer_experimental.Optimizer`.

    c
                     |
j                  dd      }|| _        t        |   |||||||||	f	i |
 t        j
                  j                         | _        t        j                         | _
        y)zCreate a new Optimizer.meshN)rE   _meshr   r2   r   
distributeget_strategy_distribution_strategydtensor_utilsrunning_with_dtensor_strategy_run_with_dtensor)r/   r   r   r   r   r   r   r(   r)   r   r0   r,  r#   s               r1   r2   zOptimizer.__init__{  sv     zz&$'
#	
 	
 ')mm&@&@&B#!.!L!L!Nr3   c                 6   | j                   r+|t        j                  |      }nt        |t        j                        r{t        j
                  j                  j                  |t        j
                  j                  j                  j                  | j                   |j                  j                              }t        |d      r|j                  }n|j                  }t        j
                  j                  j                  || d|j                    |d      }| j"                  j%                  |       |S t        j&                  j)                         }|j*                  j-                  |      5  t.        | a  ||||      cd d d        S # 1 sw Y   y xY w)Nr   r   r   Fr   )r-  r   r   r&   Tensorexperimentaldtensorcopy_to_meshLayout
replicatedrY   r   r   r   r7   	DVariabler   r*   r=   r.  r/  extendedcolocate_vars_withr   r   )	r/   r   r   rY   r   r7   rP   strategyr#   s	           r1   r   z%Optimizer.add_variable_from_reference  sN    ::$ !#n =M2995 " 7 7 D D!OO++22==

)<)<)A)A > ! ~|4&11&,,..88+%a(C(C'DE	 9 H OO""8,O}}113H""55nE w:"M5-  s   3FFc                 P   | j                   rt        j                  dt        j                        }t        j                  j
                  j                  |t        j                  j
                  j                  j                  | j                   d            }t        j                         5  t        j                  j
                  j                  |d      | _        d d d        | j                  j                  | j                         y t        | =          y # 1 sw Y   >xY w)Nr   r   r5  r5   r   )r-  r   constantr;   r7  r8  r9  r:  r;  r9   r<  r<   r*   r=   r   r-   )r/   init_valr#   s     r1   r-   z$Optimizer._create_iteration_variable  s    ::{{1BHH5H..;;''..99$**19MH   $&??#:#:#D#D; $E $  OO""4#3#34G.0 s   .1DD%c                     t        |d      r|j                         }nQt        j                  |      r<t        |d      r0t        |j                  d      r|j                  j                         }t
        |   |      S )rL   _distributed_containerhandle)r   rE  r   is_extension_typerF  r   rQ   )r/   rP   r#   s     r1   rQ   zOptimizer._var_key  si    
 856668H&&x0(+)AB  ==?Hw))r3   c                     | j                   s| j                  rt        j                  d       |S t	        j
                  |      S )aa  Aggregate gradients on all devices.

        By default, we will perform reduce_sum of gradients across devices.
        Users can implement their own aggregation logic by overriding this
        method.

        Args:
          grads_and_vars: List of (gradient, variable) pairs.

        Returns:
          List of (gradient, variable) pairs.
        zCalling aggregate_gradients is unnecessary when the model is used with DTensor, which includes aggregation of replicated gradients as part of backward pass.)r-  r3  r   r"   r   all_reduce_sum_gradients)r/   r   s     r1   aggregate_gradientszOptimizer.aggregate_gradients  s>     :://OOA
 "!";;NKKr3   c                     | j                   s| j                  rt        |   ||      S |j	                  dd      }|s|r| j                  |      }t        |   ||      S )a  Apply gradients to variables.

        Args:
          grads_and_vars: List of `(gradient, variable)` pairs.
          name: string, defaults to None. The name of the namescope to
            use when creating variables. If None, `self.name` will be used.
          skip_gradients_aggregation: If true, gradients aggregation will not be
            performed inside optimizer. Usually this arg is set to True when you
            write custom code aggregating gradients outside the optimizer.
          **kwargs: keyword arguments only used for backward compatibility.

        Returns:
          A `tf.Variable`, representing the current iteration.

        Raises:
          TypeError: If `grads_and_vars` is malformed.
          RuntimeError: If called in a cross-replica context.
        rA   experimental_aggregate_gradientsT)r-  r3  r   r   rE   rJ  )r/   r   r   skip_gradients_aggregationr0   rL  r#   s         r1   r   zOptimizer.apply_gradients  sl    2 :://7*>*EE
 ,2::.,
( *.N!55nENw&~D&AAr3   c                       j                   y  fd}t        j                  j                  j                  j                  | j                  |       y )Nc                 Z    fd}|D ]   }| j                   j                  ||d       " y )Nc                     j                  |       rlt        j                  j                  | j                        }t        j                  j
                  | j                        }| j                  | |z  |z         y y r   )r   r   r   r   r7   r   r   )rP   rA   r   r/   s      r1   weight_decay_fnz^Optimizer._apply_weight_decay.<locals>.distributed_apply_weight_decay.<locals>.weight_decay_fn%  sa    ))(3!3!3X^^DB!2!2HNNCB''2(:; 4r3   F)group)r=  update)distributionr   r0   rQ  rP   r/   s        r1   distributed_apply_weight_decayzEOptimizer._apply_weight_decay.<locals>.distributed_apply_weight_decay$  s9    < & %%,,oU - r3   )r   r   r   r.  interimmaybe_merge_callr0  )r/   r   rU  s   `  r1   r   zOptimizer._apply_weight_decay  sF    $
	 	""**;;*''	
r3   c                     | j                   s| j                  rt        |   |      S t        j
                  j                  j                  j                  | j                  | j                  |      S r   )r-  r3  r   r   r   r   r.  rV  rW  _distributed_apply_gradients_fnr0  )r/   r   r#   s     r1   r   z#Optimizer._internal_apply_gradients6  sZ    :://74^DD))11BB00''
 	
r3   c                    | j                   s| j                  rt        |   |       | j                  }|D ]M  }| j
                  | j                  | j                  |            }|j                  j                  |d |f       O y)zOverwrite model variables with their moving average values.

        This function overwrites variables on each device.
        Args:
          var_list: list of model variables.
        c                 $    | j                  |      S r   )r   )abs     r1   <lambda>zIOptimizer._overwrite_model_variables_with_average_value.<locals>.<lambda>S  s    !((1+ r3   )argsN)
r-  r3  r   r   r0  r   rn   rQ   r=  rS  )r/   rw   r?  r   r   r#   s        r1   r   z7Optimizer._overwrite_model_variables_with_average_valueA  s     :://GA(K.. 	C::  s!34G $$-WJ % 		r3   c                      j                   st         	  |      S t        j                  j
                  j                  } fd}t        |t        j                        rMt        j                   | j                              } ||      } ||dt        j                         _        |S  |t        j                  |t        j                              } ||dt        j                          d      S )Nc                     t         j                  j                  j                  | t         j                  j                  j                  j                  j                  d            S )Nr   r5  )r   r7  r8  r9  r:  r;  r-  )xr/   s    r1   r^  z0Optimizer._build_learning_rate.<locals>.<lambda>\  sE    "//*A*A*N*Nr&&--88!8L+
 r3   r   )r   r7   r   Fr6   )r-  r   r   r   r7  r8  r<  r&   r   r   r   r   float32r   rB  r   r   )r/   r   variable_creationinit_value_convert_fnr   rC  r#   s   `     r1   r   zOptimizer._build_learning_rateV  s    zz7/>> OO33==!
 1FF
 %'$8$8doo.%! %::O$P! +<%$jj+D'
 ! (KKRZZ8
 ! .."	
 	
r3   c                       j                   rb fd}|D ]W  } j                   j                   j                  |            } j                  j
                  j                  |||fd       Y yy)r   c                 h    | j                  j                  | z  dj                  z
  |z  z          y )Nr   )r   r(   )r   r   r/   s     r1   update_averagezHOptimizer._update_model_variables_moving_average.<locals>.update_average~  s2    %%/1t7H7H3HC2OOr3   Fr_  rR  N)r   r   rn   rQ   r0  r=  rS  )r/   rw   rh  r   r   s   `    r1   r   z0Optimizer._update_model_variables_moving_averagez  sx    <<
   >>$$T]]3%78 ++44;;^3& < 	 r3   c                    	  fd}|D ]%  \  }}|j                   j                  |||fd       '  j                  rt        | \  }	 j	                  	        j
                  r^ j                  dz    j
                  z  dk(  }t        j                  t        j                  |t        j                         	fdd         j                  j                  d      S )	z1`apply_gradients` using a `DistributionStrategy`.c           	          j                   r+j                  || t        j                  |                   S j	                  ||       S r   )r   rk   r   rQ   ri   )r   r]   r/   s     r1   apply_grad_to_update_varzKOptimizer._distributed_apply_gradients_fn.<locals>.apply_grad_to_update_var  sB    ,,T34==;M8NOO((s33r3   Fri  r   r   c                  &     j                        S r   )r   r   s   r1   r^  z;Optimizer._distributed_apply_gradients_fn.<locals>.<lambda>  s    D$V$V % r3   c                       y r   r  r  r3   r1   r^  z;Optimizer._distributed_apply_gradients_fn.<locals>.<lambda>  s    r3   )true_fnfalse_fn)r=  rS  r   ru   r   r)   r   r   condr   boolr   )
r/   rT  r   r0   rl  r]   r   _should_overwrite_model_varsrw   s
   `        @r1   rY  z)Optimizer._distributed_apply_gradients_fn  s    
	4 ( 	ID#!!((-TG5 ) 	
 <<~.KAx77A++ OOa'00/145/6+ GG7A * ))!,,r3   )r   NNNFr  NTr  )NF)r$   r  r  r  r2   r   r-   rQ   rJ  r   r   r   r   r   r   rY  r&  r'  s   @r1   r*  r*    sv    xz  $O@ HL#J1"*"L4 #(	%BN
.	
*"
H" -r3   r*  c                   $     e Zd Z fdZd Z xZS )RestoredOptimizerc                 $    t         |   d       y )Nrv  )r   r2   )r/   r#   s    r1   r2   zRestoredOptimizer.__init__  s    ,-r3   c                     t        d      )NzRestoring functional Optimizers from SavedModels is not currently supported. Please file a feature request if this limitation bothers you.rc   r>   s    r1   r  zRestoredOptimizer.get_config  s    !
 	
r3   )r$   r  r  r2   r  r&  r'  s   @r1   rv  rv    s    .
r3   rv  c                       e Zd ZdZd Zy)r  zETemporary shim to support both `opt.variables()` and `opt.variables`.c                     | S r   r  r>   s    r1   __call__zCallableList.__call__  s    r3   N)r$   r  r  r  r{  r  r3   r1   r  r    s
    Or3   r  c                   (    e Zd ZdZd Zd Zd Zd Zy)r   z4Accumulate variable shards into a `ShardedVariable`.c                     d g|z  | _         y r   )shards)r/   
num_shardss     r1   r2   z _ShardedVariableBuilder.__init__  s    fz)r3   c                     t        |j                  j                  d      d   j                  d      d         }| j                  |   || j                  |<   y t	        d|j                         )Npart_:r   z<Cannot add duplicate optimizer variable from shard variable )r'   r   splitr~  r%   )r/   shard	shard_idxs      r1   r   z!_ShardedVariableBuilder.add_shard  sm    

((1"5;;C@CD	;;y!)%*DKK	"""'**/ r3   c                 V    t        | j                  D cg c]  }|d u c}      S c c}w r   )allr~  )r/   r  s     r1   r   z&_ShardedVariableBuilder.has_all_shards  s$    4;;?%E%?@@?s   &c                 h    t         j                  j                  j                  | j                        S r   )r   r   r.  ShardedVariabler~  r>   s    r1   r   z_ShardedVariableBuilder.build  s!    ))99$++FFr3   N)r$   r  r  r  r2   r   r   r   r  r3   r1   r   r     s    >*	AGr3   r   experimentalOptimizerc                 "    t        | t              S r   r&   r*  objs    r1   r^  r^        JsI. r3   c                     t               S r   rv  protos    r1   r^  r^    
    ->-@ r3      r   )object_factoryversionmin_producer_versionmin_consumer_version)versionstf_keras_experimentalOptimizerc                 "    t        | t              S r   r  r  s    r1   r^  r^    r  r3   c                     t               S r   r  r  s    r1   r^  r^    r  r3   z{{base_optimizer_keyword_args}})*r  r  r   r   tensorflow.compat.v2compatv2r   abslr   tf_keras.srcr   r   tf_keras.src.dtensorr   r1  tf_keras.src.optimizersr   !tf_keras.src.optimizers.schedulesr   tf_keras.src.utilsr    tensorflow.python.util.tf_exportr	   tensorflow.tools.docsr
   r   r   AutoTrackabler   base_optimizer_keyword_argsr*  rv  rt   r  r   saved_modelloadregister_revived_typeVersionedTypeRegistrationAssertionErrorreplacer  r3   r1   <module>r     s    
  	 ! !    % 7 < D ' : .k+R__--;; k+\&K R  -	
k- k-
k-\	
	 	
4 G G6OO$$::.OO'',,FF@%&%&	 G 
 ; 4 %%--%'B	   OO$$::(.OO'',,FF@%&%&	 G 
 ; s   3A&D; ;A+F)(F)