
    hhM6                         d Z ddlZddlZddlZddlmZ ddlmZ d Z	 ed       G d dej                               Z ed	      dd
       Z ed      	 	 	 	 	 	 dd       Zy)ai  Utilities for preprocessing sequence data.

Deprecated: `tf.keras.preprocessing.sequence` APIs are not recommended for new
code. Prefer `tf.keras.utils.timeseries_dataset_from_array` and
the `tf.data` APIs which provide a much more flexible mechanisms for dealing
with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data)
for more details.
    N)
data_utils)keras_exportc                     g g }}t        ||      D ]6  \  }}t        |      | k  s|j                  |       |j                  |       8 ||fS )aC  Removes sequences that exceed the maximum length.

    Args:
        maxlen: Int, maximum length of the output sequences.
        seq: List of lists, where each sublist is a sequence.
        label: List where each element is an integer.

    Returns:
        new_seq, new_label: shortened lists for `seq` and `label`.
    )ziplenappend)maxlenseqlabelnew_seq	new_labelxys          a/var/www/html/dev/engine/venv/lib/python3.12/site-packages/tf_keras/src/preprocessing/sequence.py_remove_long_seqr   $   sX     RYGC  1q6F?NN1Q  I    z0keras.preprocessing.sequence.TimeseriesGeneratorc                   >    e Zd ZdZ	 	 	 	 	 	 	 ddZd Zd Zd Zd Zy)	TimeseriesGeneratora$  Utility class for generating batches of temporal data.

    Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not
    operate on tensors and is not recommended for new code. Prefer using a
    `tf.data.Dataset` which provides a more efficient and flexible mechanism for
    batching, shuffling, and windowing input. See the
    [tf.data guide](https://www.tensorflow.org/guide/data) for more details.

    This class takes in a sequence of data-points gathered at
    equal intervals, along with time series parameters such as
    stride, length of history, etc., to produce batches for
    training/validation.

    Arguments:
        data: Indexable generator (such as list or Numpy array)
            containing consecutive data points (timesteps).
            The data should be at 2D, and axis 0 is expected
            to be the time dimension.
        targets: Targets corresponding to timesteps in `data`.
            It should have same length as `data`.
        length: Length of the output sequences (in number of timesteps).
        sampling_rate: Period between successive individual timesteps
            within sequences. For rate `r`, timesteps
            `data[i]`, `data[i-r]`, ... `data[i - length]`
            are used for create a sample sequence.
        stride: Period between successive output sequences.
            For stride `s`, consecutive output samples would
            be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
        start_index: Data points earlier than `start_index` will not be used
            in the output sequences. This is useful to reserve part of the
            data for test or validation.
        end_index: Data points later than `end_index` will not be used
            in the output sequences. This is useful to reserve part of the
            data for test or validation.
        shuffle: Whether to shuffle output samples,
            or instead draw them in chronological order.
        reverse: Boolean: if `true`, timesteps in each output sample will be
            in reverse chronological order.
        batch_size: Number of timeseries samples in each batch
            (except maybe the last one).

    Returns:
        A [Sequence](
        https://www.tensorflow.org/api_docs/python/tf/tf_keras/utils/Sequence)
        instance.

    Examples:
        ```python
        from tf_keras.src.preprocessing.sequence import TimeseriesGenerator
        import numpy as np
        data = np.array([[i] for i in range(50)])
        targets = np.array([[i] for i in range(50)])
        data_gen = TimeseriesGenerator(data, targets,
                                       length=10, sampling_rate=2,
                                       batch_size=2)
        assert len(data_gen) == 20
        batch_0 = data_gen[0]
        x, y = batch_0
        assert np.array_equal(x,
                              np.array([[[0], [2], [4], [6], [8]],
                                        [[1], [3], [5], [7], [9]]]))
        assert np.array_equal(y,
                              np.array([[10], [11]]))
        ```
    Nc                    t        |      t        |      k7  r)t        ddt        |       z   dt        |       z         || _        || _        || _        || _        || _        ||z   | _        |t        |      dz
  }|| _        || _	        |	| _
        |
| _        | j                  | j                  kD  r$t        d| j                  | j                  fz        y )NzData and targets have to bez  of same length. Data length is z while target length is    zz`start_index+length=%i > end_index=%i` is disallowed, as no part of the sequence would be left to be used as current step.)r   
ValueErrordatatargetslengthsampling_ratestridestart_index	end_indexshufflereverse
batch_size)selfr   r   r   r   r   r   r   r   r    r!   s              r   __init__zTimeseriesGenerator.__init__{   s     t9G$-4SYK@A,S\N;<  	*&/D	AI"$dnn,< ##T^^45  -r   c                     | j                   | j                  z
  | j                  | j                  z  z   | j                  | j                  z  z  S )N)r   r   r!   r   )r"   s    r   __len__zTimeseriesGenerator.__len__   s@    NNT---$++0MMoo+- 	-r   c           	         | j                   rDt        j                  j                  | j                  | j
                  dz   | j                        }n{| j                  | j                  | j                  z  |z  z   }t        j                  |t        || j                  | j                  z  z   | j
                  dz         | j                        }t        j                  |D cg c]+  }| j                  || j                  z
  || j                     - c}      }t        j                  |D cg c]  }| j                  |    c}      }| j                  r|d d d d ddf   |fS ||fS c c}w c c}w )Nr   )size.)r   nprandomrandintr   r   r!   r   arangeminarrayr   r   r   r   r    )r"   indexrowsirowsamplesr   s          r   __getitem__zTimeseriesGenerator.__getitem__   s>   <<99$$  $..1"44?? % D   4??T[[#@5#HHA99A$++55t~~7IJD ((   		#+cD4F4FFG
 ((>#DLL->?<<1ddC<('11
 ?s   0E%)E*c                    | j                   }t        | j                         j                  t        j                  k(  r| j                   j                         }	 t        j                  |      }| j                  }t        | j                        j                  t        j                  k(  r| j                  j                         }	 t        j                  |      }||| j                  | j                  | j                  | j                  | j                  | j                  | j                   | j"                  d
S # t        $ r}t        d|      |d}~ww xY w# t        $ r}t        d|      |d}~ww xY w)zReturns the TimeseriesGenerator configuration as Python dictionary.

        Returns:
            A Python dictionary with the TimeseriesGenerator configuration.
        zData not JSON Serializable:NzTargets not JSON Serializable:)
r   r   r   r   r   r   r   r   r    r!   )r   type
__module__r)   __name__tolistjsondumps	TypeErrorr   r   r   r   r   r   r   r    r!   )r"   r   	json_dataer   json_targetss         r   
get_configzTimeseriesGenerator.get_config   s.    yy		?%%499##%D	H

4(I ,,((BKK7ll))+G	N::g.L
 #kk!//kk++||||//
 	
  	H94@aG	H  	N<gFAM	Ns0   D6 E 6	E?EE	E/E**E/c                     | j                         }| j                  j                  |d}t        j                  |fi |S )a  Returns a JSON string containing the generator's configuration.

        Args:
            **kwargs: Additional keyword arguments to be passed
                to `json.dumps()`.

        Returns:
            A JSON string containing the tokenizer configuration.
        )
class_nameconfig)r@   	__class__r8   r:   r;   )r"   kwargsrC   timeseries_generator_configs       r   to_jsonzTimeseriesGenerator.to_json   s?     "..11'
# zz5@@@r   )r   r   r   NFF   )	r8   r7   __qualname____doc__r#   r%   r4   r@   rG    r   r   r   r   7   s<    @N 'R-
 2!
FAr   r   z0keras.preprocessing.sequence.make_sampling_tablec                     d}t        j                  |       }d|d<   |t        j                  |      |z   z  dz   dd|z  z  z
  }||z  }t        j                  d|t        j                  |      z        S )a2  Generates a word rank-based probabilistic sampling table.

    Used for generating the `sampling_table` argument for `skipgrams`.
    `sampling_table[i]` is the probability of sampling
    the word i-th most common word in a dataset
    (more common words should be sampled less frequently, for balance).

    The sampling probabilities are generated according
    to the sampling distribution used in word2vec:

    ```
    p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
        (word_frequency / sampling_factor)))
    ```

    We assume that the word frequencies follow Zipf's law (s=1) to derive
    a numerical approximation of frequency(rank):

    `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
    where `gamma` is the Euler-Mascheroni constant.

    Args:
        size: Int, number of possible words to sample.
        sampling_factor: The sampling factor in the word2vec formula.

    Returns:
        A 1D Numpy array of length `size` where the ith entry
        is the probability that a word of rank i should be sampled.
    gX9v?r   r   g      ?      ?g      (@)r)   r,   logminimumsqrt)r'   sampling_factorgammarankinv_fqfs         r   make_sampling_tablerV      sp    > E99T?DDGRVVD\E)*S03$+3FFF& A::c1rwwqz>**r   z&keras.preprocessing.sequence.skipgramsc                    g }g }	t        |       D ]  \  }
}|s	|||   t        j                         k  r&t        d|
|z
        }t        t	        |       |
|z   dz         }t        ||      D ]J  }||
k7  s	| |   }|s|j                  ||g       |r|	j                  ddg       :|	j                  d       L  |dkD  rt        t	        |	      |z        }|D cg c]  }|d   	 }}t        j                  |       |t        |      D 
cg c]5  }
||
t	        |      z     t        j                  dt        |dz
              g7 c}
z  }|r|	ddgg|z  z  }	n	|	dg|z  z  }	|ru|t        j                  dt        d            }t        j                  |       t        j                  |       t        j                  |       t        j                  |	       ||	fS c c}w c c}
w )as  Generates skipgram word pairs.

    This function transforms a sequence of word indexes (list of integers)
    into tuples of words of the form:

    - (word, word in the same window), with label 1 (positive samples).
    - (word, random word from the vocabulary), with label 0 (negative samples).

    Read more about Skipgram in this gnomic paper by Mikolov et al.:
    [Efficient Estimation of Word Representations in
    Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)

    Args:
        sequence: A word sequence (sentence), encoded as a list
            of word indices (integers). If using a `sampling_table`,
            word indices are expected to match the rank
            of the words in a reference dataset (e.g. 10 would encode
            the 10-th most frequently occurring token).
            Note that index 0 is expected to be a non-word and will be skipped.
        vocabulary_size: Int, maximum possible word index + 1
        window_size: Int, size of sampling windows (technically half-window).
            The window of a word `w_i` will be
            `[i - window_size, i + window_size+1]`.
        negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
            1 for same number as positive samples.
        shuffle: Whether to shuffle the word couples before returning them.
        categorical: bool. if False, labels will be
            integers (eg. `[0, 1, 1 .. ]`),
            if `True`, labels will be categorical, e.g.
            `[[1,0],[0,1],[0,1] .. ]`.
        sampling_table: 1D array of size `vocabulary_size` where the entry i
            encodes the probability to sample a word of rank i.
        seed: Random seed.

    Returns:
        couples, labels: where `couples` are int pairs and
            `labels` are either 0 or 1.

    Note:
        By convention, index 0 in the vocabulary is
        a non-word and will be skipped.
    r   r   g    cA)	enumerater*   maxr-   r   ranger   intr   r+   seed)sequencevocabulary_sizewindow_sizenegative_samplesr   categoricalsampling_tabler\   coupleslabelsr1   wiwindow_start
window_endjwjnum_negative_samplescwordss                      r   	skipgramsrm     s   j GF8$ %2%b!FMMO31a+o.XK!(;<
|Z0 		%AAva[Bx(MM1a&)MM!$		%%( !"3v;1A#AB&'!1''u/0
 1s5z>"FNN1c/A:M6N$OP
 	
 1vh!555Fqc000F<>>!SY/DDwDvF?) (
s   G:G")gh㈵>)   rM   TFNN)rJ   r:   r*   numpyr)   tf_keras.src.utilsr    tensorflow.python.util.tf_exportr   r   Sequencer   rV   rm   rK   r   r   <module>rs      s       ) :& @A|A*-- |A B|A~ @A$+ B$+N 67 	` 8`r   