
    |h$                         d dl  d dlmZ d dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlZd dlZ G d d	e	j                        Zy)
    )*)audio)Path)UnionList)nn)perf_counterNc                        e Zd Zddeeej                  f   deeef   f fdZdej                  fdZ
edefd       Zddej                  fd	Zd
eej                     fdZ xZS )VoiceEncoderdeviceweights_fpathc                 Z   t         |           t        j                  t        t
        t        d      | _        t        j                  t
        t              | _
        t        j                         | _        |6t        j                  t        j                  j!                         rdnd      }n%t#        |t$              rt        j                  |      }|| _        |7t'        t(              j+                         j,                  j/                  d      }nt'        |      }|j1                         st3        d|z        t5               }t        j6                  |d      }| j9                  |d	   d
       | j;                  |       |r&t=        d|j>                  t5               |z
  fz         yy)a6  
        If None, defaults to cuda if it is available on your machine, otherwise the model will
        run on cpu. Outputs are always returned on the cpu, as numpy arrays.
        :param weights_fpath: path to "<CUSTOM_MODEL>.pt" file path.
        If None, defaults to built-in "pretrained.pt" model
        T)batch_firstNcudacpuzpretrained.ptz7Couldn't find the voice encoder pretrained model at %s.)map_locationmodel_stateF)strictz5Loaded the voice encoder model on %s in %.2f seconds.) super__init__r   LSTMmel_n_channelsmodel_hidden_sizemodel_num_layerslstmLinearmodel_embedding_sizelinearReLUrelutorchr   r   is_available
isinstancestrr   __file__resolveparentjoinpathexists	Exceptiontimerloadload_state_dicttoprinttype)selfr   verboser   start
checkpoint	__class__s         X/var/www/html/test/engine/venv/lib/python3.12/site-packages/resemblyzer/voice_encoder.pyr   zVoiceEncoder.__init__   sN    	 GGN,=?O]ab	ii 13GHGGI	 >\\EJJ,C,C,E&5QF$\\&)F   N224;;DD_UM /M##%U)* + +ZZEB
Z6uEI;;%01 2     melsc                     | j                  |      \  }\  }}| j                  | j                  |d               }|t        j                  |dd      z  S )a  
        Computes the embeddings of a batch of utterance spectrograms.

        :param mels: a batch of mel spectrograms of same duration as a float32 tensor of shape
        (batch_size, n_frames, n_channels)
        :return: the embeddings as a float 32 tensor of shape (batch_size, embedding_size).
        Embeddings are positive and L2-normed, thus they lay in the range [0, 1].
           T)dimkeepdim)r   r    r   r!   norm)r1   r8   _hidden
embeds_raws        r6   forwardzVoiceEncoder.forward3   sL     4;FAYYt{{6":67
EJJzq$GGGr7   	n_samplesc                    d|cxk  rdk  sJ  J t        t        t        z  dz        }t        t        j                  | dz   |z              }t        t        j
                  t        |z  |z              }d|k  sJ d       |t        k  sJ dt        |t        z  z  z         g g }}t        d|t        z
  |z   dz         }t        d||      D ]S  }	t        j                  |	|	t        z   g      }
|
|z  }|j                  t        |
        |j                  t        |        U |d   }| |j                  z
  |j                  |j                  z
  z  }||k  rt        |      dkD  r
|dd }|dd }||fS )a"  
        Computes where to split an utterance waveform and its corresponding mel spectrogram to
        obtain partial utterances of <partials_n_frames> each. Both the waveform and the
        mel spectrogram slices are returned, so as to make each partial utterance waveform
        correspond to its spectrogram.

        The returned ranges may be indexing further than the length of the waveform. It is
        recommended that you pad the waveform with zeros up to wav_slices[-1].stop.

        :param n_samples: the number of samples in the waveform
        :param rate: how many partial utterances should occur per second. Partial utterances must
        cover the span of the entire utterance, thus the rate should not be lower than the inverse
        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
        the minimum rate is thus 0.625.
        :param min_coverage: when reaching the last partial utterance, it may or may not have
        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
        it will be discarded. If there aren't enough frames for one partial utterance,
        this parameter is ignored so that the function always returns at least one slice.
        :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
        respectively the waveform and the mel spectrogram with these slices to obtain the partial
        utterances.
        r   r;   i  zThe rate is too highz-The rate is too low, it should be %f at leastr:   N)intsampling_ratemel_window_stepnpceilroundpartials_n_framesmaxrangearrayappendslicer3   stoplen)rC   ratemin_coveragesamples_per_framen_frames
frame_step
wav_slices
mel_slicesstepsi	mel_range	wav_rangelast_wav_rangecoverages                 r6   compute_partial_slicesz#VoiceEncoder.compute_partial_slicesB   s   2 <$1$$$$$  !@4!GIrww	A1BBCD=4#7;L"LMN
:~555~.. 	F0_/2CCD1F 	F. "$RJ
Ax"33j@1DEq%, 	1A!Q):%:!;<I!$55IeY/0eY/0		1 $B 4 449L9L~OcOc9cdl"s:':#CRJ#CRJ:%%r7   wavc                    | j                  t        |      ||      \  }}|d   j                  }|t        |      k\  r%t        j                  |d|t        |      z
  fd      }t        j                  |      }t        j                  |D 	cg c]  }	||	   	 c}	      }
t        j                         5  t        j                  |
      j                  | j                        }
 | |
      j                         j                         }ddd       t        j                  d      }|t        j                   j#                  |d      z  }|r|||fS |S c c}	w # 1 sw Y   QxY w)a~  
        Computes an embedding for a single utterance. The utterance is divided in partial
        utterances and an embedding is computed for each. The complete utterance embedding is the
        L2-normed average embedding of the partial utterances.

        TODO: independent batched version of this function

        :param wav: a preprocessed utterance waveform as a numpy array of float32
        :param return_partials: if True, the partial embeddings will also be returned along with
        the wav slices corresponding to each partial utterance.
        :param rate: how many partial utterances should occur per second. Partial utterances must
        cover the span of the entire utterance, thus the rate should not be lower than the inverse
        of the duration of a partial utterance. By default, partial utterances are 1.6s long and
        the minimum rate is thus 0.625.
        :param min_coverage: when reaching the last partial utterance, it may or may not have
        enough frames. If at least <min_pad_coverage> of <partials_n_frames> are present,
        then the last partial utterance will be considered by zero-padding the audio. Otherwise,
        it will be discarded. If there aren't enough frames for one partial utterance,
        this parameter is ignored so that the function always returns at least one slice.
        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
        <return_partials> is True, the partial utterances as a numpy array of float32 of shape
        (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
        returned.
        r:   r   constantNaxis   )r`   rR   rQ   rH   padr   wav_to_mel_spectrogramrN   r!   no_grad
from_numpyr.   r   r   numpymeanlinalgr>   )r1   ra   return_partialsrS   rT   rX   rY   max_wave_lengthmelsr8   partial_embeds	raw_embedembeds                 r6   embed_utterancezVoiceEncoder.embed_utterancew   s&   6 "&!<!<SXt\!Z
J$R.--c#h&&&q/CH"<=zJC **3/xx4AQ45]]_ 	6##D),,T[[9D!$Z^^-335N	6
 GGN3	BIINN9a88.*44 5	6 	6s   
E1AEEwavsc           
          t        j                  |D cg c]  } | j                  |fddi| c}d      }|t         j                  j	                  |d      z  S c c}w )au  
        Compute the embedding of a collection of wavs (presumably from the same speaker) by
        averaging their embedding and L2-normalizing it.

        :param wavs: list of wavs a numpy arrays of float32.
        :param kwargs: extra arguments to embed_utterance()
        :return: the embedding as a numpy array of float32 of shape (model_embedding_size,).
        rn   Fr   rd   rf   )rH   rl   ru   rm   r>   )r1   rv   kwargsra   rs   s        r6   embed_speakerzVoiceEncoder.embed_speaker   sb     GG(,.!$ 2T11#WuWPVW .457	299>>)Q777.s   A)NTN)Fg?g      ?)__name__
__module____qualname__r   r$   r!   r   r   r   FloatTensorrB   staticmethodrE   r`   rH   ndarrayru   r   ry   __classcell__)r5   s   @r6   r   r      s    %2uS%,,%67 %2[`aegjaj[k %2NHE-- H 2&# 2& 2&h-2:: -^8$rzz"2 8r7   r   )resemblyzer.hparamsresemblyzerr   pathlibr   typingr   r   r!   r   timer	   r+   rk   rH   Moduler    r7   r6   <module>r      s/    !     &  f8299 f8r7   