
    ujh5                     v   d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlmZ d dlmZmZmZ g Zd ej,                   ej.                  ej0                        j2                        z  Z ed	d
ez        Zd Z G d dej<                  j>                        Z  G d dej<                  j>                        Z! G d de      Z" G d de      Z# G d dej<                  j>                  e"      Z$ G d de#      Z%e G d d             Z& e&d eed      dddd d!d"d#d$d%d&      Z'd'e'_(        y)(    N)ABCabstractmethod)	dataclass)partial)CallableListTuple)module_utils)emformer_rnnt_baseRNNTRNNTBeamSearch(   
   g?c                     t        j                  | | t        j                  kD           | | t        j                  kD  <   | | t        j                  k     t        j                  z  | | t        j                  k  <   | S N)torchlogmathexs    `/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/pipelines/rnnt_pipeline.py_piecewise_linear_logr      sS    IIaDFF
m,Aa$&&jMqDFF{^dff,Aa466kNH    c                   $     e Zd Z fdZd Z xZS )_FunctionalModulec                 0    t         |           || _        y r   )super__init__
functional)selfr    	__class__s     r   r   z_FunctionalModule.__init__   s    $r   c                 $    | j                  |      S r   )r    r!   inputs     r   forwardz_FunctionalModule.forward   s    u%%r   __name__
__module____qualname__r   r&   __classcell__r"   s   @r   r   r      s    %&r   r   c                   $     e Zd Z fdZd Z xZS )_GlobalStatsNormalizationc                 H   t         |           t        |      5 }t        j                  |j                               }d d d        | j                  dt        j                  d                | j                  dt        j                  |d                y # 1 sw Y   ZxY w)Nmean	invstddev)	r   r   openjsonloadsreadregister_bufferr   tensor)r!   global_stats_pathfblobr"   s       r   r   z"_GlobalStatsNormalization.__init__$   s    #$ 	(::affh'D	( 	VU\\$v,%?@[%,,tK7H*IJ		( 	(s   $BB!c                 :    || j                   z
  | j                  z  S r   )r0   r1   r$   s     r   r&   z!_GlobalStatsNormalization.forward-   s    		!T^^33r   r'   r,   s   @r   r.   r.   #   s    K4r   r.   c                   l    e Zd Zedej
                  deej
                  ej
                  f   fd       Zy)_FeatureExtractorr%   returnc                      y)X  Generates features and length output from the given input tensor.

        Args:
            input (torch.Tensor): input tensor.

        Returns:
            (torch.Tensor, torch.Tensor):
            torch.Tensor:
                Features, with shape `(length, *)`.
            torch.Tensor:
                Length, with shape `(1,)`.
        N r$   s     r   __call__z_FeatureExtractor.__call__2       r   N)r(   r)   r*   r   r   Tensorr	   rB   rA   r   r   r=   r=   1   s8    ell uU\\5<<5O/P  r   r=   c                   ,    e Zd Zedee   defd       Zy)_TokenProcessortokensr>   c                      y)zDecodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.

        Returns:
            str:
                Decoded text sequence.
        NrA   )r!   rG   kwargss      r   rB   z_TokenProcessor.__call__C   rC   r   N)r(   r)   r*   r   r   intstrrB   rA   r   r   rF   rF   B   s&    	tCy 	s 	 	r   rF   c                        e Zd ZdZdej
                  j                  ddf fdZdej                  de	ej                  ej                  f   fdZ
 xZS )_ModuleFeatureExtractorz``torch.nn.Module``-based feature extraction pipeline.

    Args:
        pipeline (torch.nn.Module): module that implements feature extraction logic.
    pipeliner>   Nc                 0    t         |           || _        y r   )r   r   rN   )r!   rN   r"   s     r   r   z _ModuleFeatureExtractor.__init__W   s     r   r%   c                 r    | j                  |      }t        j                  |j                  d   g      }||fS )r@   r   )rN   r   r7   shape)r!   r%   featureslengths       r   r&   z_ModuleFeatureExtractor.forward[   s7     =='x~~a012r   )r(   r)   r*   __doc__r   nnModuler   rD   r	   r&   r+   r,   s   @r   rM   rM   P   sL    ! !T ! U\\  eELL%,,4N.O  r   rM   c                   <    e Zd ZdZdeddfdZd	dee   dedefdZ	y)
_SentencePieceTokenProcessorztSentencePiece-model-based token processor.

    Args:
        sp_model_path (str): path to SentencePiece model.
    sp_model_pathr>   Nc                    t        j                  d      st        d      dd l}|j	                  |      | _        | j
                  j                         | j
                  j                         | j
                  j                         h| _	        y )Nsentencepiecez2SentencePiece is not available. Please install it.r   )
model_file)
r
   is_module_availableRuntimeErrorr[   SentencePieceProcessorsp_modelunk_ideos_idpad_idpost_process_remove_list)r!   rY   spms      r   r   z%_SentencePieceTokenProcessor.__init__t   sn    //@STT#22m2LMM  "MM  "MM  ")
%r   rG   lstripc                     |dd D cg c]  }|| j                   vs| }}dj                  | j                  j                  |            j	                  dd      }|r|j                         S |S c c}w )aX  Decodes given list of tokens to text sequence.

        Args:
            tokens (List[int]): list of tokens to decode.
            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
                removed. (Default: ``True``).

        Returns:
            str:
                Decoded text sequence.
           N u   ▁ )rd   joinr`   id_to_piecereplacerf   )r!   rG   rf   token_indexfiltered_hypo_tokensoutput_strings         r   rB   z%_SentencePieceTokenProcessor.__call__   s~     ,2!": 
'DLiLi9iK 
  
  9 9:N OPXXYacfg ''))   
s
   A0A0)T)
r(   r)   r*   rT   rK   r   r   rJ   boolrB   rA   r   r   rX   rX   m   s8    
c 
d 
!tCy !$ !# !r   rX   c                      e Zd ZU dZ G d de      Z G d de      Zee	d<   e
g ef   e	d<   ee	d<   ee	d	<   ee	d
<   ee	d<   ee	d<   ee	d<   ee	d<   ee	d<   ee	d<   ee	d<   defdZedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       ZdefdZdefdZdefdZdefdZy)
RNNTBundleu  Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
    inference with an RNN-T model.

    More specifically, the class provides methods that produce the featurization pipeline,
    decoder wrapping the specified RNN-T model, and output token post-processor that together
    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
    given a raw waveform.

    It can support non-streaming (full-context) inference as well as streaming inference.

    Users should not directly instantiate objects of this class; rather, users should use the
    instances (representing pre-trained models) that exist within the module,
    e.g. :data:`torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH`.

    Example
        >>> import torchaudio
        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
        >>> import torch
        >>>
        >>> # Non-streaming inference.
        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
        >>>
        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
        >>> waveform = next(iter(dataset))[0].squeeze()
        >>>
        >>> with torch.no_grad():
        >>>     # Produce mel-scale spectrogram features.
        >>>     features, length = feature_extractor(waveform)
        >>>
        >>>     # Generate top-10 hypotheses.
        >>>     hypotheses = decoder(features, length, 10)
        >>>
        >>> # For top hypothesis, convert predicted tokens to text.
        >>> text = token_processor(hypotheses[0][0])
        >>> print(text)
        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
        >>>
        >>>
        >>> # Streaming inference.
        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
        >>> num_samples_segment_right_context = (
        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
        >>> )
        >>>
        >>> # Build streaming inference feature extractor.
        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
        >>>
        >>> # Process same waveform as before, this time sequentially across overlapping segments
        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
        >>> state, hypothesis = None, None
        >>> for idx in range(0, len(waveform), num_samples_segment):
        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
        >>>     with torch.no_grad():
        >>>         features, length = streaming_feature_extractor(segment)
        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
        >>>     hypothesis = hypotheses[0]
        >>>     transcript = token_processor(hypothesis[0])
        >>>     if transcript:
        >>>         print(transcript, end=" ", flush=True)
        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
    c                       e Zd ZdZy)RNNTBundle.FeatureExtractorz:Interface of the feature extraction part of RNN-T pipelineNr(   r)   r*   rT   rA   r   r   FeatureExtractorru      s    Hr   rw   c                       e Zd ZdZy)RNNTBundle.TokenProcessorz7Interface of the token processor part of RNN-T pipelineNrv   rA   r   r   TokenProcessorry      s    Er   rz   
_rnnt_path_rnnt_factory_func_global_stats_path_sp_model_path_right_padding_blank_sample_rate_n_fft_n_mels_hop_length_segment_length_right_context_lengthr>   c                     | j                         }t        j                  j                  | j                        }t        j                  |      }|j                  |       |j                          |S r   )	r|   
torchaudioutilsdownload_assetr{   r   loadload_state_dicteval)r!   modelpath
state_dicts       r   
_get_modelzRNNTBundle._get_model   sT    '')..t?ZZ%
j)

r   c                     | j                   S )zSSample rate (in cycles per second) of input waveforms.

        :type: int
        )r   r!   s    r   sample_ratezRNNTBundle.sample_rate   s        r   c                     | j                   S )z7Size of FFT window to use.

        :type: int
        )r   r   s    r   n_fftzRNNTBundle.n_fft  s     {{r   c                     | j                   S )z`Number of mel spectrogram features to extract from input waveforms.

        :type: int
        )r   r   s    r   n_melszRNNTBundle.n_mels  s     ||r   c                     | j                   S )zdNumber of samples between successive frames in input expected by model.

        :type: int
        )r   r   s    r   
hop_lengthzRNNTBundle.hop_length  s     r   c                     | j                   S )zTNumber of frames in segment in input expected by model.

        :type: int
        )r   r   s    r   segment_lengthzRNNTBundle.segment_length  s     ###r   c                     | j                   S )zcNumber of frames in right contextual block in input expected by model.

        :type: int
        )r   r   s    r   right_context_lengthzRNNTBundle.right_context_length%  s     )))r   c                 N    | j                         }t        || j                        S )zOConstructs RNN-T decoder.

        Returns:
            RNNTBeamSearch
        )r   r   r   )r!   r   s     r   get_decoderzRNNTBundle.get_decoder-  s!     !eT[[11r   c                     t         j                  j                   j                        }t	        t
        j                  j                  t         j                  j                   j                   j                   j                   j                        t        d       t        d       t        |      t         fd                  S )zzConstructs feature extractor for non-streaming (full-context) ASR.

        Returns:
            FeatureExtractor
        r   r   r   r   c                 &    | j                  dd      S Nrh   r   	transposer   s    r   <lambda>z2RNNTBundle.get_feature_extractor.<locals>.<lambda>B      AKK1,= r   c                 &    t        | t        z        S r   r   _gainr   s    r   r   z2RNNTBundle.get_feature_extractor.<locals>.<lambda>C      ,A!e),L r   c                 t    t         j                  j                  j                  | dddj                  f      S )Nr   )r   rU   r    padr   )r   r!   s    r   r   z2RNNTBundle.get_feature_extractor.<locals>.<lambda>E  s.    EHH,?,?,C,CA1aQUQdQdGe,f r   r   r   r   r}   rM   r   rU   
Sequential
transformsMelSpectrogramr   r   r   r   r   r.   r!   
local_paths   ` r   get_feature_extractorz RNNTBundle.get_feature_extractor6  s      %%44T5L5LM
&HH%%44 $ 0 0

4;;cgcrcr 5  ""=>!"LM)*5!"fg

 
	
r   c           
      v   t         j                  j                  | j                        }t	        t
        j                  j                  t         j                  j                  | j                  | j                  | j                  | j                        t        d       t        d       t        |                  S )zvConstructs feature extractor for streaming (simultaneous) ASR.

        Returns:
            FeatureExtractor
        r   c                 &    | j                  dd      S r   r   r   s    r   r   z<RNNTBundle.get_streaming_feature_extractor.<locals>.<lambda>U  r   r   c                 &    t        | t        z        S r   r   r   s    r   r   z<RNNTBundle.get_streaming_feature_extractor.<locals>.<lambda>V  r   r   r   r   s     r   get_streaming_feature_extractorz*RNNTBundle.get_streaming_feature_extractorI  s      %%44T5L5LM
&HH%%44 $ 0 0

4;;cgcrcr 5  ""=>!"LM)*5	
 		
r   c                 j    t         j                  j                  | j                        }t	        |      S )zQConstructs token processor.

        Returns:
            TokenProcessor
        )r   r   r   r~   rX   r   s     r   get_token_processorzRNNTBundle.get_token_processor[  s+      %%44T5H5HI
+J77r   N)r(   r)   r*   rT   r=   rw   rF   rz   rK   __annotations__r   r   rJ   r   propertyr   r   r   r   r   r   r   r   r   r   r   rA   r   r   rs   rs      sU   FPI, IF F O T**KKLD  !S ! ! s       C     $ $ $ *c * *2^ 2
'7 
&
1A 
$8^ 8r   rs   z(models/emformer_rnnt_base_librispeech.pti  )num_symbolsz2pipeline-assets/global_stats_rnnt_librispeech.jsonz.pipeline-assets/spm_bpe_4096_librispeech.model   i   i>  i  P         )r{   r|   r}   r~   r   r   r   r   r   r   r   r   a  ASR pipeline based on Emformer-RNNT,
pretrained on *LibriSpeech* dataset :cite:`7178964`,
capable of performing both streaming and non-streaming inference.

The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
and utilizes weights trained on LibriSpeech using training script ``train.py``
`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.

Please refer to :py:class:`RNNTBundle` for usage instructions.
))r3   r   abcr   r   dataclassesr   	functoolsr   typingr   r   r	   r   r   torchaudio._internalr
   torchaudio.modelsr   r   r   __all__log10iinfoint16max_decibelpowr   r   rU   rV   r   r.   r=   rF   rM   rX   rs   EMFORMER_RNNT_BASE_LIBRISPEECHrT   rA   r   r   <module>r      s4     # !  ( (   - F F JDJJ{u{{5;;7;;<<Bx & &4 4 "c  ehhoo/@  :(!? (!V I8 I8 I8X ",91tDKC" 	*  &r   