
    ujh'2                        d dl mZmZmZmZmZ d dlZd dlmZ ddgZ	eee
   ej                  eeej                        ef   Zde_        dedee
   fd	Zdedej                  fd
Zdedeeej                        fdZdedefdZdedefdZdee   deeej                        fdZdeeej                        de
dej,                  deeej                        fdZdedefdZdee   dej                  de
deej                  ej                  ej                  f   fdZdedee   ddfdZ G d dej6                  j8                        Zy)    )CallableDictListOptionalTupleN)RNNT
HypothesisRNNTBeamSearchzHypothesis generated by RNN-T beam search decoder,
    represented as tuple of (tokens, prediction network output, prediction network state, score).
    hyporeturnc                     | d   S Nr    r   s    \/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/rnnt_decoder.py_get_hypo_tokensr          7N    c                     | d   S N   r   r   s    r   _get_hypo_predictor_outr      r   r   c                     | d   S )N   r   r   s    r   _get_hypo_stater      r   r   c                     | d   S )N   r   r   s    r   _get_hypo_scorer      r   r   c                     t        | d         S r   )strr   s    r   _get_hypo_keyr!       s    tAw<r   hyposc                 V   g }t        t        t        | d                     D ]~  }g }t        t        t        | d         |               D ]C  }|j                  t	        j
                  | D cg c]  }t        |      |   |    c}             E |j                  |        |S c c}w r   )rangelenr   appendtorchcat)r"   statesibatched_state_componentsjr   s         r   _batch_stater-   $   s    ')F3uQx012 079 s?584Q789 	hA$++EII_d6eW[t7LQ7OPQ7R6e,fg	h./	0
 M 7fs   ,B&r)   idxdevicec                     t        j                  |g|      }| D cg c]"  }|D cg c]  }|j                  d|       c}$ c}}S c c}w c c}}w )Nr/   r   )r'   tensorindex_select)r)   r.   r/   
idx_tensorstate_tuplestates         r   _slice_stater7   .   sC    seF3J\bc[KH5U:.HccHcs   	AA	 A	Ac                 H    t        |       t        t        |             dz   z  S r   )r   r%   r   r   s    r   _default_hypo_sort_keyr9   3   s"    4 C(8(>$?!$CDDr   next_token_probs
beam_widthc                 R   t        j                  | D cg c]  }t        |       c}      j                  d      }||d d d df   z   }|j	                  d      j                  |      \  }}|j                  |j                  d   d      }||j                  d   z  }	|||	fS c c}w )Nr   trunc)rounding_mode)r'   r2   r   	unsqueezereshapetopkdivshape)
r"   r:   r;   hhypo_scoresnonblank_scoresnonblank_nbest_scoresnonblank_nbest_idxnonblank_nbest_hypo_idxnonblank_nbest_tokens
             r   _compute_updated_scoresrL   7   s    
 ,,EBq 2BCMMaPK!$4QV$<<O0?0G0G0K0P0PQ[0\--044_5J5J15M]d4e-0E0Ea0HH "9;OOO  Cs   B$	hypo_listc                 b    t        |      D ]!  \  }}t        |       t        |      k(  s||=  y  y N)	enumerater!   )r   rM   r*   elems       r   _remove_hyporR   D   s6    Y' 4-"55!r   c                   6    e Zd ZdZ	 	 	 d#dedededeee	gef      deddf fd	Z
d
ej                  dee	   fdZdej                  dee	   d
ej                  dej                  fdZdee	   dee	   dej                  deee	f   dee	   f
dZdee	   dee	   dej                  deded
ej                  dee	   fdZdee	   dee   dee   ded
ej                  dee	   fdZdej                  deee	      dedee	   fdZdej                  dej                  dedee	   fdZej0                  j2                  	 	 d$dej                  dej                  ded eeeej                           d!eee	      deee	   eeej                        f   fd"       Z xZS )%r
   a)  Beam search decoder for RNN-T model.

    See Also:
        * :class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pretrained model.

    Args:
        model (RNNT): RNN-T model to use.
        blank (int): index of blank token in vocabulary.
        temperature (float, optional): temperature to apply to joint network output.
            Larger values yield more uniform samples. (Default: 1.0)
        hypo_sort_key (Callable[[Hypothesis], float] or None, optional): callable that computes a score
            for a given hypothesis to rank hypotheses by. If ``None``, defaults to callable that returns
            hypothesis score normalized by token sequence length. (Default: None)
        step_max_tokens (int, optional): maximum number of tokens to emit per input time step. (Default: 100)
    Nmodelblanktemperaturehypo_sort_keystep_max_tokensr   c                     t         |           || _        || _        || _        |t
        | _        || _        y || _        || _        y rO   )super__init__rT   rU   rV   r9   rW   rX   )selfrT   rU   rV   rW   rX   	__class__s         r   r[   zRNNTBeamSearch.__init__\   sP     	

& !7D  / "/D.r   r/   c                     | j                   }d }t        j                  dg|      }| j                  j	                  t        j                  |gg|      ||      \  }}}|g|d   j                         |df}|gS )Nr   r1   r   g        )rU   r'   r2   rT   predictdetach)	r\   r/   tokenr6   
one_tensorpred_out_
pred_state	init_hypos	            r   _init_b_hyposzRNNTBeamSearch._init_b_hyposp   s    

\\1#f5
"&**"4"4U\\E7)TZ5[]gin"o!ZGQK 	
	 {r   enc_outr"   c                    t        j                  dg|      }t        j                  |D cg c]  }t        |       c}d      }| j                  j                  |||t        j                  dgt        |      z  |            \  }}}t         j                  j                  j                  || j                  z  d      }|d d ddf   S c c}w )Nr   r1   r   )dimr   )r'   r2   stackr   rT   joinr%   nn
functionallog_softmaxrV   )	r\   rh   r"   r/   rb   rE   predictor_out
joined_outrd   s	            r   _gen_next_token_probsz$RNNTBeamSearch._gen_next_token_probs~   s     \\1#f5
$OA%<Q%?$OUVW::??LL!s5z)&9	

Aq XX((44Z$BRBR5RXY4Z
!Q'"" %Ps   Cb_hyposa_hyposr:   key_to_b_hypoc                 j   t        t        |            D ]  }||   }t        |      ||df   z   }t        |      |v rQ|t        |         }t	        ||       t        t        j                  t        |            j                  |            }	nt        |      }	t        |      t        |      t        |      |	f}|j                  |       ||t        |      <    t        j                  |D 
cg c]  }
t        |
       c}
      j                         \  }}|D cg c]  }||   	 c}S c c}
w c c}w )Nr=   )r$   r%   r   r!   rR   floatr'   r2   	logaddexpr   r   r   r&   sort)r\   rs   rt   r:   ru   r*   h_aappend_blank_scoreh_bscorer   rd   
sorted_idxr.   s                 r   _gen_b_hyposzRNNTBeamSearch._gen_b_hypos   s    s7|$ 	4A!*C!0!58HB8O!OS!]2#M#$67S'*ell?3+?@JJK]^_01 %',$	C NN303M-,-!	4" %Pod&;%PQVVX:(2333 &Q3s   .D+D0tr;   c                    t        |||      \  }}}	t        |      |k  rt        d       }
nt        ||          }
g }g }g }t	        |      D ]f  }t        ||         }||
kD  st        ||         }|j                  ||          |j                  t        |	|                |j                  |       h |r| j                  |||||      }|S g }|S )Ninf)rL   r%   rw   r   r$   intr&   _gen_new_hypos)r\   rt   rs   r:   r   r;   r/   rH   rJ   rK   b_nbest_score
base_hypos
new_tokens
new_scoresr*   r}   
a_hypo_idx	new_hyposs                     r   _gen_a_hyposzRNNTBeamSearch._gen_a_hypos   s    $G-=zJ		
!#  w<*$"5\MM+GZK,@AM')
 "
"$
z" 	)A/23E}$ !8!;<
!!'*"56!!#&:1&=">?!!%(	) ++J
JPQSYZI  +-Ir   r   tokensscoresc           
         t        j                  |D cg c]  }|g c}|      }t        |      }| j                  j	                  |t        j                  dgt        |      z  |      |      \  }	}
}g }t        |      D ]K  \  }}t        |      ||   gz   }|j                  ||	|   j                         t        |||      ||   f       M |S c c}w )Nr1   r   )r'   r2   r-   rT   r_   r%   rP   r   r&   r`   r7   )r\   r   r   r   r   r/   ra   
tgt_tokensr)   rc   rd   pred_statesr   r*   rz   r   s                   r   r   zRNNTBeamSearch._gen_new_hypos   s     \\"?uE7"?O
j)#'::#5#5LL!s:.v>$
 ![
 ')	
+ 	rFAs)#.&)<Jj(1+*<*<*>[Z[]c@dflmnfopq	r  #@s   
Cr   c           	         |j                   d   }|j                  }g }|| j                  |      n|}t        |      D ]  }|}t        j
                  j                  t        t           g       }i }	d}
|rs| j                  |d d ||dz   f   ||      }|j                         }| j                  ||||	      }|
| j                  k(  rn | j                  ||||||      }|r|
dz  }
|rst	        j                  |D cg c]  }| j                  |       c}      j!                  |      \  }}|D cg c]  }||   	 }} |S c c}w c c}w )Nr   r   )rD   r/   rg   r$   r'   jitannotater   r	   rr   cpur   rX   r   r2   rW   rB   )r\   rh   r   r;   n_time_stepsr/   rt   rs   r   ru   symbols_current_tr:   hyprd   r~   r.   s                   r   _searchzRNNTBeamSearch._search   sm    }}Q'$&04$$$V,$|$ 	;AGii((j)92>G35M !#'#=#=gaQQRUl>SU\^d#e #3#7#7#9 ++GW>NP]^$(<(<<++$ %*%# & "LLW)Uc$*<*<S*A)UV[[\fgMAz/9:ws|:G:5	;8  *V:s   :E
/Einputlengthc                    |j                         dk7  r0|j                         dk(  r|j                  d   dk(  st        d      |j                         dk(  r|j                  d      }|j                  dk7  r|j                  dk7  rt        d      |j                         dk(  r|j                  d      }| j                  j                  ||      \  }}| j                  |d	|      S )
a  Performs beam search for the given input sequence.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.

        Returns:
            List[Hypothesis]: top-``beam_width`` hypotheses found by beam search.
        r   r   r   r   *input must be of shape (T, D) or (1, T, D)r   r   "length must be of shape () or (1,)N)rj   rD   
ValueErrorr@   rT   
transcriber   )r\   r   r   r;   rh   rd   s         r   forwardzRNNTBeamSearch.forward  s     99;!UYY[A%5%++a.A:MIJJ99;!OOA&E<<2&,,$"6ABB::<1%%a(FZZ**5&9
||GT:66r   r6   
hypothesisc                    |j                         dk7  r0|j                         dk(  r|j                  d   dk(  st        d      |j                         dk(  r|j                  d      }|j                  dk7  r|j                  dk7  rt        d      |j                         dk(  r|j                  d      }| j                  j                  |||      \  }}}| j                  |||      |fS )	a  Performs beam search for the given input sequence in streaming mode.

        T: number of frames;
        D: feature dimension of each frame.

        Args:
            input (torch.Tensor): sequence of input frames, with shape (T, D) or (1, T, D).
            length (torch.Tensor): number of valid frames in input
                sequence, with shape () or (1,).
            beam_width (int): beam size to use during search.
            state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
                representing transcription network internal state generated in preceding
                invocation. (Default: ``None``)
            hypothesis (List[Hypothesis] or None): hypotheses from preceding invocation to seed
                search with. (Default: ``None``)

        Returns:
            (List[Hypothesis], List[List[torch.Tensor]]):
                List[Hypothesis]
                    top-``beam_width`` hypotheses found by beam search.
                List[List[torch.Tensor]]
                    list of lists of tensors representing transcription network
                    internal state generated in current invocation.
        r   r   r   r   r   r   r   r   )rj   rD   r   r@   rT   transcribe_streamingr   )r\   r   r   r;   r6   r   rh   rd   s           r   inferzRNNTBeamSearch.infer'  s    B 99;!UYY[A%5%++a.A:MIJJ99;!OOA&E<<2&,,$"6ABB::<1%%a(F JJ;;E65QE||GZ<eCCr   )g      ?Nd   )NN)__name__
__module____qualname____doc__r   r   rw   r   r   r	   r[   r'   r/   r   rg   Tensorrr   r   r    r   r   r   r   r   r   exportr   r   __classcell__)r]   s   @r   r
   r
   K   s   ( !AE"// / 	/
  *u)< =>/ / 
/(ELL T*5E #||#,0,<#FKll#	#4j!4 j!4  ,,	4
 CO,4 
j	46$j!$ j!$  ,,	$
 $ $ $ 
j	$L$ S	 U	
   
j	*'' tJ'(' 	'
 
j	'R7U\\ 75<< 7S 7UYZdUe 78 YY 5915+D||+D +D 	+D
 T%,,/01+D T*-.+D 
tJd5<<&8!99	:+D +Dr   )typingr   r   r   r   r   r'   torchaudio.modelsr   __all__r   r   rw   r	   r   r   r   r   r   r    r!   r-   r/   r7   r9   rL   rR   rm   Moduler
   r   r   r   <module>r      s   8 8  " )
* 49ellDell1C,DeKL

 
: $s) *  * d5<<.@)A *  
 s Z( T$u||2D-E dd5<<01 d dU\\ dVZ[_`e`l`l[mVn d
E E E
P

Pll
P 
P 5<<u||34	
Pz d:.> 4 HDUXX__ HDr   