
    ujh0                        U d dl Z d dlmZmZmZ d dlZd dlmZ d dlmc m	Z
 dedefdZd ed      fZeeef   ed<    G d	 d
ej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Z G d dej                        Zdedededej0                  j                  fdZ	 d"dededededededededee   defd Zdefd!Zy)#    N)ListOptionalTuplexreturnc                 J    dddt        j                  d| z  dz         z   z  z   S )a  The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined
    for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric
    defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score".

    Args:
        x (float): Narrow-band PESQ score.

    Returns:
        (float): Wide-band PESQ score.
    g+?g@   g;pΈgׁsF@)mathexp)r   s    _/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/squim/objective.pytransform_wb_pesq_ranger   	   s+     Ma$((7Q;3G*H&HIII          ?g      @	PESQRangec                   l     e Zd Zddeeef   ddf fdZdej                  dej                  fdZ xZ	S )RangeSigmoid	val_ranger   Nc                     t         t        |           t        |t              rt        |      dk(  sJ || _        t        j                         | _	        y )N   )
superr   __init__
isinstancetuplelenr   nnSigmoidsigmoid)selfr   	__class__s     r   r   zRangeSigmoid.__init__    s?    lD*,)U+I!0CCC.7*,**,r   r   c                     | j                  |      | j                  d   | j                  d   z
  z  | j                  d   z   }|S )Nr	   r   )r   r   r   r   outs      r   forwardzRangeSigmoid.forward&   s?    ll1o!2T^^A5F!FG$..YZJ[[
r   ))        r   )
__name__
__module____qualname__r   floatr   torchTensorr#   __classcell__r   s   @r   r   r      s:    7%u"5 7t 7 %,, r   r   c                   j     e Zd ZdZd	dededdf fdZdej                  dej                  fdZ xZ	S )
EncoderzEncoder module that transform 1D waveform to 2D representations.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512)
        win_len (int, optional): kernel size in the Conv1D layer. (Default: 32)
    feat_dimwin_lenr   Nc                 n    t         t        |           t        j                  d|||dz  d      | _        y )Nr	   r   F)stridebias)r   r.   r   r   Conv1dconv1d)r   r/   r0   r   s      r   r   zEncoder.__init__3   s-    gt%'ii8WW\PUVr   r   c                 r    |j                  d      }t        j                  | j                  |            }|S )a  Apply waveforms to convolutional layer and ReLU layer.

        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`.
        r	   dim)	unsqueezeFrelur5   r!   s      r   r#   zEncoder.forward8   s0     kkak ffT[[%&
r   )i       )
r%   r&   r'   __doc__intr   r)   r*   r#   r+   r,   s   @r   r.   r.   +   sA    W WS W$ W
 %,, r   r.   c                   n     e Zd Zd
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )	SingleRNNrnn_type
input_sizehidden_sizedropoutr   Nc                     t         t        |           || _        || _        || _         t        t        |      ||d|dd      | _        t        j                  |dz  |      | _
        y )Nr	   T)rD   batch_firstbidirectionalr   )r   r@   r   rA   rB   rC   getattrr   rnnLinearproj)r   rA   rB   rC   rD   r   s        r   r   zSingleRNN.__init__G   se    i') $&&;gb(&;'
 IIkAoz:	r   r   c                 P    | j                  |      \  }}| j                  |      }|S N)rI   rK   )r   r   r"   _s       r   r#   zSingleRNN.forwardY   s%    !Qiin
r   )r$   )r%   r&   r'   strr>   r(   r   r)   r*   r#   r+   r,   s   @r   r@   r@   F   sH    ; ;# ;C ;RW ;bf ;$ %,, r   r@   c                   L    e Zd ZdZ	 	 	 	 	 	 	 ddededededededed	d
f fdZdej                  d	e	ej                  ef   fdZ
dej                  d	e	ej                  ef   fdZdej                  ded	ej                  fdZdej                  d	ej                  fdZ xZS )DPRNNa  *Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64)
        hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128)
        num_blocks (int, optional): Number of DPRNN layers. (Default: 6)
        rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM")
        d_model (int, optional): The number of expected features in the input. (Default: 256)
        chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100)
        chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50)
    r/   
hidden_dim
num_blocksrA   d_model
chunk_sizechunk_strider   Nc                 $   t         t        |           || _        t	        j
                  g       | _        t	        j
                  g       | _        t	        j
                  g       | _        t	        j
                  g       | _	        t        |      D ]  }| j                  j                  t        |||             | j                  j                  t        |||             | j                  j                  t	        j                  d|d             | j                  j                  t	        j                  d|d              t	        j                  t	        j                  ||d      t	        j                                | _        || _        || _        y )Nr	   g:0yE>)eps)r   rQ   r   rS   r   
ModuleListrow_rnncol_rnnrow_normcol_normrangeappendr@   	GroupNorm
SequentialConv2dPReLUconvrU   rV   )
r   r/   rR   rS   rA   rT   rU   rV   rN   r   s
            r   r   zDPRNN.__init__m   s    	eT#%$}}R(}}R(b)b)z" 	FALL	(Hj IJLL	(Hj IJMM  at!DEMM  at!DE		F
 MMIIh+HHJ
	 %(r   r   c                     |j                   d   }| j                  | j                  || j                  z  z   | j                  z  z
  }t        j                  || j                  || j                  z   g      }||fS )N)shaperU   rV   r:   pad)r   r   seq_lenrestr"   s        r   	pad_chunkzDPRNN.pad_chunk   sm    ''"+$"3"3g6O"OSWSbSb!bbeeA))4$2C2C+CDEDyr   c                    | j                  |      \  }}|j                  \  }}}|d d d d d | j                   f   j                         j	                  ||d| j
                        }|d d d d | j                  d f   j                         j	                  ||d| j
                        }t        j                  ||gd      }|j	                  ||d| j
                        j                  dd      j                         }||fS )Nrf      r7   r   )	rk   rg   rV   
contiguousviewrU   r)   cat	transpose)	r   r   r"   rj   
batch_sizer/   ri   	segments1	segments2s	            r   chunkingzDPRNN.chunking   s    NN1%	T(+		%
Hg12!2!2 2223>>@EEjRZ\^`d`o`op	1d//112==?DDZQY[]_c_n_no	iiI.A6hhz8RAKKAqQ\\^Dyr   rj   c                 :   |j                   \  }}}}|j                  dd      j                         j                  ||d| j                  dz        }|d d d d d d d | j                  f   j                         j                  ||d      d d d d | j
                  d f   }|d d d d d d | j                  d f   j                         j                  ||d      d d d d d | j
                   f   }||z   }|dkD  r|d d d d d | f   }|j                         }|S )Nr   rm   rf   r   )rg   rq   rn   ro   rU   rV   )	r   r   rj   rr   r8   rN   r"   out1out2s	            r   mergingzDPRNN.merging   s    !
CAkk!Q**,11*c2tYZGZ[1a-doo--.99;@@SRTUVWYZ\`\m\m\oVop1aDOO--.99;@@SRTUVWYZ\p_c_p_p^p\pVpqTk!8aFdUFl#Cnn
r   c                    | j                  |      \  }}|j                  \  }}}}|}t        | j                  | j                  | j
                  | j                        D ]"  \  }}	}
}|j                  dddd      j                         j                  ||z  |d      j                         } ||      }|j                  |||d      j                  dddd      j                         } |	|      }||z   }|j                  dddd      j                         j                  ||z  |d      j                         } |
|      }|j                  |||d      j                  dddd      j                         } ||      }||z   }% | j                  |      }| j                  ||      }|j                  dd      j                         }|S )Nr   rm   r   r	   rf   )ru   rg   ziprZ   r\   r[   r]   permutern   ro   rd   ry   rq   )r   r   rj   rr   rN   dim1dim2r"   rZ   r\   r[   r]   row_inrow_outcol_incol_outs                   r   r#   zDPRNN.forward   s   --"4$%GG!
AtT47dmmUYUaUacgcpcp4q 	 0GXw[[Aq!,779>>zD?PRVXZ[ffhFfoGll:tT2>FFq!QPQR]]_Gw'G-C[[Aq!,779>>zD?PRVXZ[ffhFfoGll:tT2>FFq!QPQR]]_Gw'G-C	  iinll3%mmAq!,,.
r   )@         LSTM   d   2   )r%   r&   r'   r=   r>   rO   r   r)   r*   r   rk   ru   ry   r#   r+   r,   s   @r   rQ   rQ   `   s    
 )) ) 	)
 ) ) ) ) 
)<5<< E%,,2C,D 	%,, 	5s1B+C 		 	S 	U\\ 	 %,, r   rQ   c                   b     e Zd Zddeddf fdZdej                  dej                  fdZ xZS )AutoPoolpool_dimr   Nc                     t         t        |           || _        t	        j
                  |      | _        | j                  dt	        j                  t        j                  d                   y )Nr7   alphar	   )r   r   r   r   r   Softmaxsoftmaxregister_parameter	Parameterr)   ones)r   r   r   s     r   r   zAutoPool.__init__   sH    h&(%*,***Bejjm)DEr   r   c                     | j                  t        j                  || j                              }t        j                  t        j                  ||      | j
                        }|S )Nr7   )r   r)   mulr   sumr   )r   r   weightr"   s       r   r#   zAutoPool.forward   sC    eii4::67ii		!V,$--@
r   )r	   )	r%   r&   r'   r>   r   r)   r*   r#   r+   r,   s   @r   r   r      s4    F FT F %,, r   r   c                        e Zd ZdZdej
                  dej
                  dej                  f fdZdej                  de
ej                     fdZ xZS )	SquimObjectivea  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores
    for speech enhancement (e.g., STOI, PESQ, and SI-SDR).

    Args:
        encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation.
        dprnn (torch.nn.Module): DPRNN module to model sequential feature.
        branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score.
    encoderdprnnbranchesc                 T    t         t        |           || _        || _        || _        y rM   )r   r   r   r   r   r   )r   r   r   r   r   s       r   r   zSquimObjective.__init__   s'     	nd,.
 r   r   r   c                 V   |j                   dk7  rt        d|j                    d      |t        j                  |dz  dd      dz  dz  z  }| j	                  |      }| j                  |      }g }| j                  D ])  }|j                   ||      j                  d	             + |S )
z
        Args:
            x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`.

        Returns:
            List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`.
        r   z/The input must be a 2D Tensor. Found dimension .r	   T)r8   keepdimg      ?   r7   )	ndim
ValueErrorr)   meanr   r   r   r_   squeeze)r   r   r"   scoresbranchs        r   r#   zSquimObjective.forward   s     66Q;NqvvhVWXYYAqDa6#=BCll1ojjomm 	6FMM&+--!-45	6r   )r%   r&   r'   r=   r   ModulerY   r   r)   r*   r   r#   r+   r,   s   @r   r   r      sU    	!	! yy	! --		! $u||*< r   r   rT   nheadmetricc                    t        j                  | || dz  dd      }t               }|dk(  r[t        j                  t        j                  | |       t        j
                         t        j                  | d      t                     }n|dk(  rat        j                  t        j                  | |       t        j
                         t        j                  | d      t        t                    }nQt        j                  t        j                  | |       t        j
                         t        j                  | d            }t        j                  |||      S )	al  Create branch module after DPRNN model for predicting metric score.

    Args:
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        metric (str): The metric name to predict.

    Returns:
        (nn.Module): Returned module to predict corresponding metric score.
       r$   T)rD   rF   stoir	   pesq)r   )r   TransformerEncoderLayerr   ra   rJ   rc   r   r   )rT   r   r   layer1layer2layer3s         r   _create_branchr      s     ''!S^bcFZFIIgw'HHJIIgq!N	
 
6	IIgw'HHJIIgq!9-	
 %'MM"))GW2Mrxxz[][d[delno[p$q==00r   r/   r0   rR   rS   rA   rU   rV   c	           	          ||dz  }t        | |      }	t        | ||||||      }
t        j                  t	        ||d      t	        ||d      t	        ||d      g      }t        |	|
|      S )a  Build a custome :class:`torchaudio.prototype.models.SquimObjective` model.

    Args:
        feat_dim (int, optional): The feature dimension after Encoder module.
        win_len (int): Kernel size in the Encoder module.
        d_model (int): The number of expected features in the input.
        nhead (int): Number of heads in the multi-head attention model.
        hidden_dim (int): Hidden dimension in the RNN layer of DPRNN.
        num_blocks (int): Number of DPRNN layers.
        rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"].
        chunk_size (int): Chunk size of input for DPRNN.
        chunk_stride (int or None, optional): Stride of chunk input for DPRNN.
    r   r   r   sisdr)r.   rQ   r   rY   r   r   )r/   r0   rT   r   rR   rS   rA   rU   rV   r   r   r   s               r   squim_objective_modelr     s~    0 !Qh(G(J
HgzS_`E}}7E627E627E73	
H '5(33r   c            
      (    t        dddddddd      S )zWBuild :class:`torchaudio.prototype.models.SquimObjective` model with default arguments.r   r   r   r   r   G   )r/   r0   rT   r   rR   rS   rA   rU   )r    r   r   squim_objective_baser   ;  s'     	 	r   rM   )r
   typingr   r   r   r)   torch.nnr   torch.nn.functional
functionalr:   r(   r   r   __annotations__r   r   r.   r@   rQ   r   r   r>   rO   modulesr   r   r   r   r   r   <module>r      sa    ( (    Ju J J  C 	"	5 	299 	bii 6		 4]BII ]@
ryy 
%RYY %P1C 1 1S 1RZZ=N=N 1R #'#4#4#4 #4 	#4
 #4 #4 #4 #4 3-#4 #4Ln r   