
    ujh              
           d dl mZ d dlZd dlmZ d dlZ G d dej                        Z G d dej                        Z G d dej                        Z	d	e
d
ededede	f
dZde	fdZy)    )TupleNc                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )AttPoolzAttention-Pooling module that estimates the attention score.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    	input_dimatt_dimc                     t         t        |           t        j                  |d      | _        t        j                  ||      | _        y )N   )superr   __init__nnLinearlinear1linear2selfr   r   	__class__s      `/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/squim/subjective.pyr   zAttPool.__init__   s4    gt%'yyA.yyG4    xreturnc                     | j                  |      }|j                  dd      }t        j                  j	                  |d      }t        j                  ||      j                  d      }| j                  |      }|S )zApply attention and pooling.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Attention score with dimensions `(batch, att_dim)`.
           r	   dim)	r   	transposer   
functionalsoftmaxtorchmatmulsqueezer   )r   r   atts      r   forwardzAttPool.forward   sg     ll1ommAq!mm##CQ#/LLa ((+LLOr   
__name__
__module____qualname____doc__intr   r   Tensorr"   __classcell__r   s   @r   r   r      s6    5# 5 5 %,, r   r   c                   d     e Zd ZdZdedef fdZdej                  dej                  fdZ xZ	S )	PredictorzPrediction module that apply pooling and attention, then predict subjective metric scores.

    Args:
        input_dim (int): Input feature dimension.
        att_dim (int): Attention Tensor dimension.
    r   r   c                 Z    t         t        |           t        ||      | _        || _        y N)r
   r-   r   r   att_pool_layerr   r   s      r   r   zPredictor.__init__0   s&    i')%i9r   r   r   c                     | j                  |      }t        j                  j                  |d      }t	        j
                  dd| j                  |j                        }||z  j                  d      }|S )a  Predict subjective evaluation metric score.

        Args:
            x (torch.Tensor): Input Tensor with dimensions `(batch, time, feature_dim)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r	   r   r      )stepsdevice)	r0   r   r   r   r   linspacer   r4   sum)r   r   Bs      r   r"   zPredictor.forward5   sb     "MM!!!!+NN1at||AHHEUKKAKr   r#   r+   s   @r   r-   r-   (   s6    #  
 %,, r   r-   c                       e Zd ZdZdej
                  dej
                  dej
                  f fdZdej                  dej                  de	ej                  ej                  f   fd	Z
dej                  dej                  fd
Z xZS )SquimSubjectiveaP  Speech Quality and Intelligibility Measures (SQUIM) model that predicts **subjective** metric scores
    for speech enhancement (e.g., Mean Opinion Score (MOS)). The model is adopted from *NORESQA-MOS*
    :cite:`manocha2022speech` which predicts MOS scores given the input speech and a non-matching reference.

    Args:
        ssl_model (torch.nn.Module): The self-supervised learning model for feature extraction.
        projector (torch.nn.Module): Projection layer that projects SSL feature to a lower dimension.
        predictor (torch.nn.Module): Predict the subjective scores.
    	ssl_model	projector	predictorc                 T    t         t        |           || _        || _        || _        y r/   )r
   r9   r   r:   r;   r<   )r   r:   r;   r<   r   s       r   r   zSquimSubjective.__init__P   s%    ot-/"""r   waveform	referencer   c                     |j                   d   }|j                   d   }||k  r6||z  dz   }t        j                  t        |      D cg c]  }| c}d      }||ddd|f   fS c c}w )a  Cut or pad the reference Tensor to make it aligned with waveform Tensor.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor, torch.Tensor): The aligned waveform and reference Tensors
                with same dimensions `(batch, time)`.
        r	   r   N)shaper   catrange)r   r>   r?   
T_waveformT_referencenum_padding_s          r   _align_shapeszSquimSubjective._align_shapesV   su     ^^B'
oob)#$3a7K		eK6H"I9"IqQI1kzk>222 #Js   	A'c                 J   | j                  ||      \  }}| j                  | j                  j                  |      d   d         }| j                  | j                  j                  |      d   d         }t	        j
                  ||fd      }| j                  |      }d|z
  S )a  Predict subjective evaluation metric score.

        Args:
            waveform (torch.Tensor): Input waveform for evaluation. Tensor with dimensions `(batch, time)`.
            reference (torch.Tensor): Non-matching clean reference. Tensor with dimensions `(batch, time_ref)`.

        Returns:
            (torch.Tensor): Subjective metric score. Tensor with dimensions `(batch,)`.
        r   rA   r   r      )rI   r;   r:   extract_featuresr   rC   r<   )r   r>   r?   concat
score_diffs        r   r"   zSquimSubjective.forwardh   s     #009E)>>$.."A"A("KA"Nr"RSNN4>>#B#B9#Ma#PQS#TU	Ix0a8^^F+
:~r   )r$   r%   r&   r'   r   Moduler   r   r)   r   rI   r"   r*   r+   s   @r   r9   r9   E   s    #")) #		 #bii #3ell 3u|| 3PUV[VbVbdidpdpVpPq 3$  r   r9   ssl_typefeat_dimproj_dimr   r   c                      t        t        j                  |              }t        j                  ||      }t        |dz  |      }t        |||      S )a  Build a custome :class:`torchaudio.prototype.models.SquimSubjective` model.

    Args:
        ssl_type (str): Type of self-supervised learning (SSL) models.
            Must be one of ["wav2vec2_base", "wav2vec2_large"].
        feat_dim (int): Feature dimension of the SSL feature representation.
        proj_dim (int): Output dimension of projection layer.
        att_dim (int): Dimension of attention scores.
    r   )getattr
torchaudiomodelsr   r   r-   r9   )rP   rQ   rR   r   r:   r;   r<   s          r   squim_subjective_modelrW   z   sJ     5
))846I		(H-I(Q,0I9i;;r   c                       t        dddd      S )zXBuild :class:`torchaudio.prototype.models.SquimSubjective` model with default arguments.wav2vec2_basei       rK   )rP   rQ   rR   r   )rW    r   r   squim_subjective_baser\      s    ! 	 r   )typingr   r   torch.nnr   rU   rO   r   r-   r9   strr(   rW   r\   r[   r   r   <module>r`      s       bii @		 :2bii 2j<<< < 	<
 <* r   