
    ujhV<                     $   d dl Z d dlmZmZmZ d dlZd dlmc mZ	 d dlmZm
Z
 g dZ G d dej                        Z G d dej                        Z G d	 d
ej                        Z G d dej                        Z G d dej                        Zy)    N)ListOptionalTuple)nnTensor)ResBlock	MelResNet	Stretch2dUpsampleNetworkWaveRNNc                   >     e Zd ZdZddeddf fdZdedefdZ xZS )	r   af  ResNet block based on *Efficient Neural Audio Synthesis* :cite:`kalchbrenner2018efficient`.

    Args:
        n_freq: the number of bins in a spectrogram. (Default: ``128``)

    Examples
        >>> resblock = ResBlock()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = resblock(input)  # shape: (10, 128, 512)
    n_freqreturnNc                 .   t         |           t        j                  t        j                  ||dd      t        j
                  |      t        j                  d      t        j                  ||dd      t        j
                  |            | _        y )N   Fin_channelsout_channelskernel_sizebiasTinplace)super__init__r   
SequentialConv1dBatchNorm1dReLUresblock_model)selfr   	__class__s     W/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/wavernn.pyr   zResBlock.__init__   si     mmII&v1SXYNN6"GGD!II&v1SXYNN6"
    specgramc                 *    | j                  |      |z   S )zPass the input through the ResBlock layer.
        Args:
            specgram (Tensor): the input sequence to the ResBlock layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_freq, n_time)
        )r   r    r$   s     r"   forwardzResBlock.forward(   s     ""8,x77r#   )   	__name__
__module____qualname____doc__intr   r   r'   __classcell__r!   s   @r"   r   r      s.    		
s 	
T 	
	8 	86 	8r#   r   c                   P     e Zd ZdZ	 ddedededededdf fd	Zd
edefdZ xZS )r	   a  MelResNet layer uses a stack of ResBlocks on spectrogram.

    Args:
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> melresnet = MelResNet()
        >>> input = torch.rand(10, 128, 512)  # a random spectrogram
        >>> output = melresnet(input)  # shape: (10, 128, 508)
    n_res_blockr   n_hiddenn_outputr   r   Nc                 T   t         |           t        |      D cg c]  }t        |       }}t	        j
                  t	        j                  |||d      t	        j                  |      t	        j                  d      g|t	        j                  ||d       | _	        y c c}w )NFr   Tr   r   )r   r   r   )
r   r   ranger   r   r   r   r   r   melresnet_model)	r    r2   r   r3   r4   r   _	ResBlocksr!   s	           r"   r   zMelResNet.__init__D   s     	16{1CDAXh'D	D!}}II&x[_deNN8$GGD! 
 	 

 II(qQ 
 Es   B%r$   c                 $    | j                  |      S )zPass the input through the MelResNet layer.
        Args:
            specgram (Tensor): the input sequence to the MelResNet layer (n_batch, n_freq, n_time).

        Return:
            Tensor shape: (n_batch, n_output, n_time - kernel_size + 1)
        )r7   r&   s     r"   r'   zMelResNet.forwardS   s     ##H--r#   
   r(   r(   r(      r)   r0   s   @r"   r	   r	   4   sW      vw

-0
BE
WZ
or
	
	. 	.6 	.r#   r	   c                   @     e Zd ZdZdededdf fdZdedefdZ xZS )	r
   a  Upscale the frequency and time dimensions of a spectrogram.

    Args:
        time_scale: the scale factor in time dimension
        freq_scale: the scale factor in frequency dimension

    Examples
        >>> stretch2d = Stretch2d(time_scale=10, freq_scale=5)

        >>> input = torch.rand(10, 100, 512)  # a random spectrogram
        >>> output = stretch2d(input)  # shape: (10, 500, 5120)
    
time_scale
freq_scaler   Nc                 >    t         |           || _        || _        y N)r   r   r@   r?   )r    r?   r@   r!   s      r"   r   zStretch2d.__init__m   s    $$r#   r$   c                 n    |j                  | j                  d      j                  | j                  d      S )zPass the input through the Stretch2d layer.

        Args:
            specgram (Tensor): the input sequence to the Stretch2d layer (..., n_freq, n_time).

        Return:
            Tensor shape: (..., n_freq * freq_scale, n_time * time_scale)
        )repeat_interleaver@   r?   r&   s     r"   r'   zStretch2d.forwards   s0     ))$//2>PPQUQ`Q`bdeer#   r)   r0   s   @r"   r
   r
   _   s8    %3 %C %D %
f 
f6 
fr#   r
   c                   l     e Zd ZdZ	 	 	 	 	 ddee   dedededededd	f fd
Zdedeeef   fdZ	 xZ
S )r   a  Upscale the dimensions of a spectrogram.

    Args:
        upsample_scales: the list of upsample scales.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)

    Examples
        >>> upsamplenetwork = UpsampleNetwork(upsample_scales=[4, 4, 16])
        >>> input = torch.rand(10, 128, 10)  # a random spectrogram
        >>> output = upsamplenetwork(input)  # shape: (10, 128, 1536), (10, 128, 1536)
    upsample_scalesr2   r   r3   r4   r   r   Nc                    t         |           d}|D ]  }||z  }	 || _        |dz
  dz  |z  | _        t	        |||||      | _        t        |d      | _        g }	|D ]  }
t        |
d      }t        j                  ddd|
dz  dz   fd|
fd      }t        j                  j                  j                  |j                  d|
dz  dz   z         |	j                  |       |	j                  |        t        j                  |	 | _        y )Nr      r   F)r   r   r   paddingr         ?)r   r   total_scaleindentr	   resnetr
   resnet_stretchr   Conv2dtorchinit	constant_weightappendr   upsample_layers)r    rH   r2   r   r3   r4   r   rM   upsample_scale	up_layersscalestretchconvr!   s                r"   r   zUpsampleNetwork.__init__   s    	- 	*N>)K	* +"Q1,{:VXxU'Q7	$ 	#Eq)G99AAuqy1};MXY[`WahmD HHMM##DKK	A1FGW%T"	#  "}}i8r#   r$   c                 6   | j                  |      j                  d      }| j                  |      }|j                  d      }|j                  d      }| j	                  |      }|j                  d      dddd| j
                  | j
                   f   }||fS )a  Pass the input through the UpsampleNetwork layer.

        Args:
            specgram (Tensor): the input sequence to the UpsampleNetwork layer (n_batch, n_freq, n_time)

        Return:
            Tensor shape: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale),
                          (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
        where total_scale is the product of all elements in upsample_scales.
        r   N)rO   	unsqueezerP   squeezerW   rN   )r    r$   resnet_outputupsampling_outputs       r"   r'   zUpsampleNetwork.forward   s     H-77:++M:%--a0%%a( 00:-55a8At{{dkk\?Y9YZ -//r#   r;   )r*   r+   r,   r-   r   r.   r   r   r   r'   r/   r0   s   @r"   r   r      s    & 9c9 9 	9
 9 9 9 
9>0 05+@ 0r#   r   c                        e Zd ZdZ	 	 	 	 	 	 	 ddee   dedededededed	ed
ededdf fdZdededefdZe	j                  j                  ddedee   deeee   f   fd       Z xZS )r   aW  WaveRNN model from *Efficient Neural Audio Synthesis* :cite:`wavernn`
    based on the implementation from `fatchord/WaveRNN <https://github.com/fatchord/WaveRNN>`_.

    The original implementation was introduced in *Efficient Neural Audio Synthesis*
    :cite:`kalchbrenner2018efficient`. The input channels of waveform and spectrogram have to be 1.
    The product of `upsample_scales` must equal `hop_length`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wavernn>`__
        * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.

    Args:
        upsample_scales: the list of upsample scales.
        n_classes: the number of output classes.
        hop_length: the number of samples between the starts of consecutive frames.
        n_res_block: the number of ResBlock in stack. (Default: ``10``)
        n_rnn: the dimension of RNN layer. (Default: ``512``)
        n_fc: the dimension of fully connected layer. (Default: ``512``)
        kernel_size: the number of kernel size in the first Conv1d layer. (Default: ``5``)
        n_freq: the number of bins in a spectrogram. (Default: ``128``)
        n_hidden: the number of hidden dimensions of resblock. (Default: ``128``)
        n_output: the number of output dimensions of melresnet. (Default: ``128``)

    Example
        >>> wavernn = WaveRNN(upsample_scales=[5,5,8], n_classes=512, hop_length=200)
        >>> waveform, sample_rate = torchaudio.load(file)
        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
        >>> output = wavernn(waveform, specgram)
        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, n_classes)
    rH   	n_classes
hop_lengthr2   n_rnnn_fcr   r   r3   r4   r   Nc                    t         |           || _        |dz  r|dz
  n|dz  | _        || _        |
dz  | _        || _        || _        t        t        j                  | j                              | _        d}|D ]  }||z  }	 || j                  k7  rt        d| d|       t        ||||	|
|      | _        t        j                   || j
                  z   dz   |      | _        t        j$                  ||d      | _        t        j$                  || j
                  z   |d      | _        t        j*                  d      | _        t        j*                  d      | _        t        j                   || j
                  z   |      | _        t        j                   || j
                  z   |      | _        t        j                   || j                        | _        y )	NrJ   r      z/Expected: total_scale == hop_length, but found z != T)batch_firstr   )r   r   r   _padre   n_auxrd   rc   r.   mathlog2n_bits
ValueErrorr   upsampler   LinearfcGRUrnn1rnn2r   relu1relu2fc1fc2fc3)r    rH   rc   rd   r2   re   rf   r   r   r3   r4   rM   rX   r!   s                r"   r   zWaveRNN.__init__   s    	&(3a[1_[QN	
]
$"tyy89- 	*N>)K	*$//)N{m[_`j_klmm'fhX`bmn))FTZZ/!3U;FF5%T:	FF54::-u$G	WWT*
WWT*
99UTZZ/699TDJJ.599T4>>2r#   waveformr$   c                 n   |j                  d      dk7  rt        d      |j                  d      dk7  rt        d      |j                  d      |j                  d      }}|j                  d      }t        j                  d|| j
                  |j                  |j                        }t        j                  d|| j
                  |j                  |j                        }| j                  |      \  }}|j                  dd      }|j                  dd      }t        d      D cg c]  }| j                  |z   }}|dddd|d   |d   f   }	|dddd|d   |d   f   }
|dddd|d   |d	   f   }|dddd|d	   |d
   f   }t        j                  |j                  d      ||	gd      }| j                  |      }|}| j                  ||      \  }}||z   }|}t        j                  ||
gd      }| j!                  ||      \  }}||z   }t        j                  ||gd      }| j#                  |      }| j%                  |      }t        j                  ||gd      }| j'                  |      }| j)                  |      }| j+                  |      }|j                  d      S c c}w )a  Pass the input through the WaveRNN model.

        Args:
            waveform: the input waveform to the WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
            specgram: the input spectrogram to the WaveRNN layer (n_batch, 1, n_freq, n_time)

        Return:
            Tensor: shape (n_batch, 1, (n_time - kernel_size + 1) * hop_length, n_classes)
        r   z*Require the input channel of waveform is 1z*Require the input channel of specgram is 1r   )dtypedevicerJ   r=   N   rh   rE   dim)sizero   r_   rR   zerosre   r}   r~   rp   	transposer6   rk   catr^   rr   rt   ru   rx   rv   ry   rw   rz   )r    r{   r$   
batch_sizeh1h2auxiaux_idxa1a2a3a4xresr8   s                   r"   r'   zWaveRNN.forward  s    ==q IJJ==q IJJ%--a0(2B2B12E(]]1%
[[J

(..QYQ`Q`a[[J

(..QYQ`Q`a h/#%%a+mmAq!+084a4::>44AwqzGAJ../AwqzGAJ../AwqzGAJ../AwqzGAJ../IIx))"-x<"EGGAJyyB1GIIq"g2&yyB1GIIq"g2&HHQKJJqMIIq"g2&HHQKJJqMHHQK {{1~7 5s   'J2lengthsc           	         |j                   }|j                  }t        j                  j                  j                  || j                  | j                  f      }| j                  |      \  }}||| j                  j                  z  }g }|j                         \  }}}	t        j                  d|| j                  f||      }
t        j                  d|| j                  f||      }t        j                  |df||      }t        d      D cg c]-  }|dd| j                  |z  | j                  |dz   z  ddf   / }}t        |	      D ]  }|dddd|f   }|D cg c]  }|dddd|f    c}\  }}}}t        j                  |||gd      }| j                  |      }| j!                  |j#                  d      |
      \  }}
||
d   z   }t        j                  ||gd      }| j%                  |j#                  d      |      \  }}||d   z   }t        j                  ||gd      }t'        j(                  | j+                  |            }t        j                  ||gd      }t'        j(                  | j-                  |            }| j/                  |      }t'        j0                  |d      }t        j2                  |d      j5                         }d|z  d| j6                  z  dz
  z  dz
  }|j9                  |        t        j:                  |      j=                  ddd      |fS c c}w c c}w )	a  Inference method of WaveRNN.

        This function currently only supports multinomial sampling, which assumes the
        network is trained on cross entropy loss.

        Args:
            specgram (Tensor):
                Batch of spectrograms. Shape: `(n_batch, n_freq, n_time)`.
            lengths (Tensor or None, optional):
                Indicates the valid length of each audio in the batch.
                Shape: `(batch, )`.
                When the ``specgram`` contains spectrograms with different durations,
                by providing ``lengths`` argument, the model will compute
                the corresponding valid output lengths.
                If ``None``, it is assumed that all the audio in ``waveforms``
                have valid length. Default: ``None``.

        Returns:
            (Tensor, Optional[Tensor]):
            Tensor
                The inferred waveform of size `(n_batch, 1, n_time)`.
                1 stands for a single channel.
            Tensor or None
                If ``lengths`` argument was provided, a Tensor of shape `(batch, )`
                is returned.
                It indicates the valid length in time axis of the output Tensor.
        Nr   )r~   r}   rh   r   r   rJ   rL   )r~   r}   rR   r   
functionalpadrj   rp   rM   r   r   re   r6   rk   r   rr   rt   r^   ru   Frelurx   ry   rz   softmaxmultinomialfloatrn   rV   stackpermute)r    r$   r   r~   r}   r   outputb_sizer8   seq_lenr   r   r   r   	aux_splitm_taa1_ta2_ta3_ta4_tinplogits	posteriors                           r"   inferzWaveRNN.inferK  s   < 88&&**8dii5KLh/# 9 99G!%]]_7[[!VTZZ0uM[[!VTZZ0uMKKF%@OTUVxX!SDJJNTZZ1q5-AA1DEX	Xw 	A1a7#C:C%DQa1aj%D"D$d		1c4.a0A
AIIakk!nb1EArBqE	A))QI1-CIIcmmA.3EArBqE	A		1d)+Atxx{#A		1d)+Atxx{#AXXa[F		&a0I!!)Q/557AADKK#-.4AMM!;	> {{6"**1a3W<<C Y &Es   2L6"L;)r<      r   r=   r(   r(   r(   rB   )r*   r+   r,   r-   r   r.   r   r   r'   rR   jitexportr   r   r   r/   r0   s   @r"   r   r      s    J (3c(3 (3 	(3
 (3 (3 (3 (3 (3 (3 (3 
(3T7 7& 7V 7r YYM=f M=x/? M=5QWYabhYiQiKj M= M=r#   r   )rl   typingr   r   r   rR   torch.nn.functionalr   r   r   r   __all__Moduler   r	   r
   r   r    r#   r"   <module>r      s{     ( (     8ryy  8F(.		 (.Vf		 fBD0bii D0NR=bii R=r#   