
    ujhb                     (   d dl Z d dlZd dlmZmZmZmZ d dlZd dlmZ d dl	m
Z  G d dej                  j                        Z G d dej                  j                        Z G d	 d
ej                  j                        Z G d dej                  j                        Z G d dej                  j                        Z G d dej                  j                        Z G d dej                        Z G d dej                        Zdej*                  dededej*                  fdZd Zd'dej*                  dedededej*                  f
dZd(d ej*                  ded!ededej*                  f
d"Zd#ee   defd$Zd#ee   defd%Zd#ee   defd&Zy))    N)AnyDictListOptional)nn)
functionalc            	            e Zd ZdZddedededef fdZede	j                  fd       Zd	e	j                  de	j                  fd
Z xZS )_ScaledEmbeddingaF  Make continuous embeddings and boost learning rate

    Args:
        num_embeddings (int): number of embeddings
        embedding_dim (int): embedding dimensions
        scale (float, optional): amount to scale learning rate (Default: 10.0)
        smooth (bool, optional): choose to apply smoothing (Default: ``False``)
    num_embeddingsembedding_dimscalesmoothc                    t         |           t        j                  ||      | _        |rt        j                  | j                  j                  j                  d      }|t        j                  d|dz         j                         d d d f   z  }|| j                  j                  j                  d d  | j                  j                  xj                  |z  c_        || _        y )Nr   dim   )super__init__r   	Embedding	embeddingtorchcumsumweightdataarangesqrtr   )selfr   r   r   r   r   	__class__s         X/var/www/html/dev/engine/venv/lib/python3.12/site-packages/torchaudio/models/_hdemucs.pyr   z_ScaledEmbedding.__init__-   s    nmD\\$.."7"7"<"<!DFell1nq.@AFFHDQQF,2DNN!!&&q)""e+"
    returnc                 H    | j                   j                  | j                  z  S N)r   r   r   )r   s    r   r   z_ScaledEmbedding.weight8   s    ~~$$tzz11r    xc                 B    | j                  |      | j                  z  }|S )zForward pass for embedding with scale.
        Args:
            x (torch.Tensor): input tensor of shape `(num_embeddings)`

        Returns:
            (Tensor):
                Embedding output of shape `(num_embeddings, embedding_dim)`
        )r   r   )r   r$   outs      r   forwardz_ScaledEmbedding.forward<   s      nnQ$**,
r    )g      $@F)__name__
__module____qualname____doc__intfloatboolr   propertyr   Tensorr   r'   __classcell__r   s   @r   r
   r
   #   sd    	s 	3 	u 	]a 	 2 2 2
 
%,, 
r    r
   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 ddededededededed	ed
edeeee	f      def fdZ
ddej                  deej                     dej                  fdZ xZS )
_HEncLayerat  Encoder layer. This used both by the time and the frequency branch.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int, optional): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency domain (Default: ``True``)
        norm_type (string, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 0)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
    chinchoutkernel_sizestridenorm_groupsemptyfreq	norm_typecontextdconv_kwpadc                 \   t         |           |
i }
d }|dk(  rfd}|r|dz  nd}t        j                  }|| _        || _        || _        || _        || _        |r|dg}|dg}|dg}t        j                  } ||||||      | _
         ||      | _        | j                  rLt        j                         | _        t        j                         | _        t        j                         | _        y  ||d|z  dd|	z  z   d|	      | _         |d|z        | _        t!        |fi |
| _        y )Nc                 *    t        j                         S r#   r   Identityds    r   <lambda>z%_HEncLayer.__init__.<locals>.<lambda>m       BKKM r    
group_normc                 0    t        j                  |       S r#   r   	GroupNormrE   r9   s    r   rF   z%_HEncLayer.__init__.<locals>.<lambda>o       [! < r       r   r      )r   r   r   Conv1dr;   r7   r8   r:   r?   Conv2dconvnorm1rC   rewritenorm2dconv_DConv)r   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   norm_fnpad_valklassr   s        `         r   r   z_HEncLayer.__init__\   s    	H)$<G&)+"q			&
&*Ka[FlGIIE${FGD	U^
::;;=DLDJDJ E	1q7{?AwODL U+DJ22DJr    r$   injectr!   c                    | j                   s7|j                         dk(  r$|j                  \  }}}}|j                  |d|      }| j                   sS|j                  d   }|| j                  z  dk(  s2t        j                  |d| j                  || j                  z  z
  f      }| j                  |      }| j                  r|S |a|j                  d   |j                  d   k7  rt        d      |j                         dk(  r|j                         dk(  r|dddddf   }||z   }t        j                  | j                  |            }| j                   rn|j                  \  }}}}|j                  dddd      j                  d||      }| j                  |      }|j                  ||||      j                  dddd      }n| j                  |      }| j                  | j!                  |            }	t        j"                  |	d	      }	|	S )
a]  Forward pass for encoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            inject (torch.Tensor, optional): on last layer, combine frequency and time branches through inject param,
                same shape as x (default: ``None``)

        Returns:
            Tensor
                output tensor after encoder layer of shape `(B, C, F / stride, T)` for frequency
                    and shape `(B, C, ceil(T / stride))` for time
        rN   r   NzInjection shapes do not align   rO   r   r   )r;   r   shapeviewr8   Fr?   rR   r:   
ValueErrorgelurS   permutereshaperV   rU   rT   glu)
r   r$   r[   BCFrTleyzs
             r   r'   z_HEncLayer.forward   s   " yyQUUW\''KAq"aq"a AyyB#q(EE!aT[[0@!ABCIIaL::H||B1772;. !@AAzz|q QUUW\1d
+F
AFF4::a=!99''KAq"a		!Q1%--b!Q7A

1Aq"a#++Aq!Q7A

1AJJt||A'EE!Or    )	   rN   rN   FTrH   r   NTr#   r(   r)   r*   r+   r,   r.   strr   r   r   r   r   r0   r'   r1   r2   s   @r   r4   r4   I   s    * %-1*3*3 *3 	*3
 *3 *3 *3 *3 *3 *3 4S>**3 *3X, ,x/E ,QVQ]Q] ,r    r4   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededeeee	f      def fdZ
dej                  deej                     fdZ xZS )
_HDecLayera  Decoder layer. This used both by the time and the frequency branches.
    Args:
        chin (int): number of input channels.
        chout (int): number of output channels.
        last (bool, optional): whether current layer is final layer (Default: ``False``)
        kernel_size (int, optional): Kernel size for encoder (Default: 8)
        stride (int): Stride for encoder layer (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 1)
        empty (bool, optional): used to make a layer with just the first conv. this is used
            before merging the time and freq. branches. (Default: ``False``)
        freq (bool, optional): boolean for whether conv layer is for frequency (Default: ``True``)
        norm_type (str, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        context (int, optional): context size for the 1x1 conv. (Default: 1)
        dconv_kw (Dict[str, Any] or None, optional): dictionary of kwargs for the DConv class. (Default: ``None``)
        pad (bool, optional): true to pad the input. Padding is done so that the output size is
            always the input size / stride. (Default: ``True``)
    r5   r6   lastr7   r8   r9   r:   r;   r<   r=   r>   r?   c                    t         |           |i }d }|	dk(  rfd}|r||z
  dz  dk7  rt        d      ||z
  dz  }nd}|| _        || _        || _        || _        || _        || _        || _	        t        j                  }t        j                  }|r(|dg}|dg}t        j                  }t        j                  } |||||      | _         ||      | _        | j                  r3t        j"                         | _        t        j"                         | _        y  ||d|z  dd|
z  z   d|
      | _         |d|z        | _        y )Nc                 *    t        j                         S r#   rB   rD   s    r   rF   z%_HDecLayer.__init__.<locals>.<lambda>   rG   r    rH   c                 0    t        j                  |       S r#   rJ   rL   s    r   rF   z%_HDecLayer.__init__.<locals>.<lambda>   rM   r    rO   r   z#Kernel size and stride do not alignr   )r   r   rb   r?   rs   r;   r5   r:   r8   r7   r   rP   ConvTranspose1drQ   ConvTranspose2dconv_trrU   rC   rT   rS   )r   r5   r6   rs   r7   r8   r9   r:   r;   r<   r=   r>   r?   rX   rZ   klass_trr   s         `         r   r   z_HDecLayer.__init__   s@    	H)$<Gf$)Q. !FGG'A-CC			
&		%%&*Ka[FIIE))He[&AU^
::;;=DLDJ q4xQ[!WMDL T*DJr    r$   skipc                    | j                   rA|j                         dk(  r.|j                  \  }}}|j                  || j                  d|      }| j
                  s;||z   }t        j                  | j                  | j                  |            d      }n|}|t        d      | j                  | j                  |            }| j                   r.| j                  r_|d| j                  | j                   ddf   }n=|d| j                  | j                  |z   f   }|j                  d   |k7  rt        d      | j                  st        j                  |      }||fS )	a,  Forward pass for decoding layer.

        Size depends on whether frequency or time

        Args:
            x (torch.Tensor): tensor input of shape `(B, C, F, T)` for frequency and shape
                `(B, C, T)` for time
            skip (torch.Tensor, optional): on first layer, separate frequency and time branches using param
                (default: ``None``)
            length (int): Size of tensor for output

        Returns:
            (Tensor, Tensor):
                Tensor
                    output tensor after decoder layer of shape `(B, C, F * stride, T)` for frequency domain except last
                        frequency layer shape is `(B, C, kernel_size, T)`. Shape is `(B, C, stride * T)`
                        for time domain.
                Tensor
                    contains the output just before final transposed convolution, which is used when the
                        freq. and time branch separate. Otherwise, does not matter. Shape is
                        `(B, C, F, T)` for frequency and `(B, C, T)` for time.
        r^   r]   r   r   Nz%Skip must be none when empty is true..z'Last index of z must be equal to length)r;   r   r_   r`   r5   r:   ra   rf   rS   rT   rb   rU   ry   r?   rs   rc   )	r   r$   r{   lengthrg   rh   rj   rl   rm   s	            r   r'   z_HDecLayer.forward   s$   . 99AggGAq!q$))R+AzzDAdjja1q9AA !HIIJJt||A'99xxc488txxi/23#txx$((V"3334Awwr{f$ !JKKyyq	A!tr    )
Frn   rN   r   FTrH   r   NTro   r2   s   @r   rr   rr      s    , %-10+0+ 0+ 	0+
 0+ 0+ 0+ 0+ 0+ 0+ 0+ 4S>*0+ 0+d. .Xell-C .r    rr   c            +           e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d$dee   dedededededed	ed
ededededededededededededef* fdZ	d Z
d%dZd&dej                  dedededef
dZd  Zd! Zd"ej                  fd#Z xZS )'HDemucsa#
  Hybrid Demucs model from
    *Hybrid Spectrogram and Waveform Source Separation* :cite:`defossez2021hybrid`.

    See Also:
        * :class:`torchaudio.pipelines.SourceSeparationBundle`: Source separation pipeline with pre-trained models.

    Args:
        sources (List[str]): list of source names. List can contain the following source
            options: [``"bass"``, ``"drums"``, ``"other"``, ``"mixture"``, ``"vocals"``].
        audio_channels (int, optional): input/output audio channels. (Default: 2)
        channels (int, optional): initial number of hidden channels. (Default: 48)
        growth (int, optional): increase the number of hidden channels by this factor at each layer. (Default: 2)
        nfft (int, optional): number of fft bins. Note that changing this requires careful computation of
            various shape parameters and will not work out of the box for hybrid models. (Default: 4096)
        depth (int, optional): number of layers in encoder and decoder (Default: 6)
        freq_emb (float, optional): add frequency embedding after the first frequency layer if > 0,
            the actual value controls the weight of the embedding. (Default: 0.2)
        emb_scale (int, optional): equivalent to scaling the embedding learning rate (Default: 10)
        emb_smooth (bool, optional): initialize the embedding with a smooth one (with respect to frequencies).
            (Default: ``True``)
        kernel_size (int, optional): kernel_size for encoder and decoder layers. (Default: 8)
        time_stride (int, optional): stride for the final time layer, after the merge. (Default: 2)
        stride (int, optional): stride for encoder and decoder layers. (Default: 4)
        context (int, optional): context for 1x1 conv in the decoder. (Default: 4)
        context_enc (int, optional): context for 1x1 conv in the encoder. (Default: 0)
        norm_starts (int, optional): layer at which group norm starts being used.
            decoder layers are numbered in reverse order. (Default: 4)
        norm_groups (int, optional): number of groups for group norm. (Default: 4)
        dconv_depth (int, optional): depth of residual DConv branch. (Default: 2)
        dconv_comp (int, optional): compression of DConv branch. (Default: 4)
        dconv_attn (int, optional): adds attention layers in DConv branch starting at this layer. (Default: 4)
        dconv_lstm (int, optional): adds a LSTM layer in DConv branch starting at this layer. (Default: 4)
        dconv_init (float, optional): initial scale for the DConv branch LayerScale. (Default: 1e-4)
    sourcesaudio_channelschannelsgrowthnfftdepthfreq_emb	emb_scale
emb_smoothr7   time_strider8   r=   context_encnorm_startsr9   dconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initc                    t         +|           || _        || _        || _        || _        |
| _        || _        || _        || _	        | j                  dz  | _
        d | _        t        j                         | _        t        j                         | _        t        j                         | _        t        j                         | _        |}|dz  }|}|}| j                  dz  }t%        | j                        D ]  }||k\  }||k\  }||k\  rdnd}|dkD  }|} |
}!|s|dk7  rt'        d      |dz  }!|} d}"d}#|r||
k  r|}!d}"d}#|!| ||"|||||||d	d
}$t)        |$      }%d|%d<   |
|%d<   ||%d<   d|%d<   t)        |$      }&|#rt+        ||      }|}t-        ||fd|i|$}'|r>|#du r|dk(  r
d|%d<   d|%d<   t-        ||f||#d|%}(| j                   j/                  |(       | j                  j/                  |'       |dk(  r'| j                  t1        | j
                        z  }|dz  }t3        ||f|dk(  |d|&})|r0t3        ||f|#|dk(  |d|%}*| j"                  j5                  d|*       | j                  j5                  d|)       |}|}t7        ||z        }t7        ||z        }|r||
k  rd}n||z  }|dk(  s|st9        |||	|      | _        || _         t=        |        y )NrN   rO   rH   noner   z$When freq is false, freqs must be 1.TF)lstmattnr   compressinit)r7   r8   r;   r?   r<   r9   r>   r   r;   r7   r8   r?   r=      )r=   r:   )rs   r=   )r:   rs   r=   )r   r   )r   r   r   r   r   r   r7   r=   r8   r   
hop_lengthr   r   
ModuleListfreq_encoderfreq_decodertime_encodertime_decoderrangerb   dictmaxr4   appendlenrr   insertr,   r
   freq_emb_scale_rescale_module),r   r   r   r   r   r   r   r   r   r   r7   r   r8   r=   r   r   r9   r   r   r   r   r   r5   chin_zr6   chout_zfreqsindexr   r   r<   r;   strikerr?   	last_freqkwkwtkw_decenctencdectdecr   s,                                              r   r   zHDemucs.__init__Q  sP   0 	
	,& ))q.MMOMMOMMOMMO		Q4::& K	/EJ&DJ&D(-(<&I19DDCA:$%KLL!Ao"CI, 	  #&*  ( *&B r(CCK!,C"CMCJ"XFeW-VWHkHRHC$$%CM)*C&!$[{)[WZ[!!((.$$S)z**S->>WfY5A:wYRXYC!%hYUaZY`hdgh!!((D1$$Q,DF'E&7*+GK'Ef$Ezh 0zYb c&.#WK	/Z 	r    c                    | j                   }| j                  }|}||dz  k7  rt        d      t        t	        j
                  |j                  d   |z              }|dz  dz  }| j                  |||||z  z   |j                  d   z
  d      }t        |||      dd dd d f   }|j                  d   |dz   k7  rt        d	      |ddd|z   f   }|S )
NrN   zHop length must be nfft // 4r]   rO   r^   reflect)mode.zESpectrogram's last dimension must be 4 + input size divided by stride)	r   r   rb   r,   mathceilr_   _pad1d_spectro)r   r$   hlr   x0rk   r?   rm   s           r   _speczHDemucs._spec  s    __yy ?;<<1772;+,-AgkKK3b2g ;)KLQb!#ssA+.772;"q& deec1q2v:or    c                    | j                   }t        j                  |g d      }t        j                  |ddg      }|dz  dz  }|t        t	        j
                  ||z              z  d|z  z   }t        |||      }|d|||z   f   }|S )N)r   r   r   r   rO   r^   )r}   .)r   ra   r?   r,   r   r   	_ispectro)r   rm   r}   r   r?   rk   r$   s          r   _ispeczHDemucs._ispec  s    __EE!\"EE!aVAgk#dii,--C7aB'c3v%%&r    r$   padding_leftpadding_rightr   valuec                     |j                   d   }|dk(  r/t        ||      }||k  rt        j                  |d||z
  dz   f      }t        j                  |||f||      S )zWrapper around F.pad, in order for reflect padding when num_frames is shorter than max_pad.
        Add extra zero padding around in order for padding to not break.r]   r   r   r   )r_   r   ra   r?   )r   r$   r   r   r   r   r}   max_pads           r   r   zHDemucs._pad1d  sf     9,6G EE!a6!1A!567uuQ}5tUCCr    c                     |j                   \  }}}}t        j                  |      j                  ddddd      }|j	                  ||dz  ||      }|S )Nr   r   rN   rO   r^   )r_   r   view_as_realrd   re   )r   rm   rg   rh   ri   rj   ms          r   
_magnitudezHDemucs._magnitude  sS    gg1b!q!))!Q1a8IIaQA&r    c                     |j                   \  }}}}}|j                  ||dd||      j                  dddddd      }t        j                  |j                               }|S )Nr]   rO   r   r   rN      r^   )r_   r`   rd   r   view_as_complex
contiguous)r   r   rg   Srh   ri   rj   r&   s           r   _maskzHDemucs._mask  s^    1aQffQ2q"a(00Aq!QB##CNN$45
r    inputc                    |j                   dk7  rt        d|j                         |j                  d   | j                  k7  rt        d|j                  d    d      |}|j                  d   }| j	                  |      }| j                  |      }|}|j                  \  }}}}	|j                  dd	      }
|j                  dd	      }||
z
  d
|z   z  }|}|j                  dd	      }|j                  dd	      }||z
  d
|z   z  }g }g }g }g }t        | j                        D ]7  \  }}|j                  |j                  d          d}|t        | j                        k  rU|j                  |j                  d          | j                  |   } ||      }|j                  s|j                  |       n|} |||      }|dk(  r| j                  yt        j                   |j                  d   |j"                        }| j                  |      j%                         ddddddf   j'                  |      }|| j(                  |z  z   }|j                  |       : t        j*                  |      }t        j*                  |      }t        | j,                        D ]  \  }}|j/                  d      } ||||j/                  d            \  }}| j0                  t        | j2                        z
  }||k\  s[| j2                  ||z
     }|j/                  d      }|j                  rD|j                  d   dk7  rt        d|j                         |dddddf   } ||d|      \  }}|j/                  d      } ||||      \  }} t        |      dk7  rt5        d      t        |      dk7  rt5        d      t        |      dk7  rt5        d      t        | j6                        } |j9                  || d||	      }||dddf   z  |
dddf   z   }| j;                  |      }!| j=                  |!|      }|j9                  || d|      }||dddf   z  |dddf   z   }||z   }|S )a  HDemucs forward call

        Args:
            input (torch.Tensor): input mixed tensor of shape `(batch_size, channel, num_frames)`

        Returns:
            Tensor
                output tensor split into sources of shape `(batch_size, num_sources, channel, num_frames)`
        r^   zDExpected 3D tensor with dimensions (batch, channel, frames). Found: r   zZThe channel dimension of input Tensor must match `audio_channels` of HDemucs model. Found:.r]   )r   rO   r^   T)r   keepdimgh㈵>)r   rO   Nr   )devicerO   z0If tdec empty is True, pre shape does not match zsaved is not emptyzlengths_t is not emptyzsaved_t is not empty)ndimrb   r_   r   r   r   meanstd	enumerater   r   r   r   r:   r   r   r   r   t	expand_asr   
zeros_liker   popr   r   AssertionErrorr   r`   r   r   )"r   r   r$   r}   rm   magrg   rh   Fqrj   r   r   xtmeantstdtsavedsaved_tlengths	lengths_tidxencoder[   r   frsembdecoder{   preoffsetr   length_t_r   zouts"                                     r   r'   zHDemucs.forward  s\    ::?cdidodocpqrr;;q>T000Q(+ 
 JJuooa gg1b! vv)Tv2ee	4e0X$*% FD1vv&$v/5jTD[)!	$T%6%67 	KCNN1772;'FS**++  ".((-"XzzNN2&  Fq&!AaxDMM5 ll1772;qxx@mmC(**,T1a-=>HHK++c11LLO/	2 Qa  %T%6%67 	5KC99R=DAtW[[_5FAs ZZ#d&7&7"88Ff}((v6$==,::yy|q((+[\_\e\e[f)ghhaAg,C dH5EB";;r?D T84EB!	5$ u:? !566y>Q !9::w<1 !788FF1aR#AtGtAtG},zz!}KKf%WWQ2v&$q$w-%4.0Fr    )rO   0   rO         g?
   Trn   rO   rN   r   r   rN   rN   rO   rN   rN   rN   -C6?r#   )zerog        )r(   r)   r*   r+   r   rp   r,   r-   r.   r   r   r   r   r0   r   r   r   r'   r1   r2   s   @r   r   r   -  s   !L   -~c~ ~ 	~
 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~  !~" #~$ %~& '~( )~* +~, -~@0D DC D DSV Dhm DoU\\ or    r   c                   f     e Zd ZdZ	 	 	 	 	 	 	 	 	 ddededededededed	ed
edef fdZd Z	 xZ
S )rW   a  
    New residual branches in each encoder layer.
    This alternates dilated convolutions, potentially with LSTMs and attention.
    Also before entering each residual branch, dimension is projected on a smaller subspace,
    e.g. of dim `channels // compress`.

    Args:
        channels (int): input/output channels for residual branch.
        compress (float, optional): amount of channel compression inside the branch. (default: 4)
        depth (int, optional): number of layers in the residual branch. Each layer has its own
            projection, and potentially LSTM and attention.(default: 2)
        init (float, optional): initial scale for LayerNorm. (default: 1e-4)
        norm_type (bool, optional): Norm type, either ``group_norm `` or ``none`` (Default: ``group_norm``)
        attn (bool, optional): use LocalAttention. (Default: ``False``)
        heads (int, optional): number of heads for the LocalAttention.  (default: 4)
        ndecay (int, optional): number of decay controls in the LocalAttention. (default: 4)
        lstm (bool, optional): use LSTM. (Default: ``False``)
        kernel_size (int, optional): kernel size for the (dilated) convolutions. (default: 3)
    r   r   r   r   r<   r   headsndecayr   r7   c                 &   t         |           |
dz  dk(  rt        d      || _        || _        t        |      | _        |dkD  }d }|dk(  rd }t        ||z        }t        j                  }t        j                  g       | _        t        | j                        D ]  }|rt        d|      nd}||
dz  z  }t        j                  |||
||       ||       |       t        j                  |d|z  d       |d|z        t        j                  d      t!        ||      g}|r|j#                  d	t%        |||
             |	r|j#                  d	t'        |dd             t        j(                  | }| j                  j+                  |        y )NrO   r   z(Kernel size should not be divisible by 2c                 *    t        j                         S r#   rB   rD   s    r   rF   z!_DConv.__init__.<locals>.<lambda>  rG   r    rH   c                 .    t        j                  d|       S )Nr   rJ   rD   s    r   rF   z!_DConv.__init__.<locals>.<lambda>  s    Q 2 r    r   )dilationpaddingr^   )r   r   T)layersr{   )r   r   rb   r   r   absr   r,   r   GELUr   r   r   powrP   GLU_LayerScaler   _LocalState_BLSTM
Sequentialr   )r   r   r   r   r   r<   r   r   r   r   r7   dilaterX   hiddenactrE   r   r   modslayerr   s                       r   r   z_DConv.__init__  so    	?aGHH  Z
 *$2GX()ggmmB'tzz" 	&A$*s1ayH+"23G		(FK(T[\		&!h,2H%q	Hd+D A{6vNOAvfQTBCMM4(EKKu%#	&r    c                 >    | j                   D ]  }| ||      z   } |S )zDConv forward call

        Args:
            x (torch.Tensor): input tensor for convolution

        Returns:
            Tensor
                Output after being run through layers.
        )r   )r   r$   r  s      r   r'   z_DConv.forward  s)     [[ 	EE!HA	r    )	rN   rO   r   rH   FrN   rN   Fr^   )r(   r)   r*   r+   r,   r-   rp   r.   r   r'   r1   r2   s   @r   rW   rW   }  s    . %1&1& 1& 	1&
 1& 1& 1& 1& 1& 1& 1&fr    rW   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	r  ae  
    BiLSTM with same hidden units as input dim.
    If `max_steps` is not None, input will be splitting in overlapping
    chunks and the LSTM applied separately on each chunk.
    Args:
        dim (int): dimensions at LSTM layer.
        layers (int, optional): number of LSTM layers. (default: 1)
        skip (bool, optional): (default: ``False``)
    r   r{   c                     t         |           d| _        t        j                  d|||      | _        t        j                  d|z  |      | _        || _        y )N   T)bidirectional
num_layershidden_size
input_sizerO   )	r   r   	max_stepsr   LSTMr   Linearlinearr{   )r   r   r   r{   r   s       r   r   z_BLSTM.__init__  sI    GG$6s_bc	iiC-	r    r$   r!   c           	      B   |j                   \  }}}|}d}d}d}d}	| j                  c|| j                  kD  rT| j                  }|dz  }t        |||      }
|
j                   d   }	d}|
j                  dddd      j	                  d||      }|j                  ddd      }| j                  |      d   }| j                  |      }|j                  ddd      }|rg }|j	                  |d||      }
|dz  }t        |	      D ]m  }|dk(  r |j                  |
dd|ddd| f          (||	dz
  k(  r|j                  |
dd|dd|df          O|j                  |
dd|dd|| f          o t        j                  |d      }|d	d|f   }|}| j                  r||z   }|S )
a  BLSTM forward call

        Args:
            x (torch.Tensor): input tensor for BLSTM shape is `(batch_size, dim, time_steps)`

        Returns:
            Tensor
                Output after being run through bidirectional LSTM. Shape is `(batch_size, dim, time_steps)`
        Fr   NrO   Tr   r^   r]   .)r_   r  _unfoldrd   re   r   r  r   r   r   catr{   )r   r$   rg   rh   rj   rl   framedwidthr8   nframesframesr&   limitks                 r   r'   z_BLSTM.forward  s    ''1a>>%!dnn*<NNEaZFQv.Fll1oGFq!Q*222q%@AIIaAIIaLOKKNIIaACYYq"a/FaKE7^ >6JJvaAww&678'A+%JJvaAuvo67JJvaAueV|&;<=> ))C$Cc2A2g,CA99AAr    )r   F)r(   r)   r*   r+   r,   r.   r   r   r0   r'   r1   r2   s   @r   r  r    s6    C 4 . .%,, .r    r  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ xZ	S )
r  a   Local state allows to have attention based only on data (no positional embedding),
    but while setting a constraint on the time window (e.g. decaying penalty term).
    Also a failed experiments with trying to provide some frequency based attention.
    r   r   r   c                 t   t         t        |           ||z  dk7  rt        d      || _        || _        t        j                  ||d      | _        t        j                  ||d      | _	        t        j                  ||d      | _
        t        j                  |||z  d      | _        |rm| j                  j                  xj                  dz  c_        | j                  j                  t        d      d| j                  j                  j                  dd t        j                  ||dz  z   |d      | _        y)z
        Args:
            channels (int): Size of Conv1d layers.
            heads (int, optional):  (default: 4)
            ndecay (int, optional): (default: 4)
        r   z$Channels must be divisible by heads.r   g{Gz?Nzbias must not be None.r   )r   r  r   rb   r   r   r   rP   contentquerykeyquery_decayr   r   biasproj)r   r   r   r   r   s       r   r   z_LocalState.__init__  s    	k4)+eq CDD
yy8Q7YYx15
99Xx399Xuv~qA##((D0($$, !9::,.D!!&&q)IIh2Ha@	r    r$   r!   c                    |j                   \  }}}| j                  }t        j                  ||j                  |j
                        }|dddf   |dddf   z
  }| j                  |      j                  ||d|      }| j                  |      j                  ||d|      }	t        j                  d|	|      }
|
t        j                  |	j                   d         z  }
| j                  rt        j                  d| j                  dz   |j                  |j
                        }| j                  |      j                  ||d|      }t        j                  |      dz  }|j                  ddd       |j                         z  t        j                  | j                        z  }|
t        j                  d||      z  }
|
j!                  t        j"                  ||
j                  t        j$                        d       t        j&                  |
d	      }| j)                  |      j                  ||d|      }t        j                  d
||      }|j+                  |d|      }|| j-                  |      z   S )zLocalState forward call

        Args:
            x (torch.Tensor): input tensor for LocalState

        Returns:
            Tensor
                Output after being run through LocalState layer.
        )r   dtypeNr]   zbhct,bhcs->bhtsrO   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcs)r_   r   r   r   r   r,  r&  r`   r'  einsumr   r   r   r(  sigmoidr  masked_fill_eyer.   softmaxr%  re   r*  )r   r$   rg   rh   rj   r   indexesdeltaquerieskeysdotsdecaysdecay_qdecay_kernelweightsr%  results                    r   r'   z_LocalState.forward6  s    ''1a

,,qA4 747#33**Q-$$Qr15xx{5"a0||-tW=		$**Q-((;;\\!T[[1_QXXQWWUF&&q)..q%Q?GmmG,q0G"KKAq11EIIK?$))DKKBXXLELL!1<IID 	%))AdkkLdS--!,,,q/&&q%Q7/'B2q)499V$$$r    )rN   rN   )
r(   r)   r*   r+   r,   r   r   r0   r'   r1   r2   s   @r   r  r    sA    
A AS Ac A2#% #%%,, #%r    r  c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	r  zLayer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
    This rescales diagonally residual outputs close to 0 initially, then learnt.
    r   r   c                     t         |           t        j                  t	        j
                  |d            | _        || j                  j                  dd y)z
        Args:
            channels (int): Size of  rescaling
            init (float, optional): Scale to default to (default: 0)
        T)requires_gradN)r   r   r   	Parameterr   zerosr   r   )r   r   r   r   s      r   r   z_LayerScale.__init__a  s=     	\\%++hd"KL
!

r    r$   r!   c                 .    | j                   dddf   |z  S )zLayerScale forward call

        Args:
            x (torch.Tensor): input tensor for LayerScale

        Returns:
            Tensor
                Output after rescaling tensor.
        N)r   )r   r$   s     r   r'   z_LayerScale.forwardk  s     zz!T'"Q&&r    )r   )r(   r)   r*   r+   r,   r-   r   r   r0   r'   r1   r2   s   @r   r  r  \  s6    " "E "
' 
'%,, 
'r    r  ar7   r8   r!   c                    t        | j                  dd       }t        | j                  d         }t        j                  ||z        }|dz
  |z  |z   }t        j                  | d||z
  g      } t        | j                               D cg c]  }| j                  |       }}|d   dk7  rt        d      |dd |dgz   }|j                  |       |j                  |       | j                  ||      S c c}w )zGiven input of size [*OT, T], output Tensor of size [*OT, F, K]
    with K the kernel size, by extracting frames with the given stride.
    This will pad the input so that `F = ceil(T / K)`.
    see https://github.com/pytorch/pytorch/issues/60466
    Nr]   r   r   )r   r?   zData should be contiguous.)listr_   r,   r   r   ra   r?   r   r   r8   rb   r   
as_strided)	rB  r7   r8   r_   r}   n_frames
tgt_lengthr   stridess	            r   r  r  x  s     "EFyy&)HQ,&(;6J	AAzF234A(-aeeg7qxx}7G7r{a566crlfa[(G	LL	LL<<w'' 8s   C6c                    | j                         D ]  }t        |t        j                  t        j                  t        j
                  t        j                  f      sL|j                  j                         j                         }|dz  dz  }|j                  xj                  |z  c_
        |j                  |j                  xj                  |z  c_
         y)zI
    Rescales initial weight scale for all models within the module.
    g?g      ?N)modules
isinstancer   rP   rw   rQ   rx   r   r   detachr   r)  )modulesubr   r   s       r   r   r     s     ~~ 'cBIIr'9'9299bFXFXYZ**.."))+C3Y3&EJJOOu$Oxx#&'r    r$   n_fftr   r?   c                 z   t        | j                  d d       }t        | j                  d         }| j                  d|      } t	        j
                  | |d|z   z  |t	        j                  |      j                  |       |dddd	      }|j                  \  }}}	|j                  ||	g       |j                  |      S )Nr]   r   Tr   )window
win_length
normalizedcenterreturn_complexpad_mode)
rD  r_   r,   re   r   stfthann_windowtoextendr`   )
r$   rO  r   r?   otherr}   rm   r   r   frames
             r   r   r     s    "EF			"fA

	S  '**1-
	A ggOAue	LL% 66%=r    rm   r}   c           
         t        | j                  d d       }t        | j                  d         }t        | j                  d         }d|z  dz
  }| j                  d||      } |d|z   z  }t	        j
                  | ||t	        j                  |      j                  | j                        |d|d      }	|	j                  \  }
}|j                  |       |	j                  |      S )Nr   r]   rO   r   T)rQ  rR  rS  r}   rT  )
rD  r_   r,   r`   r   istftrX  rY  realr   )rm   r   r}   r?   r[  r   r   rO  rR  r$   r   s              r   r   r     s    "EEFIME	r5&!A1s7#J	  ,//7		A IAv	LL66%=r    r   c                     t        | dd      S )zBuilds low nfft (1024) version of :class:`HDemucs`, suitable for sample rates around 8 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    i   r   r   r   r   r   r   s    r   hdemucs_lowrd         7Q77r    c                     t        | dd      S )a  Builds medium nfft (2048) version of :class:`HDemucs`, suitable for sample rates of 16-32 kHz.

    .. note::

        Medium HDemucs has not been tested against the original Hybrid Demucs as this nfft and depth configuration is
        not compatible with the original implementation in https://github.com/facebookresearch/demucs

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    r   r   ra  rb  rc  s    r   hdemucs_mediumrg    s      7Q77r    c                     t        | dd      S )zBuilds medium nfft (4096) version of :class:`HDemucs`, suitable for sample rates of 44.1-48 kHz.

    Args:
        sources (List[str]): See :py:func:`HDemucs`.

    Returns:
        HDemucs:
            HDemucs model.
    r   r   ra  rb  rc  s    r   hdemucs_highri    re  r    )i   r   r   )r   r   r   )r   typingtpr   r   r   r   r   r   torch.nnr   ra   Moduler
   r4   rr   r   rW   r  r  r  r0   r,   r  r   r   r   rp   rd  rg  ri   r    r   <module>ro     s  4   , ,   $#uxx #Lk k\s slMehhoo M`
TUXX__ Tn@UXX__ @FB%")) B%J'")) '8(u|| (# (s (u|| ((
' S C # V[VbVb ( 3 C # V[VbVb .8c 8w 88DI 8' 8&8$s) 8 8r    