
    '}hLc              
           U d dl Z d dlZd dlmZmZmZmZmZmZ d dl	Z	d dl
mZ ddlmZ ddgZ e       s:d dlZ G d d	      Zd
 Zeej$                  d   _        eej$                  d   _        yd dlmZmZmZmZmZmZmZmZmZ  e j>                  e       Z!er	 d dl"m#Z#  G d d      Z& e&       Z'e&e(d<   dde)fdZ* G d d      Zddde)dee+df   deee)df      defdZy# e$$ r e!jK                  d       Y \w xY w)    N)DictListOptionalTupleTYPE_CHECKINGUnion)is_available   )not_noneinit_device_mesh
DeviceMeshc                       e Zd Zy)_DeviceMeshStubN)__name__
__module____qualname__     \/var/www/html/test/engine/venv/lib/python3.12/site-packages/torch/distributed/device_mesh.pyr   r      s    r   r   c                       y Nr   r   r   r   _init_device_mesh_stubr      s    r   ztorch.distributed.device_mesh)	_find_pg_by_ranks_and_tag_get_default_group_get_group_tagget_rankget_world_sizeinit_process_groupis_initialized	new_groupProcessGroup)	ArrayLikezCDeviceMesh requires numpy >= 1.21 to be installed for type checkingc                       e Zd ZddZddZdddededdfd	Zddded   fd
Z	dddee   fdZ
ededefd       Zededefd       ZdddedefdZy)_MeshEnvreturnNc                 .    g | _         i | _        i | _        y r   )
mesh_stackchild_to_parent_mappingparent_to_child_mappingselfs    r   __init__z_MeshEnv.__init__<   s    02DOIKD(TVD(r   r   c                 f    t        | j                        dk(  rt        d      | j                  d   S )Nr   z#No device mesh is currently active!)lenr'   RuntimeErrorr*   s    r   get_current_meshz_MeshEnv.get_current_meshA   s.    4??#q("#HII??2&&r   device_meshmesh_dimmesh_dim_namec                    | j                   j                  |      }|r|j                  |      }|r|S |j                         }|j                  j	                  d|      j                  d|j                  j                  |            }|D ]#  }t        |j                  ||fd      }||v s"|}	% |j                  |   g	_	        || j                  |	<   |	| j                   j                  |i       |<   |	S )Nr.   Fmesh_dim_names_init_backend)r)   getr   meshswapdimsreshapesizer   device_type_dim_group_infosr(   
setdefault)
r+   r2   r3   r4   child_mesh_mappingssub_meshcur_rankpg_ranks_by_dimmesh_1dres_sub_meshs
             r   create_child_meshz_MeshEnv.create_child_meshF   s    #'">">"B"B;"O".22=A#O #++-H)..77HEMMK$$))(3O + ,%++$1#3"'	 w&#+L, .9-I-I(-S,TL)9DD((6  ((33KD  r   c                 :    | j                   j                  |d       S r   )r(   r9   )r+   r2   s     r   get_parent_meshz_MeshEnv.get_parent_meshi   s    //33KFFr   c                     | j                  |      }|j                  }|r.|r,t        |      dk(  sJ d       |d   }| j                  ||      S y)z
            Return the index of the mesh dim in the parent mesh.
            The device_mesh passed in needs to be sliced out from a parent mesh.
               z%The child mesh can only be a 1D mesh.r   N)rI   r7   r/   get_mesh_dim_by_name)r+   r2   parent_meshchild_mesh_dim_nameschild_mesh_dim_names        r   get_parent_mesh_dimz_MeshEnv.get_parent_mesh_diml   sf    
 ..{;K#.#=#= 3,-2;:;2&:1&=#00>QRRr   r>   c                 4    t        |       j                         S r   )_get_device_handledevice_countr>   s    r   num_devices_per_hostz_MeshEnv.num_devices_per_host{   s    %k2??AAr   c                 B    t               t        j                  |       z  S r   )r   r$   rU   rT   s    r   	num_hostsz_MeshEnv.num_hosts   s     "#x'D'D['QQQr   c                     |j                   t        |j                         dk(  rt        d      ||j                   vrt        d| dd|j                          t        |j                   j	                  |            S )Nr   zNo `mesh_dim_names` found.zMesh dimension 'z' does not exist.z.Available mesh dimensions are: mesh_dim_names=)r7   r/   KeyErrorr   index)r+   r2   r4   s      r   rL   z_MeshEnv.get_mesh_dim_by_name   s     **2{112a70  K$>$>>&}o5FGD[E_E_D`a  K66<<]KLLr   r%   Nr%   r   )r   r   r   r,   r1   intstrrG   r   rI   rP   staticmethodrU   rW   rL   r   r   r   r$   r$   ;   s    	W
	'
!	 +!	 7:!	 KN!	 !	 F	G| 	G@V 	G	< 	HSM 	 
	Bc 	Bc 	B 
	B 
	R3 	R3 	R 
	R
	M+	M<?	M	Mr   r$   _mesh_resourcesr>   c                 $    t        t        | d      S )a:  
        Get the module corresponding to the device_type which is cuda or cuda-like device.
        For example, when the device_type is cuda, the module `torch.cuda` is returned.
        Return None when there is no corresponding module for device_type, otherwise
        return the corresponding module.
        N)getattrtorchrT   s    r   rR   rR      s     uk400r   c                      e Zd ZU dZeed<   ej                  ed<   ee	edf      ed<   ddddede
ej                  d	f   dee	edf      d
eddf
dZd Zd ZddZd dZdefdZd ZdedefdZdedd fdZ	 d!dee
eef      de
eee   f   fdZd!dee   defdZedefd       Zede	edf   fd       ZdefdZd!dee
eef      defdZdeee      fdZ y)"r   a  
        DeviceMesh represents a mesh of devices, where layout of devices could be
        represented as a n-d dimension array, and each value of the n-d dimensional
        array is the global id of the default process group ranks.

        DeviceMesh could be used to describe the layout of devices across the cluster,
        and serves as a proxy for communication among the device lists within the cluster.

        DeviceMesh can be used as a context manager.

        .. note::
            DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
            is running on all processes/ranks in the cluster. Therefore, users need to make sure the
            `mesh` array (which describes the layout of devices) should be identical across all ranks.
            Inconsistent `mesh` will lead to silent hang.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
            mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
                of devices, where the IDs are global IDs of the default process group.

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        The following program runs on each process/rank in an SPMD manner. In this example, we have 2
        hosts with 4 GPUs each.
        A reduction over the first dimension of mesh will reduce across
        columns (0, 4), .. and (3, 7), a reduction over the second dimension
        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).

        Example::
            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import DeviceMesh
            >>>
            >>> # Initialize device mesh as (2, 4) to represent the topology
            >>> # of cross-host(dim 0), and within-host (dim 1).
            >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
        r>   r:   .r7   NTr6   r"   r8   r%   c                f   || _         t        |t        j                        r'|j                  j
                  dk7  rt        d|       t        |t        j                        r|j                         j                         n$t        j                  |t        j                        | _        || _        t        | j                  j                         j                               | _        t#        | j                   | j                  j$                  t'        |       f      | _        |dk7  r|r | j+                          | j-                          | j                  t/               k(  j1                         }|j3                  d      dv sJ |j3                  d      dkD  r|d   j                         nd | _        y y )Ncpuz!`mesh` must be a CPU tensor, got )dtypexlar   )r   rK   )r>   
isinstancerc   Tensordevicetype
ValueErrordetachrf   tensorr]   r:   r7   tupleflattentolist_flatten_mesh_listhashshapeid_hash_get_or_create_default_group_init_process_groupsr   nonzeror=   _coordinate_on_dim)r+   r>   r:   r7   r8   rank_coordss         r   r,   zDeviceMesh.__init__   s[     +D$-$++2B2Be2K #DTF!KLL dELL1 !!#\\$eii8 I
 #1D ',DII,=,=,?,F,F,H&ID#t66		DRSDJ e# !557--/  $yyHJ6??A"''*f444/:/?/?/BQ/FKN))+D ' $r   c           	         t               }|s
t                t               }| j                  j	                         |kD  r't        d| j                  j	                          d      t        | j                        }|sZ|rX|j                         }||kD  r'||z  dk7  rt        d| d| d| j                   d      |j                  t               |z         t               S )Nz=Mesh should not be bigger than default world size, but found z ranks!r   z8DeviceMesh only support homogeneous hardware, but found z ranks and  z	 devices!)r   r   r   r:   numelr0   rR   r>   rS   
set_devicer   r   )r+   default_initialized
world_sizedevice_handlerU   s        r   rx   z'DeviceMesh._get_or_create_default_group   s    "0"2&"$')Jyy :-"STXT]T]TcTcTeSffmn  /t/?/?@M&= (5'A'A'C$!55"%99Q>&R%,k2F1GqIYIYHZZce  ((6J)JK%''r   c           	      6   g }| j                   j                  dk(  r~| j                   j                         t               k(  rY|j	                  t        t                     t        t        t                           t               j                  f       || _        y t        | j                   j                        D ]  }| j                   j                  d|      j                  d| j                   j                  |            }|D ]  }|j                         }t        |      }| j                         |v s2t!        |      |kD  rt#        d| j                   d| d      |j	                  t        t%        |            ||j                  f         || _        y )NrK   r.   )rankszFEach device mesh dimension should get only one process group, but got z in !)r:   ndimr   r   appendr   r   listrange
group_namer;   r<   r=   rr   r    r   r/   r0   r   r?   )r+   dim_group_infosdimrD   dim_meshsubgroup_ranks	dim_groups          r   ry   zDeviceMesh._init_process_groups  sx    ACOyy~~"tyy'8N<L'L  &&&'9';<U>#345*,77N %4D!= !0 C '+ii&8&8S&A&I&IDIINN3/'O
 %4 )1):
 %.N$C	  ==?n<"?3c9&2&lmqmzmzl{ |**8)9%<'" !" ,22$28I3F$G$2$-$8$8!"< %4D!r   c                 D    t         j                  j                  |        | S r   )r`   r'   r   r*   s    r   	__enter__zDeviceMesh.__enter__G  s    &&--d3Kr   c                 @    t         j                  j                          y r   )r`   r'   pop)r+   exc_type	exc_valueexc_tracebacks       r   __exit__zDeviceMesh.__exit__M  s    &&**,r   c                     | j                   s d| j                  j                          d}|S d| j                  j                          d| j                    d}|S )NzDeviceMesh()z, mesh_dim_names=)r7   r:   rr   )r+   device_mesh_reprs     r   __repr__zDeviceMesh.__repr__Q  sn     ** dii..013 
 $# #499#3#3#5"66GH[H[G\\]^ 
 $#r   c                     | j                   S r   )rw   r*   s    r   __hash__zDeviceMesh.__hash__Y  s    ::r   otherc                    t        |t              syt        | j                        t        |j                        k(  ry| j                  j                  |j                  j                  k(  xr | j
                  |j
                  k(  S )NFT)ri   r   rv   r:   ru   rs   )r+   r   s     r   __eq__zDeviceMesh.__eq__\  sc    eZ0$))}5::.		5::#3#33 H++u/G/GGr   r4   c                     | j                   j                  dk(  r/| j                  r|| j                  d   k(  r| S t        d| d      t        j                  | |      }t        j                  | ||      }|S )a  
            Slice the current DeviceMesh based on the mesh_dim_name given to create a child
            DeviceMesh.

            Args:
                mesh_dim_name (str): the name of the mesh dimension of the parent DeviceMesh
                to create a child DeviceMesh for.
            Returns:
                A :class:`DeviceMesh` object

            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
            hosts with 4 GPUs each.
            Calling mesh["tp"] on rank 0, 1, 2, 3 would return a 1D child DeviceMesh:([0, 1, 2, 3]).
            Calling mesh["tp"] on rank 4, 5, 6, 7 would return a 1D child DeviceMesh:([4, 5, 6, 7]).
            Calling mesh["dp"] on rank 0, 4 would return a 1D child DeviceMesh:([0, 4]).
            Calling mesh["dp"] on rank 1, 5 would return a 1D child DeviceMesh:([1, 5]).
            Calling mesh["dp"] on rank 2, 6 would return a 1D child DeviceMesh:([2, 6]).
            Calling mesh["dp"] on rank 3, 7 would return a 1D child DeviceMesh:([3, 7]).

            Example::
                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
            rK   r   zInvalid mesh_dim_name z specified.)r:   r   r7   r0   r`   rL   rG   )r+   r4   r3   submeshs       r   __getitem__zDeviceMesh.__getitem__f  s{    8 yy~~"&&=D<O<OPQ<R+RK&0{K  ';;D-PH%77hVGNr   r3   c           
         t        | d      st        d      | j                  j                  dk(  r!t	        t        | j                  d   dd        S |Gt        |t              rt        j                  | |      }t	        t        | j                  |   dd        S g }t        | j                  j                        D ]2  }|j                  t	        t        | j                  |   dd               4 |S )a  
            Returns a list of ProcessGroups corresponding to the mesh dimensions, or
            returns a single ProcessGroup if mesh_dim is specified or the given mesh has
            only one mesh dimension.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                A list of :class:`ProcessGroup` object when `mesh_dim` is not specified for
                a DeviceMesh with more than 1 dimension; otherwise, returns a single
                :class:`ProcessGroup` object.
            r?   z*DeviceMesh process groups not initialized!rK   r   Nr
   )hasattrr0   r:   r   r   r   r?   ri   r^   r`   rL   r   r   )r+   r3   
dim_groupsith_dims       r   	get_groupzDeviceMesh.get_group  s    " 4!34"#OPPyy~~"-t/D/DQ/G/KL  #h,.CCD(SH-t/D/DX/NrPQ/RS   
$TYY^^4 G%% 5!%!6!6w!?!C "!r   c                 p    || j                   j                         S | j                   j                  |      S r   )r:   r   r=   )r+   r3   s     r   r=   zDeviceMesh.size  s*    (0(8499??$VdiinnX>VVr   c                 .    | j                   j                  S r   )r:   r   r*   s    r   r   zDeviceMesh.ndim  s    99>>!r   c                 @    t        | j                  j                        S r   )rp   r:   ru   r*   s    r   ru   zDeviceMesh.shape  s    ))r   c                     t               S )z:
            Returns the current global rank.
            )r   r*   s    r   r   zDeviceMesh.get_rank  s     :r   c                     | j                   dkD  r&|$t        d| j                  j                    dd      |d}t        | j	                  |            }t        |t              sJ d       t        t        |            S )a{  
            Returns the local rank of the given mesh_dim of the DeviceMesh.

            Args:
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
                of the mesh dimension. Default is None.

            Returns:
                An integer denotes the local rank.

            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
            hosts with 4 GPUs each.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.

            Example::
                >>> # xdoctest: +SKIP("no rank")
                >>> from torch.distributed.device_mesh import DeviceMesh
                >>>
                >>> # Initialize device mesh as (2, 4) to represent the topology
                >>> # of cross-host(dim 0), and within-host (dim 1).
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
            rK   zFound the DeviceMesh have z dimensionszJOptional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.r   z1We expect ProcessGroup before calling `get_rank`!)r   r0   r:   r   r   ri   r!   r   )r+   r3   mesh_dim_groups      r   get_local_rankzDeviceMesh.get_local_rank  s    8 yy1}!1"00@L`  !%dnnX&>?N CBC  H^455r   c                 6    | j                   r| j                   S dS )z
            Return the relative indices of this rank relative to all
            dimensions of the mesh. If this rank is not part of the mesh, return None.
            N)r{   r*   s    r   get_coordinatezDeviceMesh.get_coordinate  s    
 /3.E.E4**O4Or   r\   r[   r   )!r   r   r   __doc__r^   __annotations__rc   rj   r   r   r   boolr,   rx   ry   r   r   r   r   objectr   r   r]   r!   r   r   r=   propertyr   ru   r   r   r   r   r   r   r   r      s   %	N ll sCx11 9="&%	%	 k12%	
 %U38_5%	  %	 %	N	(:4	4l		-	$c 	$		 	4 	&	S &	\ &	R 9=)	"$U38_5)	"<l!334)	"V	W# 	W# 	W 
	"# 	" 
	" 
	*5c? 	* 
	*	c 	(	68E#s(O+D (	6PS (	6T	PHT#Y$7 	Pr   )r7   
mesh_shape.r7   r%   c          	      j   |kt        t        |            t        |      k7  rt        dd|       t        |      t        |      k7  r%t        ddt        |       dt        |       d      t        j                  t        j                  |            j                  |      }t        | ||      }|S )aG  
        Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.

        This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
        If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.

        .. note::
            `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
            runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
            describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.

        .. note::
            If no process group is found, init_device_mesh will initialize distributed process group/groups
            required for distributed communications behind the scene.

        Args:
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
            mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
                describing the layout of devices.
            mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
                of the multi-dimensional array describing the layout of devices. Its length must match the length
                of `mesh_shape`. Each string in `mesh_dim_names` must be unique.

        Returns:
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.

        Example::
            >>> # xdoctest: +SKIP("no rank")
            >>> from torch.distributed.device_mesh import init_device_mesh
            >>>
            >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
            >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))

        z"Each mesh_dim_name must be unique.z/Found repeated mesh_dim_name in mesh_dim_names z6mesh_shape and mesh_dim_names should have same length!zFound len(mesh_dim_names): z and len(mesh_shape):.)r>   r:   r7   )	r/   setr0   rc   arangemathprodviewr   )r>   r   r7   r:   r2   s        r   r   r     s    P %3~&'3~+>>"8EnEUV 
 :#n"55"L1#n2E1FF[\_`j\k[llmn 
 ||DIIj1277
C #)
 r   )cuda),loggingr   typingr   r   r   r   r   r   rc   torch.distributedr	   utils._typing_utilsr   __all__sysr   r   modulesr   r   "torch.distributed.distributed_c10dr   r   r   r   r   r   r   r    r!   	getLoggerr   loggernumpy.typingr"   ImportErrorwarningr$   r`   r   r^   rR   r]   r   r   r   <module>r      s7     D D  * *|
, ~  ?NCKK/0; 0 KK'
 
 
 Wx(F 	.YM YMv !)
OX*1 1XP XP|
 59	<<#s(O< !sCx1	<
 
<K  	NNU	s   C$ $C=<C=