
    7|hc                       d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ  ej@                  e!      Z"dZ#dZ$dZ%dZ&g dZ'g dZ(dgZ)g dZ*e'e(e)e*dZ+ G d de,e      Z- G d de      Z. G d de      Z/ G d de      Z0 G d de      Z1 G d  d!e      Z2d-d"Z3d.d#Z4d/d$Z5d0d%Z6d1d&Z7	 d2	 	 	 	 	 d3d'Z8d4d(Z9d5d)Z:d6d*Z; G d+ d,e      Z<y)7    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )
JSONLoaderS3FileLoaderUnstructuredMarkdownLoaderUnstructuredPDFLoaderUnstructuredFileLoaderUnstructuredJsonLoaderPyPDFLoaderGCSFileLoaderAmazonTextractPDFLoader	CSVLoaderUnstructuredExcelLoaderUnstructuredEmailLoader)DirectoryLoaderS3DirLoaderSlackDirectoryLoaderPyPDFDirectoryLoaderNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                      e Zd ZdZdZdZy)Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover     c/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_community/utilities/pebblo.pyr-   r-   C   s    <!J,r5   r-   c                      e Zd ZU dZded<   y)IndexedDocumentzPebblo Indexed Document.strpb_idNr.   r/   r0   r1   __annotations__r4   r5   r6   r8   r8   J   s    "J$r5   r8   c                      e Zd ZU dZdZded<   	 ded<   	 ded<   	 dZded	<   	 ded
<   	 ded<   	 ded<   	 ded<   	 ded<   	 dZded<   y)RuntimezPebblo Runtime.localr9   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r.   r/   r0   r1   r@   r<   rE   rK   r4   r5   r6   r>   r>   Q   se    D#,
I#
I)B4M*GOM(GS4r5   r>   c                  (    e Zd ZU dZded<   	 ded<   y)	FrameworkzPebblo Framework instance.r9   nameversionNr;   r4   r5   r6   rM   rM   j   s    $
I L#r5   rM   c                  p    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   	 ded	<   	 d
ed<   	 ded<   	 d
ed<   y)AppzPebblo AI application.r9   rN   ownerrD   descriptionload_idr>   rK   rM   	frameworkplugin_versionclient_versionNr;   r4   r5   r6   rQ   rQ   s   sJ     
IJ!L-%'**r5   rQ   c                      e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   	 ded<   	 d	ed
<   	 ded<   	 ded<   	 ded<   	 ded<   y)DoczPebblo document.r9   rN   rR   listdocsrV   rT   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsNr;   r4   r5   r6   rY   rY      s[    
I0J
J.L-+7,%Ur5   rY   c                    | rd| v sd| d   k(  s| dv r| S t        j                  |       }|j                         r|j                         }t	        |      S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r+   )pathlibPathexistsresolver9   )rB   	full_paths     r6   get_full_pathrl      sY     TM47N11T"I%%'	y>r5   c                L    t         j                         D ]  \  }}| |v s|c S  y)zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)loaderloader_typeloaderss      r6   get_loader_typert      s4     !4 9 9 ; WW r5   c                V   ddl m}m}m}m} d}t        | t              st        j                  d       |S | j                  }	 d|v rUt        | |      rd| j                   d| j                   }nt        | |      rd| j                   d| j                   }nbd	|v r|d	   }|rVd
|v rQ|d
   }|rI| d| }n@d|v r|d   }n5d|v r|d   }n*d|v r-|d   }|rt        |t              rt        |      dkD  r|d   }nt        | |      rd}nt        | |      rd| j                   }n| j                   j"                  dk(  r|j%                  d      r|j%                  d      }	d|	 }n|j%                  d      r6|j%                  dg       }
dj'                  |
D cg c]  }d| d
 c}      }nF|j%                  d      r5|j%                  dg       }dj'                  |D cg c]  }d| d
 c}      }t+        t-        |            S c c}w c c}w # t(        $ r Y )w xY w)zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r%   r   r&   r   rf   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://rd   zs3://sourcechannelrB   	file_path	web_pathsr+   znotiondb://r'   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, z https://drive.google.com/file/d/z/viewdocument_idsz#https://docs.google.com/document/d/z/edit)$langchain_community.document_loadersr%   r   r&   r   
isinstancer   loggererror__dict__rv   blobkeyrZ   lendatabase_id	__class__r.   getjoin	Exceptionrl   r9   )rq   r%   r   r&   r   locationloader_dictrx   rz   r{   r|   file_idr}   doc_ids                 r6   get_loader_full_pathr      sN     Hfj)U	
 //K/{"&-0"6==/6;;-@FL1"6==/6::,?$"8,HI4%i0"*1WI6H{""6*HK'";/HK'#K0IZ	48S^a=O$Q<0"H/$V%7%7$89H&&*=={+'OOK8	HT,&??:r:99 (0# ;7)5I 0*~rB99 '3" >fXUK X''!  s1   E H #H0:H *H7H 
H 	H('H(c                    t               } t        d| j                  dd            }t        j                         }t        |j                  t        j                  d   | j                  dd      |j                  |j                  t               | j                  dd      | j                  d	d      
      }d|j                  v rd|_        d|_        t        j                  d|        t        j                  d|        ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    	langchainlibrary_versionN)rN   rO   PWDrF   re   rK   runtime_version)rA   rB   rF   rG   rH   rE   rI   rJ   DarwindesktopzMac OSXz
framework zruntime )r   rM   r   rF   unamer>   noderG   environsystemrO   get_ipr@   rK   r   debug)runtime_envrU   r   rK   s       r6   get_runtimer     s     *+K+//2CT"JI NNEZZZZY7<<==8I6$):IF	G 7:: #
LL:i[)*
LL8G9%&gr5   c                     ddl } | j                         }	 | j                  |      }|S # t        $ r | j                  d      }Y |S w xY w)zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamer   )r   rA   	public_ips      r6   r   r   .  sY     D6((.	   6((5	6s   ) AAc                $   g }g }d}| D ]q  }t        |j                  j                  d            }||kD  r|j                  |g       ?||z   |kD  r|j                  |       g }d}|j                  |       ||z  }s |r|j                  |       |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)r   page_contentencodeappend)r[   max_batch_sizebatchescurrent_batchcurrent_batch_sizedocdoc_sizes          r6   generate_size_based_batchesr   >  s     %'G$&M +C,,33G<=n$NNC5!!H,~=}- "%&"   %(*!+& }%Nr5   c                    	 ddl }t        j                  |       j                  }|j	                  |      j
                  }|S # t        $ r d}Y |S w xY w)zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   Nre   )pwdrG   statst_uidgetpwuidpw_namer   )ry   r   file_owner_uidfile_owner_names       r6   get_file_owner_from_pathr   f  sV    $+22,,~6>>   $#$s   >A AAc                   | syd}t         j                  j                  |       r!t         j                  j                  |       }|S t         j                  j	                  |       rd}t        j
                  |       D ]o  \  }}}|D ]d  }t         j                  j                  ||      }t         j                  j                  |      rC|t         j                  j                  |      z  }f q |}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )rG   rB   isfilegetsizeisdirwalkr   islink)source_pathsize
total_sizedirpath_	filenamesffps           r6   get_source_sizer   y  s     D	ww~~k"ww{+ K 
{	#
%'WW[%9 	6!GQ	 6WW\\'1-ww~~b)"''//""55J6	6
 Kr5   c                >    | j                  d      }t        |      }|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   r   )dataencoded_contentr   s      r6   calculate_content_sizer     s!     kk'*ODKr5   c                  (    e Zd ZU dZded<   	 dZded<   	 ded<   	 ded<   	 d	Zd
ed<   	 d fdZddZ	 d	 	 	 	 	 	 	 	 	 ddZ	ddZ
dddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZe	 	 d	 	 	 	 	 	 	 	 	 	 	 dd       Ze	 	 	 	 	 	 	 	 dd       Zedd       Z xZS ) PebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.rD   api_keyr?   r9   ra   classifier_url	cloud_urlFr^   rb   c                    t        |ddd      |d<   t        |ddt              |d<   t        |ddt              |d<   t        |   d	i | y)
z%Validate that api key in environment.r   PEBBLO_API_KEYrC   r   PEBBLO_CLASSIFIER_URLr   PEBBLO_CLOUD_URLNr4   )r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfkwargsr   s     r6   r   zPebbloLoaderAPIWrapper.__init__  sh    0I/
y $8$&=?V$
  3K!35N
{ 	"6"r5   c                L   d}|j                  d      }| j                  dk(  rL| j                         }| j                   t        j
                  j                   }| j                  d|||      }| j                  r| j                  d      }|rAt        j                  |j                        j                  d      }|j                  d|i       |j                  dt        i       | j                   t        j
                  j                   }| j                  d|||      }yy)	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTexclude_unsetr?   POSTcloud_requestpebblo_server_versionpebblo_client_version)r\   ra   _make_headersr   r-   r3   valuemake_requestr   jsonloadstextr   updatePLUGIN_VERSIONr   )	r   apppebblo_resppayloadheadersapp_discover_urlr   pebblo_cloud_urlr   s	            r6   send_loader_discoverz+PebbloLoaderAPIWrapper.send_loader_discover  s    (((.##w.((*G&&'(B(B(H(H'IJ  ++F4DgwWK<<((t(<G(,

;3C3C(D(H(H+)%  79NOPNN3^DE"&..!1&2L2L2R2R1ST!!&*:GWMA r5   c                N   |j                  dd      }t        |      }| j                  |||      \  }}| j                  ||||||      }	i }
| j                  dk(  r| j                         }| j                   t        j                  j                   }	 | j                  d|||	d      }|rJt        j                  |j                        j                  dg       D ]  }|
j                  |d   |i        | j$                  rI| j                  dk(  r| j'                  |	d   |
       |	j)                  d
d	       | j+                  |	       |
S | j                  dk(  r t         j#                  d       t-        d      |
S # t        $ r }t         j#                  d|       Y d	}~d	}~ww xY w)a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   rC   r?   r   i,  r[   r:   z3An Exception caught in classify_documents: local %sNrb   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)r   r   prepare_docs_for_classificationbuild_classification_payloadra   r   r   r-   r2   r   r   r   r   r   r   r   r   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   docs_with_idr   r]   r_   r   r`   r[   source_aggregate_sizer   classified_docsr   load_doc_urlr   classified_doces                   r6   classify_documentsz)PebbloLoaderAPIWrapper.classify_documents  s   " %((;/<&*&J&J+~'
## 33~|5JK
 ##w.((*G"11263D3D3J3J2KLLY"//L'7C *.**[5E5E*F*J*J6SU*V '..+G4nE <<''72 $$WV_oFKK,d3**73
 	 %%7NNQRRSS!  YTVWXXYs   A!E; ;	F$FF$c                    | j                  d      }| j                   t        j                  j                   }	 | j                  d|||      }y# t        $ r }t        j                  d|       Y d}~yd}~ww xY w)z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r-   r2   r   r   r   r   r   )r   r   r   r   r   r   s         r6   r   z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloud  sz     $$4$8"nn-f.?.?.E.E-FG	U!!&*:GWMA 	UNNPRSTT	Us   A 	A:A55A:c                    ddd}|r@| j                   r|j                  d| j                   i       |S t        j                  d       |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   r   r   )r   r   r   s      r6   r   z$PebbloLoaderAPIWrapper._make_headers(  sM     ).
 ||T\\:;  MNr5   c                    |j                   |j                  |t        |j                  |d|| j                  | j
                  d
}|du rd|d<   d|v r||d   d<   t        d	i |j                  d      }|S )
a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)
rN   rR   r[   rV   rT   r]   r_   r`   ra   rb   Ttruer_   r]   r   r   r4   )rN   rR   r   rT   ra   rb   rY   r\   )r   r   r[   r]   r`   r   r_   r   s           r6   r   z3PebbloLoaderAPIWrapper.build_classification_payload?  s    0 HHYY,{{,"(#'#;#;"&"9"9#
 $%+GM"7*) ()*AB ..%%D%9r5   c           
     >   	 t        | ||||      }t        j                  d| |j                   j                  t	        t        |j                   j                  r|j                   j                  ng             t	        |j                               |j                  t        j                  k\  r$t        j                  d|j                          |S |j                  t        j                  k\  r$t        j                  d|j                          |S |j                  t        j                  k7  r"t        j                  d|j                          |S # t        $ r t        j                  d|       Y yt        $ r }t        j                  d|       Y d}~yd}~ww xY w)	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )methodurlr   r   timeoutz5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   r   r   r
  r9   r   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   r   )r	  r
  r   r   r  responser   s          r6   r   z#PebbloLoaderAPIWrapper.make_requestk  sd   *	I3gwH LLG  $$C1A1A1F1F((--BOPH(() ##z'G'GG!6x7K7K6LMN O %%)?)??!Ehmm_UV O %%6C++,.
 O 	=NN6<   	INNDaHH	Is,   CE A E A E F4F<FFc           
        g }d}| D cg c]  }|j                          }}d}|D ]Y  }|j                  di       }|j                  dg       }	|d   dk(  rt        |j                  d|d               }
n+t        |j                  d	|j                  d|                  }
|j                  d
t        |
            }|j                  dt	        |
            }t        |j                  d            }t        |      }||z  }|j                  dd      xs d}|j                  ||
||j                  di       j                  d      |d|	rd|	ini |d|ini        |d   dk(  s@|rD|j                  d      |d<   d}\ ||fS c c}w )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   Fmetadataauthorized_identitiesrq   r(   rw   r   rk   rR   r   r   r:   Nlast_modified)r   r   r:   r  
file_ownersource_path_sizesource_full_urlT)r\   r   rl   r   r   r9   r   r   )r   r   r]   r[   r   r   doc_contentsource_path_updatedoc_metadatadoc_authorized_identitiesdoc_source_pathdoc_source_ownerdoc_source_sizer   page_content_sizer   s                   r6   r   z6PebbloLoaderAPIWrapper.prepare_docs_for_classification  s   $  !-9:csxxz::" .	*C77:r2L(4(8(89PRT(U%h'+=="/ $$X~m/LM# #0 $$#$((;?#  ,//1/B  +..v7WXOsww~67L 6| D!%66!WWWd+0qFKK'#2#%(WWZ%<%@%@%Q"2 5 12KL +6 ,_=( x(,>>*0<0@0@AR0S}-%)"].	*^ ***c ;s   Fc           
        | D ]|  }|j                  |d   i       }|j                  |j                  d      |j                  d      |j                  di       |j                  di       d       |j                  d       ~ y)	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r:   pb_checksumloader_source_pathentitiestopics)r"  r#  r$  r%  r   N)r   r   r   )r[   r   doc_dataclassified_datas       r6   r   z&PebbloLoaderAPIWrapper.update_doc_data  s      	 H-11(72CRHOOO#2#6#6}#E*9*=*=>R*S / 3 3J C-11(B?	 LL	 r5   )r   r   )r   rQ   returnNone)F)
r   List[IndexedDocument]r   rQ   r]   r\   r_   r^   r(  r\   )r   r\   r(  r)  )r   r^   r(  r\   )r   rQ   r[   
List[dict]r]   r\   r`   r9   r   intr_   r^   r(  r\   )N   )r	  r9   r
  r9   r   r\   r   zOptional[dict]r  r,  r(  zOptional[Response])r   r*  r   r9   r]   r\   r(  zTuple[List[dict], int])r[   r+  r   r\   r(  r)  )r.   r/   r0   r1   r<   ra   rb   r   r   r  r   r   r   staticmethodr   r   r   __classcell__)r   s   @r6   r   r     s   ("&&I!!&!$$U#NH "<+< < 	<
 < 
<|U.** * 	*
 *  #* * 
*X 
 #'/// /  	/
 / 
/ /b D++D+D+ D+ 
 	D+ D+L    r5   r   )rB   r9   r(  r9   )rq   r9   r(  r9   )rq   r   r(  r9   )r(  zTuple[Framework, Runtime])r(  r9   )r   )r[   zList[Document]r   r,  r(  zList[List[Document]])ry   r9   r(  r9   )r   r9   r(  r,  )r   r9   r(  r,  )=
__future__r   r   loggingrG   rg   rF   enumr   httpr   typingr   r   r   r	   r
   langchain_core.documentsr   langchain_core.envr   langchain_core.utilsr   pydanticr   requestsr   r   requests.exceptionsr   )langchain_community.document_loaders.baser   	getLoggerr.   r   r   r   r   BATCH_SIZE_BYTESfile_loader
dir_loader	in_memorycloud_folderro   r9   r-   r8   r>   rM   rQ   rY   rl   rt   r   r   r   r   r   r   r   r   r4   r5   r6   <module>rB     s=   "   	     3 3 - 6 5  & 0 @			8	$1 1  
 	  	 -S$ -%h %5i 52$	 $+) +*V) V2.E(P>" 1;%
%*-%%P&2 W Y W r5   