
    7|hQ,                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ  ej<                  e      Z  G d d	e      Z! G d
 de      Z"y)z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                      e Zd ZU dZdZeed<   	 	 	 	 	 dddddeded	ed
ede	e   dede	e   dedefdZ
dee   fdZd dZdee   fdZed d       ZdefdZdee   fdZdedee   fdZdee   fdZdededefdZdeddfdZy)!PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sentNlocal)classifier_locationanonymize_snippetslangchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                r   |rt        |t              st        d      || _        t        t	        j
                               | _        || _        t        j                  j                  d      xs || _        || _        || _        t        | j                        | _        g | _        g | _        t        t%        | j                              j'                  d      d   j'                  d      d   }
t)        |
      | _        t-        | j                        | _        t0        | _        |
| j                  | j*                  d| j.                  dkD  rdt        | j.                        ini | _        | j7                         | _        t;        ||||		      | _        | j<                  j?                  | j8                         y )
NzMust specify a valid name.PEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r"   r   r$   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr*   osenvirongetr#   r    r!   r   r+   docsdocs_with_idtypesplitr   r,   r   r-   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientsend_loader_discover)selfr   r   r    r!   r"   r#   r$   r   r   loader_names              j/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/pebblo.py__init__zPebbloSafeLoader.__init__%   sn    :dC08994::<(&ZZ^^,BCT}
&/<$&	35$t{{+,2237;AA#FqI*;7 /0@0@ A*!++++	
 ((1, $S)>)>%?@	
 ((*/ 3)1	
 	++DHH5    returnc                 x    | j                   j                         | _        | j                          | j                  S )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r*   loadr8   classify_in_batches)rB   s    rD   rI   zPebbloSafeLoader.loadV   s.     KK$$&	  "yyrF   c                    t        | j                  | j                        }g }t        |      }t	        |      D ]  \  }}||dz
  k(  }|| _        | j                         | _        | j                  j                  | j                  | j                  | j                  |      }| j                  |       | j                  r| j                  |      }n| j                         }|j                  |        || _        y)z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )loading_endN)r   r8   r<   len	enumerate_index_docsr9   r@   classify_documentsr?   r=   _add_pebblo_specific_metadatar#   _add_semantic_to_docs_unindex_docsextend)	rB   batchesprocessed_docstotal_batchesibatchis_last_batchclassified_docsbatch_processed_docss	            rD   rJ   z$PebbloSafeLoader.classify_in_batchesa   s     )DIIt)
 *,G!'* 	8HAu"#}q'8"8MDI $ 0 0 2D"nn??!!##)	 @ O ..?!!'+'A'A/'R$'+'9'9';$!!"67	8" #	rF   c              #     K   	 | j                   j                         }	 	 t        |      }t        |f      | _	        | j                         | _        | j                  j                  | j                  | j                  | j                         }| j#                  |       | j$                  r| j'                  |      | _	        n| j)                         | _	        | j                  d    # t        $ rI}| j                   j                  j                   d}t
        j                  |       t        |      |d}~ww xY w# t        $ r
 g | _	        Y yw xY ww)zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()Nr   )r*   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr8   listrP   r9   r@   rQ   r?   r=   rR   r#   rS   rT   )rB   doc_iteratorexcerr_strdocclassified_docs         rD   r_   zPebbloSafeLoader.lazy_load   s,    	8;;002L
 <( cVDI $ 0 0 2D!^^>>!!488T-@-@N ..~>!! 66~F	 ..0	))A,! 	 # 	8..7788WXGLL!%g.C7	8 ! 	sK   EC' ED< B<E'	D90AD44D99E<EEEEc                     d| _         y )NT)r   )clss    rD   set_discover_sentz"PebbloSafeLoader.set_discover_sent   s
    !rF   c                     t               \  }}t        | j                  | j                  | j                  | j
                  ||t        t        dt        d                  }|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        langchain_community)r   r   )r   r    r!   r4   runtime	frameworkplugin_versionclient_version)	r   r   r1   r    r!   r4   r   r   r   )rB   rs   rr   r?   s       rD   r>   z!PebbloSafeLoader._get_app_details   s\     )]	7**((LL)$* 56
 
rF   c                     t        | j                        D cg c])  \  }}t        ddt        |      i|j	                         + }}}|S c c}}w )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        pb_id )rO   r8   r   r/   dict)rB   rY   rk   r9   s       rD   rP   zPebbloSafeLoader._index_docs   sR     $DII.
3 7#a&7CHHJ7
 
 	
s   .Ar\   c                 Z   | j                   D ci c].  }|j                  t        |j                  |j                        0 }}|j                         D ]-  }|j                  d      }||v s| j                  ||   |       / |j                         D cg c]  }| }}|S c c}w c c}w )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        page_contentmetadatarw   )r9   rw   r
   r|   r}   valuesr7   _add_semantic_to_doc)rB   r\   rk   indexed_docsrl   doc_idsemantic_metadata_docss          rD   rS   z&PebbloSafeLoader._add_semantic_to_docs   s     ((
 IIxS-=-=UU
 

 .446 	PN#''0F%)),v*>O	P
 2>1D1D1F!G##!G!G%%
 "Hs   3B#	B(c                     t        | j                        D cg c]&  \  }}t        |j                  |j                        ( }}}|S c c}}w )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        r{   )rO   r9   r
   r|   r}   )rB   rY   rk   r8   s       rD   rT   zPebbloSafeLoader._unindex_docs   sM     $D$5$56
3 #"2"2S\\J
 
 	
s   +A	rk   rl   c                     t        |j                  di       j                               |j                  d<   t        |j                  di       j                               |j                  d<   |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiespebblo_semantic_entitiestopicspebblo_semantic_topics)rg   r7   keysr}   )rB   rk   rl   s      rD   r   z%PebbloSafeLoader._add_semantic_to_doc   sg     48z2.3354
/0 26x,1132
-. 
rF   c           
         | j                   D ]  }|j                  }| j                  j                  j                  dk(  r)t        |j                  d| j                              |d<   n8t        |j                  d|j                  d| j                                    |d<   |j                  |j                  i       j                  dd      |d<    y)z*Add Pebblo specific metadata to documents.SharePointLoadersource	full_pathpb_checksumN)	r9   r}   r*   ra   rb   r   r7   r+   rw   )rB   r\   rk   doc_metadatas       rD   rR   z.PebbloSafeLoader._add_pebblo_specific_metadata  s    $$ 	C<<L{{$$--1CC,9 $$Xt/?/?@-[) -: $$#\%5%5h@P@P%Q-[)
 +:*=*=cii*L*P*Pt+L'	rF   ) r   NFN)rG   N)rb   
__module____qualname____doc__r   bool__annotations__r   r/   r	   rE   r   r
   rI   rJ   r   r_   classmethodro   r   r>   r   rP   r   rS   rT   ry   r   rR   rx   rF   rD   r   r      sD    !ND  !%#(,/6 $+#(/6$/6 /6 	/6
 /6 #/6 /6 !/6 !/6 !/6b	d8n 	#@ 8H-  D " "# ,T/2 &T &d8n &2tH~  $ 8 &T d rF   r   c                       e Zd ZdZddddddee   dee   deee      deeee	f      deeeee	f         d	dfd
Z
d	ee   fdZd	ee   fdZy)PebbloTextLoaderz
    Loader for text data.

    Since PebbloSafeLoader is a wrapper around document loaders, this loader is
    used to load text data directly into Documents.
    N)r   idsr}   	metadatastextsr   r   r}   r   rG   c                J    || _         || _        || _        || _        || _        y)a  
        Args:
            texts: Iterable of text data.
            source: Source of the text data.
                Optional. Defaults to None.
            ids: List of unique identifiers for each text.
                Optional. Defaults to None.
            metadata: Metadata for all texts.
                Optional. Defaults to None.
            metadatas: List of metadata for each text.
                Optional. Defaults to None.
        N)r   r   r   r}   r   )rB   r   r   r   r}   r   s         rD   rE   zPebbloTextLoader.__init__  s'    * 
 "rF   c              #     K   t        | j                        D ]  \  }}d}| j                  xs i }| j                  rE|t	        | j                        k  r-| j                  |   r|j                  | j                  |          | j                  r'|t	        | j                        k  r| j                  |   }t        |||        yw)zi
        Lazy load text data into Documents.

        Returns:
            Iterator of Documents
        N)idr|   r}   )rO   r   r}   r   rN   updater   r
   )rB   rY   text_idr}   s        rD   r_   zPebbloTextLoader.lazy_load9  s      !, 	IGAtC}}*H~~!c$..&9"9dnnQ>Oq 12xxADHH-hhqkcxHH	Is   CCc                 V    g }| j                         D ]  }|j                  |        |S )z`
        Load text data into Documents.

        Returns:
            List of Documents
        )r_   append)rB   	documentsrk   s      rD   rI   zPebbloTextLoader.loadI  s3     	>># 	"CS!	"rF   )rb   r   r   r   r   r/   r	   r   r   r   rE   r   r
   r_   rI   rx   rF   rD   r   r     s     !%#'-148#}# 	#
 d3i # 4S>*# Dc3h01# 
#6I8H- I 
d8n 
rF   r   )#r   loggingr5   r2   importlib.metadatar   typingr   r   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   $langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerrb   rc   r   r   rx   rF   rD   <module>r      sc    @  	  & @ @ - @    
		8	$uz up=z =rF   