
    ihP5                        d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ dZd	Zd
ZdZdZdZdZdZdZ  ejB                  e"      Z# eddd       G d dee             Z$y)    N)Path)AnyDictListMappingOptionalSequenceUnion)
deprecated)Document)	BaseModelmodel_validator)
BaseLoaderz#{http://www.w3.org/1999/xhtml}tablexpathidsourcename	structuretagprojectsz#https://api.docugami.com/v1preview1z0.0.24z1.0z!docugami_langchain.DocugamiLoader)sinceremovalalternative_importc                   <   e Zd ZU dZeZeed<   	 ej                  j                  d      Zee   ed<   	 dZeed<   	 dZeed<   	 d	Zeed
<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZeed<   	 dZee   ed<   	 dZeee      ed<   	 eeeeef         ed<   	 dZeed<   	  ed      edeee f   de fd              Z!	 	 d*de"dee   d ee#   de$e%   fd!Z&dede$e   fd"Z'dede$e   fd#Z(d$edefd%Z)	 	 d*d&ededee   d'ee#   de$e%   f
d(Z*de$e%   fd)Z+y)+DocugamiLoaderzdLoad from `Docugami`.

    To use, you should have the ``dgml-utils`` python package installed.
    apiDOCUGAMI_API_KEYaccess_tokeni   max_text_length    min_text_lengthi   max_metadata_lengthFinclude_xml_tagsr   parent_hierarchy_levelsdoc_idparent_id_keysub_chunk_tablesTwhitespace_normalize_textN	docset_iddocument_ids
file_paths(include_project_metadata_in_doc_metadatabefore)modevaluesreturnc                    |j                  d      r|j                  d      rt        d      |j                  d      s|j                  d      st        d      |j                  d      r|j                  d      st        d      |S )zValidate that either local file paths are given, or remote API docset ID.

        Args:
            values: The values to validate.

        Returns:
            The validated values.
        r+   r)   z7Cannot specify both file_paths and remote API docset_idz6Must specify either file_paths or remote API docset_idr   z7Must specify access token if using remote API docset_id)get
ValueError)clsr/   s     k/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/docugami.pyvalidate_local_or_remotez'DocugamiLoader.validate_local_or_remoteT   sr     ::l#

;(?VWWzz,'

;0GUVV::k"6::n+EVWW    contentdocument_nameadditional_doc_metadatac           	      (    	 ddl m} 	 ddlm} ddlm} d|dt        f fd	}|j                  t        j                  |            }|j                         }	 ||	 j                   j                   j                   j                   j                    j"                  
      }
i }|
D ]  } ||      }|j$                  j'                  t(              }|s-|||<   |j*                  s? ||j*                        }|j$                  j'                  t(              }|ss|j,                  s||j$                   j.                  <   |||<    t1        |j3                               S # t        $ r t        d      w xY w# t        $ r t        d      w xY w)z6Parse a single DGML document into a list of Documents.r   etreePCould not import lxml python package. Please install it with `pip install lxml`.)Chunk)
get_chunkszaCould not import from dgml-utils python package. Please install it with `pip install dgml-utils`.dg_chunkr0   c                    t        j                  | j                  j                               j	                         }t
        | j                  t        |t        t        t        | j                  t        | j                  i}| j                  }rj                  r|j                         t!        |d j"                   |      S )N)page_contentmetadata)hashlibmd5textencode	hexdigest	XPATH_KEYr   ID_KEYDOCUMENT_NAME_KEYDOCUMENT_SOURCE_KEYSTRUCTURE_KEYr   TAG_KEYr   r,   updater   r   )rA   
_hashed_idrD   rG   r:   r9   selfs       r5   _build_framework_chunkz:DocugamiLoader._parse_dgml.<locals>._build_framework_chunk   s     X]]%9%9%;<FFHJ8>>
!=#]x11H ==D&@@OO$;<!"8D$8$89! r7   )r!   r   r(   r'   r#   r$   )lxmlr=   ImportErrordgml_utils.modelsr?   dgml_utils.segmentationr@   r   parseioBytesIOgetrootr!   r   r(   r'   r#   r$   rD   r2   rK   parentrC   r&   listr/   )rR   r8   r9   r:   r=   r?   r@   rS   treeroot	dg_chunksframework_chunksrA   framework_chunkchunk_idframework_parent_chunk	parent_ids   ` ``             r5   _parse_dgmlzDocugamiLoader._parse_dgmlj   s   	"	/:	U 	x 	. {{2::g./||~ 00 00&*&D&D!22!22$($@$@
	 13! 
	MH4X>O&//33F;H-< *??-CHOO-T* 6 ? ? C CF KI %;%H%HGP001C1CD6L(3
	M $++-..A  	= 	  	C 	s   E$ E< $E9<Fc                 B   | j                    d| d}g }|rt        j                  |dd| j                   i      }|j                  r7|j                         }|j                  |d          |j                  dd      }nt        d	| d
|j                   d      |r|S )z1Gets all document details for the given docset ID	/docsets/z
/documentsAuthorizationBearer )headers	documentsnextNFailed to download 
 (status: ))	r   requestsr2   r   okjsonextend	Exceptionstatus_code)rR   r)   urlall_documentsresponsedatas         r5   _document_details_for_docset_idz.DocugamiLoader._document_details_for_docset_id   s    
)I;j9||(GD4E4E3F*GHH {{}}$$T+%67hhvt,)#j9M9M8NaP   r7   c                 D   | j                    d| }g }|rt        j                  d|dd| j                   ii       }|j                  r7|j                         }|j                  |d          |j                  dd      }nt        d	| d
|j                   d      |r|S )z0Gets all project details for the given docset IDz/projects?docset.id=GETri   rj   rk   rz   r   rm   Nrn   ro   rp   )
r   rq   requestr   rr   rs   rt   r2   ru   rv   )rR   r)   rw   all_projectsry   rz   s         r5   _project_details_for_docset_idz-DocugamiLoader._project_details_for_docset_id   s    
.yk:''(GD4E4E3F*GH	H {{}}##D$45hhvt,)#j9M9M8NaP    r7   projectc                    |j                  t              }| j                   d| d}g }i }|rt        j                  d|dd| j
                   ii       }|j                  r7|j                         }|j                  |d          |j                  dd	      }n-|j                  d
k(  r|S t        d| d|j                   d      |r|D ]y  }|j                  d      }	|j                  d      }
|j                  d      }|	dk(  s=|
s@|sC|t           }i }t        j                  d|
 ddd| j
                   ii       }|j                  r	 ddlm} |j                  t        j                   |j"                              }|j%                         }|j&                  }|j)                  d|      }|D ]u  }|j)                  d|      d   j*                  }dj-                  |j)                  d|      d   j/                               j1                         }|d	| j2                   ||<   w |||<   jt        d|
 ddz          |S # t        $ r t        d      w xY w)z#Gets project metadata for all filesz
/projects/z/artifacts/latestr}   ri   rj   r~   	artifactsrm   Ni  rn   ro   rp   r   rw   documentzreport-values.xmlz/contentr   r<   r>   z
//pr:Entry)
namespacesz./pr:Heading z
./pr:Valuez	/content z (status: {response.status_code}))r2   rK   r   rq   r   r   rr   rs   rt   rv   ru   rT   r=   rU   rX   rY   rZ   r8   r[   nsmapr   rG   joinitertextstripr"   )rR   r   
project_idrw   all_artifactsper_file_metadatary   rz   artifactartifact_nameartifact_urlartifact_docr%   rD   r=   artifact_treeartifact_rootnsentriesentryheadingvalues                         r5   _metadata_for_projectz$DocugamiLoader._metadata_for_project   s   [[(

*ZL0AB"$''(GD4E4E3F*GH	H {{}}$$T+%67hhvt,%%,(()#j9M9M8NaP  & & (	H$LL0M#<<.L#<<
3L 33%f-!# $++#nH-,8I8I7J.KL	 ;;. %*KK

8;K;K0L$MM$1$9$9$;M&,,B+11,21NG!( N"'++n+"LQ"O"T"T #!KKKDQGPPR!%'  -22LD4L4L,M)N 19%f-#-l^9E<= K(	T ! - ' )I s   I

Idocument_idadditional_metadatac                    | j                    d| d| d}t        j                  d|dd| j                   ii       }|j                  r| j                  |j                  ||      S t        d	| d
|j                   d      )zLoad chunks for a document.rh   z/documents/z/dgmlr}   ri   rj   r~   )r8   r9   r:   rn   ro   rp   )	r   rq   r   r   rr   rf   r8   ru   rv   )rR   r   r)   r9   r   rw   ry   s          r5   _load_chunks_for_documentz(DocugamiLoader._load_chunks_for_document(  s     
)I;k+eL##$0A0A/B&CD	
 ;;## ((+(; $   %cU*X5I5I4J!L r7   c                 ^   g }| j                   r!| j                  r| j                  | j                        }| j                  r%|D cg c]  }|t           | j                  v s| }}| j                  | j                        }i }|rO| j                  rC|D ]>  }| j                  |      }|D ]&  }||vr	||   ||<   ||   j                  ||          ( @ |D ]S  }	|	t           }
|	j                  t              }|j                  |
      }|| j                  |
| j                  ||      z  }U |S | j                  r_| j                  D ]P  }t        |      }t        |d      5 }|| j                  |j!                         |j"                        z  }ddd       R |S c c}w # 1 sw Y   cxY w)zLoad documents.)r   r)   r9   r   rb)r8   r9   N)r   r)   r{   r*   rK   r   r,   r   rP   r2   rL   r   r+   r   openrf   readr   )rR   chunks_document_detailsd_project_detailscombined_project_metadatar   rD   file_iddocr%   doc_namedoc_metadatapathfiles                  r5   loadzDocugamiLoader.loadD  s   !# $ D DT^^ T  0%AfIARAR4RA%! %  $BB4>>R9;%D$Q$Q  0 YG#99'BH#+ Y"*CCAI'AR5g>5g>EEhwFWX	YY ) 	V77#458<<VD$88 &"nn"*(4	 9  		(  __ Dz$% d.. $		&*ii /  F  K%> s   F#F#/F##F,	)NN),__name__
__module____qualname____doc__DEFAULT_API_ENDPOINTr   str__annotations__osenvironr2   r   r   r   intr!   r"   r#   boolr$   r&   r'   r(   r)   r*   r	   r
   r   r,   r   classmethodr   r   r6   bytesr   r   r   rf   r{   r   r   r   r    r7   r5   r   r      s   
 $C#+"$**..1C"DL(3-D/OS,OSS""/"d"8#$S$K!M3!)"d"9&*t*'  $Ix}#,,0L(8C=)0/%c	"2344&59,d9V(#d38n    $. (,59	I/I/  }I/ "*'!2	I/
 
hI/V d * T
 .E!T E!d E!V (,15   }	
 &g. 
h8-d8n -r7   r   )%rE   rY   loggingr   pathlibr   typingr   r   r   r   r   r	   r
   rq   langchain_core._api.deprecationr   langchain_core.documentsr   pydanticr   r   )langchain_community.document_loaders.baser   
TABLE_NAMErJ   rK   rM   rL   rN   rO   PROJECTS_KEYr   	getLoggerr   loggerr   r   r7   r5   <module>r      s     	  	  F F F  6 - / @2
		  
< 			8	$ 
:
NZ N
Nr7   