
    7|hQ                         d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ  G d dee      Z G d d	e      Z G d
 de      Zy)    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc            !       @   e Zd ZdZdddddddd	d	dddd
d
ddedededeeef   dededededededeeef   deeef   deeef   dee   dee   dd
f dZ	de
e   fdZedefd       Zdedefd Zd!ed"ede
e   fd#Z	 d(d!eded$ee   de
e   fd%Zd&edeeef   fd'Zy
))DedocBaseLoadera  
    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

    Loader enables extracting text, tables and attached files from the given file:
        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
            (according to the `split` parameter).
        * `Attached files` (when with_attachments=True)
            are split according to the `split` parameter.
            For attachments, langchain Document object has an additional metadata field
            `type`="attachment".
        * `Tables` (when with_tables=True) are not split - each table corresponds to one
            langchain Document object.
            For tables, Document object has additional metadata fields `type`="table"
            and `text_as_html` with table HTML representation.
    documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding	file_pathr   r   r   r   r   r   r   r   r   r   r   r   r    r!   returnc                l   t               j                         D ci c]  \  }}|dvr|| c}}| _        h d| _        || j                  vrt	        d| d| j                   d      || _        || _        || _        | j
                  dk(  rdnd}|| j                  d	<   || j                  d
<   yc c}}w )a
  
        Initialize with file path and parsing parameters.

        Args:
            file_path: path to the file for processing
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document text is returned as a single langchain Document
                    object (don't split)
                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
                    ODP)
                "node": split document text into tree nodes (title nodes, list item
                    nodes, raw text nodes)
                "line": split document text into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        >   selfr   r"   r   >   linenodepager   Got $ for `split`, but should be one of ``r'   treelinearstructure_typeneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r"   )r%   r"   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   keyvaluer.   s                      i/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/dedoc.py__init__zDedocBaseLoader.__init__#   s    F %hnn.#
UGG J#

 #G///ug ++,A/  
&"#'::#7X4B 01;K 78##
s   B0c              #     K   ddl }	 ddlm}  || j	                               }d|j
                  d   _        |j                         5 }|j                  | j                  i | j                  d|i	      }ddd       | j                  j                         j                         | j                  
      E d{    y# t        $ r t        d      w xY w# 1 sw Y   cxY w7 )w)Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)manager_configTloggerattachments_dir)r"   
parametersdocument_treer   )tempfilededocr;   ImportError_make_configconfigdisabledTemporaryDirectoryparser"   r2   _split_documentto_api_schemadictr   )r%   rB   r;   dedoc_managertmpdirrA   s         r7   	lazy_loadzDedocBaseLoader.lazy_loadw   s     	*
 %D4E4E4GH26X&/((* 	f)//..Qd55Q7H&Q 0 M	
 '''557<<>djj ( 
 	
 	
  	W 		 	
	
s@   C'C ;C'	.C7AC';C%<C'CC'C"C'c                      y)zu
        Make configuration for DedocManager according to the file extension and
        parsing parameters.
        N r%   s    r7   rE   zDedocBaseLoader._make_config   s     	    	paragraphc                     dj                  |d   D cg c]  }| j                  |       c}      }|r|d    d| }|S |d   }|S c c}w )z1Get text (recursively) of the document tree node.
subparagraphstext)join	_json2txt)r%   rT   subparagraphsubparagraphs_textrX   s        r7   rZ   zDedocBaseLoader._json2txt   s{    !YY %.o$>  |,
 "  !$6#78 	
  6" 	
 s   ArA   document_metadatac              #      K   t        |d         dkD  r&|d   D ]  }| j                  ||      E d{     yt        |d   i ||d          y7 !w)z4Parse recursively document tree obtained by `dedoc`.rW   r   rA   r]   NrX   metadatapage_contentr`   )len_parse_subparagraphsr
   )r%   rA   r]   r[   s       r7   rd   z$DedocBaseLoader._parse_subparagraphs   s      }_-.2 -o > 44".BS 5   
 *62K-Kz1JK 	s   0AA"Aadditional_metadatac              #     K   |d   }|ri ||}|dk(  r)| j                  |d   d         }t        ||       n|dk(  r|d   d   d   }|d	   d   d
   }d}|D ]P  }	|	d   d
   |k(  r|| j                  |	      z  }#t        |i |d
|i       |	d   d
   }| j                  |	      }R t        |i |d
|i       n|dk(  r8|d   d   d   D ])  }	|	d   }
t        | j                  |	      i ||
       + nC|dk(  r"| j                  |d   d   |      E d{    nt        d| d| j                   d      | j
                  r:|d   d   D ]/  }| j                  |      \  }}t        |i |d   d|d       1 |d   D ]*  }| j                  || j                  ddi      E d{    , y7 7 	w)z=Split document into parts according to the `split` parameter.r`   r   content	structure)rT   ra   r(   rW   r   page_id r&   r'   r_   Nr)   r*   r+   tablestable)typetext_as_htmlattachmentsrm   
attachment)rA   r   re   )	rZ   r
   rd   r4   r3   r   
_get_tablerJ   r   )r%   rA   r   re   r]   rX   nodesri   	page_textr'   line_metadatarl   
table_text
table_htmlrp   s                  r7   rJ   zDedocBaseLoader._split_document   s     **5 L#4 L8K LJ>>M),D[,Q>RD7HIIf_!),[9/JEAhz*95GI 	5
#I.'9!55I"%.!J$5!Jy'!J  #:.y9G $t 4I	5 &B-By'B 
 f_%i0=oN  $Z 0!%!5C 1C]C  f_00+I6{C"3 1    ug ++,A/ 
 &y1(; 	)-)?&
J!+
+ '(2 	 (6 	J++(jj%+\$: ,   	/0s%   D"G$F?%BG7G8GGrl   c           
      l   d}|d   D ]1  }|D ]%  }|dj                  d |d   D              z  }|dz  }' |dz  }3 d}|d   D ]g  }|d	z  }|D ]V  }dj                  d
 |d   D              }t        j                  |      }|dz  }|d   r|dz  }|d|d    d|d    d| dz  }X |dz  }i |dz  }||fS )z.Get text and HTML representation of the table.rj   cells c              3   &   K   | ]	  }|d      ywrX   NrQ   .0r&   s     r7   	<genexpr>z-DedocBaseLoader._get_table.<locals>.<genexpr>  s     &NtF|&N   lines	rV   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c              3   &   K   | ]	  }|d      ywr{   rQ   r|   s     r7   r~   z-DedocBaseLoader._get_table.<locals>.<genexpr>  s     %Mtd6l%Mr   z<td	invisiblez style="display: none" z
 colspan="colspanz" rowspan="rowspanz">z</td>
z</tr>
z</tbody>
</table>)rY   htmlescape)r%   rl   ru   rowcellrv   	cell_texts          r7   rq   zDedocBaseLoader._get_table   s-   
> 	C #chh&NW&NNN
d"
# $J		 	 > 	$C("J 	 II%MtG}%MM	 KK	2	e#
$";;J i 1 2Y(9+W>
	 )#J	$ 	**
:%%rS   N)__name__
__module____qualname____doc__strboolr	   intr   r8   r   r
   rO   r   rL   rE   rZ   rd   rJ   r   rq   rQ   rS   r7   r   r      s   (   -2*,#/!&,$*8=.348#'"&#RLRL 	RL
 RL  T	*RL %(RL !RL RL RL !$RL "RL &+39%5RL !d+RL "'sDy!1RL  C=!RL" 3-#RL$ 
%RLh
8H- 
, d  4 C !6:	(	& /3	II I &d^	I
 
(	IV& &sCx &rS   r   c                       e Zd ZdZdefdZy)DedocFileLoaderaw  
    DedocFileLoader document loader integration to load files using `dedoc`.

    The file loader automatically detects the file type (with the correct extension).
    The list of supported file types is gives at
    https://dedoc.readthedocs.io/en/latest/index.html#id1.
    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocFileLoader

            loader = DedocFileLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    r#   c                 `    ddl m}  || j                  | j                  | j                        S )Nr   )make_manager_config)r"   parsing_paramsr   )dedoc.utils.langchainr   r"   r2   r   )r%   r   s     r7   rE   zDedocFileLoader._make_config`  s)    ="nn22**
 	
rS   N)r   r   r   r   rL   rE   rQ   rS   r7   r   r     s    @D
d 
rS   r   c            #       
    e Zd ZdZdddddddd	d
d
dddddddededededeeef   dededededededeeef   deeef   deeef   dee   dee   ddf" fdZ	de
e   fdZdefd Zdeded!edeeeeeef   f   fd"Z xZS )#DedocAPIFileLoaderaU  
    Load files using `dedoc` API.
    The file loader automatically detects the file type (even with the wrong extension).
    By default, the loader makes a call to the locally hosted `dedoc` API.
    More information about `dedoc` API can be found in `dedoc` documentation:
        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        You don't need to install `dedoc` library for using this loader.
        Instead, the `dedoc` API needs to be run.
        You may use Docker container for this purpose.
        Please see `dedoc` documentation for more details:
            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

        .. code-block:: bash

            docker pull dedocproject/dedoc
            docker run -p 1231:1231

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocAPIFileLoader

            loader = DedocAPIFileLoader(
                file_path="example.pdf",
                # url=...,
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r#   c                n    t         |   ||||||||	|
||||||       || _        d| j                  d<   y)a
  Initialize with file path, API url and parsing parameters.

        Args:
            file_path: path to the file for processing
            url: URL to call `dedoc` API
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document is returned as a single langchain Document object
                    (don't split)
                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
                "node": split document into tree nodes (title nodes, list item nodes,
                    raw text nodes)
                "line": split document into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        )r"   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   jsonreturn_formatN)superr8   r   r2   )r%   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   	__class__s                    r7   r8   zDedocAPIFileLoader.__init__  s^    B 	#-'A 3#9!5(C/$; 	 	
" 390rS   c              #      K   | j                  | j                  | j                  | j                        }| j	                  || j
                        E d{    y7 w)r:   )r   r"   r?   r@   N)
_send_filer   r"   r2   rJ   r   )r%   doc_trees     r7   rO   zDedocAPIFileLoader.lazy_load	  sM     ??DNNt?V?V # 
 ''hdjj'QQQs   AAAAc                     i S r   rQ   rR   s    r7   rE   zDedocAPIFileLoader._make_config  s    	rS   r?   c                    ddl }t        j                  j                  |      }t	        |d      5 }d||fi}|j                  | d||      }ddd       j                  dk7  r&t        d|j                  j                                t        j                  |j                  j                               }	|	S # 1 sw Y   mxY w)	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder4   rg   decoder   loads)
r%   r   r"   r?   r   	file_namer   r   rresults
             r7   r   zDedocAPIFileLoader._send_file  s     	GG$$Y/	)T" 	Mdi./EWoULA	M ==C;AII<L<L<N;OPQQAII,,./	M 	Ms   B::C)r   r   r   r   r   r   r	   r   r   r8   r   r
   rO   rL   rE   r   listr   __classcell__)r   s   @r7   r   r   j  s~   GZ ) -2*,#/!&,$*8=.348#'"&%S:S: 	S:
 S: S:  T	*S: %(S: !S: S: S: !$S: "S: &+39%5S: !d+S:  "'sDy!1!S:" C=#S:$ 3-%S:& 
'S:jR8H- Rd #&48	c5tS))	*rS   r   )r   r   r   abcr   r   typingr   r   r   r   r	   langchain_core.documentsr
   )langchain_community.document_loaders.baser   r   r   r   rQ   rS   r7   <module>r      sQ      	 #  . @H&j# H&VJ
o J
Zx xrS   