
    ih                         d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ  e j                   e      Z G d de      Zy)	    N)Path)AnyIteratorListMappingOptional)Document)
BaseLoader)BibtexparserWrapperc                       e Zd ZdZdddddddedee   d	ee   d
ee   dedefdZ	de
eef   dee   fdZdee   fdZy)BibtexLoadera  Load a `bibtex` file.

    Each document represents one entry from the bibtex file.

    If a PDF file is present in the `file` bibtex field, the original PDF
    is loaded into the document text. If no such file entry is present,
    the `abstract` field is used instead.
    Ni  Fz
[^:]+\.pdf)parsermax_docsmax_content_charsload_extra_metadatafile_pattern	file_pathr   r   r   r   r   c                    || _         |xs
 t               | _        || _        || _        || _        t        j                  |      | _        y)a  Initialize the BibtexLoader.

        Args:
            file_path: Path to the bibtex file.
            parser: The parser to use. If None, a default parser is used.
            max_docs: Max number of associated documents to load. Use -1 means
                           no limit.
            max_content_chars: Maximum number of characters to load from the PDF.
            load_extra_metadata: Whether to load extra metadata from the PDF.
            file_pattern: Regex pattern to match the file name in the bibtex.
        N)	r   r   r   r   r   r   recompile
file_regex)selfr   r   r   r   r   r   s          i/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/bibtex.py__init__zBibtexLoader.__init__   sB    * #5 3 5 !2#6 **\2    entryreturnc                 x   dd l }t        | j                        j                  }| j                  j                  |j                  dd            }|sy g }|D ]8  }	 |j                  ||z        5 }|j                  d |D               d d d        : dj                  |      xs |j                  dd      }	| j                  r|	d | j                   }	| j                  j                  || j                         }
t#        |	|
      S # 1 sw Y   xY w# t        $ r}t        j                  |       Y d }~d }~ww xY w)	Nr   file c              3   <   K   | ]  }|j                           y w)N)get_text).0pages     r   	<genexpr>z+BibtexLoader._load_entry.<locals>.<genexpr>@   s      ?T ?s   
abstract)
load_extra)page_contentmetadata)fitzr   r   parentr   findallgetopenextendFileNotFoundErrorloggerdebugjoinr   r   get_metadatar   r	   )r   r   r+   
parent_dir
file_namestexts	file_namefecontentr*   s              r   _load_entryzBibtexLoader._load_entry4   s    $..)00
__,,UYYvr-BC
# 	 I YYzI56 @!LL ?Q ??@	  ))E"?eii
B&?!!6 6 67G;;++Ed>V>V+W 
 	
@ @$  Q s0   D.DDD	
D	D9D44D9c              #     K   	 ddl }| j                  j                  | j                        }| j
                  r|d| j
                   }|D ]  }| j                  |      }|s|  y# t        $ r t        d      w xY ww)a  Load bibtex file using bibtexparser and get the article texts plus the
        article metadata.
        See https://bibtexparser.readthedocs.io/en/master/

        Returns:
            a list of documents with the document.page_content in text format
        r   NzGPyMuPDF package not found, please install it with `pip install pymupdf`)r+   ImportErrorr   load_bibtex_entriesr   r   r=   )r   r+   entriesr   docs        r   	lazy_loadzBibtexLoader.lazy_loadL   s     	 ++11$..A==o.G 	E""5)C		  	( 	s"   B A( AB !B (A==B )__name__
__module____qualname____doc__strr   r   intboolr   r   r   r	   r=   r   rC    r   r   r   r      s     15"&+0$))33 ,-	3
 3-3 $C=3 "3 38
c!2 
x7I 
08H- r   r   )loggingr   pathlibr   typingr   r   r   r   r   langchain_core.documentsr	   )langchain_community.document_loaders.baser
   $langchain_community.utilities.bibtexr   	getLoggerrD   r2   r   rK   r   r   <module>rS      s=     	  9 9 - @ D			8	$T: Tr   