
    7|hb
                         d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	 d dl
mZ  ej                  e      Z G d de      Zy)    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   d    e Zd ZdZ	 	 	 d
deeef   deedf   deedf   deddf
dZde	e
   fd	Zy)MHTMLLoaderz)Parse `MHTML` files with `BeautifulSoup`.N	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                     	 ddl }|| _        || _        |ddi}|| _        || _        y# t        $ r t        d      w xY w)a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: Path to file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when getting the text
                from the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`featureslxml)bs4ImportErrorr   r   r   r   )selfr   r   r   r   r   s         i/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/mhtml.py__init__zMHTMLLoader.__init__   s]    "	 #*#V,I""4  	/ 	s   ) >c              #     K   ddl m} t        | j                  d| j                        5 }t        j                  |j                               }|j                         }t        |t              s|g}|D ]  }|j                         dk(  s|j                  d      j                         } ||fi | j                  }|j                  | j                        }|j                   r t#        |j                   j$                        }	nd}	t#        | j                        |	d	}
t'        ||

        ddd       y 	 ddd       y# 1 sw Y   yxY ww)z*Load MHTML document into document objects.r   )BeautifulSoupr)encodingz	text/htmlT)decode )sourcetitle)page_contentmetadataN)r   r   openr   r   emailmessage_from_stringreadget_payload
isinstancelistget_content_typer   r   get_textr   r   strstringr   )r   r   fmessagepartsparthtmlsouptextr   r!   s              r   	lazy_loadzMHTMLLoader.lazy_load0   s#     	&$..#0B0BC 	q//9G'')EeT* 	 ((*k9++4+8??AD(@@D==)@)@ADzz #DJJ$5$5 6 " #&dnn"5!&=H #xHH1	 		 	 	s0   )EAEB$E/	E8E:	EEE)NNr   )__name__
__module____qualname____doc__r   r+   r   dictr   r   r   r4        r   r
   r
      sr    3
 +/'+"$5d#5 S$Y'5 t$	5
  5 
5@8H- r;   r
   )r#   loggingpathlibr   typingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr5   loggerr
   r:   r;   r   <module>rC      s:       ( ( - @			8	$@* @r;   