
    ih                         d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ  e j                  e      Z G d de      Zy)    N)Path)IteratorOptionalSequenceUnion)Document)
BaseLoaderc                       e Zd ZdZ	 	 	 	 ddeeef   dee   deee	      dee
   dee
   f
dZd	 Zd
efdZd
ee   fdZy)MWDumpLoadera  Load `MediaWiki` dump from an `XML` file.

    Example:
        .. code-block:: python

            from langchain_text_splitters import RecursiveCharacterTextSplitter
            from langchain_community.document_loaders import MWDumpLoader

            loader = MWDumpLoader(
                file_path="myWiki.xml",
                encoding="utf8"
            )
            docs = loader.load()
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000, chunk_overlap=0
            )
            texts = text_splitter.split_documents(docs)


    :param file_path: XML local file path
    :type file_path: str
    :param encoding: Charset encoding, defaults to "utf8"
    :type encoding: str, optional
    :param namespaces: The namespace of pages you want to parse.
        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
        for a list of all common namespaces
    :type namespaces: List[int],optional
    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
        False to keep them. False by default
    :type skip_redirects: bool, optional
    :param stop_on_error: False to skip over pages that cause parsing errors,
        True to stop. True by default
    :type stop_on_error: bool, optional
    N	file_pathencoding
namespacesskip_redirectsstop_on_errorc                     t        |t              r|n
t        |      | _        || _        || _        || _        || _        y )N)
isinstancestrr   r   r   r   r   )selfr   r   r   r   r   s         p/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/mediawikidump.py__init__zMWDumpLoader.__init__0   s8     '1C&@c)n $,*    c                     	 dd l }|j                  j                  t	        | j
                  | j                              S # t        $ r}t        d      |d }~ww xY w)Nr   zBUnable to import 'mwxml'. Please install with `pip install mwxml`.)r   )mwxmlImportErrorDump	from_fileopenr   r   )r   r   es      r   _load_dump_filezMWDumpLoader._load_dump_file?   sU    	 zz##D$--$PQQ  	T	s   A   	A	AAreturnc                     	 ddl }|D ]M  }|j                  |j                        }|j	                  ddd      }d|j
                  i}t        ||      c S  y# t        $ r}t        d      |d}~ww xY w)	zParse a single page.r   NzXUnable to import 'mwparserfromhell'. Please install with `pip install mwparserfromhell`.TF)	normalizecollapsekeep_template_paramssource)page_contentmetadata)mwparserfromhellr   parsetext
strip_codetitler   )r   pager(   r   revisioncoder*   r'   s           r   _load_single_page_from_dumpz(MWDumpLoader._load_single_page_from_dumpI   s    	#  	BH#))(--8D??E # D !$**-HAA	B  	3 	s   A 	A3"A..A3c              #     K   | j                         }|j                  D ]T  }| j                  r|j                  r| j                  r|j
                  | j                  vrA	 | j                  |       V y# t        $ r<}t        j                  dj                  |             | j                  r|Y d}~d}~ww xY ww)zLazy load from a file path.zParsing error: {}N)r   pagesr   redirectr   	namespacer0   	Exceptionloggererrorformatr   )r   dumpr-   r   s       r   	lazy_loadzMWDumpLoader.lazy_loadZ   s     
 ##%JJ 	D""t}}4>>#H66t<<	  077:;%%Gs0   AB?!A74B?7	B< 2B72B?7B<<B?)utf8NFT)__name__
__module____qualname____doc__r   r   r   r   r   intboolr   r   r   r0   r   r:    r   r   r   r      s    !L #).2).(,+d#+ 3-+ Xc]+	+
 !+  ~+RB8 B"	(	r   r   )loggingpathlibr   typingr   r   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser	   	getLoggerr<   r6   r   rB   r   r   <module>rI      s7      6 6 - @			8	$a: ar   