
    ihB                     Z    d dl mZmZmZmZ d dlmZmZ d dlm	Z	 d dl
mZ  G d de      Zy)    )AnyIteratorListOptional)urljoinurlparse)Document)WebBaseLoaderc                        e Zd ZdZ	 	 	 	 	 ddededee   dededef fdZd	ee	   fd
Z
	 ddedee   d	ee	   fdZded	ee   fdZ xZS )GitbookLoaderztLoad `GitBook` data.

    1. load from either a single page, or
    2. load all (relative) paths in the navbar.
    web_pageload_all_pathsbase_urlcontent_selectorcontinue_on_failureshow_progressc                     |xs || _         | j                   j                  d      r| j                   dd | _         |r| j                    d}t        |   |f||       || _        || _        y)a  Initialize with web page and whether to load all paths.

        Args:
            web_page: The web page to load or the starting point from where
                relative paths are discovered.
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`.
            base_url: If `load_all_paths` is True, the relative paths are
                appended to this base url. Defaults to `web_page`.
            content_selector: The CSS selector for the content to load.
                Defaults to "main".
            continue_on_failure: whether to continue loading the sitemap if an error
                occurs loading a url, emitting a warning instead of raising an
                exception. Setting this to True makes the loader more robust, but also
                may result in missing data. Default: False
            show_progress: whether to show a progress bar while loading. Default: True
        /Nz/sitemap.xml)	web_pathsr   r   )r   endswithsuper__init__r   r   )selfr   r   r   r   r   r   	__class__s          j/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/gitbook.pyr   zGitbookLoader.__init__   sw    4 !,H==!!#& MM#2.DM--5Hk 3' 	 	

 - 0    returnc              #     K   | j                   r| j                         }| j                  |      }|D cg c]  }t        | j                  |       }}| j                  |      }t        ||      D ]  \  }}| j                  ||      }|s|   y| j                         }| j                  || j                        }|r| yyc c}w w)z(Fetch text from one single GitBook page.N)	r   scrape
_get_pathsr   r   
scrape_allzip_get_documentweb_path)r   	soup_inforelative_pathspathurls
soup_infosurldocs           r   	lazy_loadzGitbookLoader.lazy_load8   s     I!__Y7N=KLTGDMM40LDL.J"%j$"7 	3((C8I I$$Y>C	  Ms   2CC9CA Csoup
custom_urlc                     |j                  | j                        }|sy|j                  d      j                         }|j                  d      }|r|j                  nd}|xs | j
                  |d}t        ||      S )z,Fetch content from page and return Document.N
)	separatorh1 )sourcetitle)page_contentmetadata)findr   get_textstriptextr%   r	   )r   r.   r/   page_content_rawcontenttitle_if_existsr6   r8   s           r   r$   zGitbookLoader._get_documentJ   sz      99T%:%:;"++d+;AAC*//5(7$$R(9DMMEJWx@@r   c                     |j                  d      D cg c]!  }t        |j                        j                  # c}S c c}w )z'Fetch all relative paths in the navbar.loc)find_allr   r<   r(   )r   r.   rA   s      r   r!   zGitbookLoader._get_pathsW   s.    37==3GHC"''HHHs   &=)FNmainFT)N)__name__
__module____qualname____doc__strboolr   r   r   r	   r-   r   r$   r   r!   __classcell__)r   s   @r   r   r   	   s      %"& &$)"&1&1 &1 3-	&1
 &1 "&1 &1P8H- & 6:AA%-c]A	(	AIs ItCy Ir   r   N)typingr   r   r   r   urllib.parser   r   langchain_core.documentsr	   -langchain_community.document_loaders.web_baser
   r    r   r   <module>rP      s$    0 0 * - GPIM PIr   