
    ihP0                     Z    d dl Z d dlmZmZmZ d dlmZ d dlmZ d dl	m
Z
  G d de      Zy)    N)IteratorLiteralOptional)
BaseLoader)Document)get_from_envc                       e Zd ZdZdedefdZdedefdZdddddd	ed
ee   dee   de	d   dee   f
dZ
dee   fdZy)FireCrawlLoadera!
  
    FireCrawlLoader document loader integration

    Setup:
        Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.

        .. code-block:: bash

            pip install -U firecrawl-py langchain_community
            export FIRECRAWL_API_KEY="your-api-key"

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import FireCrawlLoader

            loader = FireCrawlLoader(
                url = "https://firecrawl.dev",
                mode = "crawl"
                # other params = ...
            )

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            # async variant:
            # docs_lazy = await loader.alazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    Async load:
        .. code-block:: python

            docs = await loader.aload()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
             Join the waitlist to turn any web
            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}

    paramsreturnc                    d}g d}|D ]  }|j                  |      sd} n |rt        j                  dt               d|v r|d   du r|d   |d<   |d= d|v r|d   du r|d   |d<   |d= d	|v r|d	   du r|d	   |d
<   |d	= d|v r|d   du r|d   |d<   |d= d|v r-t	        |d   t
              r| j                  |d         |d<   |d= |S )NF)includesexcludesallowBackwardCrawlingallowExternalContentLinkspageOptionsTBDeprecated parameters detected. See Firecrawl v1 docs for updates.r   includePathsr   excludePathsr   allowBackwardLinksr   allowExternalLinksr   scrapeOptions)getwarningswarnDeprecationWarning
isinstancedictlegacy_scrape_options_adapter)selfr   use_legacy_optionslegacy_keyskeys        l/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/firecrawl.pylegacy_crawler_options_adapterz.FireCrawlLoader.legacy_crawler_options_adapterC   sH   "
  	Czz#%)"	
 MMT" V#*%--3J-?F>*:&V#*%--3J-?F>*:&&&012d:39:Q3RF/023*f456$>39:U3VF/067&f]3T:.2.P.P}-/F?+ =)    c                    d}dg}d|v rd|d   v r|d   d   dk(  s|d   d   dk(  s|d   d   dk(  rld}d	|d   v r,|d   d	   r|d   d	   |d
<   n|d   j                  d	d      |d
<   d|d   v r|d   d   r|d   d   |d<   d|d   v r|d   d   r|d   d   |d
<   |d= g d}|D ]  }|j                  |      sd} n |rt        j                  dt               d|v r|d   du r|j	                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|j                  d       |d= d|v r|d   du r|d   |d<   |d= d|v r|d   du r|d   |d <   |d= d!|vr||d!<   |S )"NFmarkdownextractorOptionsmodezllm-extractionzllm-extraction-from-raw-htmlzllm-extraction-from-markdownTextractionPromptpromptz-Extract page information based on the schema.extractionSchemaschema
userPrompt)	includeMarkdownincludeHtmlincludeRawHtmlincludeExtractincludeLinks
screenshotfullPageScreenshotonlyIncludeTags
removeTagsr   r0   r1   htmlr2   rawHtmlr3   extractr4   linksr5   r6   zscreenshot@fullPager7   includeTagsr8   excludeTagsformats)r   r   r   r   removeappend)r    r   r!   r?   scrape_keysr#   s         r$   r   z-FireCrawlLoader.legacy_scrape_options_adapters   s   ",' 233-.v6:JJ01&95601&956 *.&)V4F-GG!"456HI/56H/I 20F8, 066H/I/M/M 2 O0F8,
 *V4F-GG!"456HI/56H/I 20F8, $v.@'AA!"45lC/56H/I,/WF8,12

  	Czz#%)"	
 MMT" !F*+,5NN:.,-&-(D0NN6*=)6)*+t3NN9-+,6)*+t3NN9-+,'.)T1NN7+>*v%,'4/NN<0<(#v-./47NN#89/0 F*+,4,23D,EF=),-v%,'4/,2<,@F=)<(F" 'F9r&   Ncrawl)api_keyapi_urlr*   r   urlrD   rE   r*   )rC   scrapemapc                    	 ddl m} |dvrt        d| d      |st        d      |xs t	        dd	      } |||
      | _         || _        || _        |xs i | _        y# t        $ r t        d      w xY w)aR  Initialize with API key and url.

        Args:
            url: The url to be crawled.
            api_key: The Firecrawl API key. If not specified will be read from env var
                FIRECRAWL_API_KEY. Get an API key
            api_url: The Firecrawl API URL. If not specified will be read from env var
                FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev.
            mode: The mode to run the loader in. Default is "crawl".
                 Options include "scrape" (single url),
                 "crawl" (all accessible sub pages),
                 "map" (returns list of links that are semantically related).
            params: The parameters to pass to the Firecrawl API.
                Examples include crawlerOptions.
                For more details, visit: https://github.com/mendableai/firecrawl-py
        r   )FirecrawlAppzD`firecrawl` package not found, please run `pip install firecrawl-py`)rC   rG   searchrH   Invalid mode 'z/'. Allowed: 'crawl', 'scrape', 'search', 'map'.zUrl must be providedrD   FIRECRAWL_API_KEY)rD   rE   N)	firecrawlrJ   ImportError
ValueErrorr   rF   r*   r   )r    rF   rD   rE   r*   r   rJ   s          r$   __init__zFireCrawlLoader.__init__   s    4	.
 ;; &UV  344I\)5HI%gwG	l!  	V 	s   A! !A6c              #     K   | j                   dk(  rC| j                  j                  | j                  | j	                  | j
                              g}n| j                   dk(  rj| j                  st        d      | j                  j                  | j                  | j                  | j
                              }|j                  dg       }n| j                   dk(  rI| j                  st        d      | j                  j                  | j                  | j
                        }n3| j                   dk(  rt        d	      t        d
| j                    d      |D ]r  }| j                   dk(  r|}i }nJ|j                  d      xs% |j                  d      xs |j                  dd      }|j                  di       }|sdt        ||       t y w)NrG   )r   rC   zURL is required for crawl modedatarH   zURL is required for map moderK   z?Search mode is not supported in this version, please downgrade.rL   z%'. Allowed: 'crawl', 'scrape', 'map'.r(   r9   r:    metadata)page_contentrU   )r*   rN   
scrape_urlrF   r   r   rP   	crawl_urlr%   r   map_urlr   )r    firecrawl_docscrawl_responsedocrV   rU   s         r$   	lazy_loadzFireCrawlLoader.lazy_load  s    99 ))HHT%G%G%T * N
 YY'!88 !ABB!^^55!D!DT[[!Q 6 N ,//;NYY%88 !?@@!^^33DHHT[[3QNYY("Q   +PQ  " 	CyyE!" GGJ'T3776?TcggiQS>T  77:r2)! 	s   GG)__name__
__module____qualname____doc__r   r%   r   strr   r   rQ   r   r   r]    r&   r$   r
   r
   	   s    7r.T .d .`jD jT j` "&!%29!%,#,# #	,#
 #,# ./,# ,#\(8H- (r&   r
   )r   typingr   r   r   langchain_core.document_loadersr   langchain_core.documentsr   langchain_core.utilsr   r
   rc   r&   r$   <module>rh      s%     . . 6 - -lj lr&   