
    ih                     v    d Z ddlZddlmZmZmZ ddlmZ ddlm	Z	  ej                  e      Z G d de	      Zy)z1Loader that uses unstructured to load HTML files.    N)AnyIteratorList)Document)
BaseLoaderc                   h    e Zd ZdZ	 	 	 	 ddee   dedededededd	fd
Zdee	   fdZ
dee	   fdZy	)NewsURLLoadera/  Load news articles from URLs using `Unstructured`.

    Args:
        urls: URLs to load. Each is loaded into its own document.
        text_mode: If True, extract text from URL and use that for page content.
            Otherwise, extract raw HTML.
        nlp: If True, perform NLP on the extracted contents, like providing a summary
            and extracting keywords.
        continue_on_failure: If True, continue loading documents even if
            loading fails for a particular URL.
        show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
            tqdm to be installed, ``pip install tqdm``.
        **newspaper_kwargs: Any additional named arguments to pass to
            newspaper.Article().

    Example:
        .. code-block:: python

            from langchain_community.document_loaders import NewsURLLoader

            loader = NewsURLLoader(
                urls=["<url-1>", "<url-2>"],
            )
            docs = loader.load()

    Newspaper reference:
        https://newspaper.readthedocs.io/en/latest/
    urls	text_modenlpcontinue_on_failureshow_progress_barnewspaper_kwargsreturnNc                     	 ddl }|j                  | _        || _        || _        || _        || _        || _        || _	        y# t        $ r t        d      w xY w)zInitialize with file path.r   NzMnewspaper package not found, please install it with `pip install newspaper3k`)
	newspaper__version___NewsURLLoader__versionImportErrorr
   r   r   r   r   r   )selfr
   r   r   r   r   r   r   s           g/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/news.py__init__zNewsURLLoader.__init__+   si    	&22DN 	"#6  0!2  	, 	s   A Ac                     | j                         }| j                  r	 ddlm}  ||      }t	        |      S # t        $ r}t        d      |d }~ww xY w)Nr   )tqdmzPackage tqdm must be installed if show_progress_bar=True. Please install with 'pip install tqdm' or set show_progress_bar=False.)	lazy_loadr   r   r   list)r   iterr   es       r   loadzNewsURLLoader.loadF   s]    ~~!!% :DDz  !/ 	s   7 	A AAc              #     K   	 ddl m} | j                  D ]	  }	  ||fi | j                  }|j                          |j                          | j                  r|j                          t        |dd      t        |dt        |d	d            t        |d
g       t        |dd      t        |dd      t        |dd      d}| j                  r|j                  }n|j                  }| j                  r t        |dg       |d<   t        |dd      |d<   t!        ||        y # t        $ r}t        d      |d }~ww xY w# t        $ r4}| j                  r!t        j                  d| d|        Y d }~a|d }~ww xY ww)Nr   )ArticlezFCannot import newspaper, please install with `pip install newspaper3k`zError fetching or processing z, exception: title urlcanonical_linkauthors	meta_langmeta_descriptionpublish_date)r"   linkr&   languagedescriptionr)   keywordssummary)page_contentmetadata)r   r!   r   r
   r   downloadparser   	Exceptionr   loggererrorgetattrr   texthtmlr   )r   r!   r   r$   articler0   contents          r   r   zNewsURLLoader.lazy_loadT   sy    	) 99 "	DC!#?)>)>?  "88KKM !'26AQSU0VW"7Ir:#G["=&w0BBG ' DH ~~!,,!,,xx'.w
B'G$&-gy"&E#(CCE"	D  	X	  ++LL#@]STRU!VWGsX   FD$ FAE*B:F$	D>-D99D>>F	E>
'E91F7E99E>>F)TFTF)__name__
__module____qualname____doc__r   strboolr   r   r   r   r   r        r   r	   r	      s    @ $("'33i3 3 	3
 "3  3  3 
36d8n *D8H- *DrB   r	   )r>   loggingtypingr   r   r   langchain_core.documentsr   )langchain_community.document_loaders.baser   	getLoggerr;   r4   r	   rA   rB   r   <module>rH      s9    7  & & - @			8	$qDJ qDrB   