
    ihg                       d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZmZ d dlm Z   ejB                  e"      Z#	 	 	 	 	 	 	 	 	 	 	 	 ddZ$ G d	 d
ee      Z%y)    )annotationsN)AnyDictIterableListLiteralMappingOptionalSequenceSetTupleUnioncast)
Embeddings)from_envget_pydantic_field_namessecret_from_env)	BaseModel
ConfigDictField	SecretStrmodel_validator)Selfc                
   t        |       D cg c]  }g  }}t        |       D cg c]  }g  }}t        t        |            D ]S  }|rt        ||         dk(  r|||      j                  ||          |||      j                  t        ||                U g }	t        |       D ]  }||   }
t        |
      dk(  r|	j                  d        (t        |
      dk(  r|	j                  |
d          Kt        ||         }t	        |
 D cg c]$  }t        d t	        |||         D              |z  & }}t        d |D              dz  }|	j                  |D cg c]  }||z  	 c}        |	S c c}w c c}w c c}w c c}w )N   r   c              3  ,   K   | ]  \  }}||z    y wN ).0valweights      ^/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_openai/embeddings/base.py	<genexpr>z6_process_batched_chunked_embeddings.<locals>.<genexpr>K   s       #V &Ls   c              3  &   K   | ]	  }|d z    yw)   Nr   )r   r    s     r"   r#   z6_process_batched_chunked_embeddings.<locals>.<genexpr>U   s     6sCF6   g      ?)rangelenappendsumzip)	num_textstokensbatched_embeddingsindices
skip_empty_resultsnum_tokens_in_batchi
embeddings_resulttotal_weight	embeddingaverage	magnituder    s                   r"   #_process_batched_chunked_embeddingsr;      s    5:)4D'Eq'EG'E
 9>i8H+I1B+I+I3w<  ?#0349
""#5a#89GAJ'..s6!9~>	? /1J9  D%,QZw<1 d#\Qgaj) 2156L "%g   '*96I!6L'M  	G  6g66#=I'B3sYBCA DD c (F
 ,J< Cs   	E1	E6)E;F 
c                     e Zd ZU dZ edd      Zded<    edd      Zded<   dZd	ed
<   dZ	ded<   	 eZ
ded<    e edd      d      Zded<   	  ed edd            Zded<   	  e edd            Zded<    e edd            Zded<   dZded<   	  ed  ed!d            Zd"ed#<   	  ed$ ed%d&gd            Zded'<   	 dZd(ed)<   dZd*ed+<   d,Zded-<   	 d.Zded/<   	  edd01      Zd2ed3<   	 dZded4<   dZd5ed6<   	 dZded7<   	 d8Zd5ed9<   	  ee      Zd:ed;<   	 d8Zd5ed<<   	 dZ d=ed><   dZ!d?ed@<   dAZ"dedB<   	 dCZ#dedD<   	 dZ$dEedF<   	 dZ%dEedG<   	 dZ&d5edH<   	  e'dIddJK      Z( e)dLM      e*dZdN              Z+ e)dOM      d[dP       Z,e-d\dQ       Z.	 	 	 	 	 	 d]dRZ/ddS	 	 	 	 	 	 	 d^dTZ0ddS	 	 	 	 	 	 	 d^dUZ1	 d_	 	 	 	 	 d`dVZ2	 d_	 	 	 	 	 d`dWZ3dadXZ4dadYZ5y)bOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in `text-embedding-3` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var OPENAI_ORG_ID.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]
    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelOptional[int]
dimensionszOptional[str]
deploymentOPENAI_API_VERSION)r>   api_version)default_factoryaliasopenai_api_versionbase_urlOPENAI_API_BASE)rJ   rI   openai_api_baseOPENAI_API_TYPE)rI   openai_api_typeOPENAI_PROXYopenai_proxyi  intembedding_ctx_lengthapi_keyOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationOPENAI_ORG_IDOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], Set[str], None]allowed_specialz4Union[Literal['all'], Set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer%   max_retriestimeout)r>   rJ   z0Optional[Union[float, Tuple[float, float], Any]]request_timeoutheadersbooltiktoken_enabledtiktoken_model_nameFshow_progress_barDict[str, Any]model_kwargsr0   zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthforbidr   )extrapopulate_by_nameprotected_namespacesbefore)modec           
     `   t        |       }|j                  di       }t        |      D ]M  }||v rt        d| d      ||vst	        j
                  d| d| d| d       |j                  |      ||<   O |j                  |j                               }|rt        d| d	      ||d<   |S )
z>Build extra kwargs from additional params that were passed in.rh   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsvaluesall_required_field_namesrs   
field_nameinvalid_model_kwargss         r"   build_extrazOpenAIEmbeddings.build_extra  s     $<C#@ 

>2.v, 		;JU" 6*5E!FGG!99!* .L !))34JN
 %+JJz$:j!		;  8DDUZZ\R23 4S T 
 "'~    afterc                4   | j                   dv rt        d      | j                  r| j                  j                         nd| j                  | j
                  | j                  | j                  | j                  | j                  d}| j                  rP| j                  s| j                  r8| j                  }| j                  }| j                  }t        d|d|d|      | j                  sr| j                  r2| j                  s&	 ddl}|j!                  | j                  
      | _        d| j                  i}t#        j$                  di ||j&                  | _        | j(                  sr| j                  r2| j                  s&	 ddl}|j+                  | j                  
      | _        d| j                  i}t#        j,                  di ||j&                  | _        | S # t        $ r}t        d	      |d}~ww xY w# t        $ r}t        d	      |d}~ww xY w)z?Validate that api key and python package exists in environment.)azureazure_adazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.N)rU   rX   rL   r`   r_   ri   rj   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyro   r   )rP   r{   rW   get_secret_valuer[   rN   ra   r_   ri   rj   rR   ro   rp   r@   httpxImportErrorClientopenaiOpenAIr5   rA   AsyncClientAsyncOpenAI)	selfclient_paramsrR   ro   rp   r   esync_specificasync_specifics	            r"   validate_environmentz%OpenAIEmbeddings.validate_environment*  s-    #CC@  ;?:M:M##446SW 44,,++++#33!//

 $"2"2d6L6L,,L**K $ 6 6!/K>1F4E3GI 
 {{  )9)9  $)<<d6G6G<#H *D,<,<=M --I-I=ITTDK    )?)?  */):):ARAR):)S&+T-C-CDN & 2 2 !! ! j  / # %F  # %F s0   G  G=  	G:)G55G:=	HHHc                p    d| j                   i| j                  }| j                  | j                  |d<   |S )NrC   rE   )rC   rh   rE   )r   paramss     r"   _invocation_paramsz#OpenAIEmbeddings._invocation_paramsd  s8    At/@/@A??&#'??F< r   c                2   g }g }| j                   xs | j                  }| j                  s	 ddlm} |j                  |      }t        |      D ]  \  }}	|j                  |	d      }
t        dt        |
      | j                        D ]G  }|
||| j                  z    }|j                  |      }|j                  |       |j                  |       I  n	 t        j                   |      }| j&                  | j(                  dj+                         D ci c]
  \  }}||| }}}t        |      D ]  \  }}	| j                  j-                  d	      r|	j/                  d
d      }	|r |j                  |	fi |}n|j1                  |	      }t        dt        |      | j                        D ]4  }|j                  |||| j                  z           |j                  |       6  | j2                  r$	 ddlm}  |t        dt        |      |            }nt        dt        |      |      }|||fS # t
        $ r t        d      w xY w# t"        $ r t        j$                  d      }Y ~w xY wc c}}w # t
        $ r t        dt        |      |      }Y iw xY w)a  
        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )pretrained_model_name_or_pathF)add_special_tokenscl100k_base)r\   r]   001
 )tqdm)re   rC   rd   transformersr   r   r{   from_pretrained	enumerateencoder'   r(   rT   decoder)   tiktokenencoding_for_modelKeyErrorget_encodingr\   r]   itemsendswithreplaceencode_ordinaryrf   	tqdm.autor   )r   textsr^   r-   r/   
model_namer   	tokenizerr4   text	tokenizedjtoken_chunk
chunk_textencodingkvencoder_kwargstokenr   _iters                        r"   	_tokenizezOpenAIEmbeddings._tokenizek  s   . /1--;
 $$6 &55.8 6 I %U+ &4'0'7'7QV'7'W	 q#i.$2K2KL &A-6A 9 99.K
 '0&6&6{&CJMM*-NN1%&&@#66zB (,';';*.*A*A %'.Aq = 1.N . %U+ &4::&&u-  <<c2D!+HOODCNCE$44T:E q#e*d.G.GH &AMM%A0I0I,I"JKNN1%&&" !!:*"&uQFZ'H"I !S[*5Efg%%   V 6  @#00?@.<  :aVj9:s5   H1 I	 I.3"I4 1I	I+*I+4JJ)r^   c          	         |xs  j                   } j                  ||      \  }}}g }|D ]k  }	  j                  j                  dd||	|	|z    i j                  }
t        |
t              s|
j                         }
|j                  d |
d   D               m t        t        |      ||| j                        }dd fd}|D cg c]  }||n |        c}S c c}w )al  
        Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc              3  &   K   | ]	  }|d      ywr8   Nr   r   rs     r"   r#   z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>       %Oan%Or&   dataNc                     S j                   j                  dddij                  } t        | t              s| j                         } | d   d   d   S Nr    r   r   r8   r   )r@   creater   
isinstancedict
model_dumpaverage_embedded_cached_empty_embeddingr   s    r"   empty_embeddingzBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embedding  sm    &.#54;;#5#5 $$ $ 7 7$  ""2D9'7'B'B'D$*:6*B1*Ek*R'**r   r   returnList[float])r^   r   r@   r   r   r   r   r   extendr;   r(   r0   r   r   enginer^   _chunk_sizer   r-   r/   r.   r4   responser5   r   r   r   s   `             @r"   _get_len_safe_embeddingsz)OpenAIEmbeddings._get_len_safe_embeddings  s    $ !3DOO!%{!Cvw02 	PA)t{{)) Q[1595L5LH h-#..0%%%Ohv>N%OO	P 9J 2GT__

 :>		+ DNNaQ](99NNNs   
Cc          	     @   K   |xs  j                   } j                  ||      \  }}}g }|xs  j                   }t        dt        |      |      D ]s  }	  j                  j
                  dd||	|	|z    i j                   d{   }
t        |
t              s|
j                         }
|j                  d |
d   D               u t        t        |      ||| j                        }dd fd}|D cg c]  }||n |        d{    c}S 7 7 c c}w w)	a  
        Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        r   r   Nc              3  &   K   | ]	  }|d      ywr   r   r   s     r"   r#   z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>  r   r&   r   c                    K   [ j                   j                  dddij                   d {   } t        | t              s| j                         } | d   d   d   S 7 1wr   )rA   r   r   r   r   r   r   s    r"   r   zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embedding#  s}     &.)A):):)A)A ** $ 7 7* $  ""2D9'7'B'B'D$*:6*B1*Ek*R'**$s   .A%A#2A%r   r   )r^   r   r'   r(   rA   r   r   r   r   r   r   r;   r0   r   s   `             @r"   _aget_len_safe_embeddingsz*OpenAIEmbeddings._aget_len_safe_embeddings  s1    & !3DOO!%{!Cvw02 3DOOq#f+{3 	PA5T..55 Q[1595L5L H h-#..0%%%Ohv>N%OO	P 9J 2GT__

 :>		+ JTTAQ]o.?(??TT10 )@Ts7   BDDA0D8DD
DDDDc           	        |xs | j                   }| j                  sg }t        dt        |      |      D ]k  } | j                  j
                  dd||||z    i| j                  }t        |t              s|j                         }|j                  d |d   D               m |S t        t        | j                        }| j                  ||      S )aM  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   c              3  &   K   | ]	  }|d      ywr   r   r   s     r"   r#   z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>F       !KQ!K.!Kr&   r   r   r   )r^   rq   r'   r(   r@   r   r   r   r   r   r   rB   rF   r   r   r   r^   chunk_size_r5   r4   r   r   s           r"   embed_documentsz OpenAIEmbeddings.embed_documents0  s     !3DOO..,.J1c%j+6 L-4;;-- AO48<8O8O "(D1'}}H!!!K(6:J!KKL  c4??+,,U6,BBr   c           	       K   |xs | j                   }| j                  sg }t        dt        |      |      D ]s  } | j                  j
                  dd||||z    i| j                   d{   }t        |t              s|j                         }|j                  d |d   D               u |S t        t        | j                        }| j                  ||       d{   S 7 x7 w)aS  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   Nc              3  &   K   | ]	  }|d      ywr   r   r   s     r"   r#   z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>d  r   r&   r   r   r   )r^   rq   r'   r(   rA   r   r   r   r   r   r   rB   rF   r   r   s           r"   aembed_documentsz!OpenAIEmbeddings.aembed_documentsN  s      !3DOO..,.J1c%j+6 L!9!2!2!9!9 "AO4"8<8O8O"  "(D1'}}H!!!K(6:J!KKL  c4??+33E&3III Js%   A)C(+C$,A3C(C& C(&C(c                ,    | j                  |g      d   S )zCall out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        r   )r   )r   r   s     r"   embed_queryzOpenAIEmbeddings.embed_queryl  s     ##TF+A..r   c                L   K   | j                  |g       d{   }|d   S 7 	w)zCall out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        Nr   )r   )r   r   r5   s      r"   aembed_queryzOpenAIEmbeddings.aembed_queryw  s,       00$88
!} 9s   $"
$)r   rg   r   r   )r   r   )r   rg   )r   	List[str]r^   rS   r   z<Tuple[Iterable[int], List[Union[List[int], str]], List[int]])r   r   r   rB   r^   rD   r   List[List[float]]r   )r   r   r^   z
int | Noner   r   )r   rB   r   r   )6__name__
__module____qualname____doc__r   r@   __annotations__rA   rC   rE   rF   r   rK   rN   rP   rR   rT   r   rW   r[   r\   r]   r^   r_   ra   rb   rd   re   rf   r   rh   r0   ri   rj   rl   rn   ro   rp   rq   r   model_configr   classmethodr   r   propertyr   r   r   r   r   r   r   r   r   r   r"   r=   r=   [   s   Ob d3FC3dD9L#9)E3) $J$
 !&J%(- !5tD)  T%*(3Dd*S&O]  &+ !2DA&O]  #( >#L-  !%#$8*/9ISW)X+N'  P). 34d
*  O=AO:AOSLSJ8K<HMIIOE GS!d!G)--J $t#8#(#>L.>VJ!6:O3:7;M4; s7s7$(K!( +/'.S'++- BL (#  $2 '"7 #7r  ^&^&,/^&	E^&F MQ-O-O+.-O<I-O	-Od MQ0U0U+.0U<I0U	0Uf :>CC,6C	C> :>JJ,6J	J<	/
r   r=   )r,   rS   r-   zList[Union[List[int], str]]r.   r   r/   z	List[int]r0   rc   r   zList[Optional[List[float]]])&
__future__r   loggingr|   typingr   r   r   r   r   r	   r
   r   r   r   r   r   r   r   langchain_core.embeddingsr   langchain_core.utilsr   r   r   pydanticr   r   r   r   r   typing_extensionsr   	getLoggerr   loggerr;   r=   r   r   r"   <module>r     s    "        0 T T M M "			8	$::': *: 	:
 : !:zfy* fr   