
    9|h                    F   d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl"m&Z' ddl"m(Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl4m7Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZR ddlSm&ZT ddlSmUZUmVZV erddlWZX ej                  eZ      Z[eeg eeIe#f   f   eee\gef   e#eIf   Z]eeg eeIe#f   f   ef   Z^ G d de_      Z` G d  d!e\      Za G d" d#e\      Zb	 dM	 	 	 	 	 dNd$ZcdOd%Zd G d& d'eG      ZedPd(Zf	 	 	 	 	 	 dQd)Zg	 	 	 	 	 	 	 	 dRd*Zh	 	 	 	 	 	 	 	 dSd+Zi	 	 	 	 	 	 	 	 	 	 dTd,Zj	 	 	 	 	 	 dUd-Zk	 	 	 	 	 	 dVd.Zl	 	 	 	 	 	 dWd/Zm	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dXd0Zn	 	 	 	 	 	 	 	 	 	 dYd1Zo	 	 	 	 	 	 	 	 	 	 	 	 	 	 dZd2Zpddddd3	 	 	 	 	 	 	 	 	 	 	 	 	 d[d4Zqdddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d\d6Zrdd7	 	 	 	 	 	 	 	 	 d]d8Zsdddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d^d9Ztdddd5	 	 	 	 	 	 	 	 	 	 	 	 	 d_d:Zudd7	 	 	 	 	 	 	 	 	 d]d;Zv	 	 	 d`	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dad<Zw G d= d>eGd?@      Zxej                   G dA dB             ZzdbdCZ{dcdDZ|dEZ}dddFddd?ddG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dddHZ~dddFddd?ddG	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dddIZdJZee_         ej                  dKdL      e~_         y)ez>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)	TYPE_CHECKINGAnyCallableDictListOptionalTupleUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                      e Zd ZdZy)InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__     f/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain/smith/evaluation/runner_utils.pyr8   r8   N   s    2r>   r8   c                  $    e Zd ZdZ	 	 ddZddZy)
TestResultz1A dictionary of the results of a single test run.c                   | j                         }|j                  D cg c];  }|j                  d      s&|j                  d      s|dv s|j                  d      r|= }}|j                  d      j	                  |d      S c c}w )	zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        inputs.outputs.>   inputoutput	referenceall)include   )axis)to_dataframecolumns
startswithdescribedrop)selfdfcolto_drops       r?   get_aggregate_feedbackz!TestResult.get_aggregate_feedbackX   s       zz
~~i(~~j)))~~k* 
 
 {{5{)..wQ.??
s   A Bc           	        	 ddl }g }g }| d   j                         D ]e  \  }}|d   }|j                  d      }t	        |t
              r'|j                         D 	
ci c]  \  }	}
d|	 |
 }}	}
n	|i }nd|i}i |d   j                         D 	
ci c]  \  }	}
d	|	 |
 c}
}	|}d
|v rUt	        |d
   t
              r:|j                  |d
   j                         D 	
ci c]  \  }	}
d|	 |
 c}
}	       n|d
   |d
<   |j                  i |D ci c]  }d|j                   |j                   c}|j                  d      |d   |j                  d      d       |j                  |       |j                  |       h  |j                  ||      S # t        $ r}t        d      |d}~ww xY wc c}
}	w c c}
}	w c c}
}	w c c}w )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrF   rD   rE   rC   rG   z
reference.z	feedback.Errorexecution_timerun_id)errorrZ   r[   )index)pandasImportErroritemsget
isinstancedictupdatekeyscoreappend	DataFrame)rQ   pdeindicesrecords
example_idresultrX   output_kvrF   rfs                 r?   rL   zTestResult.to_dataframeo   s   	 "&y/"7"7"9 	'Jj)Hjj*G'4(8?H1HQC.!+HH"G,06w0E0E0GH1WQC=!#HA f$f[148HH9?9L9R9R9TUA:aS)1,U &,K%8AkNHH=EF155'*AGG3F#ZZ0&,-=&>$jj2	 NN1NN:&=	'@ r||G733Q  	@ 	 I I V Gs/   F F7F=
 G!G		F4#F//F4N)returnpd.DataFrame)r9   r:   r;   r<   rU   rL   r=   r>   r?   rA   rA   U   s    ;@	@.,4r>   rA   c                  ,     e Zd ZdZd fdZddZ xZS )	EvalErrorz"Your architecture raised an error.c                (    t        |   dd|i| y )NrY   r=   )super__init__)rQ   rY   kwargs	__class__s      r?   rz   zEvalError.__init__   s    /u//r>   c                F    	 | |   S # t         $ r t        d| d      w xY w)Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rQ   names     r?   __getattr__zEvalError.__getattr__   s8    	R: 	R #Ha!PQQ	Rs     )rY   BaseExceptionr{   r   rt   None)r   strrt   r   )r9   r:   r;   r<   rz   r   __classcell__)r|   s   @r?   rw   rw      s    ,0Rr>   rw   c           	     P  	
 t        | t              r^| j                  j                  }| j                  5j                  j                  j                  }t        d| d| d| d      fdS t        | t              r| S t        | t              r| 		fdS t        |       rt        |       rt        t        t        |             

fdS 	  |        }t        t        |       t        |t              r|S t        t        t        |            rt        t        t        |            

fdS t        |t              sfdS S | S # t        $ rQ t        t        |       }t        j                  |      }t         j#                  d| d	       t%        |      fd
cY S w xY w)zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.a$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                      S Nr=   )chains   r?   <lambda>z(_wrap_in_chain_factory.<locals>.<lambda>   s    u r>   c                      S r   r=   )lcfs   r?   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    s r>   c                      S r   r=   	runnable_s   r?   r   z(_wrap_in_chain_factory.<locals>.<lambda>       9 r>   zWrapping function z as RunnableLambda.c                      S r   r=   )wrappeds   r?   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    7 r>   c                      S r   r=   r   s   r?   r   z(_wrap_in_chain_factory.<locals>.<lambda>   r   r>   c                     t               S r   )r   )constructors   r?   r   z(_wrap_in_chain_factory.<locals>.<lambda>   s    >+6 r>   )rb   r/   r|   r9   memory
ValueErrorr   r   callabler&   r%   r   r   	TypeErrorinspect	signatureloggerinfor   )llm_or_chain_factorydataset_namechain_classmemory_class_model	user_funcsigr   r   r   r   r   s          @@@@@r?   _wrap_in_chain_factoryr      s    &.$oo..&&2 <<11::L$ %1> 2)]##/.0J
L  	(*;	<##	((	3"	&	' !56#D3G$HII$$	#)+F 8%9:f/0 M"4&#9:#D6$:;I$$FH-66 +  	#X';<I##I.CKK,SE1DEF$Y/G""	#s   E AF%$F%c                   | st        d      g }d| v r>t        | d   t              s$t        dt        | d         j                         | d   g}nd| v rRt        | d   t
              rt        d | d   D              s$t        dt        | d         j                         | d   }nt        |       dk(  rit        t        | j                                     }t        |t              r|g}nAt        |t
              rt        d |D              r|}nt        d	|        t        d
|        t        |      dk(  r|d   S t        dt        |       d      )zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc              3  <   K   | ]  }t        |t                y wr   rb   r   .0is     r?   	<genexpr>z_get_prompt.<locals>.<genexpr>   s      >
#$Jq#>
   z,Expected list of strings for 'prompts', got rJ   c              3  <   K   | ]  }t        |t                y wr   r   r   s     r?   r   z_get_prompt.<locals>.<genexpr>
  s     .Saz!S/A.Sr   z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r8   rb   r   typer9   listrH   lennextitervalues)inputsr   prompt_s      r?   _get_promptr      s~    <==G6&*C0"4T&:J5K5T5T4UV  (#$	f	&+T2# >
(.y(9>
 ;
 #VI./889;  #	V	tFMMO,-gs#iG&3.S7.S+SG"%Nvh#WXXCF8L
 	
 7|qqz7G~YO
 	
r>   c                      e Zd ZU dZded<   y)ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zList[BaseMessage]messagesNr9   r:   r;   r<   __annotations__r=   r>   r?   r   r     s      r>   r   c                   | st        d      | j                         }d| v r|j                  d      |d<   n3t        |       dk(  r%t	        t        | j                                     |d<   d|v rV|d   }t        |t              rt        d |D              r|g}t        |      dk(  rt        |d         |d<   |S t        d      t        d|        )	zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rE   rJ   c              3  <   K   | ]  }t        |t                y wr   )rb   rc   r   s     r?   r   z _get_messages.<locals>.<genexpr>8  s      2
$%Jq$2
r   r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r8   copypopr   r   r   r   rb   r   rH   r   )r   
input_copyraw_messagess      r?   _get_messagesr   $  s     <==JV(nnZ8
7	V	"4#89
7*!'*lD)c 2
)52
 /
 )>L|!"4\!_"EJw 	 #,  !($
 	
r>   c                   |ra || j                         }t        |t              s>t        |t              rt	        d |D              st        d| dt        |       d      y y 	 t        | j                          y # t
        $ r? 	 t        | j                          Y y # t
        $ r t        d| j                    d      w xY ww xY w)Nc              3  <   K   | ]  }t        |t                y wr   rb   r   r   msgs     r?   r   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>T  s     ISJsK0Ir   zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   rb   r   r   rH   r8   r   r   r   )first_exampleinput_mapperprompt_inputs      r?   +_validate_example_inputs_for_language_modelr   L  s     #M$8$89,,|T*ILII" 'yl1C0DAG  J -	,,- 	
m223# &*112GG 	s   %A; ;	CB#B??Cc                &   |r || j                         }t        |j                        j                  |      }t	        |t
              st        d| dt        |       d      |r)t        d|j                   d|j                                y| j                   }t        |j                        j                  |      }t        |      dk(  rt        |j                        dk(  ry|r)t        d|j                   d|j                                y)	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rJ   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencerb   rc   r8   r   keysr   )r   r   r   first_inputsmissing_keyss        r?   "_validate_example_inputs_for_chainr   m  s2    #M$8$895++,77E,-"&yl1C0DAG 
 "$//08I8I8K7LN   %++5++,77E|!c%*:*:&;q&@ " #--. /$))+,	.  r>   c                    t        |t              rt        | |       y |       }t        |t              rt	        | ||       yt        |t
              rt        j                  d|        yy)z9Validate that the example inputs are valid for the model.zSkipping input validation for N)rb   r   r   r/   r   r   r   debug)exampler   r   r   s       r?   _validate_example_inputsr     s[     &(9:3G\J$&eU#.w|Lx(LL9%AB )r>   c           	     :   |rt        | t              rd\  }}d}nEd} |        }t        |t              r|j                  nd}t        |t              r|j                  nd}t        ||||d   j                  rt        |d   j                        nd||      }|S d}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )rb   r   r/   r   output_keys_load_run_evaluatorsoutputsr   )	r   examplesr4   	data_type
run_inputsrun_outputsrun_typer   run_evaluatorss	            r?   _setup_evaluationr     s     *,=>&0#JHH(*E-7u-E))4J/9%/G%++TK-)1!)<)<D!$$%$
  r>   c                   d }| j                   r0| j                   }|r ||vrt        j                  d| d| d       |S |rt        |      dk(  r|d   }|S |'t        |      dkD  rt        j                  d| d       |S )Nz
Input key z% not in chain's specified input keys '. Evaluation behavior may be undefined.rJ   r   z#Chain expects multiple input keys: z, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   s      r?   _determine_input_keyr     s     I$$	):5NNYK ()l*QS  
J1,qM	  
	C
Oa$71* >P P	
 r>   c                   d }| j                   r0| j                   }|r ||vrt        j                  d| d| d       |S |rt        |      dk(  r|d   }|S |'t        |      dkD  rt        j                  d| d       |S )NzPrediction key z& not in chain's specified output keys r   rJ   r   z$Chain expects multiple output keys: zl, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   s      r?   _determine_prediction_keyr     s     N..><NN!.!1 2  +},SU  
[)Q.$Q  
	 S%5%92;- @; ;	

 r>   c                    | j                   r%| j                   }|r||vrt        d| d|       |S |rt        |      dk(  rt        |      d   }|S d }|S )NzReference key z! not in Dataset example outputs: rJ   r   )reference_keyr   r   r   )r   example_outputsr   s      r?   _determine_reference_keyr     s     ,,}OC  0%%4$57  	 
S1Q6_-a0  r>   c           	     n   t        | t              r| S t        | t        t        f      r5t        | t              st        |       } t	        | |      }| j
                  }	nt        | t        j                        rd|i| j                         }
t	        | j                  fi |
}| j                  j
                  }	t        | t        j                        r^| j                  xs |}| j                  xs |}| j                  xs |}n-t        |       rt        |       S t!        dt#        |              t        |t$              rI|j&                  r|t!        d|	 d| d      t(        j*                  j-                  |||||||	g      }|S t        |t.              rt1        d|	 d	      t1        d|	 d
      )N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)rb   r#   r1   r   r0   valuesmith_eval_config
EvalConfig
get_kwargsevaluator_typeSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r3   requires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer2   NotImplementedError)eval_configeval_llmr   r   r   r   r   r   
evaluator_eval_type_tagr{   r$   s               r?   _construct_run_evaluatorr	    s    +|,+s34+}5'4K#KX>
#))	K!2!=!=	>>[%;%;%=>#K$>$>I&I
#2288k#4#H#HI#--:I(77I>N'55FM	+	 --3D4E3FGHH*o.((]-B&&3_ 577F6GqJ 
 #::QQ)' R 
,  
J 7	8!  0Q Q
 	
 " /BC
 	
r>   c                T    t        | |      }t        | |      }t        | |      }|||fS r   )r   r   r   )r   r   r   r   r   r   r   s          r?   	_get_keysr  H  s5     %VZ8I.v{CN,V_EMnm33r>   c                   g }d\  }}}	| j                   s=| j                  rCt        | j                  D 
cg c]  }
t        |
t               c}
      rt        | |||      \  }}}	| j                   D ]/  }t        || j                  ||||	||      }|j                  |       1 | j                  xs g }|D ]  }t        |t              r|j                  |       %t        |t              r5|j                  t        j                  j                  ||||||	             jt        |      r|j                  t        |             t        d| d       |S c c}
w )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    NNN)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyrb   r3   r  r	  r  rg   r#   r  r  r  r   r   r   )r   r   r   r   r   r   r   r   r   r   rj   r  r$   r  custom_evaluators                  r?   r   r   T  s   " N/?,I~}  9Q9QRAA/RS3<J_4
0	>= (( -0OO	
 	m,- 006B- &5!!"23(/:!!22II$'#1"/ J 	 &'!!"34D"EF01A0B C= > #, Q Ss   Er   	callbacksr   metadatac                 K   |x ||      }t        |t              s"t        |t              rAt        d |D              r/| j	                  |t        ||xs g |xs i              d{   S t        d| d      	 t        |      }| j	                  |t        ||xs g |xs i              d{   }|S 7 P7 # t        $ rB t        |      }	 | j                  di |	dt        ||xs g |xs i       i d{  7  }Y |S w xY ww)	a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc              3  <   K   | ]  }t        |t                y wr   r   r   s     r?   r   z_arun_llm.<locals>.<genexpr>       OSJsK0Or   r  r   r  r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r=   )	rb   r   r   rH   ainvoker   r8   r   r   )
r   r   r   r  r   r  prompt_or_messagesr   
llm_output
llm_inputss
             r?   	_arun_llmr    sK    0 )&1)3/,d3O<NOO"%'djb8>r %    #&'FG 	 (F8;%'djb8>r 9D 9 3J = 3   	&v.J*s{{   %'djb8>r   J 	sU   A'D)B:*D>5B> 3B<4B> 8D<B> >AD	?D D	DD		Dr   r   r  c          	       K   ||n ||      }t        | t              r}t        |t              rmt        |      dk(  r_| j                  rSt        t        |j                                     }| j                  |t        ||xs g |xs i              d{   }|S t        |xs g ||xs i       }	| j                  ||	       d{   }|S 7 97 w)z%Run a chain asynchronously on inputs.NrJ   r  r   r   r  r  )
rb   r/   rc   r   r   r   r   r   r  r   
r   r   r  r   r   r  inputs_valrF   runnable_configs
             r?   _arun_chainr&    s      %,f,v2FG5% w%LA4()*}}!#$*"x~2 % 
 
 M	 )y8>r
 }}W_}EEM
 Fs$   BCC2CC	CC)r   c                 K   t        |t              rdnd}d}	 t        |t              r9t        || j                  |d   |d   ||j	                  d             d{   }n? |       }t        || j                  |d   |d   ||j	                  d             d{   }|}|S 7 H7 
# t        $ rT}t        j                  | d| j                   d	| j                   d
t        |              t        |      }Y d}~|S d}~ww xY ww)a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr/   Nr   r  r  r   failed for example  with inputs 
rY   )rb   r   r  r   ra   r&  	Exceptionr   r   idreprrw   )	r   r   r   r   chain_or_llmrn   rF   r   rj   s	            r?   _arun_llm_or_chainr1    s#    ( 02CD'  F$*,=> )$F^ -)J/! F )*E&F^ -)J/ F  M5  $n0 =">>*a		

 #M$sT   D
AB* B&?B* B(B* $D
&B* (B* *	D3A	D<D
DD
c                  |r ||      }t        |t              s"t        |t              r;t        d |D              r)| j	                  |t        ||xs g |xs i             }|S t        d| d      	 t        |      }| j	                  |t        ||xs g |xs i             }|S # t        $ r4 t        |      }	 | j                  di |	dt        ||xs i       i}Y |S w xY w)	a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    c              3  <   K   | ]  }t        |t                y wr   r   r   s     r?   r   z_run_llm.<locals>.<genexpr>N  r  r   r  r   z'Input mapper returned invalid format:  r  r   )r  r  r=   )	rb   r   r   rH   invoker   r8   r   r   )
r   r   r  r   r   r  r  r  llm_promptsr  s
             r?   _run_llmr6  0  s%   2 )&1)3/,d3O<NOO25**"%'djb8>r 3= 3J6 ) #&'FG 	%f-K%'djb8>r $ J    	&v.J# %	HNPRSJ 	s   62B* *9C'&C'c          	     z   ||n ||      }t        | t              rut        |t              ret        |      dk(  rW| j                  rKt        t        |j                                     }| j                  |t        ||xs g |xs i             }|S t        |xs g ||xs i       }	| j                  ||	      }|S )zRun a chain on inputs.rJ   r  r   r!  )
rb   r/   rc   r   r   r   r   r   r4  r   r"  s
             r?   
_run_chainr8  n  s     %,f,v2FG5% w%LA4()*!#$*"x~2  
 M	 )y8>r
 go>Mr>   c                  t        |t              rdnd}d}	 t        |t              r1t        || j                  |d   |d   ||j	                  d            }n7 |       }t        || j                  |d   |d   ||j	                  d            }|}|S # t        $ rc}t        |      j                  }	t        j                  | d| j                   d	| j                   d
|	 d| 	       t        |      }Y d}~|S d}~ww xY w)a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r(  r/   Nr  r   r  r  r)  r*  z
Error Type: z, Message: r,  )rb   r   r6  r   ra   r8  r-  r   r9   r   r   r.  rw   )
r   r   r   r   r0  rn   rF   r   rj   
error_types
             r?   _run_llm_or_chainr;    s   * 02CD'  F$*,=>"${#F^)J/F )*E{#F^)J/F  M  $!W%%
n0 =">>*ZLA38	

 #M$s   A:B 	D AC;;D c           
        t        ||      }| j                  |      }t        | j                  |j                  |            }	|	st        d| d      |	D 
cg c]  }
|
j                  s|
j                   }}
|rt        |      nd }|r|j                         nd }	 |xs i }t               }|ri |d|i}||d<   | j                  ||j                  |rd|ini |      }|j                   d|j                   z   }t#        d| d| d| d|j                    d       ||||	fS c c}
w # t        t
        t        f$ rE}d	t        |      vr|t        j                         }d
| d| d| d}t        d| d|       d }~ww xY w)N)r   )
dataset_idas_ofzDataset z has no example rows.gitdataset_versionr   )reference_dataset_idproject_extrar  zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   read_datasetr   list_examplesr.  r   modified_atmax	isoformatr    create_projectr-   r,   r   uuiduuid4urlprint)clientr   r   project_nameproject_metadatar   r@  wrapped_modeldatasetr   exrF  max_modified_atinferred_versiongit_infoprojectrj   uidexample_msgcomparison_urls                       r?   _prepare_eval_runr[    s    ++?NM!!|!<GF((GJJo(VWH8L>1FGHH,4Gb2>>GKG +6c+&4O6E0024
+1r> " x 
 /?*+''!(,064.b%	 ( 
( [[%?

|#LLN	
3L> B  !&&2^6'++	H 	 '7H44W H, z>2 
CF*Gjjl  .C5(:<. I L> *-!
 	

s&   D),D) AD. .FA FFc                  0    e Zd ZU dZded<   ded<   ded<   y)	
_RowResultz5A dictionary of the results for a single example row.z Optional[List[EvaluationResult]]rX   zOptional[float]rZ   Optional[str]r[   Nr   r=   r>   r?   r]  r]    s    ?..##r>   r]  F)totalc                      e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   dZded<   	 	 	 	 	 	 ddZddZddZ	 	 	 	 ddZ	dddZ
e	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Zy)_DatasetRunContainerz3A container to help manage the state of a eval run.r   rN  r+   rW  MCFrQ  List[Example]r   zList[RunnableConfig]configsNz6Optional[List[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsc           	     `   i }t        | j                  |      D ]  \  }}t        t        |j	                  t        |j                        i             }|j                  |j	                  dg       |j	                  d      |j	                  d      d|t        |j                        <   t        |t              r&|j                  |t        |j                           d<   n||t        |j                           d<   |j                  s|j                  |t        |j                           d<    |S )NrX   rZ   r[   )rE   rX   rZ   r[   rY   rF   rG   )zipr   r   r]  ra   r   r.  r   rb   rw   rY   r   )rQ   batch_resultsall_eval_resultsrW   r   rF   
row_results          r?   _merge_test_outputsz(_DatasetRunContainer._merge_test_outputs  s    
 "4==-@ 	HOGVj*:*>*>s7::PR*STJ &NN:r:",..1A"B$..2	(GC

O$ &),4:LLGJJ(15;GJJ(28?GJJ(5	H r>   c           	        | j                   }|sg S | j                  D cg c]  }|t        |j                            }}g }t        j
                  j                         5 }|D ]  }	  ||| j                        }t        |t              r|j                         }|j                  t        t        |              |j                  | j                  j                  fi |d | j                  j                  d  	 d d d        |S c c}w # t         $ r.}	t"        j%                  dt'        |       d|	        Y d }	~	d }	~	ww xY w# 1 sw Y   |S xY w)N)r[   
project_idzError running batch evaluator z: )re  r   r   r.  
concurrentfuturesThreadPoolExecutorrb   r"   rc   rg   r   submitrN  create_feedbackrW  r-  r   r\   r/  )
rQ   runsr  r   	runs_listaggregate_feedbackexecutor	evaluatorrn   rj   s
             r?   _run_batch_evaluatorsz*_DatasetRunContainer._run_batch_evaluators,  s8   **
I:>--HwT#gjj/*H	H224 	' 	&y$--@F!&*:;!'&--d4.@A#HOO33   $#'<<??		" "!' I ! LL8i8IA3O 	" "!s<   D E'BD9E	E$D=8E=EEEc                   i }i }| j                   D ]>  }t        t        |d         D ]$  }t        |t              rT|j
                  }|j                         D ]4  \  \  }}}|j                  t        |      i       j                  d|i       6 ht        |t              sy|j                  }	|	r3|	j                  r'|	j                  |	j                  z
  j                         nd }
|	rt        |	j                        nd }|j                  t        |j                         i       j                  |
||	d       |	|t        |j                         <   ' A t        t"        t        t$        f   |      |fS )Nr  rX   )rZ   r[   run)rd  r   r   rb   r   logged_eval_resultsr`   
setdefaultr   rd   r   
latest_runend_time
start_timetotal_secondsr.  rm   r	   r]  )rQ   ri  all_runsccallbackeval_results_rm   rq   rz  rZ   r[   s               r?   _collect_metricsz%_DatasetRunContainer._collect_metricsE  sX   !# 	=A q~6 =h(@A#+#?#?L.:.@.@.B *J(33C
ORHOO'O  /:"--C 3<< 6EEG! #
 -0S[TF$//H4G4G0H"MTT.<&,#& :=HS!4!456-=	=0 Dj)+;<hFFr>   c                6   t         j                  d       t                | j                         \  }}d }| j                  r&t         j                  d       | j                  |      }| j                  ||      }t        | j                  j                  ||      S )Nz#Waiting for evaluators to complete.zRunning session evaluators.)rO  rW   aggregate_metrics)
r   r   r   r  re  rx  rk  rA   rW  r   )rQ   rh  ri  r  ru  rW   s         r?   _collect_test_resultsz*_DatasetRunContainer._collect_test_resultsb  s     	9:!%)%:%:%<"(!  KK56!%!;!;H!E**=:JK**0
 	
r>   c                   | j                  |      }|r	 |j                         }t        |       	 | j                  j                  | j                  j                  t        j                  t        j                               |S # t        $ r+}t        j                  dt        |              Y d }~d }~ww xY w# t        $ r,}t        j                  dt        |              Y d }~|S d }~ww xY w)Nz$Failed to print aggregate feedback: )r~  zFailed to close project: )r  rU   _display_aggregate_resultsr-  r   r   r/  rN  update_projectrW  r.  r   nowr   utc)rQ   rh  verboserW   agg_feedbackrj   s         r?   finishz_DatasetRunContainer.finisht  s    ,,];O&==?*<8	@KK&&(,,x||*D ' 
   OCDG9MNNO  	@LL4T!WI>??	@s/   B AB< 	B9!B44B9<	C1!C,,C1c                ,   |xs t        j                         }|
r|	si }	|	j                  d|
i       t        |||||	||      \  }}}}|xs g }|j                  j                  d      xs i j                         D ]  \  }}|j                  d| d|         d|j                  d   i}|
r|
|d<   t        |      }t        ||||j                  xs t        j                        }t        |d   ||       t        j                  t!        |            }|D cg c]O  }t#        t%        |j&                  ||j(                        t+        |xs g ||j(                  d	      |g|||
      Q }} | ||||||r|j,                        S d       S c c}w )Nrevision_id)rP  r   r@  r?  zgit:=r@  r   )rO  rN  rm   )r  rN  rm   max_concurrency)r  r   r  r  )rN  rW  rQ  r   rd  re  )r5   random_namerd   r[  r  ra   r`   rg   r   r   r   r(   kvr   r6   ProgressBarCallbackr   r   r   r   r.  r   re  )clsrN  r   r   rO  r4   r   r   concurrency_levelrP  r  r@  rQ  rW  rR  r   rp   rq   run_metadatar   progress_barr   rd  s                          r?   preparez_DatasetRunContainer.prepare  s    $D'B'B'D##% ##]K$@A4E -+5
1w zr%%))%06B==? 	(DAqKK$qc1#'	()7+;+;<M+NO*5L'./CD*8Z1B1B1Qhkk
 	!!m\J33CMB* $)
( ' #%,\\%#*::
 -#1#7R%#*::()	 !  1%#
 
, '<FZ88
 	
 MQ
 	
-
s   AF)rh  r   ri  zDict[str, _RowResult]rt   rc   )rs  zDict[str, Run]rt   z
List[dict])rt   z,Tuple[Dict[str, _RowResult], Dict[str, Run]])rh  z-List[Union[dict, str, LLMResult, ChatResult]]rt   rA   )F)rh  r   r  boolrt   rA   )NNN   NNN)rN  r   r   r   r   MODEL_OR_CHAIN_FACTORYrO  r^  r4   "Optional[smith_eval.RunEvalConfig]r   Optional[List[str]]r   Optional[Callable[[Dict], Any]]r  intrP  Optional[Dict[str, Any]]r  r^  r@  Optional[Union[datetime, str]]rt   ra  )r9   r:   r;   r<   r   re  rk  rx  r  r  r  classmethodr  r=   r>   r?   ra  ra    s%   =N!!OSLS 0 
	,"2G:
D
 

$"  :>$(8<!"59%):>E
E
 E
 5	E

 $E
 7E
 "E
 6E
 E
 3E
 #E
 8E
 
E
 E
r>   ra  c                 ~    	 ddl m}   |        } |        d uxr dt        t        |            v S # t        $ r Y yw xY w)Nr   )get_ipythonzmqshellF)IPythonr  r   r   r_   )r  ress     r?   _is_jupyter_environmentr    sB    'm}D(IZ3tCy>-II s   -0 	<<c                    t               rddlm}m}  | |d              ||        y | j	                  d d      }t        d       t        |       y )Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                
    | dS )Nz.2fr=   )xs    r?   r   z,_display_aggregate_results.<locals>.<lambda>  s
    aW r>   right)float_formatjustifyz
 Experiment Results:)r  IPython.displayr  r  	to_stringrM  )aggregate_resultsr  r  formatted_strings       r?   r  r    sQ     1345!",66-w 7 
 	&'r>   a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r  )r4   r@  r  rO  rP  r  r  c                 K   |
j                  dd       }|rt        dt        d       |	t               j	                  d      }	|
j                  dd       }|rt        ddd       |
r t        dd	|
j                          d
d       | xs
 t               } t        j                  | |||||||||	|      }t        j                  |j                  d   j	                  d      gt        t        j                  t        |j                   |      |j"                  |j                          d {   }|j%                  ||      S 7 w)Nr   0.0.305Tmessagependingr  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   r  removalrP  r  r@  r   r  r   r   r  )r   r   _INPUT_MAPPER_DEP_WARNINGr!   ra   r   r   ra  r  runnable_utilsgather_with_concurrencyrd  map	functoolspartialr1  rQ  r   r  )rN  r   r   r4   r@  r  rO  rP  r  r  r{   r   r   	containerrh  s                  r?   arun_on_datasetr    sd     ::nd3L	+DdS46::=I::fd#DU		
 4{{}oQ  	
 vxF$,,)' - I )@@!  !23	"%.%<%<)
 

 M M7;;s   D,E.E/Ec               N   |
j                  dd       }|rt        dt        d       |
j                  dd       }|rt        ddd       |	t               j	                  d      }	|
r t        dd	|
j                          d
d       | xs
 t               } t        j                  | |||||||||	|      }|dk(  rJt        |j                  |j                        D cg c]  \  }}t        |||j                  |        }}}nt        j                  |j                  d         5 }t!        |j#                  t%        j&                  t        |j                  |      |j                  |j                              }d d d        |j)                  |      S c c}}w # 1 sw Y   "xY w)Nr   r  Tr  r   r  r  r  r  r   r  r  r   r  r  )r   r   r  r!   ra   r   r   ra  r  rg  r   rd  r;  rQ  r%  get_executor_for_configr   r  r  r  r  )rN  r   r   r4   r@  r  rO  rP  r  r  r{   r   r   r  r   r   rh  rv  s                     r?   run_on_datasetr  8  s    ::nd3L	+DdS::fd#DU		
 46::=I4{{}oQ  	
 vxF$,,)' - I A $'y'9'99;L;L#M
   %.%<%<)	
 
 44Y5F5Fq5IJ 	h %%)-6-D-D%1
 &&%%
M	 M7;;/
	 	s   #F%AFF$a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()z<my_dataset>)r   r  r   r   rt   rb  )r   Dict[str, Any]rt   r   )r   r  rt   rc   )r   r)   r   r  rt   r   )r   r)   r   r/   r   r  rt   r   )r   r)   r   rb  r   r  rt   r   )
r   rb  r   rc  r4   r  r   r(   rt   zOptional[List[RunEvaluator]])r   smith_eval.RunEvalConfigr   r  rt   r^  )r   r  r   r  rt   r^  )r   r  r   r  rt   r^  )r  zYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]r  zOptional[BaseLanguageModel]r   r   r   r(   r   r  r   r^  r   r^  r   r^  rt   r#   )
r   r  r   r  r   r  r   r  rt   z2Tuple[Optional[str], Optional[str], Optional[str]])r   r  r   r   r   r(   r   r  r   r  r   r  rt   zList[RunEvaluator])r   r   r   r  r   r  r  r   r   r  r  r  rt   Union[str, BaseMessage])r   Union[Chain, Runnable]r   r  r  r   r   r  r   r  r  r  rt   zUnion[dict, str])
r   r)   r   r   r   rb  r   r  rt   z'Union[dict, str, LLMResult, ChatResult])r   r   r   r  r  r   r   r  r   r  r  r  rt   r  )r   r  r   r  r  r   r   r  r   r  r  r  rt   zUnion[Dict, str]r  )rN  r   r   r   r   r  rO  r   rP  r  r   r  r@  zOptional[Union[str, datetime]]rt   z1Tuple[MCF, TracerSession, Dataset, List[Example]])rt   r  )r  ru   rt   r   )rN  zOptional[Client]r   r   r   r  r4   r  r@  r  r  r  rO  r^  rP  r  r  r  r  r^  r{   r   rt   r  )r<   
__future__r   concurrent.futuresrn  dataclassesr  r   loggingrJ  r   r   typingr   r   r   r	   r
   r   r   r   r   langchain_core._apir    langchain_core.callbacks.managerr   langchain_core.language_modelsr   langchain_core.messagesr   r   langchain_core.outputsr   r   langchain_core.runnablesr   r   r   r   r%  r   r  !langchain_core.tracers.evaluationr   r    langchain_core.tracers.langchainr   langsmith.clientr   langsmith.envr    r!   langsmith.evaluationr"   r#   r$   r   langsmith.run_helpersr%   r&   langsmith.schemasr'   r(   r)   r*   r+   langsmith.utilsr,   requestsr-   typing_extensionsr.   langchain.chains.baser/   langchain.evaluation.loadingr0   langchain.evaluation.schemar1   r2   r3   langchain.smithr4   r  langchain.smith.evaluationr   r5   r6   r^   ri   	getLoggerr9   r   rc   r  rb  r-  r8   rA   rw   r   r   r   r   r   r   r   r   r   r   r   r	  r  r   r  r&  r1  r6  r8  r;  r[  r]  	dataclassra  r  r  r  r  r  _RUN_ON_DATASET_DOCSTRINGreplacer=   r>   r?   <module>r     s"   D "       '
 
 
 0 6 < C 8 M M > < = # F E L L *  ' ' 7 
 5 B @			8	$Ruh''(dVS[		  HRuh//02CCD3y 3F4 F4R
R 
R ': 0: :  	: z/
d Y  $
P1 
B""" 2" 
	"JCCC 2C 
	C( 3 	
 "<$# 0$$ .$( $@@
 *@ @ @ )@ !@ @ "@ @F	4$	4#	4 %	4 )		4
 8	4=$== = )	=
 $= %= =N !%48)-=	== 	=
 = 2= '= =J !%48)-! 
  2 ' J 59333 	3
 23 -3| !%48)-;	;; ;
 ; 2; '; ;F !%48)-! 
  2 ' J 59555 	5
 25 -5z 26 $6::5:5:5 1:5 	:5
 /:5 :5 4:5 7:5z%  
 
 
D 
: , 6:6:"&15!%><><>< 1><
 3>< 4>< ><  >< />< >< >< >< ><L 6:6:"&15!%J<J<J< 1J<
 3J< 4J< J<  J< /J< J< J< J< J<Zj V 3 3;;/ r>   