
    ih7*                       d Z ddlmZ ddlZddlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddZddZ G d deee      Z  G d deee      Z! G d de!      Z"y)z-LLM Chains for evaluating question answering.    )annotationsN)AnyListOptionalSequenceTuple)	Callbacks)BaseLanguageModel)PromptTemplate)
ConfigDict)LLMChain)CONTEXT_PROMPT
COT_PROMPTPROMPT)LLMEvalChainStringEvaluator)RUN_KEYc                   t        j                  d| j                         t         j                        }|rF|j	                  d      j                         dk(  ry|j	                  d      j                         dk(  ry	 | j                         j                         d   j                  t        j                  ddt        j                              }|j                         dk(  ry|j                         dk(  ry| j                         j                         d	   j                  t        j                  ddt        j                              }|j                         dk(  ry|j                         dk(  ry	 y # t        $ r Y y w xY w)
Nzgrade:\s*(correct|incorrect)   CORRECT)r   r   	INCORRECT)r   r   r    )researchstrip
IGNORECASEgroupuppersplit	translatestr	maketransstringpunctuation
IndexError)textmatch
first_word	last_words       `/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain/evaluation/qa/eval_chain.py
_get_scorer,      sC   II5tzz|R]]SE;;q>!Y.[[^!!#{2!JJL #--cmmBFDVDV.WX 	 *;.!JJLUWRYs}}RV-?-?@A 	
 ??	)__+-! .   s&   =A&E5 $E5 8A&E5 E5 5	F Fc                ^    | j                         }t        |      }|d\  }}n|\  }}|||dS )zParse the output text.

    Args:
        text (str): The output text to parse.

    Returns:
        Any: The parsed output.
    )NN)	reasoningvaluescore)r   r,   )r'   r.   parsed_scoresr/   r0   s        r+   _parse_string_eval_outputr2   1   sD     

Iy)M!u$u     c                  2   e Zd ZU dZdZded<    ed      Zedd       Z	e
dd       Ze
dd	       Ze
dd
       Ze	 d	 	 	 	 	 	 	 dd       Z	 	 	 ddd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZddddd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddddd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy)QAEvalChainz,LLM Chain for evaluating question answering.resultsr"   
output_keyignoreextrac                     yNF clss    r+   is_lc_serializablezQAEvalChain.is_lc_serializableP       r3   c                     y)Ncorrectnessr=   selfs    r+   evaluation_namezQAEvalChain.evaluation_nameT   s    r3   c                     yNTr=   rD   s    r+   requires_referencezQAEvalChain.requires_referenceX       r3   c                     yrH   r=   rD   s    r+   requires_inputzQAEvalChain.requires_input\   rJ   r3   Nc                    |xs t         }h d}|t        |j                        k7  rt        d| d|j                          | d||d|S )a  Load QA Eval Chain from LLM.

        Args:
            llm (BaseLanguageModel): the base language model to use.

            prompt (PromptTemplate): A prompt template containing the input_variables:
            'input', 'answer' and 'result' that will be used as the prompt
            for evaluation.
            Defaults to PROMPT.

            **kwargs: additional keyword arguments.

        Returns:
            QAEvalChain: the loaded QA eval chain.
        >   queryanswerresultInput variables should be 
, but got llmpromptr=   )r   setinput_variables
ValueError)r?   rT   rU   kwargsexpected_input_varss        r+   from_llmzQAEvalChain.from_llm`   sk    , !6;#f&<&<"==,-@,A B!1124  4s64V44r3   	callbacksc                   t        |      D cg c]  \  }}||   ||   ||   |   d }	}}| j                  |	|      S c c}}w )5Evaluate question answering examples and predictions.rN   rO   rP   r\   	enumerateapply)
rE   examplespredictionsquestion_key
answer_keyprediction_keyr]   iexampleinputss
             r+   evaluatezQAEvalChain.evaluate   sd    " (1
 7	 !.!*-%a.8
 
 zz&Iz66
   Ac                f    t        || j                           }t        |v r|t           |t        <   |S Nr2   r7   r   rE   rP   parsed_results      r+   _prepare_outputzQAEvalChain._prepare_output   1    1&2IJf%+G_M'"r3   F	referenceinputr]   include_run_infoc               B     | |||d||      }| j                  |      S )a  Evaluate Chain or LLM output, based on optional input and label.

        Args:
            prediction (str): the LLM or chain prediction to evaluate.
            reference (Optional[str], optional): the reference label
                to evaluate against.
            input (Optional[str], optional): the input to consider during evaluation
            callbacks (Callbacks, optional): the callbacks to use for tracing.
            include_run_info (bool, optional): whether to include run info in the
                returned results.
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
        r`   r]   rx   rs   rE   
predictionrv   rw   r]   rx   rY   rP   s           r+   _evaluate_stringszQAEvalChain._evaluate_strings   s7    0 #$
  -
 ##F++r3   c               p   K   | j                  |||d||       d {   }| j                  |      S 7 w)Nr`   rk   r]   rx   acallrs   r|   s           r+   _aevaluate_stringszQAEvalChain._aevaluate_strings   sH      zz"i:N- " 
 

 ##F++
   646returnboolr   r"   ro   )rT   r
   rU   Optional[PromptTemplate]rY   r   r   r5   r`   )rd   Sequence[dict]re   r   rf   r"   rg   r"   rh   r"   r]   r	   r   
List[dict]rP   dictr   r   r}   r"   rv   Optional[str]rw   r   r]   r	   rx   r   rY   r   r   r   )__name__
__module____qualname____doc__r7   __annotations__r   model_configclassmethodr@   propertyrF   rI   rL   r[   rl   rs   r~   r   r=   r3   r+   r5   r5   G   s   6JL          ,055 )5 	5
 
5 5D $"&7  $7 7 $7 	7
 7 7 7 
7, $(##!&!, !, !	!,
 !, !, !, !, 
!,N $(##!&, , !	,
 , , , , 
,r3   r5   c                  4   e Zd ZdZedd       Zedd       Zedd       Z e	d      Z
edd       Zedd       Ze	 d	 	 	 	 	 	 	 dd
       Z	 	 	 dd	d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZd	d	d	dd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZd	d	d	dd	 	 	 	 	 	 	 	 	 	 	 	 	 ddZy	)ContextQAEvalChainz3LLM Chain for evaluating QA w/o GT based on contextc                     yr<   r=   r>   s    r+   r@   z%ContextQAEvalChain.is_lc_serializable   rA   r3   c                     y)z.Whether the chain requires a reference string.Tr=   rD   s    r+   rI   z%ContextQAEvalChain.requires_reference        r3   c                     y)z+Whether the chain requires an input string.Tr=   rD   s    r+   rL   z!ContextQAEvalChain.requires_input   r   r3   r8   r9   c                r    h d}|t        |j                        k7  rt        d| d|j                         y )N>   rN   rP   contextrQ   rR   )rV   rW   rX   )r?   rU   rZ   s      r+   _validate_input_varsz'ContextQAEvalChain._validate_input_vars   sM    <#f&<&<"==,-@,A B!1124  >r3   c                     y)NzContextual Accuracyr=   rD   s    r+   rF   z"ContextQAEvalChain.evaluation_name   s    $r3   Nc                N    |xs t         }| j                  |        | d||d|S )a  Load QA Eval Chain from LLM.

        Args:
            llm (BaseLanguageModel): the base language model to use.

            prompt (PromptTemplate): A prompt template containing the input_variables:
            'query', 'context' and 'result' that will be used as the prompt
            for evaluation.
            Defaults to PROMPT.

            **kwargs: additional keyword arguments.

        Returns:
            ContextQAEvalChain: the loaded QA eval chain.
        rS   r=   )r   r   r?   rT   rU   rY   s       r+   r[   zContextQAEvalChain.from_llm   s1    , )>  (4s64V44r3   r\   c                   t        |      D cg c]  \  }}||   ||   ||   |   d }	}}| j                  |	|      S c c}}w )r_   rN   r   rP   r\   ra   )
rE   rd   re   rf   context_keyrh   r]   ri   rj   rk   s
             r+   rl   zContextQAEvalChain.evaluate  sd    " (1
 7	 !.";/%a.8
 
 zz&Iz66
rm   c                f    t        || j                           }t        |v r|t           |t        <   |S ro   rp   rq   s      r+   rs   z"ContextQAEvalChain._prepare_output"  rt   r3   Fru   c               B     | |||d||      }| j                  |      S )Nr   rz   r{   r|   s           r+   r~   z$ContextQAEvalChain._evaluate_strings(  s7     $$
  -
 ##F++r3   c               p   K   | j                  |||d||       d {   }| j                  |      S 7 w)Nr   r   r   r|   s           r+   r   z%ContextQAEvalChain._aevaluate_strings=  sH      zz"yJO- " 
 

 ##F++
r   r   )rU   r   r   Noner   ro   )rT   r
   rU   r   rY   r   r   r   r   )rd   r   re   r   rf   r"   r   r"   rh   r"   r]   r	   r   r   r   r   )r   r   r   r   r   r@   r   rI   rL   r   r   r   rF   r[   rl   rs   r~   r   r=   r3   r+   r   r      s   =      L   % %  ,055 )5 	5
 
5 5: $$&7  $77  7 	7
 7 7 7 
7, $(##!&, , !	,
 , , , , 
,2 $(##!&, , !	,
 , , , , 
,r3   r   c                  X    e Zd ZdZedd       Zedd       Ze	 d	 	 	 	 	 	 	 d	d       Zy)
CotQAEvalChainz=LLM Chain for evaluating QA using chain of thought reasoning.c                     yr<   r=   r>   s    r+   r@   z!CotQAEvalChain.is_lc_serializableR  rA   r3   c                     y)NzCOT Contextual Accuracyr=   rD   s    r+   rF   zCotQAEvalChain.evaluation_nameV  s    (r3   Nc                N    |xs t         }| j                  |        | d||d|S )zLoad QA Eval Chain from LLM.rS   r=   )r   r   r   s       r+   r[   zCotQAEvalChain.from_llmZ  s1     %:  (4s64V44r3   r   r   ro   )rT   r
   rU   r   rY   r   r   r   )	r   r   r   r   r   r@   r   rF   r[   r=   r3   r+   r   r   O  sj    G  ) )  ,0	5	5 )	5 		5
 
	5 	5r3   r   )r'   r"   r   zOptional[Tuple[str, int]])r'   r"   r   r   )#r   
__future__r   r   r$   typingr   r   r   r   r    langchain_core.callbacks.managerr	   langchain_core.language_modelsr
   langchain_core.promptsr   pydanticr   langchain.chains.llmr   #langchain.evaluation.qa.eval_promptr   r   r   langchain.evaluation.schemar   r   langchain.schemar   r,   r2   r5   r   r   r=   r3   r+   <module>r      so    3 " 	  7 7 6 < 1  ) R R E $:,F,(O\ F,R|,?L |,~5' 5r3   