
    7|h{/                    B   d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZmZ  ej6                  e      Z edd	
      Z G d d	ee      Z G d de      Z  G d de!e
      Z" ed       G d d             Z#ddZ$y)    )annotationsN)ABCabstractmethod)	dataclass)Enum)AbstractSetAnyCallable
CollectionIterableListLiteralOptionalSequenceTypeTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                      e Zd ZdZddedddf	 	 	 	 	 	 	 	 	 	 	 	 	 ddZedd       Z	 d	 	 	 	 	 dd	Zdd
Z	ddZ
ddZedd       Zedd e       df	 	 	 	 	 	 	 	 	 	 	 	 	 dd       Z	 	 	 	 	 	 ddZy)r   z)Interface for splitting text into chunks.i     FTc                    ||kD  rt        d| d| d      || _        || _        || _        || _        || _        || _        y)ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)self
chunk_sizechunk_overlaplength_functionkeep_separatoradd_start_indexstrip_whitespaces          \/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_text_splitters/base.py__init__zTextSplitter.__init__!   sd    * :%.}o ><46  &+ /- /!1    c                     y)z$Split text into multiple components.N )r#   texts     r*   
split_textzTextSplitter.split_textB   s    r,   Nc           	        |xs i gt        |      z  }g }t        |      D ]  \  }}d}d}| j                  |      D ]  }	t        j                  ||         }
| j
                  r>||z   | j                  z
  }|j                  |	t        d|            }||
d<   t        |	      }t        |	|
      }|j                  |         |S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater0   copydeepcopyr!   r   findmaxr   append)r#   texts	metadatas
_metadatas	documentsir/   indexprevious_chunk_lenchunkr4   offsetnew_docs                r*   create_documentszTextSplitter.create_documentsF   s     32$U"3
	 ' 	*GAtE!". *==A7(("%77$:M:MMF IIeSF^<E.3H]+),U&"I  )*	* r,   c                    g g }}|D ]8  }|j                  |j                         |j                  |j                         : | j                  ||      S )zSplit documents.)r=   )r;   r3   r4   rF   )r#   r?   r<   r=   docs        r*   split_documentszTextSplitter.split_documentsZ   sV    ry 	+CLL))*S\\*	+ $$Ui$@@r,   c                l    |j                  |      }| j                  r|j                         }|dk(  ry |S )N )joinr"   strip)r#   docs	separatorr/   s       r*   
_join_docszTextSplitter._join_docsb   s3    ~~d#!!::<D2:Kr,   c                d   | j                  |      }g }g }d}|D ]m  }| j                  |      }||z   t        |      dkD  r|ndz   | j                  kD  r
|| j                  kD  r%t        j	                  d| d| j                          t        |      dkD  r| j                  ||      }	|	|j                  |	       || j                  kD  s*||z   t        |      dkD  r|ndz   | j                  kD  ro|dkD  rj|| j                  |d         t        |      dkD  r|ndz   z  }|dd  }|| j                  kD  r?||z   t        |      dkD  r|ndz   | j                  kD  r|dkD  rj|j                  |       ||t        |      dkD  r|ndz   z  }p | j                  ||      }	|	|j                  |	       |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r   r5   r   loggerwarningrP   r;   r   )
r#   splitsrO   separator_lenrN   current_doctotald_lenrH   s
             r*   _merge_splitszTextSplitter._merge_splitsk   s    --i8!# 	KA((+D[1AA1E1M""# 4+++NN25' :>>B>N>N=OQ {#a'//+yACC(  $"5"55[9IA9MSTU**+!AI!6!6{1~!F-0-=-AMq"  '2!"o  $"5"55[9IA9MSTU**+!AI q!Tc+.>.B]JJE9	K: ook95?KKr,   c                    	 ddl m} t        |      st        d      dfd} | dd|i|S # t        $ r t        d      w xY w)	z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBasec                8    t        j                  |             S Nr5   encoder/   	tokenizers    r*   _huggingface_tokenizer_lengthzNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_length   s    9++D122r,   z`Could not import transformers python package. Please install it with `pip install transformers`.r&   r/   strreturnintr.   )transformersr]   
isinstancer   ImportError)clsrc   kwargsr]   rd   s    `   r*   from_huggingface_tokenizerz'TextSplitter.from_huggingface_tokenizer   se    	<i)@A W 3 K#@KFKK  	E 	s	   #0 Agpt2allc                   	 	 ddl }||j                  |      	n|j                  |      	d	fd}t	        | t
              r||d}i ||} | dd|i|S # t        $ r t        d      w xY w)	z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.c                >    t        j                  |             S N)allowed_specialdisallowed_specialr`   )r/   rt   ru   encs    r*   _tiktoken_encoderz=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder   s*    

$3'9   r,   )encoding_name
model_namert   ru   r&   re   r.   )tiktokenrk   encoding_for_modelget_encoding
issubclassTokenTextSplitter)
rl   rx   ry   rt   ru   rm   rz   rw   extra_kwargsrv   s
      ``    @r*   from_tiktoken_encoderz"TextSplitter.from_tiktoken_encoder   s    	 !--j9C''6C	 c,-!.(#2&8	L 0/,/F?#4???=  	A 	s   A A2c                6    | j                  t        |            S )z2Transform sequence of documents by splitting them.)rI   list)r#   r?   rm   s      r*   transform_documentsz TextSplitter.transform_documents   s     ##DO44r,   )r$   rh   r%   rh   r&   zCallable[[str], int]r'   z$Union[bool, Literal['start', 'end']]r(   boolr)   r   rg   Noner/   rf   rg   	List[str]r_   )r<   r   r=   zOptional[List[dict]]rg   List[Document])r?   zIterable[Document]rg   r   )rN   r   rO   rf   rg   Optional[str])rU   zIterable[str]rO   rf   rg   r   )rc   r	   rm   r	   rg   r   )rl   zType[TS]rx   rf   ry   r   rt   'Union[Literal['all'], AbstractSet[str]]ru   &Union[Literal['all'], Collection[str]]rm   r	   rg   r   )r?   Sequence[Document]rm   r	   rg   r   )__name__
__module____qualname____doc__r5   r+   r   r0   rF   rI   rP   r[   classmethodrn   setr   r   r.   r,   r*   r   r      sQ   3  03?D %!%22 2 .	2
 =2 2 2 
2B 3 3 CG+?	(A(T L L(  $$(CF5EJ)@)@)@ ")@ A	)@
 C)@ )@ 
)@ )@V5+57:5	5r,   c                  V     e Zd ZdZdd e       df	 	 	 	 	 	 	 	 	 	 	 d fdZddZ xZS )	r~   z/Splitting text to tokens using model tokenizer.ro   Nrp   c                    t        |   di | 	 ddl}||j	                  |      }n|j                  |      }|| _        || _        || _        y# t        $ r t        d      w xY w)zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r.   )	superr+   rz   rk   r{   r|   
_tokenizer_allowed_special_disallowed_special)	r#   rx   ry   rt   ru   rm   rz   rv   	__class__s	           r*   r+   zTokenTextSplitter.__init__   s     	"6"	 !--j9C''6C /#5   	A 	s   A A&c                     d fd}t         j                   j                   j                  j                  |      }t        ||      S )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        c                h    j                   j                  | j                  j                        S rs   )r   ra   r   r   )_textr#   s    r*   _encodez-TokenTextSplitter.split_text.<locals>._encode  s4    ??)) $ 5 5#'#;#; *  r,   )r%   tokens_per_chunkdecodera   rb   )r   rf   rg   z	List[int])	Tokenizerr   r   r   r   split_text_on_tokens)r#   r/   r   rc   s   `   r*   r0   zTokenTextSplitter.split_text   sE     	 --!--??))	
	 $CCr,   )rx   rf   ry   r   rt   r   ru   r   rm   r	   rg   r   r   )r   r   r   r   r   r+   r0   __classcell__)r   s   @r*   r~   r~      s]    9 $$(CF5EJ66 "6 A	6
 C6 6 
66Dr,   r~   c                  x    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZy)Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolcluaperlhaskellelixir
powershellN)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLCLUAPERLHASKELLELIXIR
POWERSHELLr.   r,   r*   r   r     s    ,
C	BDF	B	B
CEF
CDDEEHED
CFEA
CDGFJr,   r   T)frozenc                  @    e Zd ZU dZded<   	 ded<   	 ded<   	 ded<   y	)
r   zTokenizer data class.rh   r%   r   zCallable[[List[int]], str]r   zCallable[[str], List[int]]ra   N)r   r   r   r   __annotations__r.   r,   r*   r   r   ;  s)    *,&&=&&=r,   r   c                   g }|j                  |       }d}t        ||j                  z   t        |            }||| }|t        |      k  r|j	                  |j                  |             |t        |      k(  r	 |S ||j                  |j                  z
  z  }t        ||j                  z   t        |            }||| }|t        |      k  r|S )z6Split incoming text and return chunks using tokenizer.r   )ra   minr   r5   r;   r   r%   )r/   rc   rU   	input_ids	start_idxcur_idx	chunk_idss          r*   r   r   I  s    F  &II)i888#i.IG)G,I
c)n
$i&&y12c)n$ M 	Y//)2I2III	i)"<"<<c)nMi0	 c)n
$ Mr,   )r/   rf   rc   r   rg   r   )%
__future__r   r7   loggingabcr   r   dataclassesr   enumr   typingr   r	   r
   r   r   r   r   r   r   r   r   r   langchain_core.documentsr   r   	getLoggerr   rS   r   r   r~   rf   r   r   r   r.   r,   r*   <module>r      s    "   # !     G			8	$T(|5*C |5~<D <D~sD > $
> 
> 
>r,   