
    ihIH                    P   d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ  ej:                  e      ZdgZ e G d d             Z!ddhZ"h dZ#ddZ$ G d de      Z% G d de      Z&e G d de             Z'y)zLoads YouTube transcript.    )annotationsN)Enum)Path)AnyDict	GeneratorListOptionalSequenceUnion)parse_qsurlparse)
ParseError)Document)model_validator)	dataclass)
BaseLoaderz0https://www.googleapis.com/auth/youtube.readonlyc                      e Zd ZU dZ ej
                         dz  dz  Zded<    ej
                         dz  dz  Zded<    ej
                         dz  dz  Z	ded<   dd	Z
 ed
      edd              ZddZy)GoogleApiClienta  Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"

    *Security Note*: Note that parsing of the transcripts relies on the standard
        xml library but the input is viewed as trusted in this case.


    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    z.credentialszcredentials.jsonr   credentials_pathservice_account_pathz
token.json
token_pathc                .    | j                         | _        y N)_load_credentialscredsselfs    j/var/www/html/dev/engine/venv/lib/python3.12/site-packages/langchain_community/document_loaders/youtube.py__post_init__zGoogleApiClient.__post_init__2   s    ++-
    beforemodec                    |j                   j                  d      s&|j                   j                  d      st        d      |j                   S )DValidate that either folder_id or document_ids is set, but not both.r   r   -Must specify either channel_name or video_idskwargsget
ValueErrorclsvaluess     r   #validate_channel_or_videoIds_is_setz3GoogleApiClient.validate_channel_or_videoIds_is_set5   sC    
 }}  !34V]]=N=N">
 LMM}}r!   c                   	 ddl m} ddlm} ddlm} ddlm} ddlm	} d}| j                  j                         r.|j
                  j                  t        | j                              S | j                  j                         r)|j!                  t        | j                        t"              }|r|j$                  s|r/|j&                  r#|j(                  r|j+                   |              n;|j-                  t        | j.                        t"              }|j1                  d	      }t3        | j                  d
      5 }|j5                  |j7                                ddd       |S |S # t        $ r t        d      w xY w# 1 sw Y   |S xY w)zLoad credentials.r   )Request)service_account)Credentials)InstalledAppFlowYouTubeTranscriptApiYou must run`pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api` to use the Google Drive loaderN)portw)google.auth.transport.requestsr1   google.oauth2r2   google.oauth2.credentialsr3   google_auth_oauthlib.flowr4   youtube_transcript_apir6   ImportErrorr   existsfrom_service_account_filestrr   from_authorized_user_fileSCOPESvalidexpiredrefresh_tokenrefreshfrom_client_secrets_filer   run_local_serveropenwriteto_json)	r   r1   r2   r3   r4   r6   r   flowtokens	            r   r   z!GoogleApiClient._load_credentials@   sH   	>5=BC $$++-"..HHD--.  ??!!#99#doo:NPVWEEKK5+>+>gi('@@--. --1-5doos+ -uEMMO,- u;  	1 	4- s   E) = F)E>FNreturnNoner.   r   rQ   r   )rQ   r   )__name__
__module____qualname____doc__r   homer   __annotations__r   r   r    r   classmethodr/   r    r!   r   r   r      s    * 'TYY[>9<NNdN!*~!=@R!R$R tyy{^3lBJB. (#  $&r!   r   httphttps>   vid.plusyoutu.beyoutube.comm.youtube.comwww.youtube.comwww.youtube-nocookie.comc                   t        |       }|j                  t        vry|j                  t        vry|j
                  }|j                  d      r9|j                  }t        |      }d|v r|d   }t        |t              r|n|d   }n0y|j
                  j                  d      }|j                  d      d   }t        |      dk7  ry|S )zEParse a YouTube URL and return the video ID if valid, otherwise None.Nz/watchvr   /   )r   schemeALLOWED_SCHEMESnetlocALLOWED_NETLOCSpathendswithqueryr   
isinstancerB   lstripsplitlen)url
parsed_urlrm   ro   parsed_queryidsvideo_ids          r   _parse_video_idry   t   s    #J//??D}}X  ,s#C(c2sAH%%c*::c?2&
8}Or!   c                      e Zd ZdZdZdZdZy)TranscriptFormatz3Output formats of transcripts from `YoutubeLoader`.textlineschunksN)rT   rU   rV   rW   TEXTLINESCHUNKSr[   r!   r   r{   r{      s    =DEFr!   r{   c                      e Zd ZdZdddej
                  ddf	 	 	 	 	 	 	 	 	 	 	 	 	 ddZedd       Ze	dd       Z
	 	 	 	 	 	 dd	Z	 	 	 	 dd
ZddZddZy)YoutubeLoaderz!Load `YouTube` video transcripts.FenNx   c                    || _         d|i| _        || _        || _        t	        |t
              r	|g| _        n|| _        || _        || _        || _        || _	        y)z!Initialize with YouTube video ID.sourceN)
rx   	_metadataadd_video_infolanguagerp   rB   translationtranscript_formatcontinue_on_failurechunk_size_seconds)r   rx   r   r   r   r   r   r   s           r   __init__zYoutubeLoader.__init__   s`     !"H-, h$%JDM$DM&!2#6 "4r!   c                >    t        |       }|st        d|  d      |S )z*Extract video ID from common YouTube URLs.z.Could not determine the video ID for the URL "z".)ry   r+   )youtube_urlrx   s     r   extract_video_idzYoutubeLoader.extract_video_id   s1     #;/@RP  r!   c                6    | j                  |      } | |fi |S )z|Given a YouTube URL, construct a loader.
        See `YoutubeLoader()` constructor for a list of keyword arguments.
        )r   )r-   r   r)   rx   s       r   from_youtube_urlzYoutubeLoader.from_youtube_url   s$    
 ''48&v&&r!   c                    t        |d      \  }}t        |d      \  }}t        dj                  t        d |            i | j                  ||dd|dd|dd| j
                   d| dd	
      S )z0Create Document from chunk of transcript pieces.<    c                *    | d   j                  d      S Nr|   r   strip)chunk_pieces    r   <lambda>z4YoutubeLoader._make_chunk_document.<locals>.<lambda>   s    F(;(A(A#(F r!   02d: https://www.youtube.com/watch?v=z&t=s)start_secondsstart_timestampr   page_contentmetadata)divmodr   joinmapr   rx   )r   chunk_pieceschunk_start_secondsmr   hs         r   _make_chunk_documentz"YoutubeLoader._make_chunk_document   s     )2.1a}1FU..!4&'WAaWAaW#= 34==/)*!-	
 	
r!   c              #    K   g }d}| j                   }|D ]L  }|d   |d   z   }||kD  r)|r| j                  ||       g }|}|| j                   z  }|j                  |       N t        |      dkD  r| j                  ||       y y w)Nr   startduration)r   r   appendrs   )r   transcript_piecesr   r   chunk_time_limittranscript_piece	piece_ends          r   _get_transcript_chunksz$YoutubeLoader._get_transcript_chunks   s      .022 1 		2(14DZ4PPI++33LBUVV!&6# D$;$;;  01		2 |q ++L:MNN !s   BBc                v   	 ddl m}m}m} | j
                  r+| j                         }| j                  j                  |       	 |j                  | j                        }	 |j                  | j                        }| j                  |j                  | j                        }|j                         }| j                   t"        j$                  k(  r4dj'                  t)        d |            }t+        || j                        gS | j                   t"        j,                  k(  rt/        t)        d |            S | j                   t"        j0                  k(  rt/        | j3                  |            S t5        d	      # t        $ r t	        d      w xY w# |$ r g cY S w xY w# |$ r |j                  dg      }Y <w xY w)
z1Load YouTube transcripts into `Document` objects.r   )NoTranscriptFoundTranscriptsDisabledr6   zvCould not import "youtube_transcript_api" Python package. Please install it with `pip install youtube-transcript-api`.r   r   c                *    | d   j                  d      S r   r   r   s    r   r   z$YoutubeLoader.load.<locals>.<lambda>  s    -=f-E-K-KC-P r!   r   c           
         t        | d   j                  d      t        t        d | j	                                           S )Nr|   r   c                    | d   dk7  S )Nr   r|   r[   )items    r   r   z6YoutubeLoader.load.<locals>.<lambda>.<locals>.<lambda>"  s    T!W-> r!   r   )r   r   dictfilteritemsr   s    r   r   z$YoutubeLoader.load.<locals>.<lambda>  s=    X%5f%=%C%CC%H!%" >@P@V@V@X". r!   zUnknown transcript format.)r>   r   r   r6   r?   r   _get_video_infor   updatelist_transcriptsrx   find_transcriptr   r   	translatefetchr   r{   r   r   r   r   r   listr   r   r+   )r   r   r   r6   
video_infotranscript_list
transcriptr   s           r   loadzYoutubeLoader.load   s   
	   --/JNN!!*-	2CCDMMRO	A(88GJ '#--d.>.>?J2<2B2B2D!!%5%:%::P%J *t~~NOO##'7'='== &
  ##'7'>'>>334EFGG 9::k  	O 	 # 	I	
 ! 	A(88$@J	As/   
E8 F  F 8FFFF87F8c                x   	 ddl m}  |d| j                         }|j                  xs d|j
                  xs d|j                  xs d|j                  xs d|j                  r|j                  j                  d      nd|j                  xs d|j                  xs dd}|S # t        $ r t        d      w xY w)zGet important video information.

        Components include:
            - title
            - description
            - thumbnail URL,
            - publish_date
            - channel author
            - and more.
        r   )YouTubezVCould not import "pytube" Python package. Please install it with `pip install pytube`.r   Unknownz%Y-%m-%d %H:%M:%S)titledescription
view_countthumbnail_urlpublish_datelengthauthor)pytuber   r?   rx   r   r   viewsr   r   strftimer   r   )r   r   ytr   s       r   r   zYoutubeLoader._get_video_info/  s    	& 7GHXX*>>6Y((-a--: OO445HIiin1ii,9


 #  	? 	s   B$ $B9)rx   rB   r   boolr   zUnion[str, Sequence[str]]r   Optional[str]r   r{   r   r   r   int)r   rB   rQ   rB   )r   rB   r)   r   rQ   r   )r   
List[Dict]r   r   rQ   r   )r   r   rQ   zGenerator[Document, None, None]rQ   List[Document])rQ   r   )rT   rU   rV   rW   r{   r   r   staticmethodr   rZ   r   r   r   r   r   r[   r!   r   r   r      s    +
  %.2%).>.C.C$)"%55 5 ,	5
 #5 ,5 "5  50   ' '
&
=@
	
*O!+O	(O(=;~r!   r   c                      e Zd ZU dZded<   dZded<   dZded<   d	Zd
ed<   dZded<   dZ	d
ed<   ddZ
ddZ ed      edd              ZddZd dZd!dZd"dZd#dZd$dZy)%GoogleApiYoutubeLoadera  Load all Videos from a `YouTube` Channel.

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"



    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            from langchain_community.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    r   google_api_clientNr   channel_namezOptional[List[str]]	video_idsTr   r   r   rB   captions_languageFr   c                X    | j                  | j                  j                        | _        y r   )_build_youtube_clientr   r   youtube_clientr   s    r   r    z$GoogleApiYoutubeLoader.__post_init__v  s!    "889O9O9U9UVr!   c                b    	 ddl m} ddlm}  |dd|      S # t        $ r t	        d      w xY w)Nr   )buildr5   r7   youtubev3)credentials)googleapiclient.discoveryr   r>   r6   r?   )r   r   r   r6   s       r   r   z,GoogleApiYoutubeLoader._build_youtube_clienty  sA    	7C Y%88  	1 	s    .r"   r#   c                    |j                   j                  d      s&|j                   j                  d      st        d      |j                   S )r&   r   r   r'   r(   r,   s     r   r/   z:GoogleApiYoutubeLoader.validate_channel_or_videoIds_is_set  s=     }}  09J9J;9WLMM}}r!   c                T   ddl m}m} |j                  |      }	 |j	                  | j
                  g      }j                         }dj                  |D cg c]  }|d   j                  d       c}      S # |$ r% |D ]  }|j                  | j
                        } Y jw xY wc c}w )Nr   )r   r6   r   r|   )
r>   r   r6   r   r   r   r   r   r   r   )	r   rx   r   r6   r   r   available_transcriptr   ts	            r   _get_transcripe_for_video_idz3GoogleApiYoutubeLoader._get_transcripe_for_video_id  s    R.??I	(88$:P:P9QRJ ',,.xx7HI!6-IJJ ! 	(7 $1;;D<R<RS
	 Js   A8 B%8'B"!B"c                    | j                  |      }| j                  j                         j                  d|      j	                         }t        ||j                  d      d         S )N
id,snippetpartidr   r   r   )r   r   videosr   executer   r*   )r   rx   r)   captionsvideo_responses        r   _get_document_for_video_idz1GoogleApiYoutubeLoader._get_document_for_video_id  sl    44X>&&(T!   WY 	 !#''03
 	
r!   c                    | j                   j                         j                  d|dd      }|j                         }|d   d   d   d   }|S )Nr   channel   )r   qtype
maxResultsr   r   	channelId)r   searchr   r   )r   r   requestresponse
channel_ids        r   _get_channel_idz&GoogleApiYoutubeLoader._get_channel_id  s_    %%,,.33	 4 
 ??$g&q)$/<
r!   c                    | j                   j                         j                  d|      }|j                         }|d   d   d   d   d   S )NcontentDetailsr   r   r   relatedPlaylistsuploads)r   channelsr   r   )r   r  r	  r
  s       r   _get_uploads_playlist_idz/GoogleApiYoutubeLoader._get_uploads_playlist_id  s[    %%..055! 6 
 ??$ #$456HI)TTr!   c           	        	 ddl m}m} | j	                  |      }| j                  |      }| j                  j                         j                  d|d      }g }||j                         }	|	d   D ]s  }
|
d   d	   d
   }d
|i}| j                  r(|
d   j                  d       |j                  |
d          	 | j                  |      }|j                  t        ||             u | j                  j)                         j+                  ||	      }||S # t        $ r t        d      w xY w# ||t         f$ r>}| j"                  r%t$        j'                  dd|
d   d
    d| z          n|Y d }~d }~ww xY w)Nr   )r   r   zTYou must run`pip install --upgrade youtube-transcript-api` to use the youtube loaderr   2   )r   
playlistIdr  r   snippet
resourceIdvideoId
thumbnailsr   zError fetching transscript r   r   z, exception: )r>   r   r   r?   r  r  r   playlistItemsr   r   r   popr   r   r   r   r   r   loggererrorr  	list_next)r   r  r)   r   r   r  uploads_playlist_idr	  r   r
  r   rx   	meta_datar   es                  r   _get_document_for_channelz0GoogleApiYoutubeLoader._get_document_for_channel  s   	 ))'2
";;JG%%335::* ; 

 	!(H !) 	?<8C&1	&&O''5$$T)_5#'#D#DX#NL$$ )5%.. ))002<<WhOG7 !: [  	, 	D ,->
K //9 !$t*Y"7!8aSIJ
  s#   D 8-D.D+.E8:3E33E8c                "   g }| j                   r,|j                  | j                  | j                                |S | j                  r:|j                  | j                  D cg c]  }| j	                  |       c}       |S t        d      c c}w )zLoad documents.r'   )r   extendr"  r   r   r+   )r   document_listrx   s      r   r   zGoogleApiYoutubeLoader.load  s      !?!?@Q@Q!RS  ^^   %)NN  33H=  LMMs    BrP   )r   r   rQ   r   rS   )rx   rB   rQ   rB   )rx   rB   r)   r   rQ   r   )r   rB   rQ   rB   )r  rB   rQ   rB   )r  rB   r)   r   rQ   r   r   )rT   rU   rV   rW   rY   r   r   r   r   r   r    r   r   rZ   r/   r   r   r  r  r"  r   r[   r!   r   r   r   Q  s    8 '&"&L-&%)I")ND!s! %%W9  (#  $K
	U3jr!   r   )rt   rB   rQ   r   )(rW   
__future__r   loggingenumr   pathlibr   typingr   r   r   r	   r
   r   r   urllib.parser   r   xml.etree.ElementTreer   langchain_core.documentsr   pydanticr   pydantic.dataclassesr   )langchain_community.document_loaders.baser   	getLoggerrT   r  rD   r   rj   rl   ry   r{   r   r   r[   r!   r   <module>r2     s     "    H H H + , - $ * @			8	$
<	= N N Nb 7#<t tJ tn rZ r rr!   