
    7|h`                    X    d dl mZ d dlmZmZmZmZmZ er
d dlm	Z	m
Z
mZ  G d d      Zy)    )annotations)TYPE_CHECKINGAnyIterableListOptional)	DataFrameRowSparkSessionc                      e Zd ZdZ	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 ddZe	 d	 	 	 	 	 	 	 dd       ZddZddZddZ	dddZ
dd	Zdd
ZddZdddZdddZdddZy)SparkSQLz;SparkSQL is a utility class for interacting with Spark SQL.Nc                H   	 ddl m} |r|n|j                  j	                         | _        |%| j
                  j                  j                  |       |%| j
                  j                  j                  |       t        | j                               | _        |rt        |      n	t               | _        | j                  r*| j                  | j                  z
  }|rt        d| d      |rt        |      n	t               | _        | j                  r*| j                  | j                  z
  }|rt        d| d      | j                         }	|	rt        |	      n| j                  | _        t#        |t$              st'        d      || _        y# t        $ r t        d      w xY w)	a  Initialize a SparkSQL object.

        Args:
            spark_session: A SparkSession object.
              If not provided, one will be created.
            catalog: The catalog to use.
              If not provided, the default catalog will be used.
            schema: The schema to use.
              If not provided, the default schema will be used.
            ignore_tables: A list of tables to ignore.
              If not provided, all tables will be used.
            include_tables: A list of tables to include.
              If not provided, all tables will be used.
            sample_rows_in_table_info: The number of rows to include in the table info.
              Defaults to 3.
        r   r   Fpyspark is not installed. Please install it with `pip install pyspark`Nzinclude_tables  not found in databasezignore_tables z,sample_rows_in_table_info must be an integer)pyspark.sqlr   ImportErrorbuildergetOrCreate_sparkcatalogsetCurrentCatalogsetCurrentDatabaseset_get_all_table_names_all_tables_include_tables
ValueError_ignore_tablesget_usable_table_names_usable_tables
isinstanceint	TypeError_sample_rows_in_table_info)
selfspark_sessionr   schemaignore_tablesinclude_tablessample_rows_in_table_infor   missing_tablesusable_tabless
             f/var/www/html/test/engine/venv/lib/python3.12/site-packages/langchain_community/utilities/spark_sql.py__init__zSparkSQL.__init__   s   2	0 +M0D0D0P0P0R 	 KK11':KK226:t88:;6Ds>2#%!11D4D4DDN %n%55KL  5Bc-0su!0043C3CCN $^$44JK  3354Ac-0tGWGW3S9JKK*C'E  	X 	s   F F!c                    	 ddl m} |j                  j	                  |      j                         } | |fi |S # t        $ r t        d      w xY w)zzCreating a remote Spark Session via Spark connect.
        For example: SparkSQL.from_uri("sc://localhost:15002")
        r   r   r   )r   r   r   r   remoter   )clsdatabase_uriengine_argskwargsr   sparks         r.   from_urizSparkSQL.from_uriK   s\    	0 $$++L9EEG5#F##  	X 	s	   : Ac                v    | j                   r| j                   S t        | j                  | j                  z
        S )zGet names of tables available.)r   sortedr   r   )r&   s    r.   r    zSparkSQL.get_usable_table_names\   s3    '''d&&)<)<<==    c                    | j                   j                  d      j                  d      j                         }t	        t        d |            S )NzSHOW TABLES	tableNamec                    | j                   S N)r<   )rows    r.   <lambda>z/SparkSQL._get_all_table_names.<locals>.<lambda>e   s
    CMM r:   )r   sqlselectcollectlistmap)r&   rowss     r.   r   zSparkSQL._get_all_table_namesc   s;    {{}-44[AIIKC14899r:   c                    | j                   j                  d|       j                         d   j                  }|j	                  d      }|d | dz   S )NzSHOW CREATE TABLE r   USING;)r   rA   rC   createtab_stmtfind)r&   table	statementusing_clause_indexs       r.   _get_create_table_stmtzSparkSQL._get_create_table_stmtg   sX    KKOO089AACAFUU 	 '^^G4,,-33r:   c                X   | j                         }|-t        |      j                  |      }|rt        d| d      |}g }|D ]R  }| j	                  |      }| j
                  r"|dz  }|d| j                  |       dz  }|dz  }|j                  |       T dj                  |      }|S )Nztable_names r   z

/*
z*/z

)	r    r   
differencer   rO   r%   _get_sample_spark_rowsappendjoin)r&   table_namesall_table_namesr,   tables
table_name
table_info	final_strs           r.   get_table_infozSparkSQL.get_table_infoo   s    557" -88IN </??U!VWW)O) 	&J44Z@J..h&
4#>#>z#J"K2NN
d"
MM*%	& KK'	r:   c                   d| d| j                    }| j                  j                  |      }dj                  t	        t        d |j                  j                                    }	 | j                  |      }dj                  |D cg c]  }dj                  |       c}      }| j                    d| d| d| S c c}w # t        $ r d}Y )w xY w)	NzSELECT * FROM z LIMIT 	c                    | j                   S r>   )name)fs    r.   r@   z1SparkSQL._get_sample_spark_rows.<locals>.<lambda>   s
    166 r:   rQ    z rows from z table:
)
r%   r   rA   rU   rD   rE   r(   fields_get_dataframe_results	Exception)r&   rL   querydfcolumns_strsample_rowsr?   sample_rows_strs           r.   rS   zSparkSQL._get_sample_spark_rows   s     wt/N/N.OP[[__U#iiS)9299;K;K%L MN	!55b9K"ii;(OC3(OPO
 ../{5'm2!	
	 )P 	! O	!s$   ( C B> C >C CCc                l    t        t        t        |j                         j	                                     S r>   )tuplerE   strasDictvalues)r&   r?   s     r.   _convert_row_as_tuplezSparkSQL._convert_row_as_tuple   s#    Scjjl113455r:   c                \    t        t        | j                  |j                                     S r>   )rD   rE   rp   rC   )r&   rg   s     r.   rd   zSparkSQL._get_dataframe_results   s     C22BJJLABBr:   c                    | j                   j                  |      }|dk(  r|j                  d      }t        | j	                  |            S )None   )r   rA   limitrm   rd   )r&   commandfetchrg   s       r.   runzSparkSQL.run   s>    [[__W%E>!B4..r233r:   c                ^    	 | j                  |      S # t        $ r}	 d| cY d}~S d}~ww xY w)af  Get information about specified tables.

        Follows best practices as specified in: Rajkumar et al, 2022
        (https://arxiv.org/abs/2204.00498)

        If `sample_rows_in_table_info`, the specified number of sample rows will be
        appended to each table description. This can increase performance as
        demonstrated in the paper.
        Error: N)r\   r   )r&   rV   es      r.   get_table_info_no_throwz SparkSQL.get_table_info_no_throw   s7    	!&&{33 	!*QC= 	!s    	,',,c                `    	 | j                  ||      S # t        $ r}	 d| cY d}~S d}~ww xY w)a*  Execute a SQL command and return a string representing the results.

        If the statement returns rows, a string of the results is returned.
        If the statement returns no rows, an empty string is returned.

        If the statement throws an error, the error message is returned.
        rz   N)rx   re   )r&   rv   rw   r{   s       r.   run_no_throwzSparkSQL.run_no_throw   s7    	!88GU++ 	!*QC= 	!s    	-(--)NNNNN   )r'   zOptional[SparkSession]r   Optional[str]r(   r   r)   Optional[List[str]]r*   r   r+   r#   r>   )r3   rm   r4   zOptional[dict]r5   r   returnr   )r   zIterable[str])rL   rm   r   rm   )rV   r   r   rm   )r?   r
   r   rl   )rg   r	   r   rD   )all)rv   rm   rw   rm   r   rm   )__name__
__module____qualname____doc__r/   classmethodr7   r    r   rO   r\   rS   rp   rd   rx   r|   r~    r:   r.   r   r   	   s    E 15!% $-1.2)*=D-=D =D 	=D
 +=D ,=D $'=D~ >B$$-;$NQ$	$ $ >:4$
"6C4! !r:   r   N)
__future__r   typingr   r   r   r   r   r   r	   r
   r   r   r   r:   r.   <module>r      s#    " ? ?88q! q!r:   