
    bkh
                     `    d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dl	Z	d dl
Z
 G d d      Zy)    N)extract_text)convert_from_pathc                       e Zd Zd Zd Zd Zy)PdfExtractorc                 @    t        j                  t              | _        y )N)logging	getLogger__name__logger)selfs    BE:\xampp\htdocs\new-grp\engine\services\extractor\pdf_extractor.py__init__zPdfExtractor.__init__   s    ''1    c                    	 t        j                  t        j                  j	                  |            j                  d      }| j                  j                  d|        d}	 t        j                  |      }|D ]  }||j                  d      z  } |j                          |j                         r| j                  |      S | j                  j                  d       	 t#        |      }|j                         r| j                  |      S | j                  j                  d       	 t%        |      }|D ]  }|t'        j(                  |d	
      z  } | j                  |      S # t        $ r1}| j                  j                  dt!        |              Y d }~d }~ww xY w# t        $ r1}| j                  j                  dt!        |              Y d }~d }~ww xY w# t        $ r1}| j                  j                  dt!        |              Y d }~nd }~ww xY w|j                         st+        d      |S # t        $ r-}| j                  j                  dt!        |               d }~ww xY w)Nutf-8zExtracting text from PDF:  textz.No text found with PyMuPDF, trying PDFMiner...zPyMuPDF extraction failed: z*No text found with PDFMiner, trying OCR...zPDFMiner extraction failed: eng)langzOCR extraction failed: z'No text could be extracted from the PDFzPDF extraction failed: )base64	b64decodeurllibparseunquotedecoder   infofitzopenget_textclosestrip_detect_encoding	Exceptionerrorstrr   r   pytesseractimage_to_string
ValueError)	r   encoded_pathpdf_pathextracted_textdocpageeimagesimgs	            r   r   zPdfExtractor.extract_text   s+   -	''(<(<\(JKRRSZ[HKK9(DEN
Jii)D"dmmF&;;N  		!'')00@@  !QR
K!-h!7!'')00@@  !MN
F*84!C"k&A&A#E&RRN ",,^<<#  J!!$?Ax"HIIJ  K!!$@Q"IJJK  F!!$;CF8"DEEF "'') !JKK!! 	KK 7Ax@A	s   A!H: $A E& E& !+F# F# )<G  &	F /'FH: F  H: #	G,'GH: GH:  	H)'HH: HH: :	I0(I++I0c                    	 t        j                  |j                               }|d   }|r"|j                  |      j                  dd      S |S # t        $ r3}| j
                  j                  dt        |              |cY d}~S d}~ww xY w)z*Detect encoding and convert text to UTF-8.encodingr   ignore)errorszEncoding detection failed: N)chardetdetectencoder   r#   r   r$   r%   )r   r   resultr2   r.   s        r   r"   zPdfExtractor._detect_encoding>   s    	^^DKKM2Fj)H{{8,33GH3MMK 	KK ;CF8DEK	s$   AA A 	B(BBBN)r
   
__module____qualname__r   r   r"    r   r   r   r   
   s    2.`
r   r   )r   urllib.parser   r   pdfminer.high_levelr   	pdf2imager   r&   r5   r   r   r;   r   r   <module>r?      s'       , '   > >r   