o
    MhX                     @  s:  d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZmZ edZe  Z!e!"e#d 									d2d3d$d%Z$									d2d4d(d)Z%									d2d5d,d-Z&									d6d7d0d1Z'dS )8    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF皙?	sequencesbytes | bytearraystepsint
chunk_size	thresholdfloatcp_isolationlist[str] | Nonecp_exclusionpreemptive_behaviourboolexplainlanguage_thresholdenable_fallbackreturnr   c
           2      C  s	  t | ttfstdt| |rtj}
tt	 t
t t| }|dkrGtd |r;tt	 t
|
p9tj tt| dddg dgS |dur]ttd	d
| dd |D }ng }|duruttdd
| dd |D }ng }||| krttd||| d}|}|dkr|| |k rt|| }t| tk }t| tk}|rttd| n|rttd| g }|rt| nd}|dur|| ttd| t }g }g }d}d}d}t }t }t| \}}|dur|| ttdt|| |d d|vr|d |t D ]7}|r$||vr$q|r.||v r.q||v r5q|| d}||k}|oFt|}|dv rX|sXttd| q|dv ri|sittd| qzt|}W n t t!fy   ttd| Y qw z9|r|du rt"|du r| dtd n	| t|td |d nt"|du r| n| t|d |d}W n+ t#t$fy } zt |t$sttd|t"| || W Y d}~qd}~ww d} |D ]}!t%||!rd}  nq| rttd||! qt&|sdnt||t|| }"|o&|duo&t||k }#|#r1ttd | tt|"d! }$t'|$d"}$d}%d}&g }'g }(zLt(| ||"||||||	D ]=})|'|) |(t)|)||du ordt|  kopd"kn   |(d# |kr|%d7 }%|%|$ks|r|du r nqSW n! t#y } zttd$|t"| |$}%d}&W Y d}~nd}~ww |&s|r|sz| td%d j*|d&d' W n# t#y } zttd(|t"| || W Y d}~qd}~ww |(rt+|(t|( nd}*|*|ks|%|$krJ|| ttd)||%t,|*d* d+d, |	rH|dd|d-d.fv rH|&sHt| |||g ||d/}+||kr>|+}n
|dkrF|+}n|+}qttd0|t,|*d* d+d, |s`t-|},nt.|},|,rsttd1|t"|, g }-|dkr|'D ]})t/|)||,rd2|,nd}.|-|. q|t0|-}/|/rttd3|/| t| ||*||/|du s||ddfv r|nd|d/}0||0 ||ddfv r|*d4k r|*dkrtd5|0j1 |rtt	 t
|
 t|0g  S ||0 t|r-|du s||v r-d|v r-d|v r-|2 }1td5|1j1 |r&tt	 t
|
 t|1g  S ||krNtd6| |rEtt	 t
|
 t|| g  S qt|dkr|s`|s`|rfttd7 |rvtd8|j1 || n2|r~|du s|r|r|j3|j3ks|durtd9 || n|rtd: || |rtd;|2 j1t|d  ntd< |rtt	 t
|
 |S )=af  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S     g | ]}t |d qS Fr   .0cp r5   W/var/www/html/Persson_Maskin/env/lib/python3.10/site-packages/charset_normalizer/api.py
<listcomp>[       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S  r/   r0   r1   r2   r5   r5   r6   r7   f   r8   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      zaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsr:   r;   )preemptive_declarationz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}r   z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerloggingWARNINGr   r   logjoinr    r   r   r   appendsetr   r
   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorr   rangemaxr   r   decodesumroundr   r   r   r	   r=   bestfingerprint)2r   r   r!   r"   r$   r&   r'   r)   r*   r+   previous_logger_levellengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedresultsearly_stop_resultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_countlazy_str_hard_failure	md_chunks	md_ratioschunkmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergedcurrent_matchprobable_resultr5   r5   r6   
from_bytes!   s$  












	





&






	

























r   fpr   c
           
      C  s   t |  |||||||||	
S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)
r   r   r!   r"   r$   r&   r'   r)   r*   r+   r5   r5   r6   from_fp!  s   r   pathstr | bytes | PathLikec
                 C  sH   t | d}
t|
|||||||||	
W  d   S 1 sw   Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )r   r   r!   r"   r$   r&   r'   r)   r*   r+   r   r5   r5   r6   	from_path?  s   $r   fp_or_path_or_payload!PathLike | str | BinaryIO | bytesc
                 C  s   t | ttfrt| |||||||||	d
}
|
 S t | ttfr0t| |||||||||	d
}
|
 S t| |||||||||	d
}
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r   r!   r"   r$   r&   r'   r)   r*   r+   )rH   r_   r   r   rJ   rI   r   r   )r   r   r!   r"   r$   r&   r'   r)   r*   r+   guessesr5   r5   r6   	is_binary^  s\   -r   )	r   r   r   NNTFr   T)r   r   r   r    r!   r    r"   r#   r$   r%   r&   r%   r'   r(   r)   r(   r*   r#   r+   r(   r,   r   )r   r   r   r    r!   r    r"   r#   r$   r%   r&   r%   r'   r(   r)   r(   r*   r#   r+   r(   r,   r   )r   r   r   r    r!   r    r"   r#   r$   r%   r&   r%   r'   r(   r)   r(   r*   r#   r+   r(   r,   r   )	r   r   r   NNTFr   F)r   r   r   r    r!   r    r"   r#   r$   r%   r&   r%   r'   r(   r)   r(   r*   r#   r+   r(   r,   r(   )(
__future__r   rV   osr   typingr   cdr   r   r   r	   constantr
   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   r   	getLoggerrN   StreamHandlerrQ   setFormatter	Formatterr   r   r   r   r5   r5   r5   r6   <module>   st    $

     !