[docs]classCharsetMatch:def__init__(self,payload:bytes,guessed_encoding:str,mean_mess_ratio:float,has_sig_or_bom:bool,languages:"CoherenceMatches",decoded_payload:Optional[str]=None,):self._payload=payload# type: bytesself._encoding=guessed_encoding# type: strself._mean_mess_ratio=mean_mess_ratio# type: floatself._languages=languages# type: CoherenceMatchesself._has_sig_or_bom=has_sig_or_bom# type: boolself._unicode_ranges=None# type: Optional[List[str]]self._leaves=[]# type: List[CharsetMatch]self._mean_coherence_ratio=0.0# type: floatself._output_payload=None# type: Optional[bytes]self._output_encoding=None# type: Optional[str]self._string=decoded_payload# type: Optional[str]def__eq__(self,other:object)->bool:ifnotisinstance(other,CharsetMatch):raiseTypeError("__eq__ cannot be invoked on {} and {}.".format(str(other.__class__),str(self.__class__)))returnself.encoding==other.encodingandself.fingerprint==other.fingerprintdef__lt__(self,other:object)->bool:""" Implemented to make sorted available upon CharsetMatches items. """ifnotisinstance(other,CharsetMatch):raiseValueErrorchaos_difference=abs(self.chaos-other.chaos)# type: floatcoherence_difference=abs(self.coherence-other.coherence)# type: float# Bellow 1% difference --> Use Coherenceifchaos_difference<0.01andcoherence_difference>0.02:# When having a tough decision, use the result that decoded as many multi-byte as possible.ifchaos_difference==0.0andself.coherence==other.coherence:returnself.multi_byte_usage>other.multi_byte_usagereturnself.coherence>other.coherencereturnself.chaos<other.chaos@propertydefmulti_byte_usage(self)->float:return1.0-len(str(self))/len(self.raw)@propertydefchaos_secondary_pass(self)->float:""" Check once again chaos in decoded text, except this time, with full content. Use with caution, this can be very slow. Notice: Will be removed in 3.0 """warnings.warn("chaos_secondary_pass is deprecated and will be removed in 3.0",DeprecationWarning,)returnmess_ratio(str(self),1.0)@propertydefcoherence_non_latin(self)->float:""" Coherence ratio on the first non-latin language detected if ANY. Notice: Will be removed in 3.0 """warnings.warn("coherence_non_latin is deprecated and will be removed in 3.0",DeprecationWarning,)return0.0@propertydefw_counter(self)->Counter:""" Word counter instance on decoded text. Notice: Will be removed in 3.0 """warnings.warn("w_counter is deprecated and will be removed in 3.0",DeprecationWarning)string_printable_only=sub(NOT_PRINTABLE_PATTERN," ",str(self).lower())returnCounter(string_printable_only.split())def__str__(self)->str:# Lazy Str Loadingifself._stringisNone:self._string=str(self._payload,self._encoding,"strict")returnself._stringdef__repr__(self)->str:return"<CharsetMatch '{}' bytes({})>".format(self.encoding,self.fingerprint)defadd_submatch(self,other:"CharsetMatch")->None:ifnotisinstance(other,CharsetMatch)orother==self:raiseValueError("Unable to add instance <{}> as a submatch of a CharsetMatch".format(other.__class__))other._string=None# Unload RAM usage; dirty trick.self._leaves.append(other)@propertydefencoding(self)->str:returnself._encoding@propertydefencoding_aliases(self)->List[str]:""" Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855. """also_known_as=[]# type: List[str]foru,pinaliases.items():ifself.encoding==u:also_known_as.append(p)elifself.encoding==p:also_known_as.append(u)returnalso_known_as@propertydefbom(self)->bool:returnself._has_sig_or_bom@propertydefbyte_order_mark(self)->bool:returnself._has_sig_or_bom@propertydeflanguages(self)->List[str]:""" Return the complete list of possible languages found in decoded sequence. Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'. """return[e[0]foreinself._languages]@propertydeflanguage(self)->str:""" Most probable language found in decoded sequence. If none were detected or inferred, the property will return "Unknown". """ifnotself._languages:# Trying to infer the language based on the given encoding# Its either English or we should not pronounce ourselves in certain cases.if"ascii"inself.could_be_from_charset:return"English"# doing it there to avoid circular importfromcharset_normalizer.cdimportencoding_languages,mb_encoding_languageslanguages=(mb_encoding_languages(self.encoding)ifis_multi_byte_encoding(self.encoding)elseencoding_languages(self.encoding))iflen(languages)==0or"Latin Based"inlanguages:return"Unknown"returnlanguages[0]returnself._languages[0][0]@propertydefchaos(self)->float:returnself._mean_mess_ratio@propertydefcoherence(self)->float:ifnotself._languages:return0.0returnself._languages[0][1]@propertydefpercent_chaos(self)->float:returnround(self.chaos*100,ndigits=3)@propertydefpercent_coherence(self)->float:returnround(self.coherence*100,ndigits=3)@propertydefraw(self)->bytes:""" Original untouched bytes. """returnself._payload@propertydefsubmatch(self)->List["CharsetMatch"]:returnself._leaves@propertydefhas_submatch(self)->bool:returnlen(self._leaves)>0@propertydefalphabets(self)->List[str]:ifself._unicode_rangesisnotNone:returnself._unicode_ranges# list detected rangesdetected_ranges=[unicode_range(char)forcharinstr(self)]# type: List[Optional[str]]# filter and sortself._unicode_ranges=sorted(list({rforrindetected_rangesifr}))returnself._unicode_ranges@propertydefcould_be_from_charset(self)->List[str]:""" The complete list of encoding that output the exact SAME str result and therefore could be the originating encoding. This list does include the encoding available in property 'encoding'. """return[self._encoding]+[m.encodingforminself._leaves]
[docs]deffirst(self)->"CharsetMatch":""" Kept for BC reasons. Will be removed in 3.0. """returnself
[docs]defbest(self)->"CharsetMatch":""" Kept for BC reasons. Will be removed in 3.0. """returnself
[docs]defoutput(self,encoding:str="utf_8")->bytes:""" Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. Any errors will be simply ignored by the encoder NOT replaced. """ifself._output_encodingisNoneorself._output_encoding!=encoding:self._output_encoding=encodingself._output_payload=str(self).encode(encoding,"replace")returnself._output_payload# type: ignore
@propertydeffingerprint(self)->str:""" Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one. """returnsha256(self.output()).hexdigest()
[docs]classCharsetMatches:""" Container with every CharsetMatch items ordered by default from most probable to the less one. Act like a list(iterable) but does not implements all related methods. """def__init__(self,results:List[CharsetMatch]=None):self._results=sorted(results)ifresultselse[]# type: List[CharsetMatch]def__iter__(self)->Iterator[CharsetMatch]:yield fromself._resultsdef__getitem__(self,item:Union[int,str])->CharsetMatch:""" Retrieve a single item either by its position or encoding name (alias may be used here). Raise KeyError upon invalid index or encoding not present in results. """ifisinstance(item,int):returnself._results[item]ifisinstance(item,str):item=iana_name(item,False)forresultinself._results:ifiteminresult.could_be_from_charset:returnresultraiseKeyErrordef__len__(self)->int:returnlen(self._results)def__bool__(self)->bool:returnlen(self._results)>0
[docs]defappend(self,item:CharsetMatch)->None:""" Insert a single match. Will be inserted accordingly to preserve sort. Can be inserted as a submatch. """ifnotisinstance(item,CharsetMatch):raiseValueError("Cannot append instance '{}' to CharsetMatches".format(str(item.__class__)))# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)iflen(item.raw)<=TOO_BIG_SEQUENCE:formatchinself._results:ifmatch.fingerprint==item.fingerprintandmatch.chaos==item.chaos:match.add_submatch(item)returnself._results.append(item)self._results=sorted(self._results)
[docs]defbest(self)->Optional["CharsetMatch"]:""" Simply return the first match. Strict equivalent to matches[0]. """ifnotself._results:returnNonereturnself._results[0]
[docs]deffirst(self)->Optional["CharsetMatch"]:""" Redundant method, call the method best(). Kept for BC reasons. """returnself.best()