[docs]@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)defunicode_range(character:str)->Optional[str]:""" Retrieve the Unicode range official name from a single character. """character_ord=ord(character)# type: intforrange_name,ord_rangeinUNICODE_RANGES_COMBINED.items():ifcharacter_ordinord_range:returnrange_namereturnNone
[docs]defany_specified_encoding(sequence:bytes,search_zone:int=4096)->Optional[str]:""" Extract using ASCII-only decoder any specified encoding in the first n-bytes. """ifnotisinstance(sequence,bytes):raiseTypeErrorseq_len=len(sequence)# type: intresults=findall(RE_POSSIBLE_ENCODING_INDICATION,sequence[:min(seq_len,search_zone)].decode("ascii",errors="ignore"),)# type: List[str]iflen(results)==0:returnNoneforspecified_encodinginresults:specified_encoding=specified_encoding.lower().replace("-","_")forencoding_alias,encoding_ianainaliases.items():ifencoding_alias==specified_encoding:returnencoding_ianaifencoding_iana==specified_encoding:returnencoding_ianareturnNone
[docs]@lru_cache(maxsize=128)defis_multi_byte_encoding(name:str)->bool:""" Verify is a specific encoding is a multi byte one based on it IANA name """returnnamein{"utf_8","utf_8_sig","utf_16","utf_16_be","utf_16_le","utf_32","utf_32_le","utf_32_be","utf_7",}orissubclass(importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,# type: ignoreMultibyteIncrementalDecoder,)
[docs]defidentify_sig_or_bom(sequence:bytes)->Tuple[Optional[str],bytes]:""" Identify and extract SIG/BOM in given sequence. """foriana_encodinginENCODING_MARKS:marks=ENCODING_MARKS[iana_encoding]# type: Union[bytes, List[bytes]]ifisinstance(marks,bytes):marks=[marks]formarkinmarks:ifsequence.startswith(mark):returniana_encoding,markreturnNone,b""
[docs]defiana_name(cp_name:str,strict:bool=True)->str:cp_name=cp_name.lower().replace("-","_")forencoding_alias,encoding_ianainaliases.items():ifcp_namein[encoding_alias,encoding_iana]:returnencoding_ianaifstrict:raiseValueError("Unable to retrieve IANA for '{}'".format(cp_name))returncp_name
[docs]defis_cp_similar(iana_name_a:str,iana_name_b:str)->bool:""" Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using the function cp_similarity. """return(iana_name_ainIANA_SUPPORTED_SIMILARandiana_name_binIANA_SUPPORTED_SIMILAR[iana_name_a])