[docs]classMessDetectorPlugin:""" Base abstract class used for mess detection plugins. All detectors MUST extend and implement given methods. """
[docs]defeligible(self,character:str)->bool:""" Determine if given character should be fed in. """raiseNotImplementedError# pragma: nocover
[docs]deffeed(self,character:str)->None:""" The main routine to be executed upon character. Insert the logic in witch the text would be considered chaotic. """raiseNotImplementedError# pragma: nocover
[docs]defreset(self)->None:# pragma: no cover""" Permit to reset the plugin to the initial state. """raiseNotImplementedError
@propertydefratio(self)->float:""" Compute the chaos ratio based on what your feed() has seen. Must NOT be lower than 0.; No restriction gt 0. """raiseNotImplementedError# pragma: nocover
classTooManySymbolOrPunctuationPlugin(MessDetectorPlugin):def__init__(self)->None:self._punctuation_count=0# type: intself._symbol_count=0# type: intself._character_count=0# type: intself._last_printable_char=None# type: Optional[str]self._frenzy_symbol_in_word=False# type: booldefeligible(self,character:str)->bool:returncharacter.isprintable()deffeed(self,character:str)->None:self._character_count+=1if(character!=self._last_printable_charandcharacternotinCOMMON_SAFE_ASCII_CHARACTERS):ifis_punctuation(character):self._punctuation_count+=1elif(character.isdigit()isFalseandis_symbol(character)andis_emoticon(character)isFalse):self._symbol_count+=2self._last_printable_char=characterdefreset(self)->None:# pragma: no coverself._punctuation_count=0self._character_count=0self._symbol_count=0@propertydefratio(self)->float:ifself._character_count==0:return0.0ratio_of_punctuation=(self._punctuation_count+self._symbol_count)/self._character_count# type: floatreturnratio_of_punctuationifratio_of_punctuation>=0.3else0.0classTooManyAccentuatedPlugin(MessDetectorPlugin):def__init__(self)->None:self._character_count=0# type: intself._accentuated_count=0# type: intdefeligible(self,character:str)->bool:returncharacter.isalpha()deffeed(self,character:str)->None:self._character_count+=1ifis_accentuated(character):self._accentuated_count+=1defreset(self)->None:# pragma: no coverself._character_count=0self._accentuated_count=0@propertydefratio(self)->float:ifself._character_count==0:return0.0ratio_of_accentuation=(self._accentuated_count/self._character_count)# type: floatreturnratio_of_accentuationifratio_of_accentuation>=0.35else0.0classUnprintablePlugin(MessDetectorPlugin):def__init__(self)->None:self._unprintable_count=0# type: intself._character_count=0# type: intdefeligible(self,character:str)->bool:returnTruedeffeed(self,character:str)->None:if(character.isspace()isFalse# includes \n \t \r \vandcharacter.isprintable()isFalseandcharacter!="\x1A"# Why? Its the ASCII substitute character.):self._unprintable_count+=1self._character_count+=1defreset(self)->None:# pragma: no coverself._unprintable_count=0@propertydefratio(self)->float:ifself._character_count==0:return0.0return(self._unprintable_count*8)/self._character_countclassSuspiciousDuplicateAccentPlugin(MessDetectorPlugin):def__init__(self)->None:self._successive_count=0# type: intself._character_count=0# type: intself._last_latin_character=None# type: Optional[str]defeligible(self,character:str)->bool:returncharacter.isalpha()andis_latin(character)deffeed(self,character:str)->None:self._character_count+=1if(self._last_latin_characterisnotNoneandis_accentuated(character)andis_accentuated(self._last_latin_character)):ifcharacter.isupper()andself._last_latin_character.isupper():self._successive_count+=1# Worse if its the same char duplicated with different accent.ifremove_accent(character)==remove_accent(self._last_latin_character):self._successive_count+=1self._last_latin_character=characterdefreset(self)->None:# pragma: no coverself._successive_count=0self._character_count=0self._last_latin_character=None@propertydefratio(self)->float:ifself._character_count==0:return0.0return(self._successive_count*2)/self._character_countclassSuspiciousRange(MessDetectorPlugin):def__init__(self)->None:self._suspicious_successive_range_count=0# type: intself._character_count=0# type: intself._last_printable_seen=None# type: Optional[str]defeligible(self,character:str)->bool:returncharacter.isprintable()deffeed(self,character:str)->None:self._character_count+=1if(character.isspace()oris_punctuation(character)orcharacterinCOMMON_SAFE_ASCII_CHARACTERS):self._last_printable_seen=Nonereturnifself._last_printable_seenisNone:self._last_printable_seen=characterreturnunicode_range_a=unicode_range(self._last_printable_seen)# type: Optional[str]unicode_range_b=unicode_range(character)# type: Optional[str]ifis_suspiciously_successive_range(unicode_range_a,unicode_range_b):self._suspicious_successive_range_count+=1self._last_printable_seen=characterdefreset(self)->None:# pragma: no coverself._character_count=0self._suspicious_successive_range_count=0self._last_printable_seen=None@propertydefratio(self)->float:ifself._character_count==0:return0.0ratio_of_suspicious_range_usage=(self._suspicious_successive_range_count*2)/self._character_count# type: floatifratio_of_suspicious_range_usage<0.1:return0.0returnratio_of_suspicious_range_usageclassSuperWeirdWordPlugin(MessDetectorPlugin):def__init__(self)->None:self._word_count=0# type: intself._bad_word_count=0# type: intself._foreign_long_count=0# type: intself._is_current_word_bad=False# type: boolself._foreign_long_watch=False# type: boolself._character_count=0# type: intself._bad_character_count=0# type: intself._buffer=""# type: strself._buffer_accent_count=0# type: intdefeligible(self,character:str)->bool:returnTruedeffeed(self,character:str)->None:ifcharacter.isalpha():self._buffer="".join([self._buffer,character])ifis_accentuated(character):self._buffer_accent_count+=1if(self._foreign_long_watchisFalseand(is_latin(character)isFalseoris_accentuated(character))andis_cjk(character)isFalseandis_hangul(character)isFalseandis_katakana(character)isFalseandis_hiragana(character)isFalseandis_thai(character)isFalse):self._foreign_long_watch=Truereturnifnotself._buffer:returnif(character.isspace()oris_punctuation(character)oris_separator(character))andself._buffer:self._word_count+=1buffer_length=len(self._buffer)# type: intself._character_count+=buffer_lengthifbuffer_length>=4:ifself._buffer_accent_count/buffer_length>0.34:self._is_current_word_bad=True# Word/Buffer ending with a upper case accentuated letter are so rare,# that we will consider them all as suspicious. Same weight as foreign_long suspicious.ifis_accentuated(self._buffer[-1])andself._buffer[-1].isupper():self._foreign_long_count+=1self._is_current_word_bad=Trueifbuffer_length>=24andself._foreign_long_watch:self._foreign_long_count+=1self._is_current_word_bad=Trueifself._is_current_word_bad:self._bad_word_count+=1self._bad_character_count+=len(self._buffer)self._is_current_word_bad=Falseself._foreign_long_watch=Falseself._buffer=""self._buffer_accent_count=0elif(characternotin{"<",">","-","=","~","|","_"}andcharacter.isdigit()isFalseandis_symbol(character)):self._is_current_word_bad=Trueself._buffer+=characterdefreset(self)->None:# pragma: no coverself._buffer=""self._is_current_word_bad=Falseself._foreign_long_watch=Falseself._bad_word_count=0self._word_count=0self._character_count=0self._bad_character_count=0self._foreign_long_count=0@propertydefratio(self)->float:ifself._word_count<=10andself._foreign_long_count==0:return0.0returnself._bad_character_count/self._character_countclassCjkInvalidStopPlugin(MessDetectorPlugin):""" GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected. Searching for the overuse of '丅' and '丄'. """def__init__(self)->None:self._wrong_stop_count=0# type: intself._cjk_character_count=0# type: intdefeligible(self,character:str)->bool:returnTruedeffeed(self,character:str)->None:ifcharacterin{"丅","丄"}:self._wrong_stop_count+=1returnifis_cjk(character):self._cjk_character_count+=1defreset(self)->None:# pragma: no coverself._wrong_stop_count=0self._cjk_character_count=0@propertydefratio(self)->float:ifself._cjk_character_count<16:return0.0returnself._wrong_stop_count/self._cjk_character_countclassArchaicUpperLowerPlugin(MessDetectorPlugin):def__init__(self)->None:self._buf=False# type: boolself._character_count_since_last_sep=0# type: intself._successive_upper_lower_count=0# type: intself._successive_upper_lower_count_final=0# type: intself._character_count=0# type: intself._last_alpha_seen=None# type: Optional[str]self._current_ascii_only=True# type: booldefeligible(self,character:str)->bool:returnTruedeffeed(self,character:str)->None:is_concerned=character.isalpha()andis_case_variable(character)chunk_sep=is_concernedisFalseifchunk_sepandself._character_count_since_last_sep>0:if(self._character_count_since_last_sep<=64andcharacter.isdigit()isFalseandself._current_ascii_onlyisFalse):self._successive_upper_lower_count_final+=(self._successive_upper_lower_count)self._successive_upper_lower_count=0self._character_count_since_last_sep=0self._last_alpha_seen=Noneself._buf=Falseself._character_count+=1self._current_ascii_only=Truereturnifself._current_ascii_onlyisTrueandis_ascii(character)isFalse:self._current_ascii_only=Falseifself._last_alpha_seenisnotNone:if(character.isupper()andself._last_alpha_seen.islower())or(character.islower()andself._last_alpha_seen.isupper()):ifself._bufisTrue:self._successive_upper_lower_count+=2self._buf=Falseelse:self._buf=Trueelse:self._buf=Falseself._character_count+=1self._character_count_since_last_sep+=1self._last_alpha_seen=characterdefreset(self)->None:# pragma: no coverself._character_count=0self._character_count_since_last_sep=0self._successive_upper_lower_count=0self._successive_upper_lower_count_final=0self._last_alpha_seen=Noneself._buf=Falseself._current_ascii_only=True@propertydefratio(self)->float:ifself._character_count==0:return0.0returnself._successive_upper_lower_count_final/self._character_count
[docs]defis_suspiciously_successive_range(unicode_range_a:Optional[str],unicode_range_b:Optional[str])->bool:""" Determine if two Unicode range seen next to each other can be considered as suspicious. """ifunicode_range_aisNoneorunicode_range_bisNone:returnTrueifunicode_range_a==unicode_range_b:returnFalseif"Latin"inunicode_range_aand"Latin"inunicode_range_b:returnFalseif"Emoticons"inunicode_range_aor"Emoticons"inunicode_range_b:returnFalse# Latin characters can be accompanied with a combining diacritical mark# eg. Vietnamese.if("Latin"inunicode_range_aor"Latin"inunicode_range_b)and("Combining"inunicode_range_aor"Combining"inunicode_range_b):returnFalsekeywords_range_a,keywords_range_b=unicode_range_a.split(" "),unicode_range_b.split(" ")forelinkeywords_range_a:ifelinUNICODE_SECONDARY_RANGE_KEYWORD:continueifelinkeywords_range_b:returnFalse# Japanese Exceptionrange_a_jp_chars,range_b_jp_chars=(unicode_range_ain("Hiragana","Katakana",),unicode_range_bin("Hiragana","Katakana"),)if(range_a_jp_charsorrange_b_jp_chars)and("CJK"inunicode_range_aor"CJK"inunicode_range_b):returnFalseifrange_a_jp_charsandrange_b_jp_chars:returnFalseif"Hangul"inunicode_range_aor"Hangul"inunicode_range_b:if"CJK"inunicode_range_aor"CJK"inunicode_range_b:returnFalseifunicode_range_a=="Basic Latin"orunicode_range_b=="Basic Latin":returnFalse# Chinese/Japanese use dedicated range for punctuation and/or separators.if("CJK"inunicode_range_aor"CJK"inunicode_range_b)or(unicode_range_ain["Katakana","Hiragana"]andunicode_range_bin["Katakana","Hiragana"]):if"Punctuation"inunicode_range_aor"Punctuation"inunicode_range_b:returnFalseif"Forms"inunicode_range_aor"Forms"inunicode_range_b:returnFalsereturnTrue
[docs]@lru_cache(maxsize=2048)defmess_ratio(decoded_sequence:str,maximum_threshold:float=0.2,debug:bool=False)->float:""" Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier. """detectors=[md_class()formd_classinMessDetectorPlugin.__subclasses__()]# type: List[MessDetectorPlugin]length=len(decoded_sequence)+1# type: intmean_mess_ratio=0.0# type: floatiflength<512:intermediary_mean_mess_ratio_calc=32# type: inteliflength<=1024:intermediary_mean_mess_ratio_calc=64else:intermediary_mean_mess_ratio_calc=128forcharacter,indexinzip(decoded_sequence+"\n",range(length)):fordetectorindetectors:ifdetector.eligible(character):detector.feed(character)if(index>0andindex%intermediary_mean_mess_ratio_calc==0)orindex==length-1:mean_mess_ratio=sum(dt.ratiofordtindetectors)ifmean_mess_ratio>=maximum_threshold:breakifdebug:fordtindetectors:# pragma: nocoverprint(dt.__class__,dt.ratio)returnround(mean_mess_ratio,3)