@@ -159,6 +159,8 @@ def from_bytes(
159159
160160 results : CharsetMatches = CharsetMatches ()
161161
162+ early_stop_results : CharsetMatches = CharsetMatches ()
163+
162164 sig_encoding , sig_payload = identify_sig_or_bom (sequences )
163165
164166 if sig_encoding is not None :
@@ -221,16 +223,20 @@ def from_bytes(
221223 try :
222224 if is_too_large_sequence and is_multi_byte_decoder is False :
223225 str (
224- sequences [: int (50e4 )]
225- if strip_sig_or_bom is False
226- else sequences [len (sig_payload ) : int (50e4 )],
226+ (
227+ sequences [: int (50e4 )]
228+ if strip_sig_or_bom is False
229+ else sequences [len (sig_payload ) : int (50e4 )]
230+ ),
227231 encoding = encoding_iana ,
228232 )
229233 else :
230234 decoded_payload = str (
231- sequences
232- if strip_sig_or_bom is False
233- else sequences [len (sig_payload ) :],
235+ (
236+ sequences
237+ if strip_sig_or_bom is False
238+ else sequences [len (sig_payload ) :]
239+ ),
234240 encoding = encoding_iana ,
235241 )
236242 except (UnicodeDecodeError , LookupError ) as e :
@@ -367,7 +373,13 @@ def from_bytes(
367373 and not lazy_str_hard_failure
368374 ):
369375 fallback_entry = CharsetMatch (
370- sequences , encoding_iana , threshold , False , [], decoded_payload
376+ sequences ,
377+ encoding_iana ,
378+ threshold ,
379+ False ,
380+ [],
381+ decoded_payload ,
382+ preemptive_declaration = specified_encoding ,
371383 )
372384 if encoding_iana == specified_encoding :
373385 fallback_specified = fallback_entry
@@ -421,28 +433,58 @@ def from_bytes(
421433 ),
422434 )
423435
424- results .append (
425- CharsetMatch (
426- sequences ,
427- encoding_iana ,
428- mean_mess_ratio ,
429- bom_or_sig_available ,
430- cd_ratios_merged ,
431- decoded_payload ,
432- )
436+ current_match = CharsetMatch (
437+ sequences ,
438+ encoding_iana ,
439+ mean_mess_ratio ,
440+ bom_or_sig_available ,
441+ cd_ratios_merged ,
442+ (
443+ decoded_payload
444+ if (
445+ is_too_large_sequence is False
446+ or encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
447+ )
448+ else None
449+ ),
450+ preemptive_declaration = specified_encoding ,
433451 )
434452
453+ results .append (current_match )
454+
435455 if (
436456 encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
437457 and mean_mess_ratio < 0.1
438458 ):
459+ # If md says nothing to worry about, then... stop immediately!
460+ if mean_mess_ratio == 0.0 :
461+ logger .debug (
462+ "Encoding detection: %s is most likely the one." ,
463+ current_match .encoding ,
464+ )
465+ if explain :
466+ logger .removeHandler (explain_handler )
467+ logger .setLevel (previous_logger_level )
468+ return CharsetMatches ([current_match ])
469+
470+ early_stop_results .append (current_match )
471+
472+ if (
473+ len (early_stop_results )
474+ and (specified_encoding is None or specified_encoding in tested )
475+ and "ascii" in tested
476+ and "utf_8" in tested
477+ ):
478+ probable_result : CharsetMatch = early_stop_results .best () # type: ignore[assignment]
439479 logger .debug (
440- "Encoding detection: %s is most likely the one." , encoding_iana
480+ "Encoding detection: %s is most likely the one." ,
481+ probable_result .encoding ,
441482 )
442483 if explain :
443484 logger .removeHandler (explain_handler )
444485 logger .setLevel (previous_logger_level )
445- return CharsetMatches ([results [encoding_iana ]])
486+
487+ return CharsetMatches ([probable_result ])
446488
447489 if encoding_iana == sig_encoding :
448490 logger .debug (
0 commit comments