1+ from __future__ import annotations
2+
13import logging
24from os import PathLike
3- from typing import BinaryIO , List , Optional , Set , Union
5+ from typing import BinaryIO
46
57from .cd import (
68 coherence_ratio ,
2123 should_strip_sig_or_bom ,
2224)
2325
24- # Will most likely be controversial
25- # logging.addLevelName(TRACE, "TRACE")
2626logger = logging .getLogger ("charset_normalizer" )
2727explain_handler = logging .StreamHandler ()
2828explain_handler .setFormatter (
3131
3232
3333def from_bytes (
34- sequences : Union [ bytes , bytearray ] ,
34+ sequences : bytes | bytearray ,
3535 steps : int = 5 ,
3636 chunk_size : int = 512 ,
3737 threshold : float = 0.2 ,
38- cp_isolation : Optional [ List [ str ]] = None ,
39- cp_exclusion : Optional [ List [ str ]] = None ,
38+ cp_isolation : list [ str ] | None = None ,
39+ cp_exclusion : list [ str ] | None = None ,
4040 preemptive_behaviour : bool = True ,
4141 explain : bool = False ,
4242 language_threshold : float = 0.1 ,
@@ -62,7 +62,7 @@ def from_bytes(
6262
6363 if not isinstance (sequences , (bytearray , bytes )):
6464 raise TypeError (
65- "Expected object of type bytes or bytearray, got: {0 }" .format (
65+ "Expected object of type bytes or bytearray, got: {}" .format (
6666 type (sequences )
6767 )
6868 )
@@ -76,7 +76,7 @@ def from_bytes(
7676
7777 if length == 0 :
7878 logger .debug ("Encoding detection on empty bytes, assuming utf_8 intention." )
79- if explain :
79+ if explain : # Defensive: ensure exit path clean handler
8080 logger .removeHandler (explain_handler )
8181 logger .setLevel (previous_logger_level or logging .WARNING )
8282 return CharsetMatches ([CharsetMatch (sequences , "utf_8" , 0.0 , False , [], "" )])
@@ -135,9 +135,9 @@ def from_bytes(
135135 ),
136136 )
137137
138- prioritized_encodings : List [str ] = []
138+ prioritized_encodings : list [str ] = []
139139
140- specified_encoding : Optional [ str ] = (
140+ specified_encoding : str | None = (
141141 any_specified_encoding (sequences ) if preemptive_behaviour else None
142142 )
143143
@@ -149,16 +149,18 @@ def from_bytes(
149149 specified_encoding ,
150150 )
151151
152- tested : Set [str ] = set ()
153- tested_but_hard_failure : List [str ] = []
154- tested_but_soft_failure : List [str ] = []
152+ tested : set [str ] = set ()
153+ tested_but_hard_failure : list [str ] = []
154+ tested_but_soft_failure : list [str ] = []
155155
156- fallback_ascii : Optional [ CharsetMatch ] = None
157- fallback_u8 : Optional [ CharsetMatch ] = None
158- fallback_specified : Optional [ CharsetMatch ] = None
156+ fallback_ascii : CharsetMatch | None = None
157+ fallback_u8 : CharsetMatch | None = None
158+ fallback_specified : CharsetMatch | None = None
159159
160160 results : CharsetMatches = CharsetMatches ()
161161
162+ early_stop_results : CharsetMatches = CharsetMatches ()
163+
162164 sig_encoding , sig_payload = identify_sig_or_bom (sequences )
163165
164166 if sig_encoding is not None :
@@ -187,7 +189,7 @@ def from_bytes(
187189
188190 tested .add (encoding_iana )
189191
190- decoded_payload : Optional [ str ] = None
192+ decoded_payload : str | None = None
191193 bom_or_sig_available : bool = sig_encoding == encoding_iana
192194 strip_sig_or_bom : bool = bom_or_sig_available and should_strip_sig_or_bom (
193195 encoding_iana
@@ -221,16 +223,20 @@ def from_bytes(
221223 try :
222224 if is_too_large_sequence and is_multi_byte_decoder is False :
223225 str (
224- sequences [: int (50e4 )]
225- if strip_sig_or_bom is False
226- else sequences [len (sig_payload ) : int (50e4 )],
226+ (
227+ sequences [: int (50e4 )]
228+ if strip_sig_or_bom is False
229+ else sequences [len (sig_payload ) : int (50e4 )]
230+ ),
227231 encoding = encoding_iana ,
228232 )
229233 else :
230234 decoded_payload = str (
231- sequences
232- if strip_sig_or_bom is False
233- else sequences [len (sig_payload ) :],
235+ (
236+ sequences
237+ if strip_sig_or_bom is False
238+ else sequences [len (sig_payload ) :]
239+ ),
234240 encoding = encoding_iana ,
235241 )
236242 except (UnicodeDecodeError , LookupError ) as e :
@@ -286,7 +292,7 @@ def from_bytes(
286292 early_stop_count : int = 0
287293 lazy_str_hard_failure = False
288294
289- md_chunks : List [str ] = []
295+ md_chunks : list [str ] = []
290296 md_ratios = []
291297
292298 try :
@@ -367,7 +373,13 @@ def from_bytes(
367373 and not lazy_str_hard_failure
368374 ):
369375 fallback_entry = CharsetMatch (
370- sequences , encoding_iana , threshold , False , [], decoded_payload
376+ sequences ,
377+ encoding_iana ,
378+ threshold ,
379+ False ,
380+ [],
381+ decoded_payload ,
382+ preemptive_declaration = specified_encoding ,
371383 )
372384 if encoding_iana == specified_encoding :
373385 fallback_specified = fallback_entry
@@ -385,7 +397,7 @@ def from_bytes(
385397 )
386398
387399 if not is_multi_byte_decoder :
388- target_languages : List [str ] = encoding_languages (encoding_iana )
400+ target_languages : list [str ] = encoding_languages (encoding_iana )
389401 else :
390402 target_languages = mb_encoding_languages (encoding_iana )
391403
@@ -421,36 +433,66 @@ def from_bytes(
421433 ),
422434 )
423435
424- results .append (
425- CharsetMatch (
426- sequences ,
427- encoding_iana ,
428- mean_mess_ratio ,
429- bom_or_sig_available ,
430- cd_ratios_merged ,
431- decoded_payload ,
432- )
436+ current_match = CharsetMatch (
437+ sequences ,
438+ encoding_iana ,
439+ mean_mess_ratio ,
440+ bom_or_sig_available ,
441+ cd_ratios_merged ,
442+ (
443+ decoded_payload
444+ if (
445+ is_too_large_sequence is False
446+ or encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
447+ )
448+ else None
449+ ),
450+ preemptive_declaration = specified_encoding ,
433451 )
434452
453+ results .append (current_match )
454+
435455 if (
436456 encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
437457 and mean_mess_ratio < 0.1
438458 ):
459+ # If md says nothing to worry about, then... stop immediately!
460+ if mean_mess_ratio == 0.0 :
461+ logger .debug (
462+ "Encoding detection: %s is most likely the one." ,
463+ current_match .encoding ,
464+ )
465+ if explain : # Defensive: ensure exit path clean handler
466+ logger .removeHandler (explain_handler )
467+ logger .setLevel (previous_logger_level )
468+ return CharsetMatches ([current_match ])
469+
470+ early_stop_results .append (current_match )
471+
472+ if (
473+ len (early_stop_results )
474+ and (specified_encoding is None or specified_encoding in tested )
475+ and "ascii" in tested
476+ and "utf_8" in tested
477+ ):
478+ probable_result : CharsetMatch = early_stop_results .best () # type: ignore[assignment]
439479 logger .debug (
440- "Encoding detection: %s is most likely the one." , encoding_iana
480+ "Encoding detection: %s is most likely the one." ,
481+ probable_result .encoding ,
441482 )
442- if explain :
483+ if explain : # Defensive: ensure exit path clean handler
443484 logger .removeHandler (explain_handler )
444485 logger .setLevel (previous_logger_level )
445- return CharsetMatches ([results [encoding_iana ]])
486+
487+ return CharsetMatches ([probable_result ])
446488
447489 if encoding_iana == sig_encoding :
448490 logger .debug (
449491 "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
450492 "the beginning of the sequence." ,
451493 encoding_iana ,
452494 )
453- if explain :
495+ if explain : # Defensive: ensure exit path clean handler
454496 logger .removeHandler (explain_handler )
455497 logger .setLevel (previous_logger_level )
456498 return CharsetMatches ([results [encoding_iana ]])
@@ -504,8 +546,8 @@ def from_fp(
504546 steps : int = 5 ,
505547 chunk_size : int = 512 ,
506548 threshold : float = 0.20 ,
507- cp_isolation : Optional [ List [ str ]] = None ,
508- cp_exclusion : Optional [ List [ str ]] = None ,
549+ cp_isolation : list [ str ] | None = None ,
550+ cp_exclusion : list [ str ] | None = None ,
509551 preemptive_behaviour : bool = True ,
510552 explain : bool = False ,
511553 language_threshold : float = 0.1 ,
@@ -530,12 +572,12 @@ def from_fp(
530572
531573
532574def from_path (
533- path : Union [ str , bytes , PathLike ] , # type: ignore[type-arg]
575+ path : str | bytes | PathLike , # type: ignore[type-arg]
534576 steps : int = 5 ,
535577 chunk_size : int = 512 ,
536578 threshold : float = 0.20 ,
537- cp_isolation : Optional [ List [ str ]] = None ,
538- cp_exclusion : Optional [ List [ str ]] = None ,
579+ cp_isolation : list [ str ] | None = None ,
580+ cp_exclusion : list [ str ] | None = None ,
539581 preemptive_behaviour : bool = True ,
540582 explain : bool = False ,
541583 language_threshold : float = 0.1 ,
@@ -561,12 +603,12 @@ def from_path(
561603
562604
563605def is_binary (
564- fp_or_path_or_payload : Union [ PathLike , str , BinaryIO , bytes ] , # type: ignore[type-arg]
606+ fp_or_path_or_payload : PathLike | str | BinaryIO | bytes , # type: ignore[type-arg]
565607 steps : int = 5 ,
566608 chunk_size : int = 512 ,
567609 threshold : float = 0.20 ,
568- cp_isolation : Optional [ List [ str ]] = None ,
569- cp_exclusion : Optional [ List [ str ]] = None ,
610+ cp_isolation : list [ str ] | None = None ,
611+ cp_exclusion : list [ str ] | None = None ,
570612 preemptive_behaviour : bool = True ,
571613 explain : bool = False ,
572614 language_threshold : float = 0.1 ,
0 commit comments