@@ -203,6 +203,7 @@ class LangDetectConfig:
203203 :param proxy: HTTP proxy for downloads
204204 :param allow_fallback: Whether to fallback to small model
205205 :param disable_verify: Whether to disable MD5 verification
206+ :param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
206207 """
207208
208209 def __init__ (
@@ -213,6 +214,7 @@ def __init__(
213214 allow_fallback : bool = True ,
214215 disable_verify : bool = False ,
215216 verify_hash : Optional [str ] = None ,
217+ normalize_input : bool = False ,
216218 ):
217219 self .cache_dir = cache_dir or CACHE_DIRECTORY
218220 self .custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
221223 # Only verify large model
222224 self .disable_verify = disable_verify
223225 self .verify_hash = verify_hash
226+ self .normalize_input = normalize_input
224227 if self .custom_model_path and not Path (self .custom_model_path ).exists ():
225228 raise FileNotFoundError (f"fast-langdetect: Target model file not found: { self .custom_model_path } " )
226229
230+
227231class LangDetector :
228232 """Language detector using FastText models."""
229233 VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -286,8 +290,9 @@ def detect(
286290 DetectError: If detection fails
287291 """
288292 model = self ._get_model (low_memory )
293+ normalized_text = _normalize_text (text , self .config .normalize_input )
289294 try :
290- labels , scores = model .predict (text )
295+ labels , scores = model .predict (normalized_text )
291296 return {
292297 "lang" : labels [0 ].replace ("__label__" , "" ),
293298 "score" : min (float (scores [0 ]), 1.0 ),
@@ -317,8 +322,9 @@ def detect_multilingual(
317322 DetectError: If detection fails
318323 """
319324 model = self ._get_model (low_memory )
325+ normalized_text = _normalize_text (text , self .config .normalize_input )
320326 try :
321- labels , scores = model .predict (text , k = k , threshold = threshold )
327+ labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
322328 results = [
323329 {
324330 "lang" : label .replace ("__label__" , "" ),
@@ -336,12 +342,37 @@ def detect_multilingual(
336342_default_detector = LangDetector ()
337343
338344
345+ def _normalize_text (text : str , should_normalize : bool = False ) -> str :
346+ """
347+ Normalize text based on configuration.
348+
349+ Currently handles:
350+ - Lowercasing uppercase text to prevent misdetection as Japanese
351+
352+ :param text: Input text
353+ :param should_normalize: Whether normalization should be applied
354+ :return: Normalized text
355+ """
356+ if not should_normalize :
357+ return text
358+
359+ # Check if text is all uppercase (or mostly uppercase)
360+ if text .isupper () or (
361+ len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
362+ and len (text ) > 5
363+ ):
364+ return text .lower ()
365+
366+ return text
367+
368+
339369def detect (
340370 text : str ,
341371 * ,
342372 low_memory : bool = True ,
343373 model_download_proxy : Optional [str ] = None ,
344374 use_strict_mode : bool = False ,
375+ normalize_input : bool = True ,
345376) -> Dict [str , Union [str , float ]]:
346377 """
347378 Simple interface for language detection.
@@ -354,6 +385,7 @@ def detect(
354385 :param low_memory: Whether to use memory-efficient model
355386 :param model_download_proxy: Optional proxy for model download
356387 :param use_strict_mode: Disable fallback to small model
388+ :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
357389
358390 :return: Dictionary with language and confidence score
359391 """
@@ -362,9 +394,11 @@ def detect(
362394 "fast-langdetect: Text contains newline characters or is too long. "
363395 "You should only pass a single sentence for accurate prediction."
364396 )
365- if model_download_proxy or use_strict_mode :
397+ if model_download_proxy or use_strict_mode or normalize_input :
366398 config = LangDetectConfig (
367- proxy = model_download_proxy , allow_fallback = not use_strict_mode
399+ proxy = model_download_proxy ,
400+ allow_fallback = not use_strict_mode ,
401+ normalize_input = normalize_input
368402 )
369403 detector = LangDetector (config )
370404 return detector .detect (text , low_memory = low_memory )
@@ -379,6 +413,7 @@ def detect_multilingual(
379413 k : int = 5 ,
380414 threshold : float = 0.0 ,
381415 use_strict_mode : bool = False ,
416+ normalize_input : bool = True ,
382417) -> List [Dict [str , Any ]]:
383418 """
384419 Simple interface for multi-language detection.
@@ -393,6 +428,7 @@ def detect_multilingual(
393428 :param k: Number of top languages to return
394429 :param threshold: Minimum confidence threshold
395430 :param use_strict_mode: Disable fallback to small model
431+ :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
396432
397433 :return: List of dictionaries with languages and scores
398434 """
@@ -401,9 +437,11 @@ def detect_multilingual(
401437 "fast-langdetect: Text contains newline characters or is too long. "
402438 "You should only pass a single sentence for accurate prediction."
403439 )
404- if model_download_proxy or use_strict_mode :
440+ if model_download_proxy or use_strict_mode or normalize_input :
405441 config = LangDetectConfig (
406- proxy = model_download_proxy , allow_fallback = not use_strict_mode
442+ proxy = model_download_proxy ,
443+ allow_fallback = not use_strict_mode ,
444+ normalize_input = normalize_input
407445 )
408446 detector = LangDetector (config )
409447 return detector .detect_multilingual (
0 commit comments