@@ -214,7 +214,7 @@ def __init__(
214214 allow_fallback : bool = True ,
215215 disable_verify : bool = False ,
216216 verify_hash : Optional [str ] = None ,
217- normalize_input : bool = False ,
217+ normalize_input : bool = True ,
218218 ):
219219 self .cache_dir = cache_dir or CACHE_DIRECTORY
220220 self .custom_model_path = custom_model_path
@@ -242,6 +242,35 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242 self .config = config or LangDetectConfig ()
243243 self ._model_loader = ModelLoader ()
244244
245+ def _normalize_text (self , text : str , should_normalize : bool = False ) -> str :
246+ """
247+ Normalize text based on configuration.
248+
249+ Currently handles:
250+ - Removing newline characters for better prediction
251+ - Lowercasing uppercase text to prevent misdetection as Japanese
252+
253+ :param text: Input text
254+ :param should_normalize: Whether normalization should be applied
255+ :return: Normalized text
256+ """
257+ # If not normalization is needed, return the processed text
258+ if not should_normalize :
259+ return text
260+
261+ # Check and record newline and long text
262+ if "\n " in text :
263+ text = text .replace ("\n " , " " )
264+
265+ # Check if text is all uppercase or mostly uppercase
266+ if text .isupper () or (
267+ len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
268+ and len (text ) > 5
269+ ):
270+ return text .lower ()
271+
272+ return text
273+
245274 def _get_model (self , low_memory : bool = True ) -> Any :
246275 """Get or load appropriate model."""
247276 cache_key = "low_memory" if low_memory else "high_memory"
@@ -290,7 +319,18 @@ def detect(
290319 DetectError: If detection fails
291320 """
292321 model = self ._get_model (low_memory )
293- normalized_text = _normalize_text (text , self .config .normalize_input )
322+ normalized_text = self ._normalize_text (text , self .config .normalize_input )
323+ if len (normalized_text ) > 100 :
324+ logger .warning (
325+ "fast-langdetect: Text is too long. "
326+ "Consider passing only a single sentence for accurate prediction."
327+ )
328+ if "\n " in normalized_text :
329+ logger .warning (
330+ "fast-langdetect: Text contains newline characters. "
331+ "Removing newlines for better prediction accuracy."
332+ )
333+ normalized_text = normalized_text .replace ("\n " , " " )
294334 try :
295335 labels , scores = model .predict (normalized_text )
296336 return {
@@ -322,7 +362,7 @@ def detect_multilingual(
322362 DetectError: If detection fails
323363 """
324364 model = self ._get_model (low_memory )
325- normalized_text = _normalize_text (text , self .config .normalize_input )
365+ normalized_text = self . _normalize_text (text , self .config .normalize_input )
326366 try :
327367 labels , scores = model .predict (normalized_text , k = k , threshold = threshold )
328368 results = [
@@ -342,66 +382,52 @@ def detect_multilingual(
342382_default_detector = LangDetector ()
343383
344384
345- def _normalize_text (text : str , should_normalize : bool = False ) -> str :
346- """
347- Normalize text based on configuration.
348-
349- Currently handles:
350- - Lowercasing uppercase text to prevent misdetection as Japanese
351-
352- :param text: Input text
353- :param should_normalize: Whether normalization should be applied
354- :return: Normalized text
355- """
356- if not should_normalize :
357- return text
358-
359- # Check if text is all uppercase (or mostly uppercase)
360- if text .isupper () or (
361- len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
362- and len (text ) > 5
363- ):
364- return text .lower ()
365-
366- return text
367-
368-
369385def detect (
370386 text : str ,
371387 * ,
372388 low_memory : bool = True ,
373389 model_download_proxy : Optional [str ] = None ,
374390 use_strict_mode : bool = False ,
375- normalize_input : bool = True ,
391+ config : Optional [ LangDetectConfig ] = None ,
376392) -> Dict [str , Union [str , float ]]:
377393 """
378394 Simple interface for language detection.
379-
380- Before passing a text to this function, you remove all the newline characters.
381-
395+
382396 Too long or too short text will effect the accuracy of the prediction.
383397
384398 :param text: Input text without newline characters
385399 :param low_memory: Whether to use memory-efficient model
386- :param model_download_proxy: Optional proxy for model download
387- :param use_strict_mode: Disable fallback to small model
388- :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
400+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
401+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
402+ :param config: Optional LangDetectConfig object for advanced configuration
389403
390404 :return: Dictionary with language and confidence score
391405 """
392- if "\n " in text or len (text ) > 1000 :
406+ # Provide config
407+ if config is not None :
408+ detector = LangDetector (config )
409+ return detector .detect (text , low_memory = low_memory )
410+
411+ # Check if any custom parameters are provided
412+ has_custom_params = any ([
413+ model_download_proxy is not None ,
414+ use_strict_mode ,
415+ ])
416+ if has_custom_params :
417+ # Show warning if using individual parameters
393418 logger .warning (
394- "fast-langdetect: Text contains newline characters or is too long. "
395- "You should only pass a single sentence for accurate prediction."
419+ "fast-langdetect: Using individual parameters is deprecated. "
420+ "Consider using LangDetectConfig for better configuration management. "
421+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
396422 )
397- if model_download_proxy or use_strict_mode or normalize_input :
398- config = LangDetectConfig (
423+ custom_config = LangDetectConfig (
399424 proxy = model_download_proxy ,
400425 allow_fallback = not use_strict_mode ,
401- normalize_input = normalize_input
402426 )
403- detector = LangDetector (config )
427+ detector = LangDetector (custom_config )
404428 return detector .detect (text , low_memory = low_memory )
429+
430+ # Use default detector
405431 return _default_detector .detect (text , low_memory = low_memory )
406432
407433
@@ -413,40 +439,52 @@ def detect_multilingual(
413439 k : int = 5 ,
414440 threshold : float = 0.0 ,
415441 use_strict_mode : bool = False ,
416- normalize_input : bool = True ,
442+ config : Optional [ LangDetectConfig ] = None ,
417443) -> List [Dict [str , Any ]]:
418444 """
419445 Simple interface for multi-language detection.
420446
421- Before passing a text to this function, you remove all the newline characters.
422-
423447 Too long or too short text will effect the accuracy of the prediction.
424448
425449 :param text: Input text without newline characters
426450 :param low_memory: Whether to use memory-efficient model
427- :param model_download_proxy: Optional proxy for model download
428451 :param k: Number of top languages to return
429452 :param threshold: Minimum confidence threshold
430- :param use_strict_mode: Disable fallback to small model
431- :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
453+ :param model_download_proxy: [DEPRECATED] Optional proxy for model download
454+ :param use_strict_mode: [DEPRECATED] Disable fallback to small model
455+ :param config: Optional LangDetectConfig object for advanced configuration
432456
433457 :return: List of dictionaries with languages and scores
434458 """
435- if "\n " in text or len (text ) > 100 :
459+ # Use provided config or create new config
460+ if config is not None :
461+ detector = LangDetector (config )
462+ return detector .detect_multilingual (
463+ text , low_memory = low_memory , k = k , threshold = threshold
464+ )
465+
466+ # Check if any custom parameters are provided
467+ has_custom_params = any ([
468+ model_download_proxy is not None ,
469+ use_strict_mode ,
470+ ])
471+ if has_custom_params :
472+ # Show warning if using individual parameters
436473 logger .warning (
437- "fast-langdetect: Text contains newline characters or is too long. "
438- "You should only pass a single sentence for accurate prediction."
474+ "fast-langdetect: Using individual parameters is deprecated. "
475+ "Consider using LangDetectConfig for better configuration management. "
476+ "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
439477 )
440- if model_download_proxy or use_strict_mode or normalize_input :
441- config = LangDetectConfig (
478+ custom_config = LangDetectConfig (
442479 proxy = model_download_proxy ,
443480 allow_fallback = not use_strict_mode ,
444- normalize_input = normalize_input
445481 )
446- detector = LangDetector (config )
482+ detector = LangDetector (custom_config )
447483 return detector .detect_multilingual (
448484 text , low_memory = low_memory , k = k , threshold = threshold
449485 )
486+
487+ # Use default detector
450488 return _default_detector .detect_multilingual (
451489 text , low_memory = low_memory , k = k , threshold = threshold
452490 )
0 commit comments