✨ feat(app): improve text normalization in language detection

sudoskys · sudoskys · commit 797ae027db99 · 2025-03-29T14:28:18.000+08:00
Enhanced text normalization by removing newline characters and
lowercasing uppercase text to improve prediction accuracy. Added
warnings for deprecated parameters and improved configuration
management using LangDetectConfig.

These changes enhance text preprocessing and ensure better
configuration management.
diff --git a/README.md b/README.md
@@ -43,7 +43,6 @@ In scenarios **where accuracy is important**, you should not rely on the detecti
 
 ### Prerequisites
 
-- The "\n" character in the argument string must be removed before calling the function.
 - If the sample is too long or too short, the accuracy will be reduced.
 - The model will be downloaded to system temporary directory by default. You can customize it by:
   - Setting `FTLANG_CACHE` environment variable
@@ -79,7 +78,6 @@ except DetectError as e:
 multiline_text = """
 Hello, world!
 This is a multiline text.
-But we need remove \n characters or it will raise a DetectError.
 """
 multiline_text = multiline_text.replace("\n", " ")  
 print(detect(multiline_text))
diff --git a/feature_test/__init__.py b/feature_test/__init__.py
@@ -30,7 +30,9 @@
 # When offline, its raise error
 print(
     detect_multilingual(
-        "Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True
+        "Hello, world!你好世界!Привет, мир!",
+        low_memory=False,
+        config=LangDetectConfig(allow_fallback=True)
     )
 )
 
diff --git a/src/fast_langdetect/infer.py b/src/fast_langdetect/infer.py
@@ -214,7 +214,7 @@ def __init__(
             allow_fallback: bool = True,
             disable_verify: bool = False,
             verify_hash: Optional[str] = None,
-            normalize_input: bool = False,
+            normalize_input: bool = True,
     ):
         self.cache_dir = cache_dir or CACHE_DIRECTORY
         self.custom_model_path = custom_model_path
@@ -242,6 +242,35 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
         self.config = config or LangDetectConfig()
         self._model_loader = ModelLoader()
 
+    def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
+        """
+        Normalize text based on configuration.
+        
+        Currently handles:
+        - Removing newline characters for better prediction
+        - Lowercasing uppercase text to prevent misdetection as Japanese
+        
+        :param text: Input text
+        :param should_normalize: Whether normalization should be applied
+        :return: Normalized text
+        """
+        # If not normalization is needed, return the processed text
+        if not should_normalize:
+            return text
+
+        # Check and record newline and long text
+        if "\n" in text:
+            text = text.replace("\n", " ")
+        
+        # Check if text is all uppercase or mostly uppercase
+        if text.isupper() or (
+                len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
+                and len(text) > 5
+        ):
+            return text.lower()
+
+        return text
+
     def _get_model(self, low_memory: bool = True) -> Any:
         """Get or load appropriate model."""
         cache_key = "low_memory" if low_memory else "high_memory"
@@ -290,7 +319,18 @@ def detect(
             DetectError: If detection fails
         """
         model = self._get_model(low_memory)
-        normalized_text = _normalize_text(text, self.config.normalize_input)
+        normalized_text = self._normalize_text(text, self.config.normalize_input)
+        if len(normalized_text) > 100:
+            logger.warning(
+                "fast-langdetect: Text is too long. "
+                "Consider passing only a single sentence for accurate prediction."
+            )
+        if "\n" in normalized_text:
+            logger.warning(
+                "fast-langdetect: Text contains newline characters. "
+                "Removing newlines for better prediction accuracy."
+            )
+            normalized_text = normalized_text.replace("\n", " ")
         try:
             labels, scores = model.predict(normalized_text)
             return {
@@ -322,7 +362,7 @@ def detect_multilingual(
             DetectError: If detection fails
         """
         model = self._get_model(low_memory)
-        normalized_text = _normalize_text(text, self.config.normalize_input)
+        normalized_text = self._normalize_text(text, self.config.normalize_input)
         try:
             labels, scores = model.predict(normalized_text, k=k, threshold=threshold)
             results = [
@@ -342,66 +382,52 @@ def detect_multilingual(
 _default_detector = LangDetector()
 
 
-def _normalize_text(text: str, should_normalize: bool = False) -> str:
-    """
-    Normalize text based on configuration.
-    
-    Currently handles:
-    - Lowercasing uppercase text to prevent misdetection as Japanese
-    
-    :param text: Input text
-    :param should_normalize: Whether normalization should be applied
-    :return: Normalized text
-    """
-    if not should_normalize:
-        return text
-
-    # Check if text is all uppercase (or mostly uppercase)
-    if text.isupper() or (
-            len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
-            and len(text) > 5
-    ):
-        return text.lower()
-
-    return text
-
-
 def detect(
         text: str,
         *,
         low_memory: bool = True,
         model_download_proxy: Optional[str] = None,
         use_strict_mode: bool = False,
-        normalize_input: bool = True,
+        config: Optional[LangDetectConfig] = None,
 ) -> Dict[str, Union[str, float]]:
     """
     Simple interface for language detection.
-
-    Before passing a text to this function, you remove all the newline characters.
-
+    
     Too long or too short text will effect the accuracy of the prediction.
 
     :param text: Input text without newline characters
     :param low_memory: Whether to use memory-efficient model
-    :param model_download_proxy: Optional proxy for model download
-    :param use_strict_mode: Disable fallback to small model
-    :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
+    :param model_download_proxy: [DEPRECATED] Optional proxy for model download
+    :param use_strict_mode: [DEPRECATED] Disable fallback to small model
+    :param config: Optional LangDetectConfig object for advanced configuration
 
     :return: Dictionary with language and confidence score
     """
-    if "\n" in text or len(text) > 1000:
+    # Provide config
+    if config is not None:
+        detector = LangDetector(config)
+        return detector.detect(text, low_memory=low_memory)
+    
+    # Check if any custom parameters are provided
+    has_custom_params = any([
+        model_download_proxy is not None,
+        use_strict_mode,
+    ])
+    if has_custom_params:
+        # Show warning if using individual parameters
         logger.warning(
-            "fast-langdetect: Text contains newline characters or is too long. "
-            "You should only pass a single sentence for accurate prediction."
+            "fast-langdetect: Using individual parameters is deprecated. "
+            "Consider using LangDetectConfig for better configuration management. "
+            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
         )
-    if model_download_proxy or use_strict_mode or normalize_input:
-        config = LangDetectConfig(
+        custom_config = LangDetectConfig(
             proxy=model_download_proxy,
             allow_fallback=not use_strict_mode,
-            normalize_input=normalize_input
         )
-        detector = LangDetector(config)
+        detector = LangDetector(custom_config)
         return detector.detect(text, low_memory=low_memory)
+    
+    # Use default detector
     return _default_detector.detect(text, low_memory=low_memory)
 
 
@@ -413,40 +439,52 @@ def detect_multilingual(
         k: int = 5,
         threshold: float = 0.0,
         use_strict_mode: bool = False,
-        normalize_input: bool = True,
+        config: Optional[LangDetectConfig] = None,
 ) -> List[Dict[str, Any]]:
     """
     Simple interface for multi-language detection.
 
-    Before passing a text to this function, you remove all the newline characters.
-
     Too long or too short text will effect the accuracy of the prediction.
 
     :param text: Input text without newline characters
     :param low_memory: Whether to use memory-efficient model
-    :param model_download_proxy: Optional proxy for model download
     :param k: Number of top languages to return
     :param threshold: Minimum confidence threshold
-    :param use_strict_mode: Disable fallback to small model
-    :param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
+    :param model_download_proxy: [DEPRECATED] Optional proxy for model download
+    :param use_strict_mode: [DEPRECATED] Disable fallback to small model
+    :param config: Optional LangDetectConfig object for advanced configuration
 
     :return: List of dictionaries with languages and scores
     """
-    if "\n" in text or len(text) > 100:
+    # Use provided config or create new config
+    if config is not None:
+        detector = LangDetector(config)
+        return detector.detect_multilingual(
+            text, low_memory=low_memory, k=k, threshold=threshold
+        )
+    
+    # Check if any custom parameters are provided
+    has_custom_params = any([
+        model_download_proxy is not None,
+        use_strict_mode,
+    ])
+    if has_custom_params:
+        # Show warning if using individual parameters
         logger.warning(
-            "fast-langdetect: Text contains newline characters or is too long. "
-            "You should only pass a single sentence for accurate prediction."
+            "fast-langdetect: Using individual parameters is deprecated. "
+            "Consider using LangDetectConfig for better configuration management. "
+            "Will be removed in next major release. see https://github.com/LlmKira/fast-langdetect/pull/16"
         )
-    if model_download_proxy or use_strict_mode or normalize_input:
-        config = LangDetectConfig(
+        custom_config = LangDetectConfig(
             proxy=model_download_proxy,
             allow_fallback=not use_strict_mode,
-            normalize_input=normalize_input
         )
-        detector = LangDetector(config)
+        detector = LangDetector(custom_config)
         return detector.detect_multilingual(
             text, low_memory=low_memory, k=k, threshold=threshold
         )
+    
+    # Use default detector
     return _default_detector.detect_multilingual(
         text, low_memory=low_memory, k=k, threshold=threshold
     )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,9 +4,9 @@ def pytest_configure(config):
     """注册自定义标记。"""
     config.addinivalue_line(
         "markers",
-        "slow: 标记需要较长时间运行的测试"
+        "slow: Run in long progress"
     )
     config.addinivalue_line(
         "markers",
-        "real: 标记使用真实模型的测试"
+        "real: Test with real model"
     ) 
diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -2,27 +2,31 @@
 # @Time    : 2024/1/17 下午5:28
 
 def test_muti_detect():
-    from fast_langdetect import detect_multilingual
-    result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
+    from fast_langdetect import detect_multilingual,LangDetectConfig
+    result = detect_multilingual(
+        "hello world",
+        low_memory=True,
+        config=LangDetectConfig(allow_fallback=False)
+    )
     assert result[0].get("lang") == "en", "ft_detect error"
     return True
 
 
 def test_large():
-    from fast_langdetect import detect_multilingual
-    result = detect_multilingual("hello world", low_memory=True, use_strict_mode=True)
+    from fast_langdetect import detect_multilingual, LangDetectConfig
+    result = detect_multilingual("hello world", low_memory=True, config=LangDetectConfig(allow_fallback=False))
     assert result[0].get("lang") == "en", "ft_detect error"
-    result = detect_multilingual("你好世界", low_memory=False, use_strict_mode=True)
+    result = detect_multilingual("你好世界", low_memory=False, config=LangDetectConfig(allow_fallback=False))
     assert result[0].get("lang") == "zh", "ft_detect error"
 
 
 def test_detect():
-    from fast_langdetect import detect
-    assert detect("hello world", low_memory=False, use_strict_mode=True)["lang"] == "en", "ft_detect error"
-    assert detect("你好世界", low_memory=True, use_strict_mode=True)["lang"] == "zh", "ft_detect error"
-    assert detect("こんにちは世界", low_memory=False, use_strict_mode=True)["lang"] == "ja", "ft_detect error"
-    assert detect("안녕하세요 세계", low_memory=True, use_strict_mode=True)["lang"] == "ko", "ft_detect error"
-    assert detect("Bonjour le monde", low_memory=False, use_strict_mode=True)["lang"] == "fr", "ft_detect error"
+    from fast_langdetect import detect, LangDetectConfig
+    assert detect("hello world", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "en", "ft_detect error"
+    assert detect("你好世界", low_memory=True, config=LangDetectConfig(allow_fallback=False))["lang"] == "zh", "ft_detect error"
+    assert detect("こんにちは世界", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "ja", "ft_detect error"
+    assert detect("안녕하세요 세계", low_memory=True, config=LangDetectConfig(allow_fallback=False))["lang"] == "ko", "ft_detect error"
+    assert detect("Bonjour le monde", low_memory=False, config=LangDetectConfig(allow_fallback=False))["lang"] == "fr", "ft_detect error"
 
 
 def test_detect_totally():

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,9 @@`
`30`	`30`	`# When offline, its raise error`
`31`	`31`	`print(`
`32`	`32`	`detect_multilingual(`
`33`		`- "Hello, world!你好世界!Привет, мир!", low_memory=False, use_strict_mode=True`
	`33`	`+ "Hello, world!你好世界!Привет, мир!",`
	`34`	`+ low_memory=False,`
	`35`	`+ config=LangDetectConfig(allow_fallback=True)`
`34`	`36`	`)`
`35`	`37`	`)`
`36`	`38`
Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,9 @@ def pytest_configure(config):`
`4`	`4`	`"""注册自定义标记。"""`
`5`	`5`	`config.addinivalue_line(`
`6`	`6`	`"markers",`
`7`		`- "slow: 标记需要较长时间运行的测试"`
	`7`	`+ "slow: Run in long progress"`
`8`	`8`	`)`
`9`	`9`	`config.addinivalue_line(`
`10`	`10`	`"markers",`
`11`		`- "real: 标记使用真实模型的测试"`
	`11`	`+ "real: Test with real model"`
`12`	`12`	`)`