@@ -242,11 +242,12 @@ def __init__(self, config: Optional[LangDetectConfig] = None):
242242 self .config = config or LangDetectConfig ()
243243 self ._model_loader = ModelLoader ()
244244
245- def _normalize_text (self , text : str , should_normalize : bool = False ) -> str :
245+ @staticmethod
246+ def _normalize_text (text : str , should_normalize : bool = False ) -> str :
246247 """
247248 Normalize text based on configuration.
248249
249- Currently handles:
250+ Currently, handles:
250251 - Removing newline characters for better prediction
251252 - Lowercasing uppercase text to prevent misdetection as Japanese
252253
@@ -257,12 +258,9 @@ def _normalize_text(self, text: str, should_normalize: bool = False) -> str:
257258 # If not normalization is needed, return the processed text
258259 if not should_normalize :
259260 return text
260-
261- # Check and record newline and long text
262- if "\n " in text :
263- text = text .replace ("\n " , " " )
264261
265262 # Check if text is all uppercase or mostly uppercase
263+ # https://github.com/LlmKira/fast-langdetect/issues/14
266264 if text .isupper () or (
267265 len (re .findall (r'[A-Z]' , text )) > 0.8 * len (re .findall (r'[A-Za-z]' , text ))
268266 and len (text ) > 5
@@ -322,13 +320,13 @@ def detect(
322320 normalized_text = self ._normalize_text (text , self .config .normalize_input )
323321 if len (normalized_text ) > 100 :
324322 logger .warning (
325- "fast-langdetect: Text is too long. "
323+ "fast-langdetect: Text may be too long. "
326324 "Consider passing only a single sentence for accurate prediction."
327325 )
328326 if "\n " in normalized_text :
329327 logger .warning (
330- "fast-langdetect: Text contains newline characters. "
331- "Removing newlines for better prediction accuracy ."
328+ "fast-langdetect: Input should not contain newline characters. "
329+ "Removing them or FastText will raise an error ."
332330 )
333331 normalized_text = normalized_text .replace ("\n " , " " )
334332 try :
@@ -407,7 +405,7 @@ def detect(
407405 if config is not None :
408406 detector = LangDetector (config )
409407 return detector .detect (text , low_memory = low_memory )
410-
408+
411409 # Check if any custom parameters are provided
412410 has_custom_params = any ([
413411 model_download_proxy is not None ,
@@ -426,7 +424,7 @@ def detect(
426424 )
427425 detector = LangDetector (custom_config )
428426 return detector .detect (text , low_memory = low_memory )
429-
427+
430428 # Use default detector
431429 return _default_detector .detect (text , low_memory = low_memory )
432430
@@ -462,7 +460,7 @@ def detect_multilingual(
462460 return detector .detect_multilingual (
463461 text , low_memory = low_memory , k = k , threshold = threshold
464462 )
465-
463+
466464 # Check if any custom parameters are provided
467465 has_custom_params = any ([
468466 model_download_proxy is not None ,
@@ -483,7 +481,7 @@ def detect_multilingual(
483481 return detector .detect_multilingual (
484482 text , low_memory = low_memory , k = k , threshold = threshold
485483 )
486-
484+
487485 # Use default detector
488486 return _default_detector .detect_multilingual (
489487 text , low_memory = low_memory , k = k , threshold = threshold
0 commit comments