Skip to content

Commit 2b4e38f

Browse files
committed
✨ feat(app): add input normalization to language detection
Normalize text input to improve detection accuracy, particularly for all-uppercase text. This prevents misdetection as Japanese by converting uppercase text to lowercase. This enhancement ensures more reliable language predictions.
1 parent 5ef57c5 commit 2b4e38f

File tree

2 files changed

+45
-7
lines changed

2 files changed

+45
-7
lines changed

src/fast_langdetect/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def detect_language(sentence: str, *, low_memory: bool = True):
1919
:param low_memory: bool (default: True) whether to use low memory mode
2020
:return: ZH, EN, JA, KO, FR, DE, ES, .... (two uppercase letters)
2121
"""
22-
lang_code = detect(sentence.lower(), low_memory=low_memory).get("lang").upper()
22+
lang_code = detect(sentence, low_memory=low_memory).get("lang").upper()
2323
if lang_code == "JA" and not is_japanese(sentence):
2424
lang_code = "ZH"
2525
return lang_code

src/fast_langdetect/infer.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ class LangDetectConfig:
203203
:param proxy: HTTP proxy for downloads
204204
:param allow_fallback: Whether to fallback to small model
205205
:param disable_verify: Whether to disable MD5 verification
206+
:param normalize_input: Whether to normalize input text (e.g. lowercase for uppercase text)
206207
"""
207208

208209
def __init__(
@@ -213,6 +214,7 @@ def __init__(
213214
allow_fallback: bool = True,
214215
disable_verify: bool = False,
215216
verify_hash: Optional[str] = None,
217+
normalize_input: bool = False,
216218
):
217219
self.cache_dir = cache_dir or CACHE_DIRECTORY
218220
self.custom_model_path = custom_model_path
@@ -221,9 +223,11 @@ def __init__(
221223
# Only verify large model
222224
self.disable_verify = disable_verify
223225
self.verify_hash = verify_hash
226+
self.normalize_input = normalize_input
224227
if self.custom_model_path and not Path(self.custom_model_path).exists():
225228
raise FileNotFoundError(f"fast-langdetect: Target model file not found: {self.custom_model_path}")
226229

230+
227231
class LangDetector:
228232
"""Language detector using FastText models."""
229233
VERIFY_FASTTEXT_LARGE_MODEL = "01810bc59c6a3d2b79c79e6336612f65"
@@ -286,8 +290,9 @@ def detect(
286290
DetectError: If detection fails
287291
"""
288292
model = self._get_model(low_memory)
293+
normalized_text = _normalize_text(text, self.config.normalize_input)
289294
try:
290-
labels, scores = model.predict(text)
295+
labels, scores = model.predict(normalized_text)
291296
return {
292297
"lang": labels[0].replace("__label__", ""),
293298
"score": min(float(scores[0]), 1.0),
@@ -317,8 +322,9 @@ def detect_multilingual(
317322
DetectError: If detection fails
318323
"""
319324
model = self._get_model(low_memory)
325+
normalized_text = _normalize_text(text, self.config.normalize_input)
320326
try:
321-
labels, scores = model.predict(text, k=k, threshold=threshold)
327+
labels, scores = model.predict(normalized_text, k=k, threshold=threshold)
322328
results = [
323329
{
324330
"lang": label.replace("__label__", ""),
@@ -336,12 +342,37 @@ def detect_multilingual(
336342
_default_detector = LangDetector()
337343

338344

345+
def _normalize_text(text: str, should_normalize: bool = False) -> str:
346+
"""
347+
Normalize text based on configuration.
348+
349+
Currently handles:
350+
- Lowercasing uppercase text to prevent misdetection as Japanese
351+
352+
:param text: Input text
353+
:param should_normalize: Whether normalization should be applied
354+
:return: Normalized text
355+
"""
356+
if not should_normalize:
357+
return text
358+
359+
# Check if text is all uppercase (or mostly uppercase)
360+
if text.isupper() or (
361+
len(re.findall(r'[A-Z]', text)) > 0.8 * len(re.findall(r'[A-Za-z]', text))
362+
and len(text) > 5
363+
):
364+
return text.lower()
365+
366+
return text
367+
368+
339369
def detect(
340370
text: str,
341371
*,
342372
low_memory: bool = True,
343373
model_download_proxy: Optional[str] = None,
344374
use_strict_mode: bool = False,
375+
normalize_input: bool = True,
345376
) -> Dict[str, Union[str, float]]:
346377
"""
347378
Simple interface for language detection.
@@ -354,6 +385,7 @@ def detect(
354385
:param low_memory: Whether to use memory-efficient model
355386
:param model_download_proxy: Optional proxy for model download
356387
:param use_strict_mode: Disable fallback to small model
388+
:param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
357389
358390
:return: Dictionary with language and confidence score
359391
"""
@@ -362,9 +394,11 @@ def detect(
362394
"fast-langdetect: Text contains newline characters or is too long. "
363395
"You should only pass a single sentence for accurate prediction."
364396
)
365-
if model_download_proxy or use_strict_mode:
397+
if model_download_proxy or use_strict_mode or normalize_input:
366398
config = LangDetectConfig(
367-
proxy=model_download_proxy, allow_fallback=not use_strict_mode
399+
proxy=model_download_proxy,
400+
allow_fallback=not use_strict_mode,
401+
normalize_input=normalize_input
368402
)
369403
detector = LangDetector(config)
370404
return detector.detect(text, low_memory=low_memory)
@@ -379,6 +413,7 @@ def detect_multilingual(
379413
k: int = 5,
380414
threshold: float = 0.0,
381415
use_strict_mode: bool = False,
416+
normalize_input: bool = True,
382417
) -> List[Dict[str, Any]]:
383418
"""
384419
Simple interface for multi-language detection.
@@ -393,6 +428,7 @@ def detect_multilingual(
393428
:param k: Number of top languages to return
394429
:param threshold: Minimum confidence threshold
395430
:param use_strict_mode: Disable fallback to small model
431+
:param normalize_input: Whether to normalize input text (lowercase all-uppercase text)
396432
397433
:return: List of dictionaries with languages and scores
398434
"""
@@ -401,9 +437,11 @@ def detect_multilingual(
401437
"fast-langdetect: Text contains newline characters or is too long. "
402438
"You should only pass a single sentence for accurate prediction."
403439
)
404-
if model_download_proxy or use_strict_mode:
440+
if model_download_proxy or use_strict_mode or normalize_input:
405441
config = LangDetectConfig(
406-
proxy=model_download_proxy, allow_fallback=not use_strict_mode
442+
proxy=model_download_proxy,
443+
allow_fallback=not use_strict_mode,
444+
normalize_input=normalize_input
407445
)
408446
detector = LangDetector(config)
409447
return detector.detect_multilingual(

0 commit comments

Comments
 (0)