Add pythainlp.util.analyze_thai_text (#1149)

wannaphong · web-flow · commit c94002aeb4b8 · 2025-09-29T13:19:14.000+07:00
* Add Thai character mapping and analysis function

Added a comprehensive mapping of Thai characters to their descriptive names, including consonants, vowels, tone marks, punctuation, and digits. Implemented the analyze_thai_text function to analyze Thai text and return a list of classified characters.

* Document analyze_thai_text function

Added documentation for analyze_thai_text function.

* Add analyze_thai_text to module exports

* Add tests for analyze_thai_text function

Added unit tests for analyze_thai_text function.

* Change analyze_thai_text return type to dict

Updated the analyze_thai_text function to return a single dictionary instead of a list of dictionaries, reflecting changes in the return type and documentation.

* Update util.rst

* Fix typo in analyze_thai_text docstring

Corrected the description in the analyze_thai_text function.

* Fix expected output format in Thai text analysis tests

* Update thai.py
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -7,6 +7,11 @@ The :mod:`pythainlp.util` module serves as a treasure trove of utility functions
 Modules
 -------
 
+.. autofunction:: analyze_thai_text
+    :noindex:
+
+    Analyzes a string of Thai text and returns a dictionaries, where each values represents a single classified character from the text.
+
 .. autofunction:: abbreviation_to_full_text
     :noindex:
 
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -69,6 +69,7 @@
     "tone_detector",
     "tone_to_spelling",
     "words_to_num",
+    "analyze_thai_text",
 ]
 
 from pythainlp.util import spell_words
@@ -121,6 +122,7 @@
     isthai,
     isthaichar,
     thai_word_tone_detector,
+    analyze_thai_text,
 )
 from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date
 from pythainlp.util.thaiwordcheck import is_native_thai
diff --git a/pythainlp/util/thai.py b/pythainlp/util/thai.py
@@ -8,6 +8,7 @@
 
 import string
 from typing import Tuple
+from collections import defaultdict
 
 from pythainlp import (
     thai_above_vowels,
@@ -26,6 +27,52 @@
 _TH_FIRST_CHAR_ASCII = 3584
 _TH_LAST_CHAR_ASCII = 3711
 
+# A comprehensive map of Thai characters to their descriptive names.
+THAI_CHAR_NAMES = {
+    # Consonants
+    **{char: char for char in thai_consonants},
+    # Vowels and Signs
+    "\u0e24": "ฤ",
+    "\u0e26": "ฦ",
+    "\u0e30": "สระ อะ",
+    "\u0e31": "ไม้หันอากาศ",
+    "\u0e32": "สระ อา",
+    "\u0e33": "สระ อำ",
+    "\u0e34": "สระ อิ",
+    "\u0e35": "สระ อี",
+    "\u0e36": "สระ อึ",
+    "\u0e37": "สระ อือ",
+    "\u0e38": "สระ อุ",
+    "\u0e39": "สระ อู",
+    "\u0e40": "สระ เอ",
+    "\u0e41": "สระ แอ",
+    "\u0e42": "สระ โอ",
+    "\u0e43": "สระ ใอ",
+    "\u0e44": "สระ ไอ",
+    "\u0e45": "ไม้ม้วน",
+    "\u0e4d": "นฤคหิต",
+    "\u0e47": "ไม้ไต่คู้",
+    # Tone Marks
+    "\u0e48": "ไม้เอก",
+    "\u0e49": "ไม้โท",
+    "\u0e4a": "ไม้ตรี",
+    "\u0e4b": "ไม้จัตวา",
+    # Other Signs
+    "\u0e2f": "ไปยาลน้อย",
+    "\u0e3a": "พินทุ",
+    "\u0e46": "ไม้ยมก",
+    "\u0e4c": "การันต์",
+    "\u0e4e": "ยามักการ",
+    # Punctuation
+    "\u0e4f": "ฟองมัน",
+    "\u0e5a": "อังคั่นคู่",
+    "\u0e5b": "โคมุต",
+    # Digits
+    **{char: char for char in thai_digits},
+    # Symbol
+    "\u0e3f": "฿",
+}
+
 
 def isthaichar(ch: str) -> bool:
     """Check if a character is a Thai character.
@@ -269,3 +316,38 @@ def count_thai_chars(text: str) -> dict:
         else:
             _dict["non_thai"] += 1
     return _dict
+
+
+def analyze_thai_text(text: str) -> dict:
+    """
+    Analyzes a string of Thai text and returns a dictionaries,
+    where each values represents a single classified character from the text.
+
+    The function processes the text character by character and maps each Thai
+    character to its descriptive name or itself (for consonants and digits).
+
+    :param str text: The Thai text string to be analyzed.
+    :rtype: list[dict]
+    :return: A dictionaries, with each item containing
+                    a single character and a count of 1.
+
+    Examples:
+        >>> analyze_thai_text("คนดี")
+        {'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1}
+
+        >>> analyze_thai_text("เล่น")
+        {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
+    """
+    results = defaultdict(int)
+
+    # Iterate over each character in the input string
+    for char in text:
+        # Check if the character is in our mapping
+        if char in THAI_CHAR_NAMES:
+            name = THAI_CHAR_NAMES[char]
+            results[name]+=1
+        else:
+            # If the character is not a known Thai character, classify it as character
+            results[char]+=1
+
+    return dict(results)
diff --git a/tests/core/test_util.py b/tests/core/test_util.py
@@ -67,6 +67,7 @@
     tone_detector,
     words_to_num,
     spelling,
+    analyze_thai_text,
 )
 from pythainlp.util.morse import morse_decode, morse_encode
 
@@ -874,3 +875,13 @@ def test_longest_common_subsequence(self):
         self.assertEqual(longest_common_subsequence("", "ABC"), "")
         self.assertEqual(longest_common_subsequence("ABC", ""), "")
         self.assertEqual(longest_common_subsequence("", ""), "")
+
+    def test_analyze_thai_text(self):
+        self.assertEqual(
+            analyze_thai_text("คนดี"),
+            {"ค": 1, "น": 1, "ด": 1, "สระ อี": 1}
+        )
+        self.assertEqual(
+            analyze_thai_text("เล่น"),
+            {'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
+        )

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,7 @@`
`69`	`69`	`"tone_detector",`
`70`	`70`	`"tone_to_spelling",`
`71`	`71`	`"words_to_num",`
	`72`	`+ "analyze_thai_text",`
`72`	`73`	`]`
`73`	`74`
`74`	`75`	`from pythainlp.util import spell_words`
`@@ -121,6 +122,7 @@`
`121`	`122`	`isthai,`
`122`	`123`	`isthaichar,`
`123`	`124`	`thai_word_tone_detector,`
	`125`	`+ analyze_thai_text,`
`124`	`126`	`)`
`125`	`127`	`from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date`
`126`	`128`	`from pythainlp.util.thaiwordcheck import is_native_thai`