Skip to content

Commit c94002a

Browse files
authored
Add pythainlp.util.analyze_thai_text (#1149)
* Add Thai character mapping and analysis function Added a comprehensive mapping of Thai characters to their descriptive names, including consonants, vowels, tone marks, punctuation, and digits. Implemented the analyze_thai_text function to analyze Thai text and return a list of classified characters. * Document analyze_thai_text function Added documentation for analyze_thai_text function. * Add analyze_thai_text to module exports * Add tests for analyze_thai_text function Added unit tests for analyze_thai_text function. * Change analyze_thai_text return type to dict Updated the analyze_thai_text function to return a single dictionary instead of a list of dictionaries, reflecting changes in the return type and documentation. * Update util.rst * Fix typo in analyze_thai_text docstring Corrected the description in the analyze_thai_text function. * Fix expected output format in Thai text analysis tests * Update thai.py
1 parent bf24627 commit c94002a

File tree

4 files changed

+100
-0
lines changed

4 files changed

+100
-0
lines changed

docs/api/util.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ The :mod:`pythainlp.util` module serves as a treasure trove of utility functions
77
Modules
88
-------
99

10+
.. autofunction:: analyze_thai_text
11+
:noindex:
12+
13+
Analyzes a string of Thai text and returns a dictionaries, where each values represents a single classified character from the text.
14+
1015
.. autofunction:: abbreviation_to_full_text
1116
:noindex:
1217

pythainlp/util/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
"tone_detector",
7070
"tone_to_spelling",
7171
"words_to_num",
72+
"analyze_thai_text",
7273
]
7374

7475
from pythainlp.util import spell_words
@@ -121,6 +122,7 @@
121122
isthai,
122123
isthaichar,
123124
thai_word_tone_detector,
125+
analyze_thai_text,
124126
)
125127
from pythainlp.util.thai_lunar_date import th_zodiac, to_lunar_date
126128
from pythainlp.util.thaiwordcheck import is_native_thai

pythainlp/util/thai.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import string
1010
from typing import Tuple
11+
from collections import defaultdict
1112

1213
from pythainlp import (
1314
thai_above_vowels,
@@ -26,6 +27,52 @@
2627
_TH_FIRST_CHAR_ASCII = 3584
2728
_TH_LAST_CHAR_ASCII = 3711
2829

30+
# A comprehensive map of Thai characters to their descriptive names.
31+
THAI_CHAR_NAMES = {
32+
# Consonants
33+
**{char: char for char in thai_consonants},
34+
# Vowels and Signs
35+
"\u0e24": "ฤ",
36+
"\u0e26": "ฦ",
37+
"\u0e30": "สระ อะ",
38+
"\u0e31": "ไม้หันอากาศ",
39+
"\u0e32": "สระ อา",
40+
"\u0e33": "สระ อำ",
41+
"\u0e34": "สระ อิ",
42+
"\u0e35": "สระ อี",
43+
"\u0e36": "สระ อึ",
44+
"\u0e37": "สระ อือ",
45+
"\u0e38": "สระ อุ",
46+
"\u0e39": "สระ อู",
47+
"\u0e40": "สระ เอ",
48+
"\u0e41": "สระ แอ",
49+
"\u0e42": "สระ โอ",
50+
"\u0e43": "สระ ใอ",
51+
"\u0e44": "สระ ไอ",
52+
"\u0e45": "ไม้ม้วน",
53+
"\u0e4d": "นฤคหิต",
54+
"\u0e47": "ไม้ไต่คู้",
55+
# Tone Marks
56+
"\u0e48": "ไม้เอก",
57+
"\u0e49": "ไม้โท",
58+
"\u0e4a": "ไม้ตรี",
59+
"\u0e4b": "ไม้จัตวา",
60+
# Other Signs
61+
"\u0e2f": "ไปยาลน้อย",
62+
"\u0e3a": "พินทุ",
63+
"\u0e46": "ไม้ยมก",
64+
"\u0e4c": "การันต์",
65+
"\u0e4e": "ยามักการ",
66+
# Punctuation
67+
"\u0e4f": "ฟองมัน",
68+
"\u0e5a": "อังคั่นคู่",
69+
"\u0e5b": "โคมุต",
70+
# Digits
71+
**{char: char for char in thai_digits},
72+
# Symbol
73+
"\u0e3f": "฿",
74+
}
75+
2976

3077
def isthaichar(ch: str) -> bool:
3178
"""Check if a character is a Thai character.
@@ -269,3 +316,38 @@ def count_thai_chars(text: str) -> dict:
269316
else:
270317
_dict["non_thai"] += 1
271318
return _dict
319+
320+
321+
def analyze_thai_text(text: str) -> dict:
322+
"""
323+
Analyzes a string of Thai text and returns a dictionaries,
324+
where each values represents a single classified character from the text.
325+
326+
The function processes the text character by character and maps each Thai
327+
character to its descriptive name or itself (for consonants and digits).
328+
329+
:param str text: The Thai text string to be analyzed.
330+
:rtype: list[dict]
331+
:return: A dictionaries, with each item containing
332+
a single character and a count of 1.
333+
334+
Examples:
335+
>>> analyze_thai_text("คนดี")
336+
{'ค': 1, 'น': 1, 'ด': 1, 'สระ อี': 1}
337+
338+
>>> analyze_thai_text("เล่น")
339+
{'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
340+
"""
341+
results = defaultdict(int)
342+
343+
# Iterate over each character in the input string
344+
for char in text:
345+
# Check if the character is in our mapping
346+
if char in THAI_CHAR_NAMES:
347+
name = THAI_CHAR_NAMES[char]
348+
results[name]+=1
349+
else:
350+
# If the character is not a known Thai character, classify it as character
351+
results[char]+=1
352+
353+
return dict(results)

tests/core/test_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
tone_detector,
6868
words_to_num,
6969
spelling,
70+
analyze_thai_text,
7071
)
7172
from pythainlp.util.morse import morse_decode, morse_encode
7273

@@ -874,3 +875,13 @@ def test_longest_common_subsequence(self):
874875
self.assertEqual(longest_common_subsequence("", "ABC"), "")
875876
self.assertEqual(longest_common_subsequence("ABC", ""), "")
876877
self.assertEqual(longest_common_subsequence("", ""), "")
878+
879+
def test_analyze_thai_text(self):
880+
self.assertEqual(
881+
analyze_thai_text("คนดี"),
882+
{"ค": 1, "น": 1, "ด": 1, "สระ อี": 1}
883+
)
884+
self.assertEqual(
885+
analyze_thai_text("เล่น"),
886+
{'สระ เอ': 1, 'ล': 1, 'ไม้เอก': 1, 'น': 1}
887+
)

0 commit comments

Comments
 (0)