Skip to content

Commit 436d4ed

Browse files
authored
add demo rules for Estonian (#157)
* add rules for Estonian * test_rules.py: add newline * better rules and data * prepare merge * fix pickler
1 parent 7cf261d commit 436d4ed

File tree

3 files changed

+59
-1
lines changed

3 files changed

+59
-1
lines changed

simplemma/strategies/defaultrules/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .de import apply_de
66
from .en import apply_en
7+
from .et import apply_et
78
from .fi import apply_fi
89
from .lv import apply_lv
910
from .nl import apply_nl
@@ -13,6 +14,7 @@
1314
DEFAULT_RULES: Dict[str, Callable[[str], Optional[str]]] = {
1415
"de": apply_de,
1516
"en": apply_en,
17+
"et": apply_et,
1618
"fi": apply_fi,
1719
"lv": apply_lv,
1820
"nl": apply_nl,
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import re
2+
from typing import Optional
3+
4+
from .generic import apply_rules
5+
6+
7+
## Just a demo, the rules are really basic and coverage is not good
8+
9+
10+
DEFAULT_RULES = {
11+
# adjectives
12+
# https://en.wiktionary.org/wiki/-line
13+
re.compile(
14+
r"(?:lise|list|lisse|lisesse|lises|lisest|lisele|lisel|liselt|liseks|liseni|lisena|liseta|lisega|lised|liste|lisi|listesse|lisisse|listes|lisis|listest|lisist|listele|lisile|listel|lisil|listelt|lisilt|listeks|lisiks|listeni|listena|listeta|listega)$"
15+
): "line",
16+
# https://en.wiktionary.org/wiki/-mine
17+
re.compile(
18+
r"(?:mise|mist|misse|misesse|mises|misest|misele|misel|miselt|miseks|miseni|misena|miseta|misega|mised|miste|misi|mistesse|misisse|mistes|misis|mistest|misist|mistele|misile|mistel|misil|mistelt|misilt|misteks|misiks|misteni|mistena|misteta|mistega)$"
19+
): "mine",
20+
# nouns
21+
# https://en.wiktionary.org/wiki/-dus
22+
re.compile(
23+
r"(?:duse|dust|dusse|dusesse|duses|dusest|dusele|dusel|duselt|duseks|duseni|dusena|duseta|dusega|dused|duste|dusi|dustesse|dusisse|dustes|dusis|dustest|dusist|dustele|dusile|dustel|dusil|dustelt|dusilt|dusteks|dusiks|dusteni|dustena|dusteta|dustega)$"
24+
): "dus",
25+
# https://en.wiktionary.org/wiki/-lik
26+
# https://en.wiktionary.org/wiki/-nik
27+
re.compile(
28+
r"(?:iku|ikku|ikusse|ikus|ikust|ikule|ikul|ikult|ikuks|ikuni|ikuna|ikuta|ikuga|ikud|ike|ikudde|ikke|ikusid|ikesse|ikkudesse|ikes|ikkudes|ikest|ikkudest|ikele|ikkudele|ikel|ikkudel|ikelt|ikkudelt|ikeks|ikkudeks|ikeni|ikkudeni|ikena|ikkudena|iketa|ikkudeta|ikega|ikkudega)$"
29+
): "ik",
30+
# https://en.wiktionary.org/wiki/-kond
31+
re.compile(
32+
r"(?:konna|konda|konnasse|konnas|konnast|konnale|konnal|konnalt|konnaks|konnani|konnana|konnata|konnaga|konnad|kondade|kondi|kondasid|kondadesse|konnisse|kondades|konnis|kondadest|konnist|kondadele|konnile|kondadel|konnil|kondadelt|konnilt|kondadeks|konniks|kondadeni|kondadena|kondadeta|kondadega)$"
33+
): "kond",
34+
}
35+
36+
37+
def apply_et(token: str) -> Optional[str]:
38+
"Apply pre-defined rules for Estonian."
39+
if len(token) < 8 or token[0].isupper():
40+
return None
41+
42+
return apply_rules(token, DEFAULT_RULES)

tests/strategies/defaultrules/test_rules.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,28 @@
44
def test_DEFAULT_RULES() -> None:
55
"""Test rules on all available languages."""
66
rules_strategy = RulesStrategy()
7+
78
assert rules_strategy.get_lemma("Pfifferlinge", "de") == "Pfifferling"
8-
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None
99
assert rules_strategy.get_lemma("atonements", "de") is None
10+
1011
assert rules_strategy.get_lemma("atonements", "en") == "atonement"
12+
assert rules_strategy.get_lemma("Pfifferlinge", "en") is None
13+
1114
assert rules_strategy.get_lemma("brieven", "nl") == "brief"
15+
1216
assert rules_strategy.get_lemma("liikenaisessa", "fi") == "liikenainen"
17+
1318
assert rules_strategy.get_lemma("pracowaliście", "pl") == "pracować"
19+
1420
assert rules_strategy.get_lemma("безгра́мотностью", "ru") == "безгра́мотность"
21+
1522
assert rules_strategy.get_lemma("Rīga", "lv") is None
1623
assert rules_strategy.get_lemma("šķirkļiem", "lv") == "šķirklis"
1724
assert rules_strategy.get_lemma("mācībām", "lv") == "mācība"
25+
26+
assert rules_strategy.get_lemma("Läänemere", "et") is None
27+
assert rules_strategy.get_lemma("tavalised", "et") == "tavaline"
28+
assert rules_strategy.get_lemma("peamisteks", "et") == "peamine"
29+
assert rules_strategy.get_lemma("tähendustena", "et") == "tähendus"
30+
assert rules_strategy.get_lemma("kunstnikud", "et") == "kunstnik"
31+
assert rules_strategy.get_lemma("keelkondade", "et") == "keelkond"

0 commit comments

Comments
 (0)