-
Notifications
You must be signed in to change notification settings - Fork 0
/
Normalizer.py
57 lines (47 loc) · 1.58 KB
/
Normalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
# coding: utf-8
import unicodedata
import string
import re
from num2words import num2words
def normalize(text):
text = text.lower().strip()
text = re.sub(r"([.!?])", r" \1", text)
text = re.sub(r"[^a-zA-Zçğıüöş.!? ]+", r"", text)
return text
def remove_puncation(text):
return text.translate(str.maketrans('', '', string.punctuation))
def number2string(text):
match = re.findall(r'(\d+)',text)
for m in match:
text = text.replace(m,num2words(int(m),lang="tr"),1)
return text
def date2string(text,puncation="."):
tr_months = ["ocak","şubat","mart","nisan","mayıs","haziran","temmuz","ağustos","eylül","ekim","kasım","aralık"]
try:
match = re.findall(r'(\d+'+ puncation+'\d+' + puncation + '\d+)',text)
for m in match:
gün = num2words(int(m.split(puncation)[0]),lang="tr")
ay = tr_months[int(m.split(puncation)[1])-1]
yıl = num2words(int(m.split(puncation)[2]),lang="tr")
text = text.replace(m,gün+" "+ay+ " " + yıl)
except:
return text
return text
def time2string(text):
match = re.findall(r'(\d+:\d+)',text)
res = ""
for m in match:
try:
s1 = m.split(":")[0]
s2 = m.split(":")[1]
text = text.replace(m,num2words(int(s1),lang="tr")+" "+num2words(int(s2),lang="tr"))
except:
return text
return text
def full_normalization(text):
text = time2string(text)
text = date2string(text)
text = number2string(text)
text = normalize(text)
return text