Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +29 -0
- phonemizer.py +124 -0
- pos_tagger.model +3 -0
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as the base image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory in the container
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
espeak-ng \
|
| 10 |
+
git \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Clone ParsNorm repository and install it
|
| 14 |
+
RUN git clone https://github.com/saeedzou/ParsNorm.git \
|
| 15 |
+
&& cd ParsNorm \
|
| 16 |
+
&& pip install -e . \
|
| 17 |
+
&& pip install -r requirements.txt \
|
| 18 |
+
&& pip install fastapi uvicorn
|
| 19 |
+
|
| 20 |
+
COPY pos_tagger.model .
|
| 21 |
+
|
| 22 |
+
# Copy your Python script into the container
|
| 23 |
+
COPY phonemizer.py .
|
| 24 |
+
|
| 25 |
+
# Expose the port FastAPI will run on
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
# Run the FastAPI app
|
| 29 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
phonemizer.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
import re
|
| 3 |
+
import string
|
| 4 |
+
from fastapi import FastAPI, Request
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from hazm import POSTagger, word_tokenize
|
| 7 |
+
from parsnorm import ParsNorm
|
| 8 |
+
|
| 9 |
+
app = FastAPI()
|
| 10 |
+
# Setup
|
| 11 |
+
normalizer = ParsNorm(remove_diacritics=False)
|
| 12 |
+
tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present
|
| 13 |
+
punctuation = string.punctuation + "؟:؛»«،"
|
| 14 |
+
pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)"
|
| 15 |
+
|
| 16 |
+
ambiguity_dict = {
|
| 17 |
+
'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
| 18 |
+
'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True},
|
| 19 |
+
'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 20 |
+
'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True},
|
| 21 |
+
'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
| 22 |
+
'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True},
|
| 23 |
+
'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True},
|
| 24 |
+
'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
| 25 |
+
'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 26 |
+
'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True},
|
| 27 |
+
'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 28 |
+
'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 29 |
+
'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 30 |
+
'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 31 |
+
'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True},
|
| 32 |
+
'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True},
|
| 33 |
+
'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 34 |
+
'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 35 |
+
'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
| 36 |
+
'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
| 37 |
+
'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 38 |
+
'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 39 |
+
'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True},
|
| 40 |
+
'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 41 |
+
'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True},
|
| 42 |
+
'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 43 |
+
'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 44 |
+
'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True},
|
| 45 |
+
'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True},
|
| 46 |
+
'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True},
|
| 47 |
+
'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
| 48 |
+
'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True},
|
| 49 |
+
'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True},
|
| 50 |
+
'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True},
|
| 51 |
+
}
|
| 52 |
+
def get_phoneme_for_pos(entry, target_pos):
|
| 53 |
+
for i, pos_tag in enumerate(entry['pos']):
|
| 54 |
+
if pos_tag == target_pos:
|
| 55 |
+
return entry['phonemes'][i]
|
| 56 |
+
return None # Return None if target POS tag is not found
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def get_phonemes(word):
|
| 60 |
+
"""Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes."""
|
| 61 |
+
cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"'
|
| 62 |
+
try:
|
| 63 |
+
# Run the subprocess with 'latin1' encoding to handle special characters
|
| 64 |
+
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
| 65 |
+
# Remove apostrophes from phonemes and strip any unwanted spaces or newlines
|
| 66 |
+
return result.stdout.strip()
|
| 67 |
+
except UnicodeDecodeError as e:
|
| 68 |
+
print(f"UnicodeDecodeError: {e}\n{word}")
|
| 69 |
+
return None # Or handle the error appropriately
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def process_sentence(sentence, tagger, pattern, punctuation):
|
| 73 |
+
sentence = re.sub(pattern, r' ', sentence)
|
| 74 |
+
"""Convert Persian text to phonemes with Ezafe handling while keeping punctuation."""
|
| 75 |
+
words = word_tokenize(sentence)
|
| 76 |
+
tagged_words = tagger.tag(words)
|
| 77 |
+
|
| 78 |
+
phoneme_list = []
|
| 79 |
+
tag_index = 0 # Track the index of words that get POS tags
|
| 80 |
+
|
| 81 |
+
for word in words:
|
| 82 |
+
if word in punctuation:
|
| 83 |
+
phoneme_list.append(word)
|
| 84 |
+
else: # If it's a word, process normally
|
| 85 |
+
word = word.replace('_', ' ').replace('\u200c', ' ')
|
| 86 |
+
phonemes = get_phonemes(word)
|
| 87 |
+
kaamel_phonemes = ambiguity_dict.get(word)
|
| 88 |
+
if kaamel_phonemes:
|
| 89 |
+
if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']:
|
| 90 |
+
phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', ''))
|
| 91 |
+
|
| 92 |
+
# If word has Ezafe (EZ tag), modify phoneme
|
| 93 |
+
if 'EZ' in tagged_words[tag_index][1]:
|
| 94 |
+
if phonemes.endswith('jeː'):
|
| 95 |
+
pass
|
| 96 |
+
elif phonemes.endswith('ː'): # Ends in long vowel
|
| 97 |
+
phonemes += 'je'
|
| 98 |
+
elif phonemes.endswith('i'): # e.g زندگی
|
| 99 |
+
phonemes += 'je'
|
| 100 |
+
elif phonemes.endswith('je'): # e.g برای
|
| 101 |
+
pass
|
| 102 |
+
elif phonemes.endswith('e'): # e.g مدرسه
|
| 103 |
+
phonemes += 'je'
|
| 104 |
+
else:
|
| 105 |
+
phonemes += 'e'
|
| 106 |
+
|
| 107 |
+
phoneme_list.append(phonemes)
|
| 108 |
+
tag_index += 1 # Move to next tagged word
|
| 109 |
+
|
| 110 |
+
phoneme_text = ' '.join(phoneme_list)
|
| 111 |
+
phoneme_text = re.sub(r"\s+", " ", phoneme_text)
|
| 112 |
+
|
| 113 |
+
return phoneme_text
|
| 114 |
+
|
| 115 |
+
# FastAPI input model
|
| 116 |
+
class InputText(BaseModel):
|
| 117 |
+
text: str
|
| 118 |
+
|
| 119 |
+
# Route
|
| 120 |
+
@app.post("/phonemize")
|
| 121 |
+
async def phonemize(input_data: InputText):
|
| 122 |
+
normalized = normalizer.normalize(input_data.text, remove_punct=False)
|
| 123 |
+
result = process_sentence(normalized, tagger, pattern, punctuation)
|
| 124 |
+
return {"phonemes": result}
|
pos_tagger.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b89995f2fdd39e6efa897e824ec38824e399366821d1afcb01e81a9160dd9a0d
|
| 3 |
+
size 19246648
|