import subprocess import re import string import pandas as pd from fastapi import FastAPI, Request from pydantic import BaseModel from hazm import POSTagger, word_tokenize from parsnorm import ParsNorm def map_words_dict(csv_file="map_words.csv"): map_df = pd.DataFrame(pd.read_csv(csv_file)) mapping_dict = {} for i in range(len(map_df['original'])): mapping_dict[map_df['original'][i]] = map_df['corrected'][i] return {str(k): str(v) for k, v in mapping_dict.items() if pd.notna(k) and pd.notna(v)} def create_pattern_from_mapping_dict_words(mapping_dict): # Use \b (word boundary) to ensure only full words are matched return r"\b(" + "|".join(map(re.escape, mapping_dict.keys())) + r")\b" def multiple_replace(text, mapping_dict, mapping_pattern): return re.sub(mapping_pattern, lambda m: mapping_dict[m.group()], str(text)) app = FastAPI() # Setup normalizer = ParsNorm(remove_diacritics=False) words_mapping_dict = map_words_dict('./final_map_words.csv') words_mapping_pattern = create_pattern_from_mapping_dict_words(words_mapping_dict) tagger = POSTagger(model='./pos_tagger.model') # Make sure this model is present punctuation = string.punctuation + "؟:؛»«،" pattern = rf"(?<=\w)(?=[{re.escape(punctuation)}])|(?<=[{re.escape(punctuation)}])(?=\w)" ambiguity_dict = { 'بعد' : {'phonemes': ['baʔd', 'boʔd'], 'pos': ['ADP', 'NOUN'], 'diff': True}, 'شش' : {'phonemes': ['ʃeʃ', 'ʃoʃ'], 'pos': ['NOUN', 'NUM'], 'diff': True}, 'سقط' : {'phonemes': ['saqat', 'seqt'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'می' : {'phonemes': ['mej', 'mi'], 'pos': ['NOUN', 'ADP'], 'diff': True}, 'روی' : {'phonemes': ['ravi', 'ruj'], 'pos': ['VERB', 'NOUN'], 'diff': True}, 'رو' : {'phonemes': ['ro', 'ru'], 'pos': ['ADP', 'NOUN'], 'diff': True}, 'ولو' : {'phonemes': ['valo', 'velo'], 'pos': ['SCONJ', 'ADJ'], 'diff': True}, 'ده' : {'phonemes': ['dah', 'deh'], 'pos': ['NUM', 'NOUN'], 'diff': True}, 'خیر' : {'phonemes': ['xejr', 'xajjer'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'اولی' : {'phonemes': ['ʔavvali', 'ʔolɑ'], 'pos': ['NUM', 'ADJ'], 'diff': True}, 'مایل' : {'phonemes': ['mɑjel', 'mɑjl'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'سنی' : {'phonemes': ['sonni', 'senni'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'سبک' : {'phonemes': ['sabk', 'sabok'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'کر' : {'phonemes': ['kor', 'kar'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'نرم' : {'phonemes': ['naram', 'narm'], 'pos': ['VERB', 'ADJ'], 'diff': True}, 'جدا' : {'phonemes': ['dʒeddan', 'dʒodɑ'], 'pos': ['ADV', 'ADJ'], 'diff': True}, 'معین' : {'phonemes': ['moʔin', 'moʔajjan'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'خلقی' : {'phonemes': ['xalqi', 'xolqi'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'بردار' : {'phonemes': ['bardɑr', 'bordɑr'], 'pos': ['VERB', 'NOUN'], 'diff': True}, 'مرد' : {'phonemes': ['mord', 'mard'], 'pos': ['VERB', 'NOUN'], 'diff': True}, 'مقدم' : {'phonemes': ['moqaddam', 'maqdam'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'پست' : {'phonemes': ['past', 'post'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'شما' : {'phonemes': ['ʃemɑ', 'ʃomɑ'], 'pos': ['NOUN', 'PRON'], 'diff': True}, 'تنگ' : {'phonemes': ['tonɡ', 'tanɡ'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'صفر' : {'phonemes': ['safar', 'sefr'], 'pos': ['NUM', 'NOUN'], 'diff': True}, 'پر' : {'phonemes': ['por', 'par'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'مصر' : {'phonemes': ['moserr', 'mesr'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'کشت' : {'phonemes': ['koʃt', 'keʃt'], 'pos': ['VERB', 'NOUN'], 'diff': True}, 'کی' : {'phonemes': ['kej', 'ki'], 'pos': ['ADV', 'NOUN'], 'diff': True}, 'جور' : {'phonemes': ['dʒur', 'dʒor'], 'pos': ['ADJ', 'NOUN'], 'diff': True}, 'کرد' : {'phonemes': ['kord', 'kard'], 'pos': ['NOUN', 'VERB'], 'diff': True}, 'علی' : {'phonemes': ['ʔali', 'ʔelli'], 'pos': ['NOUN', 'ADJ'], 'diff': True}, 'شست' : {'phonemes': ['ʃast', 'ʃost'], 'pos': ['NOUN', 'VERB'], 'diff': True}, 'دهم' : {'phonemes': ['dahom', 'daham'], 'pos': ['NUM', 'VERB'], 'diff': True}, } def get_phoneme_for_pos(entry, target_pos): for i, pos_tag in enumerate(entry['pos']): if pos_tag == target_pos: return entry['phonemes'][i] return None # Return None if target POS tag is not found def get_phonemes(word): """Get phonemes of a word using espeak-ng without playing audio, and remove apostrophes.""" cmd = f'espeak-ng -v fa --ipa -q "{word}" | sed "s/[ˈˌː]//g" | sed "s/q1/q/g"' try: # Run the subprocess with 'latin1' encoding to handle special characters result = subprocess.run(cmd, shell=True, capture_output=True, text=True) # Remove apostrophes from phonemes and strip any unwanted spaces or newlines return result.stdout.strip() except UnicodeDecodeError as e: print(f"UnicodeDecodeError: {e}\n{word}") return None # Or handle the error appropriately def process_sentence(sentence, tagger, pattern, punctuation): sentence = re.sub(pattern, r' ', sentence) """Convert Persian text to phonemes with Ezafe handling while keeping punctuation.""" words = word_tokenize(sentence) tagged_words = tagger.tag(words) phoneme_list = [] tag_index = 0 # Track the index of words that get POS tags for word in words: if word in punctuation: if phoneme_list: phoneme_list[-1] += word else: phoneme_list.append(word) else: # If it's a word, process normally words = word.replace('_', ' ').split("\u200c") phoneme_for_word_list = [] for w in words: phonemes = get_phonemes(w) kaamel_phonemes = ambiguity_dict.get(w) if kaamel_phonemes: if tagged_words[tag_index][1].replace(',EZ', '') in kaamel_phonemes['pos']: phonemes = get_phoneme_for_pos(kaamel_phonemes, tagged_words[tag_index][1].replace(',EZ', '')) # If word has Ezafe (EZ tag), modify phoneme if 'EZ' in tagged_words[tag_index][1]: if phonemes.endswith('jeː'): pass elif phonemes.endswith('ː'): # Ends in long vowel phonemes += 'je' elif phonemes.endswith('i'): # e.g زندگی phonemes += 'je' elif phonemes.endswith('je'): # e.g برای pass elif phonemes.endswith('e'): # e.g مدرسه phonemes += 'je' else: phonemes += 'e' phoneme_for_word_list.append(phonemes) phoneme_for_word = "".join(phoneme_for_word_list) phoneme_list.append(phoneme_for_word) tag_index += 1 # Move to next tagged word phoneme_text = ' '.join(phoneme_list) phoneme_text = re.sub(r"\s+", " ", phoneme_text) return phoneme_text # FastAPI input model class InputText(BaseModel): text: str # Route @app.get("/") async def root(): return {"message": "Welcome to the Persian Phonemizer API. Use the /phonemize endpoint to process text."} @app.post("/phonemize") async def phonemize(input_data: InputText): normalized = normalizer.normalize(input_data.text, remove_punct=False) normalized = multiple_replace(normalized, words_mapping_dict, words_mapping_pattern) result = process_sentence(normalized, tagger, pattern, punctuation) return {"phonemes": result}