import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch import time import textwrap # <--- مكتبة مهمة لتقسيم النص print("\n⏳ جاري تحميل نموذج Fine-Tashkeel (الدقيق)...") model_name = "basharalrfooh/Fine-Tashkeel" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) device = torch.device("cpu") model.to(device) model.eval() print(f"✅ جاهز على {device}!\n") LOADING_HTML = """

⏳

جاري العمل على التشكيل...

هذا النموذج دقيق وبطيء، نقوم بمعالجة النص جزءاً بجزء...

""" def remove_diacritics(text): diacritics = [ '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0670', ] for diacritic in diacritics: text = text.replace(diacritic, '') return text def count_diacritics(text): diacritics = [ '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0670', ] return sum(text.count(d) for d in diacritics) # --- دالة التشكيل المعدلة (الحل هنا) --- def run_model(text): if not text or not text.strip(): error_msg = "❌ يرجى إدخال نص" stats = {'error': error_msg} return None, None, stats, error_msg try: start = time.time() # 1. تنظيف النص full_clean_text = remove_diacritics(text) # 2. تقسيم النص الأصلي حسب الأسطر للحفاظ على الهيكلية lines = full_clean_text.split('\n') final_result_parts = [] # 3. معالجة كل سطر for line in lines: line = line.strip() if not line: final_result_parts.append("") # سطر فارغ continue # --- التعديل الجوهري: تقسيم السطر الطويل إلى قطع صغيرة --- # نقسم السطر إلى أجزاء طولها 600 حرف تقريباً # هذا يضمن أن النموذج يملك مساحة كافية لإضافة الحركات chunks = textwrap.wrap(line, width=250, break_long_words=False, replace_whitespace=False) line_result_parts = [] for chunk in chunks: if not chunk.strip(): continue inputs = tokenizer( chunk, return_tensors="pt", max_length=1024, truncation=True ) with torch.no_grad(): outputs = model.generate( **inputs, max_length=1024, # نعطيه مساحة كاملة num_beams=1, # سرعة أكبر early_stopping=False # لا تتوقف حتى تنتهي تماماً ) chunk_result = tokenizer.decode(outputs[0], skip_special_tokens=True) line_result_parts.append(chunk_result) # تجميع أجزاء السطر الواحد final_result_parts.append(" ".join(line_result_parts)) # 4. تجميع النص النهائي final_result = '\n'.join(final_result_parts) elapsed = time.time() - start words_count = len(full_clean_text.split()) diacritics_count = count_diacritics(final_result) speed = round(words_count / elapsed, 1) if elapsed > 0 else 0 stats = { "elapsed": elapsed, "words_count": words_count, "chars_count": len(final_result), "diacritics_count": diacritics_count, "speed": speed } return full_clean_text, final_result, stats, "✅ تم التشكيل بنجاح!" except Exception as e: print(f"ERROR: {str(e)}") import traceback traceback.print_exc() error_msg = f"❌ خطأ: {str(e)}" stats = {'error': error_msg} return None, None, stats, error_msg def generate_final_html(clean_text, result_text, stats, show_comparison, highlight_mode): if not result_text: if stats and 'error' in stats: return f"""

{stats['error']}

""" return None comparison_html = "" if show_comparison: comparison_html = f"""

⬅️ قبل التشكيل

{clean_text}

➡️

➡️ بعد التشكيل

{result_text}

""" highlighted_result = result_text if highlight_mode: diacritics = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650', '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656', '\u0657', '\u0658', '\u0670'] for diacritic in diacritics: highlighted_result = highlighted_result.replace( diacritic, f'{diacritic}' ) stats_html = f"""

📊 إحصائيات التشكيل

⚡

{stats.get('elapsed', 0):.2f}s

الوقت

📝

{stats.get('words_count', 0)}

كلمة

📊

{stats.get('chars_count', 0)}

حرف

✨

{stats.get('diacritics_count', 0)}

علامة

🎯

98%+

الدقة

""" output = comparison_html output += f"""

{highlighted_result}

{stats_html} """ return output with gr.Blocks( title="🎯 مُشَكِّل (الدقيق 98%)", theme=gr.themes.Soft(), css=""" body { font-family: 'Arial', sans-serif; } .gradio-container { direction: rtl; } """ ) as demo: gr.Markdown(""" # 🚀 مُشَكِّل النصوص (النموذج الدقيق 98%+)

⚠️ تنبيه: هذا النموذج هو الأدق، ولكنه بطيء مع النصوص الطويلة.

""") clean_text_state = gr.State(None) result_text_state = gr.State(None) stats_state = gr.State({}) with gr.Row(): with gr.Column(scale=2): input_text = gr.Textbox( label="النص", placeholder="أدخل النص العربي هنا (مشكول أو بدون تشكيل)...", lines=10, max_lines=20 ) with gr.Row(): show_comparison = gr.Checkbox(label="🔄 مقارنة النصين", value=False) highlight_diacritics = gr.Checkbox(label="🎨 تلوين الحركات", value=False) submit_btn = gr.Button("✨ إضافة التشكيل", variant="primary", size="lg") output_html = gr.HTML() status = gr.Textbox(label="الحالة", interactive=False) gr.Examples( [ ["السلام عليكم ورحمة الله وبركاته"], ["اللغة العربية لغة القران الكريم"], ], inputs=input_text, label="أمثلة سريعة" ) def show_loading(): return LOADING_HTML, "⏳ جاري التشكيل..." render_inputs = [ clean_text_state, result_text_state, stats_state, show_comparison, highlight_diacritics ] submit_btn.click( fn=show_loading, inputs=None, outputs=[output_html, status] ).then( fn=run_model, inputs=[input_text], outputs=[clean_text_state, result_text_state, stats_state, status] ).then( fn=generate_final_html, inputs=render_inputs, outputs=[output_html] ) show_comparison.change( fn=generate_final_html, inputs=render_inputs, outputs=[output_html] ) highlight_diacritics.change( fn=generate_final_html, inputs=render_inputs, outputs=[output_html] ) demo.launch()