Spaces:

fadimari
/

tashkeel-accurate

Sleeping

App Files Files Community

fadimari commited on 19 days ago

Commit

13e1692

verified ·

1 Parent(s): 2c70247

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -30

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import time
 print("\n⏳ جاري تحميل نموذج Fine-Tashkeel (الدقيق)...")
@@ -19,7 +20,7 @@ LOADING_HTML = """
 <div style="text-align: center; padding: 2rem;">
     <div style="display: inline-block; animation: spin 1s linear infinite; font-size: 2.5rem;">⏳</div>
     <div style="font-size: 1.3rem; color: #667eea; margin-top: 1rem; font-weight: bold;">جاري العمل على التشكيل...</div>
-    <div style="color: #999; margin-top: 0.5rem;">هذا النموذج دقيق وبطيء (20-30 ثانية)، يرجى الانتظار</div>
 </div>
 <style>
     @keyframes spin {
@@ -47,6 +48,7 @@ def count_diacritics(text):
     ]
     return sum(text.count(d) for d in diacritics)
 def run_model(text):
     if not text or not text.strip():
         error_msg = "❌ يرجى إدخال نص"
@@ -56,42 +58,53 @@ def run_model(text):
     try:
         start = time.time()
-        # 1. تنظيف النص الأصلي
         full_clean_text = remove_diacritics(text)
-        # 2. تقسيم النص إلى فقرات بناءً على الأسطر الجديدة
-        # هذا يضمن أننا لا نرسل نصاً طويلاً جداً دفعة واحدة
         lines = full_clean_text.split('\n')
         final_result_parts = []
-        # 3. معالجة كل سطر لوحده
         for line in lines:
-            if not line.strip():
-                final_result_parts.append(line) # الحفاظ على الأسطر الفارغة
                 continue
-            # تجهيز المدخلات للسطر الحالي
-            inputs = tokenizer(
-                line,
-                return_tensors="pt",
-                max_length=1024,
-                truncation=True, # سيقطع السطر فقط إذا كان السطر الواحد أطول من 1024 توكن (نادر جداً)
-                padding="max_length"
-            )
-            with torch.no_grad():
-                outputs = model.generate(
-                    **inputs,
                     max_length=1024,
-                    num_beams=1,
-                    early_stopping=True
                 )
-            # فك التشفير للسطر الحالي
-            chunk_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-            final_result_parts.append(chunk_result)
-        # 4. تجميع النتائج مرة أخرى
         final_result = '\n'.join(final_result_parts)
         elapsed = time.time() - start
@@ -183,11 +196,6 @@ def generate_final_html(clean_text, result_text, stats, show_comparison, highlig
                 <div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('diacritics_count', 0)}</div>
                 <div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">علامة</div>
             </div>
-            <div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
-                <div style="font-size: 2.5rem;">🚀</div>
-                <div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('speed', 0)}</div>
-                <div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">كلمة/ثانية</div>
-            </div>
             <div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
                 <div style="font-size: 2.5rem;">🎯</div>
                 <div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">98%+</div>

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
 import time
+import textwrap  # <--- مكتبة مهمة لتقسيم النص
 print("\n⏳ جاري تحميل نموذج Fine-Tashkeel (الدقيق)...")
 <div style="text-align: center; padding: 2rem;">
     <div style="display: inline-block; animation: spin 1s linear infinite; font-size: 2.5rem;">⏳</div>
     <div style="font-size: 1.3rem; color: #667eea; margin-top: 1rem; font-weight: bold;">جاري العمل على التشكيل...</div>
+    <div style="color: #999; margin-top: 0.5rem;">هذا النموذج دقيق وبطيء، نقوم بمعالجة النص جزءاً بجزء...</div>
 </div>
 <style>
     @keyframes spin {
     ]
     return sum(text.count(d) for d in diacritics)
+# --- دالة التشكيل المعدلة (الحل هنا) ---
 def run_model(text):
     if not text or not text.strip():
         error_msg = "❌ يرجى إدخال نص"
     try:
         start = time.time()
+        # 1. تنظيف النص
         full_clean_text = remove_diacritics(text)
+        # 2. تقسيم النص الأصلي حسب الأسطر للحفاظ على الهيكلية
         lines = full_clean_text.split('\n')
         final_result_parts = []
+        # 3. معالجة كل سطر
         for line in lines:
+            line = line.strip()
+            if not line:
+                final_result_parts.append("") # سطر فارغ
                 continue
+            # --- التعديل الجوهري: تقسيم السطر الطويل إلى قطع صغيرة ---
+            # نقسم السطر إلى أجزاء طولها 600 حرف تقريباً
+            # هذا يضمن أن النموذج يملك مساحة كافية لإضافة الحركات
+            chunks = textwrap.wrap(line, width=600, break_long_words=False, replace_whitespace=False)
+            line_result_parts = []
+            for chunk in chunks:
+                if not chunk.strip():
+                    continue
+                inputs = tokenizer(
+                    chunk,
+                    return_tensors="pt",
                     max_length=1024,
+                    truncation=True
                 )
+                with torch.no_grad():
+                    outputs = model.generate(
+                        **inputs,
+                        max_length=1024,  # نعطيه مساحة كاملة
+                        num_beams=1,      # سرعة أكبر
+                        early_stopping=False # لا تتوقف حتى تنتهي تماماً
+                    )
+                chunk_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                line_result_parts.append(chunk_result)
+            # تجميع أجزاء السطر الواحد
+            final_result_parts.append(" ".join(line_result_parts))
+        # 4. تجميع النص النهائي
         final_result = '\n'.join(final_result_parts)
         elapsed = time.time() - start
                 <div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('diacritics_count', 0)}</div>
                 <div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">علامة</div>
             </div>
             <div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
                 <div style="font-size: 2.5rem;">🎯</div>
                 <div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">98%+</div>