Spaces:

fadimari
/

tashkeel-accurate

Sleeping

App Files Files Community

tashkeel-accurate / app.py

fadimari

Update app.py

4d3a6f1 verified 18 days ago

raw

history blame contribute delete

12.8 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	import time
	import textwrap # <--- مكتبة مهمة لتقسيم النص

	print("\n⏳ جاري تحميل نموذج Fine-Tashkeel (الدقيق)...")

	model_name = "basharalrfooh/Fine-Tashkeel"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	device = torch.device("cpu")
	model.to(device)
	model.eval()

	print(f"✅ جاهز على {device}!\n")

	LOADING_HTML = """
	<div style="text-align: center; padding: 2rem;">
	<div style="display: inline-block; animation: spin 1s linear infinite; font-size: 2.5rem;">⏳</div>
	<div style="font-size: 1.3rem; color: #667eea; margin-top: 1rem; font-weight: bold;">جاري العمل على التشكيل...</div>
	<div style="color: #999; margin-top: 0.5rem;">هذا النموذج دقيق وبطيء، نقوم بمعالجة النص جزءاً بجزء...</div>
	</div>
	<style>
	@keyframes spin {
	0% { transform: rotate(0deg); }
	100% { transform: rotate(360deg); }
	}
	</style>
	"""

	def remove_diacritics(text):
	diacritics = [
	'\u064B', '\u064C', '\u064D', '\u064E', '\u064F',
	'\u0650', '\u0651', '\u0652', '\u0653', '\u0654',
	'\u0655', '\u0656', '\u0657', '\u0658', '\u0670',
	]
	for diacritic in diacritics:
	text = text.replace(diacritic, '')
	return text

	def count_diacritics(text):
	diacritics = [
	'\u064B', '\u064C', '\u064D', '\u064E', '\u064F',
	'\u0650', '\u0651', '\u0652', '\u0653', '\u0654',
	'\u0655', '\u0656', '\u0657', '\u0658', '\u0670',
	]
	return sum(text.count(d) for d in diacritics)

	# --- دالة التشكيل المعدلة (الحل هنا) ---
	def run_model(text):
	if not text or not text.strip():
	error_msg = "❌ يرجى إدخال نص"
	stats = {'error': error_msg}
	return None, None, stats, error_msg

	try:
	start = time.time()

	# 1. تنظيف النص
	full_clean_text = remove_diacritics(text)

	# 2. تقسيم النص الأصلي حسب الأسطر للحفاظ على الهيكلية
	lines = full_clean_text.split('\n')
	final_result_parts = []

	# 3. معالجة كل سطر
	for line in lines:
	line = line.strip()
	if not line:
	final_result_parts.append("") # سطر فارغ
	continue

	# --- التعديل الجوهري: تقسيم السطر الطويل إلى قطع صغيرة ---
	# نقسم السطر إلى أجزاء طولها 600 حرف تقريباً
	# هذا يضمن أن النموذج يملك مساحة كافية لإضافة الحركات
	chunks = textwrap.wrap(line, width=250, break_long_words=False, replace_whitespace=False)

	line_result_parts = []

	for chunk in chunks:
	if not chunk.strip():
	continue

	inputs = tokenizer(
	chunk,
	return_tensors="pt",
	max_length=1024,
	truncation=True
	)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_length=1024, # نعطيه مساحة كاملة
	num_beams=1, # سرعة أكبر
	early_stopping=False # لا تتوقف حتى تنتهي تماماً
	)

	chunk_result = tokenizer.decode(outputs[0], skip_special_tokens=True)
	line_result_parts.append(chunk_result)

	# تجميع أجزاء السطر الواحد
	final_result_parts.append(" ".join(line_result_parts))

	# 4. تجميع النص النهائي
	final_result = '\n'.join(final_result_parts)

	elapsed = time.time() - start

	words_count = len(full_clean_text.split())
	diacritics_count = count_diacritics(final_result)
	speed = round(words_count / elapsed, 1) if elapsed > 0 else 0

	stats = {
	"elapsed": elapsed,
	"words_count": words_count,
	"chars_count": len(final_result),
	"diacritics_count": diacritics_count,
	"speed": speed
	}

	return full_clean_text, final_result, stats, "✅ تم التشكيل بنجاح!"

	except Exception as e:
	print(f"ERROR: {str(e)}")
	import traceback
	traceback.print_exc()
	error_msg = f"❌ خطأ: {str(e)}"
	stats = {'error': error_msg}
	return None, None, stats, error_msg

	def generate_final_html(clean_text, result_text, stats, show_comparison, highlight_mode):
	if not result_text:
	if stats and 'error' in stats:
	return f"""
	<div style="text-align: center; padding: 2rem;">
	<div style="color: #e74c3c; font-size: 1.2rem;">{stats['error']}</div>
	</div>
	"""
	return None

	comparison_html = ""
	if show_comparison:
	comparison_html = f"""
	<div style="display: grid; grid-template-columns: 1fr auto 1fr; gap: 1rem; padding: 1.5rem; background: #f8f9fa; border-radius: 15px; border: 2px solid #ffc107;">
	<div style="text-align: right;">
	<h4 style="color: #667eea; margin-bottom: 1rem;">⬅️ قبل التشكيل</h4>
	<div style="background: white; padding: 1.5rem; border-radius: 10px; border: 2px solid #ddd; font-size: 1.1rem; line-height: 2.2; direction: rtl; text-align: right;">
	{clean_text}
	</div>
	</div>
	<div style="display: flex; align-items: center; justify-content: center; font-size: 2rem; color: #667eea;">➡️</div>
	<div style="text-align: right;">
	<h4 style="color: #28a745; margin-bottom: 1rem;">➡️ بعد التشكيل</h4>
	<div style="background: white; padding: 1.5rem; border-radius: 10px; border: 2px solid #28a745; font-size: 1.1rem; line-height: 2.2; direction: rtl; text-align: right;">
	{result_text}
	</div>
	</div>
	</div>
	"""

	highlighted_result = result_text
	if highlight_mode:
	diacritics = ['\u064B', '\u064C', '\u064D', '\u064E', '\u064F',
	'\u0650', '\u0651', '\u0652', '\u0653', '\u0654',
	'\u0655', '\u0656', '\u0657', '\u0658', '\u0670']
	for diacritic in diacritics:
	highlighted_result = highlighted_result.replace(
	diacritic,
	f'<span style="color: #fff; background: #e74c3c; font-weight: bold; padding: 2px 5px; border-radius: 3px; margin: 0 2px;">{diacritic}</span>'
	)

	stats_html = f"""
	<div style="margin-top: 2rem; padding: 1.5rem; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); border-radius: 15px;">
	<h3 style="color: #667eea; text-align: center; margin-bottom: 1.5rem;">📊 إحصائيات التشكيل</h3>
	<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 1rem;">
	<div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
	<div style="font-size: 2.5rem;">⚡</div>
	<div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('elapsed', 0):.2f}s</div>
	<div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">الوقت</div>
	</div>
	<div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
	<div style="font-size: 2.5rem;">📝</div>
	<div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('words_count', 0)}</div>
	<div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">كلمة</div>
	</div>
	<div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
	<div style="font-size: 2.5rem;">📊</div>
	<div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('chars_count', 0)}</div>
	<div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">حرف</div>
	</div>
	<div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
	<div style="font-size: 2.5rem;">✨</div>
	<div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">{stats.get('diacritics_count', 0)}</div>
	<div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">علامة</div>
	</div>
	<div style="background: white; padding: 1.5rem; border-radius: 12px; text-align: center; box-shadow: 0 4px 15px rgba(102, 126, 234, 0.2); border-left: 4px solid #667eea;">
	<div style="font-size: 2.5rem;">🎯</div>
	<div style="font-size: 1.8rem; font-weight: bold; color: #667eea;">98%+</div>
	<div style="color: #666; font-size: 0.85rem; margin-top: 0.5rem;">الدقة</div>
	</div>
	</div>
	</div>
	"""

	output = comparison_html
	output += f"""
	<div style="margin-top: 1.5rem; padding: 1.5rem; background: #f8f9fa; border-radius: 12px; font-size: 1.2rem; line-height: 2.2; border-right: 4px solid #28a745; direction: rtl; text-align: right;">
	{highlighted_result}
	</div>
	{stats_html}
	"""

	return output

	with gr.Blocks(
	title="🎯 مُشَكِّل (الدقيق 98%)",
	theme=gr.themes.Soft(),
	css="""
	body { font-family: 'Arial', sans-serif; }
	.gradio-container { direction: rtl; }
	"""
	) as demo:

	gr.Markdown("""
	# 🚀 مُشَكِّل النصوص (النموذج الدقيق 98%+)
	<p style='direction: rtl; color: #e74c3c; font-weight: bold;'>
	⚠️ تنبيه: هذا النموذج هو الأدق، ولكنه بطيء مع النصوص الطويلة.
	</p>
	""")

	clean_text_state = gr.State(None)
	result_text_state = gr.State(None)
	stats_state = gr.State({})

	with gr.Row():
	with gr.Column(scale=2):
	input_text = gr.Textbox(
	label="النص",
	placeholder="أدخل النص العربي هنا (مشكول أو بدون تشكيل)...",
	lines=10,
	max_lines=20
	)

	with gr.Row():
	show_comparison = gr.Checkbox(label="🔄 مقارنة النصين", value=False)
	highlight_diacritics = gr.Checkbox(label="🎨 تلوين الحركات", value=False)

	submit_btn = gr.Button("✨ إضافة التشكيل", variant="primary", size="lg")

	output_html = gr.HTML()
	status = gr.Textbox(label="الحالة", interactive=False)

	gr.Examples(
	[
	["السلام عليكم ورحمة الله وبركاته"],
	["اللغة العربية لغة القران الكريم"],
	],
	inputs=input_text,
	label="أمثلة سريعة"
	)

	def show_loading():
	return LOADING_HTML, "⏳ جاري التشكيل..."

	render_inputs = [
	clean_text_state,
	result_text_state,
	stats_state,
	show_comparison,
	highlight_diacritics
	]

	submit_btn.click(
	fn=show_loading,
	inputs=None,
	outputs=[output_html, status]
	).then(
	fn=run_model,
	inputs=[input_text],
	outputs=[clean_text_state, result_text_state, stats_state, status]
	).then(
	fn=generate_final_html,
	inputs=render_inputs,
	outputs=[output_html]
	)

	show_comparison.change(
	fn=generate_final_html,
	inputs=render_inputs,
	outputs=[output_html]
	)

	highlight_diacritics.change(
	fn=generate_final_html,
	inputs=render_inputs,
	outputs=[output_html]
	)

	demo.launch()