Spaces:

mahesh1209
/

titanic-dataset-summary

Sleeping

App Files Files Community

titanic-dataset-summary / app.py

mahesh1209

Update app.py

cc78537 verified 4 months ago

raw

history blame contribute delete

2.65 kB

	import gradio as gr
	import pandas as pd
	import seaborn as sns
	import matplotlib.pyplot as plt
	from transformers import pipeline
	import tempfile
	import os

	# Load and clean Titanic dataset
	def load_and_clean_data():
	url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
	df = pd.read_csv(url)
	df['Age'].fillna(df['Age'].median(), inplace=True)
	df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
	df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)
	return df

	# Generate EDA plots
	def generate_plot_images(df):
	temp_files = []
	for plot_func in [plot_survival_count, plot_survival_by_gender]:
	fig, ax = plt.subplots()
	plot_func(df, ax)
	tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	fig.savefig(tmp.name)
	temp_files.append(tmp.name)
	plt.close(fig)
	return temp_files

	def plot_survival_count(df, ax):
	sns.countplot(x='Survived', data=df, ax=ax)

	def plot_survival_by_gender(df, ax):
	sns.countplot(x='Sex', hue='Survived', data=df, ax=ax)

	# Summarizer pipeline
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

	def summarize_eda(df):
	try:
	text = f"""
	Titanic dataset has {len(df)} rows. Missing values in Age and Embarked were filled.
	Columns Cabin, Ticket, and Name were dropped. Females had higher survival rates.
	Most passengers embarked from {df['Embarked'].mode()[0]}.
	"""
	summary = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
	return summary
	except Exception as e:
	return f"Summarization failed: {str(e)}"

	# Markdown fallback logic
	def safe_markdown(df):
	try:
	return df.head().to_markdown()
	except Exception:
	return df.head().to_string()

	# Main app logic
	def run_titanic_eda():
	try:
	df = load_and_clean_data()
	plots = generate_plot_images(df)
	summary = summarize_eda(df)
	return safe_markdown(df), plots[0], plots[1], summary
	except Exception as e:
	return "Data load failed", None, None, f"Error: {str(e)}"

	# Gradio interface
	demo = gr.Interface(
	fn=run_titanic_eda,
	inputs=[],
	outputs=[
	gr.Markdown(label="Cleaned Data Sample"),
	gr.Image(label="Survival Count"),
	gr.Image(label="Survival by Gender"),
	gr.Textbox(label="LLM Summary of EDA")
	],
	title="🚢 Titanic EDA + LLM Summary",
	description="Cleaned dataset, EDA plots, and LLM-generated summary — all in one page."
	)

	if __name__ == "__main__":
	demo.launch()