Spaces:

mahesh1209
/

titanic-dataset-summary

Sleeping

File size: 2,649 Bytes

import gradio as gr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
import tempfile
import os

# Load and clean Titanic dataset
def load_and_clean_data():
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    df = pd.read_csv(url)
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)
    return df

# Generate EDA plots
def generate_plot_images(df):
    temp_files = []
    for plot_func in [plot_survival_count, plot_survival_by_gender]:
        fig, ax = plt.subplots()
        plot_func(df, ax)
        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
        fig.savefig(tmp.name)
        temp_files.append(tmp.name)
        plt.close(fig)
    return temp_files

def plot_survival_count(df, ax):
    sns.countplot(x='Survived', data=df, ax=ax)

def plot_survival_by_gender(df, ax):
    sns.countplot(x='Sex', hue='Survived', data=df, ax=ax)

# Summarizer pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def summarize_eda(df):
    try:
        text = f"""
        Titanic dataset has {len(df)} rows. Missing values in Age and Embarked were filled.
        Columns Cabin, Ticket, and Name were dropped. Females had higher survival rates.
        Most passengers embarked from {df['Embarked'].mode()[0]}.
        """
        summary = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
        return summary
    except Exception as e:
        return f"Summarization failed: {str(e)}"

# Markdown fallback logic
def safe_markdown(df):
    try:
        return df.head().to_markdown()
    except Exception:
        return df.head().to_string()

# Main app logic
def run_titanic_eda():
    try:
        df = load_and_clean_data()
        plots = generate_plot_images(df)
        summary = summarize_eda(df)
        return safe_markdown(df), plots[0], plots[1], summary
    except Exception as e:
        return "Data load failed", None, None, f"Error: {str(e)}"

# Gradio interface
demo = gr.Interface(
    fn=run_titanic_eda,
    inputs=[],
    outputs=[
        gr.Markdown(label="Cleaned Data Sample"),
        gr.Image(label="Survival Count"),
        gr.Image(label="Survival by Gender"),
        gr.Textbox(label="LLM Summary of EDA")
    ],
    title="🚢 Titanic EDA + LLM Summary",
    description="Cleaned dataset, EDA plots, and LLM-generated summary — all in one page."
)

if __name__ == "__main__":
    demo.launch()