Spaces:
Sleeping
Sleeping
File size: 2,649 Bytes
840f5fb cc78537 840f5fb cc78537 840f5fb cc78537 840f5fb cc78537 840f5fb cc78537 840f5fb cc78537 840f5fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import gradio as gr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
import tempfile
import os
# Load and clean Titanic dataset
def load_and_clean_data():
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin', 'Ticket', 'Name'], inplace=True)
return df
# Generate EDA plots
def generate_plot_images(df):
temp_files = []
for plot_func in [plot_survival_count, plot_survival_by_gender]:
fig, ax = plt.subplots()
plot_func(df, ax)
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
fig.savefig(tmp.name)
temp_files.append(tmp.name)
plt.close(fig)
return temp_files
def plot_survival_count(df, ax):
sns.countplot(x='Survived', data=df, ax=ax)
def plot_survival_by_gender(df, ax):
sns.countplot(x='Sex', hue='Survived', data=df, ax=ax)
# Summarizer pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
def summarize_eda(df):
try:
text = f"""
Titanic dataset has {len(df)} rows. Missing values in Age and Embarked were filled.
Columns Cabin, Ticket, and Name were dropped. Females had higher survival rates.
Most passengers embarked from {df['Embarked'].mode()[0]}.
"""
summary = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
return summary
except Exception as e:
return f"Summarization failed: {str(e)}"
# Markdown fallback logic
def safe_markdown(df):
try:
return df.head().to_markdown()
except Exception:
return df.head().to_string()
# Main app logic
def run_titanic_eda():
try:
df = load_and_clean_data()
plots = generate_plot_images(df)
summary = summarize_eda(df)
return safe_markdown(df), plots[0], plots[1], summary
except Exception as e:
return "Data load failed", None, None, f"Error: {str(e)}"
# Gradio interface
demo = gr.Interface(
fn=run_titanic_eda,
inputs=[],
outputs=[
gr.Markdown(label="Cleaned Data Sample"),
gr.Image(label="Survival Count"),
gr.Image(label="Survival by Gender"),
gr.Textbox(label="LLM Summary of EDA")
],
title="🚢 Titanic EDA + LLM Summary",
description="Cleaned dataset, EDA plots, and LLM-generated summary — all in one page."
)
if __name__ == "__main__":
demo.launch()
|