Market-Basket / app.py
mahesh1209's picture
Create app.py
1799a54 verified
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
import gradio as gr
from transformers import pipeline
# ⚑ Load lightweight summarizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
# 🧺 Synthetic transaction data
transactions = [
["Milk", "Bread"],
["Milk", "Diapers", "Beer"],
["Bread", "Diapers", "Eggs"],
["Milk", "Bread", "Diapers", "Beer"],
["Bread", "Eggs"],
["Milk", "Eggs"],
["Beer", "Diapers"],
["Milk", "Bread", "Eggs"],
["Bread", "Diapers"],
["Milk", "Beer"]
]
df = pd.DataFrame({"TransactionID": range(1, len(transactions)+1), "Items": transactions})
# πŸ”„ One-hot encode items
mlb = MultiLabelBinarizer()
encoded = mlb.fit_transform(df["Items"])
encoded_df = pd.DataFrame(encoded, columns=mlb.classes_)
# 🧠 Clustering + LLM summary
def cluster_and_summarize(n_clusters):
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto")
df["Cluster"] = kmeans.fit_predict(encoded_df)
summaries = []
for i in range(n_clusters):
cluster_items = df[df["Cluster"] == i]["Items"].explode()
top_items = cluster_items.value_counts().head(5).to_dict()
raw_summary = f"Cluster {i} contains transactions with top items: " + ", ".join([f"{k} ({v})" for k, v in top_items.items()])
llm_output = summarizer(raw_summary, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
summaries.append(f"🧠 Cluster {i}: {llm_output}")
return "\n\n".join(summaries)
# πŸš€ Gradio UI
gr.Interface(
fn=cluster_and_summarize,
inputs=gr.Slider(2, 5, value=3, label="Number of Clusters"),
outputs="text",
title="πŸ›’ Market Basket Clustering + LLM Summary",
description="Fast, error-free clustering of synthetic transactions with LLM-powered summaries.",
cache_examples=False
).launch()