import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import MultiLabelBinarizer import gradio as gr from transformers import pipeline # โšก Load lightweight summarizer summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") # ๐Ÿงบ Synthetic transaction data transactions = [ ["Milk", "Bread"], ["Milk", "Diapers", "Beer"], ["Bread", "Diapers", "Eggs"], ["Milk", "Bread", "Diapers", "Beer"], ["Bread", "Eggs"], ["Milk", "Eggs"], ["Beer", "Diapers"], ["Milk", "Bread", "Eggs"], ["Bread", "Diapers"], ["Milk", "Beer"] ] df = pd.DataFrame({"TransactionID": range(1, len(transactions)+1), "Items": transactions}) # ๐Ÿ”„ One-hot encode items mlb = MultiLabelBinarizer() encoded = mlb.fit_transform(df["Items"]) encoded_df = pd.DataFrame(encoded, columns=mlb.classes_) # ๐Ÿง  Clustering + LLM summary def cluster_and_summarize(n_clusters): kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init="auto") df["Cluster"] = kmeans.fit_predict(encoded_df) summaries = [] for i in range(n_clusters): cluster_items = df[df["Cluster"] == i]["Items"].explode() top_items = cluster_items.value_counts().head(5).to_dict() raw_summary = f"Cluster {i} contains transactions with top items: " + ", ".join([f"{k} ({v})" for k, v in top_items.items()]) llm_output = summarizer(raw_summary, max_length=50, min_length=10, do_sample=False)[0]["summary_text"] summaries.append(f"๐Ÿง  Cluster {i}: {llm_output}") return "\n\n".join(summaries) # ๐Ÿš€ Gradio UI gr.Interface( fn=cluster_and_summarize, inputs=gr.Slider(2, 5, value=3, label="Number of Clusters"), outputs="text", title="๐Ÿ›’ Market Basket Clustering + LLM Summary", description="Fast, error-free clustering of synthetic transactions with LLM-powered summaries.", cache_examples=False ).launch()