Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Fine Tuned Llama 2 for Comment Analysis | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1NX5z-wVpsEp8UigB0q7vZSZMFRa6nnEE | |
| ##**Extract Youtube Comments** | |
| """ | |
| # !pip uninstall gradio | |
| # !pip3 install gradio -q | |
| # !pip install --upgrade fastapi -q | |
| # !pip install typing-extensions --upgrade | |
| # import locale | |
| # locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') | |
| # import locale | |
| # locale.getpreferredencoding = lambda: "UTF-8" | |
| # !pip3 install typing-extensions==4.2.0 | |
| # !pip3 install gradio -q | |
| # !pip3 install --upgrade tensorflow | |
| import pandas as pd | |
| import gradio as gr | |
| from googleapiclient.discovery import build | |
| import csv | |
| # import gradio as gr | |
| from PIL import Image | |
| import io | |
| api_key = 'AIzaSyANfQYiumNUfJ8_YaDg-Hfr0BRXFhXnbvQ' | |
| def video_comments(video_id): | |
| # Create a CSV file to store comments | |
| with open('comments.csv', 'w', newline='', encoding='utf-8') as csvfile: | |
| fieldnames = ['Comment'] | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| # Counter to limit the number of comments | |
| comment_count = 0 | |
| # creating youtube resource object | |
| youtube = build('youtube', 'v3', developerKey=api_key) | |
| # retrieve youtube video results | |
| video_response = youtube.commentThreads().list( | |
| part='snippet,replies', | |
| videoId=video_id, | |
| maxResults=100 # Adjust the number of comments per page as needed | |
| ).execute() | |
| # iterate video response | |
| while video_response: | |
| # extracting required info from each result object | |
| for item in video_response['items']: | |
| # Extracting comments | |
| comment = item['snippet']['topLevelComment']['snippet']['textDisplay'] | |
| # Write the comment to the CSV file | |
| writer.writerow({'Comment': comment}) | |
| comment_count += 1 | |
| # Check if the maximum comment count is reached | |
| if comment_count >= 50: | |
| return | |
| # Again repeat | |
| if 'nextPageToken' in video_response: | |
| video_response = youtube.commentThreads().list( | |
| part='snippet,replies', | |
| videoId=video_id, | |
| pageToken=video_response['nextPageToken'], | |
| maxResults=100 # Adjust the number of comments per page as needed | |
| ).execute() | |
| else: | |
| break | |
| def execution_function(input): | |
| # Initialize a counter for deleted rows | |
| deleted_row_count = 0 | |
| video_comments(input) | |
| # calling the comment file created above | |
| file_path = "/content/comments.csv" | |
| df = pd.read_csv(file_path) | |
| # Rename the column name to 'comments' | |
| df.rename(columns={'Comment': 'comments'}, inplace=True) | |
| # Get the first 300 comments for quick analysis | |
| df = df.head(10) | |
| return df | |
| # return_distribution() | |
| # comments_df = execution_function("6ydFDwv-n8w") | |
| # comments_df = comments_df.head(20) | |
| # comments_df.head() | |
| """##**Fine - tune Llama 2** | |
| IMP: This notebook runs on a T4 GPU. | |
| """ | |
| # !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 | |
| import os | |
| import torch | |
| from datasets import load_dataset | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| BitsAndBytesConfig, | |
| HfArgumentParser, | |
| TrainingArguments, | |
| pipeline, | |
| logging, | |
| ) | |
| from peft import LoraConfig, PeftModel | |
| from trl import SFTTrainer | |
| # The model that you want to train from the Hugging Face hub | |
| model_name = "NousResearch/Llama-2-7b-chat-hf" | |
| # The instruction dataset to use | |
| # dataset_name = "mlabonne/guanaco-llama2-1k" | |
| # Fine-tuned model name | |
| # new_model = "llama-2-7b-miniguanaco" | |
| ################################################################################ | |
| # QLoRA parameters | |
| ################################################################################ | |
| # LoRA attention dimension | |
| lora_r = 64 | |
| # Alpha parameter for LoRA scaling | |
| lora_alpha = 16 | |
| # Dropout probability for LoRA layers | |
| lora_dropout = 0.1 | |
| ################################################################################ | |
| # bitsandbytes parameters | |
| ################################################################################ | |
| # Activate 4-bit precision base model loading | |
| use_4bit = True | |
| # Compute dtype for 4-bit base models | |
| bnb_4bit_compute_dtype = "float16" | |
| # Quantization type (fp4 or nf4) | |
| bnb_4bit_quant_type = "nf4" | |
| # Activate nested quantization for 4-bit base models (double quantization) | |
| use_nested_quant = False | |
| ################################################################################ | |
| # TrainingArguments parameters | |
| ################################################################################ | |
| # Output directory where the model predictions and checkpoints will be stored | |
| output_dir = "./results" | |
| # Number of training epochs | |
| num_train_epochs = 1 | |
| # Enable fp16/bf16 training (set bf16 to True with an A100) | |
| fp16 = False | |
| bf16 = False | |
| # Batch size per GPU for training | |
| per_device_train_batch_size = 4 | |
| # Batch size per GPU for evaluation | |
| per_device_eval_batch_size = 4 | |
| # Number of update steps to accumulate the gradients for | |
| gradient_accumulation_steps = 1 | |
| # Enable gradient checkpointing | |
| gradient_checkpointing = True | |
| # Maximum gradient normal (gradient clipping) | |
| max_grad_norm = 0.3 | |
| # Initial learning rate (AdamW optimizer) | |
| learning_rate = 2e-4 | |
| # Weight decay to apply to all layers except bias/LayerNorm weights | |
| weight_decay = 0.001 | |
| # Optimizer to use | |
| optim = "paged_adamw_32bit" | |
| # Learning rate schedule | |
| lr_scheduler_type = "cosine" | |
| # Number of training steps (overrides num_train_epochs) | |
| max_steps = -1 | |
| # Ratio of steps for a linear warmup (from 0 to learning rate) | |
| warmup_ratio = 0.03 | |
| # Group sequences into batches with same length | |
| # Saves memory and speeds up training considerably | |
| group_by_length = True | |
| # Save checkpoint every X updates steps | |
| save_steps = 0 | |
| # Log every X updates steps | |
| logging_steps = 25 | |
| ################################################################################ | |
| # SFT parameters | |
| ################################################################################ | |
| # Maximum sequence length to use | |
| max_seq_length = None | |
| # Pack multiple short examples in the same input sequence to increase efficiency | |
| packing = False | |
| # Load the entire model on the GPU 0 | |
| device_map = {"": 0} | |
| # Load dataset (you can process it here) | |
| # dataset = load_dataset(dataset_name, split="train") | |
| # Load tokenizer and model with QLoRA configuration | |
| compute_dtype = getattr(torch, bnb_4bit_compute_dtype) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=use_4bit, | |
| bnb_4bit_quant_type=bnb_4bit_quant_type, | |
| bnb_4bit_compute_dtype=compute_dtype, | |
| bnb_4bit_use_double_quant=use_nested_quant, | |
| ) | |
| # Check GPU compatibility with bfloat16 | |
| if compute_dtype == torch.float16 and use_4bit: | |
| major, _ = torch.cuda.get_device_capability() | |
| if major >= 8: | |
| print("=" * 80) | |
| print("Your GPU supports bfloat16: accelerate training with bf16=True") | |
| print("=" * 80) | |
| # Load base model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=bnb_config, | |
| device_map=device_map | |
| ) | |
| model.config.use_cache = False | |
| model.config.pretraining_tp = 1 | |
| # Load LLaMA tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training | |
| # Load LoRA configuration | |
| peft_config = LoraConfig( | |
| lora_alpha=lora_alpha, | |
| lora_dropout=lora_dropout, | |
| r=lora_r, | |
| bias="none", | |
| task_type="CAUSAL_LM", | |
| ) | |
| # Set training parameters | |
| training_arguments = TrainingArguments( | |
| output_dir=output_dir, | |
| num_train_epochs=num_train_epochs, | |
| per_device_train_batch_size=per_device_train_batch_size, | |
| gradient_accumulation_steps=gradient_accumulation_steps, | |
| optim=optim, | |
| save_steps=save_steps, | |
| logging_steps=logging_steps, | |
| learning_rate=learning_rate, | |
| weight_decay=weight_decay, | |
| fp16=fp16, | |
| bf16=bf16, | |
| max_grad_norm=max_grad_norm, | |
| max_steps=max_steps, | |
| warmup_ratio=warmup_ratio, | |
| group_by_length=group_by_length, | |
| lr_scheduler_type=lr_scheduler_type, | |
| report_to="tensorboard" | |
| ) | |
| def extract_between_inst_and_newline(text): | |
| start_tag = "[/INST]" | |
| end_char = "\n" | |
| start_index = text.find(start_tag) | |
| if start_index != -1: | |
| end_index = text.find(end_char, start_index) | |
| if end_index != -1: | |
| extracted_text = text[start_index + len(start_tag):end_index] | |
| return extracted_text.strip() | |
| return None | |
| import re | |
| from functools import lru_cache | |
| def extract_classification_and_remark(output): | |
| classification_match = re.search(r'Classification: (.*?)\n', output) | |
| remark_match = re.search(r'Remark: (.*?)$', output) | |
| classification = classification_match.group(1) if classification_match else None | |
| remark = remark_match.group(1) if remark_match else None | |
| return classification, remark | |
| # Ignore warnings | |
| logging.set_verbosity(logging.CRITICAL) | |
| # Run text generation pipeline with our next model | |
| prompt = '''Can you classify the human input as either happy, sad, angry, surprised, confused or neutral and tell me why it was classified as such in one short sentence. | |
| Don't reply anything besides the classification and the remark. Separate the classificaion and remark with : | |
| Human input: {}''' | |
| def process_comment(comment): | |
| formatted_prompt = prompt.format(comment) | |
| pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=150) | |
| result = pipe(f"<s>[INST] {formatted_prompt} [/INST]") | |
| extract_output = result[0]['generated_text'] | |
| classification, remark = extract_classification_and_remark(extract_output) | |
| return comment, classification, remark | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| def return_distribution(new_formatted_df): | |
| # Assuming your DataFrame is named 'df' | |
| sentiment_counts = new_formatted_df['classification'].value_counts() | |
| fig = plt.figure() | |
| sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values) | |
| plt.xlabel('Sentiment') | |
| plt.ylabel('Count') | |
| plt.title('Sentiment Distribution') | |
| return fig | |
| from wordcloud import WordCloud | |
| def return_highest_sentiment_worldcloud(new_formatted_df, sentiment): | |
| # Create a word cloud for a specific sentiment, e.g., 'happy' | |
| happy_comments = new_formatted_df[new_formatted_df['classification'] == sentiment]['comments'] | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(happy_comments)) | |
| fig = plt.figure(figsize=(10, 5)) | |
| plt.imshow(wordcloud, interpolation='bilinear') | |
| plt.axis('off') | |
| plt.title('Word Cloud for the Strongest Sentiment') | |
| return fig | |
| import pandas as pd | |
| def concatenate_remarks_based_on_classification(dataset): | |
| # Create an empty dictionary to store concatenated remarks for each classification type. | |
| concatenated_remarks = {} | |
| # Iterate through the dataset to concatenate remarks. | |
| for index, row in dataset.iterrows(): | |
| classification = row['classification'] | |
| remarks = row['remark'] | |
| # Check if the classification exists in the dictionary. | |
| if classification in concatenated_remarks: | |
| if remarks is not None: | |
| concatenated_remarks[classification] += ' ' + str(remarks) | |
| else: | |
| if remarks is not None: | |
| concatenated_remarks[classification] = str(remarks) | |
| # Create a new DataFrame with the concatenated remarks. | |
| concatenated_remarks_df = pd.DataFrame(list(concatenated_remarks.items()), columns=['classification', 'concatenated_remarks']) | |
| return concatenated_remarks_df | |
| # !pip install dask -q | |
| # Run text generation pipeline with our next model | |
| prompt1 = '''Can you summarize the following text in a paragraph of no more than 100 words. Don't respond with anything besides the summary. | |
| Human input: {}''' | |
| def summarize_text(comment): | |
| formatted_prompt = prompt1.format(comment) | |
| new_pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer, max_length=3000) | |
| new_result = new_pipe(f"<s>[INST] {formatted_prompt} [/INST]") | |
| return new_result | |
| ## Function for first tab | |
| import numpy as np | |
| from concurrent.futures import ThreadPoolExecutor | |
| import dask.dataframe as dd | |
| from dask.distributed import Client, LocalCluster | |
| # from multiprocessing import Pool | |
| # num_processes = 4 | |
| # Import necessary libraries and functions here | |
| # return_df = pd.DataFrame() | |
| # final_analysed_df = pd.DataFrame() # Initialize as None at the global scope | |
| # Define a Gradio interface | |
| def sentiment_distribution_interface(video_id): | |
| # global final_analysed_df | |
| # global unique_classifications | |
| return_df = pd.DataFrame() | |
| # Call the execution function with the video_id | |
| return_df = execution_function(video_id) | |
| print(return_df.head()) | |
| from concurrent.futures import ThreadPoolExecutor | |
| def process_row(row): #3.9s | |
| comment, classification, remark = process_comment(row.comments) | |
| return comment, classification, remark | |
| with ThreadPoolExecutor(max_workers=4) as executor: # Adjust the number of workers as needed | |
| results = list(executor.map(process_row, return_df.itertuples())) | |
| print(type(results)) | |
| print(results) | |
| print("__________________________________________________________________") | |
| comments, classification, remark = zip(*results) | |
| # Create a DataFrame from the separated data | |
| df = pd.DataFrame({'comments': comments, 'classification': classification, 'remark': remark}) | |
| print(df.head()) | |
| print("__________________________________________________________________") | |
| plot = return_distribution(df) # Modify this line to capture the plot | |
| word_cloud = return_highest_sentiment_worldcloud(df, df['classification'].value_counts().idxmax()) | |
| df.to_csv('processed_comments.csv', index=False) # index=False prevents writing the row numbers as a column | |
| #concatinating remarks for different sentiments | |
| # concatenated_remarks_df = concatenate_remarks_based_on_classification(df) | |
| # print(concatenated_remarks_df) | |
| # final_analysed_df = df | |
| return plot , word_cloud # Return the plot | |
| # Function for Second Tab | |
| def function_for_second_tab(input_val): | |
| final_analysed_df = pd.read_csv('processed_comments.csv') | |
| final_analysed_df = pd.DataFrame(final_analysed_df) | |
| print(final_analysed_df.head()) | |
| word_cloud = return_highest_sentiment_worldcloud(final_analysed_df, input_val) | |
| concatenated_remarks_df = concatenate_remarks_based_on_classification(final_analysed_df) | |
| comments = concatenated_remarks_df.loc[concatenated_remarks_df['classification'] == 'Happy', 'concatenated_remarks'].values[0] | |
| summarized_text = summarize_text(comments) | |
| extract_output_summary = summarized_text[0]['generated_text'] | |
| final_extract = extract_output_summary.split('[/INST]')[1].strip() | |
| return word_cloud, final_extract | |
| # # Define the first tab | |
| outputs = [gr.Plot(), gr.Plot()] | |
| iface = gr.Interface(fn=sentiment_distribution_interface, inputs="text", outputs=outputs) | |
| # # Define the second tab | |
| output_second_tab = [gr.Plot(), "text"] | |
| inputs = "text" | |
| description = ("Enter the sentiment for which you want a detailed report") | |
| app2 = gr.Interface(fn=function_for_second_tab, inputs=inputs, outputs=output_second_tab, description = description) | |
| # launch the app | |
| demo = gr.TabbedInterface([iface, app2], ["Welcome page", "Visualization page"]) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |