Spaces:

Raheel31
/

Marks.Guitar-Assistant

Sleeping

File size: 13,237 Bytes

c31d1ca

import os
import re
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from logger import get_logger # pylint: disable=import-error


logger = get_logger(__name__)

def clean_string(vars : list) -> list: # pylint: disable=redefined-builtin
    """
    Funciton to manipulate and "clean" string values

    Args:
        vars (list): list of string variables to manipulate

    Returns:
        list: cleaned list of strings
    """
    try:
        clean_vars = []
        if isinstance(vars, str):
            vars = [vars]
        for s in vars:
            temp = re.sub(r'[^a-z0-9\s]','',s.strip().lower())
            temp = re.sub(r'\s+', '_', temp)
            clean_vars.append(temp)
        return clean_vars
    except Exception as e:
        logger.error("Error cleaning string : %s", e)
        raise         

def mandatory_column_check(df : pd.DataFrame, columns : list) -> bool:
    """
    Function to check if the given columnsa re present in the dataframe

    Args:
        df (pd.DataFrame): Dataframe to check for columns presence
        columns (list): Mandatory columns to check for

    Returns:
        bool: Boolean value representing status
    """
    try:
        if isinstance(columns, str):
            columns = [columns]
        missing_col = []
        df_headers_list = df.columns.to_list()
        mandatory_columns = clean_string(vars = columns)
        df_headers = clean_string(vars= df_headers_list)
        for col in mandatory_columns:
            if col.lower() not in [s.lower() for s in df_headers]:
                missing_col.append(col)
        if len(missing_col) >= 1:
            return False
        else:
            return True
    except Exception as e:
        logger.error("Error checking for mandatory columns : %s", e)
        raise

def drop_dataframe_columns(df : pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Function to drop columns form a dataframe

    Args:
        df (pd.DataFrame): Dataframe to manipulate
        columns (list): Columns to be dropped from the dataframe

    Returns:
        pd.DataFrame: Formatted fataframe
    """
    try:
        final_df = df.drop(columns=columns)
        return final_df
    except Exception as e:
        logger.error("Error droping columns : %s", e)
        raise

def dataframe_join(df1 :pd.DataFrame, df2 : pd.DataFrame,
                   join_column: str) -> pd.DataFrame:
    """ 
    Function to join different dataframes on a paticular column

    Args:
        df1 (pd.DataFrame): Left dataframe
        df2 (pd.DataFrame): Right dataframe
        join_column (str): Join column

    Returns:
        pd.DataFrame: Cross join of both dataframes
    """
    try:
        formatted_join_col = clean_string(join_column)[0]
        df_list = [df1, df2]
        for i in range(len(df_list)): # pylint: disable=consider-using-enumerate
            for col in df_list[i]:
                formatted_col_name = clean_string(col)
                df_list[i].rename(columns={col: formatted_col_name[0]}, inplace=True)
        merged_df = pd.merge(df1, df2, on=formatted_join_col, how ='inner')
        return merged_df
    except Exception as e:
        logger.error("Error merging dataframes : %s", e)
        raise

def marks_custom_encoder(df : pd.DataFrame) -> pd.DataFrame:
    """
    Function to implement custom one hot encoding for categorical variables

    Args:
        df (pd.DataFrame): Dataframe to manipulate

    Returns:
        pd.DataFrame: Updated dataframe with one hot encoded columns
    """
    try:
        barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m",
                            "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7", 
                            "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7",
                            "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"]
        major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"]
        special_chords = []
        df['formatted_chords'] = df['chords'].apply(lambda x:
            re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip())
        all_chords = set(chord for row in df['formatted_chords'] for chord in row.split()) # pylint: disable=redefined-outer-name
        barre_chords = [x.lower() for x in barre_chords_temp]
        barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp]
        major_minor_chords = [x.lower() for x in major_minor_chords_temp]
        for chord in all_chords:
            if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords:
                special_chords.append(chord)

        df['song_length_temp'] = df['formatted_chords'].apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
        df['song_length'] = (((df['song_length_temp'] - df['song_length_temp'].min())/
                              (df['song_length_temp'].max()-df['song_length_temp'].min()))) * 15
        df['distinct_chords'] = df["formatted_chords"].apply(lambda x: set(x.split()))
        df['barre_chords_metric'] = (df['distinct_chords'].apply(lambda x:
            sum(chord in barre_chords_temp for chord in x)) * 2)
        df['major_minor_chords_metric'] = df['distinct_chords'].apply(lambda x:
            sum(chord in major_minor_chords_temp for chord in x))
        df['special_chords'] = (df['distinct_chords'].apply(lambda x:
            sum(chord in special_chords for chord in x)) * 3)
        return df
    except Exception as e:
        logger.error("Error one hot encoding data : %s", e)
        raise

def exercise_custom_encoder(df: pd.DataFrame)-> pd.DataFrame :
    """
    Function to encode features in the exercise dataframe

    Args:
        df (pd.DataFrame): Dataframe to manipulate

    Returns:
        pd.DataFrame: Resultant dtaframe
    """
    try:
        barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m",
                            "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7", 
                            "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7",
                            "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"]
        major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"]
        special_chords = []
        df_exploded = df['chord_progression'].str.split(',').explode()
        all_chords = df_exploded.unique().tolist() # pylint: disable=redefined-outer-name
        barre_chords = [x.lower() for x in barre_chords_temp]
        barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp]
        major_minor_chords = [x.lower() for x in major_minor_chords_temp]
        for chord in all_chords:
            if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords:
                special_chords.append(chord)    
        df['barre_chords_metric'] = (df['chord_progression'].apply(lambda x:
        sum(chord in barre_chords_temp for chord in x)) * 2)
        df['major_minor_chords_metric'] = df['chord_progression'].apply(lambda x:
            sum(chord in major_minor_chords_temp for chord in x))
        df['special_chords'] = (df['chord_progression'].apply(lambda x:
            sum(chord in special_chords for chord in x)) * 3)
        df['tempo_mattric'] = ((df['tempo'] - 40) / (200 - 40))
        return df
    except Exception as e:
        logger.error("Error encoding exercise data : %s", e)
        raise

def get_universal_chords (df: pd.DataFrame) -> list:
    """
    Function to get a list of all chords

    Args:
        df (pd.DataFrame): Dtaframe to extract chords cfrom
        columns (list): Chord column name

    Returns:
        list: List of all unqieu chords
    """
    try:
        df['formatted_chords'] = df['chords'].apply(lambda x:
            re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip())
        all_chords_list = set(chord for row in df['formatted_chords'] for chord in row.split())
        return all_chords_list
    except Exception as e:
        logger.error("Error getting universal chords list : %s", e)
        raise

def chords_to_vector(chord_list, universal_chords):
    """
    Convert list of chords into a binary vector based on universal chords.
    """
    try:
        return [1 if chord in chord_list else 0 for chord in universal_chords]
    except Exception as e:
        logger.error("Error in coverting chord to vector : 5s", e)
        raise
    
def scaler_function(df:pd.DataFrame, columns:list)-> pd.DataFrame:
    """
    Function to create and scale feature vectors

    Args:
        df (pd.DataFrame): Dataframe to manipulate
        columns (list): Columns to engineer

    Returns:
        pd.DataFrame: Resultant Dataframe
    """
    try:
        if isinstance(columns, str):
            columns = [columns]
        scaler = MinMaxScaler()
        chords_scaled = scaler.fit_transform(df[columns])
        df_scaled = pd.DataFrame(chords_scaled,columns=[c + "_scaled" for c in columns])
        df = pd.concat([df.reset_index(drop=True), df_scaled.reset_index(drop=True)], axis=1)
        return df
    except Exception as e:
        logger.error("Error in scaling columns: 5s", e)
        raise

def create_feature_vector(df:pd.DataFrame, columns:list)-> pd.DataFrame:
    """
    Function to create final feature vector

    Args:
        df (pd.DataFrame): Dataframe to manipulate
        columns (list): _descriColumns to engineerption_

    Returns:
        pd.DataFrame: Resultant Dataframe
    """
    try:
        df['feature_vector'] = df.apply(
        lambda row: row['chord_vector'] + [row[col] for col in columns],
        axis=1)
        return df
    except Exception as e:
        logger.error("Error in creating feature vectors: 5s", e)
        raise

def exercise_build_vector(row): # pylint: disable=missing-function-docstring
    try:        
        feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled',
                        'special_chords_scaled','tempo_mattric_scaled']
        chord_vec = list(row['chord_vector'])
        extra = [float(row[col]) for col in feature_cols]
        return chord_vec + extra
    except Exception as e:
        logger.error("Error in building feature vector for exercise df: 5s", e)
        raise

def marks_build_vector(row): # pylint: disable=missing-function-docstring
    try:
        feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled',
                        'special_chords_scaled','song_length_scaled']
        chord_vec = list(row['chord_vector'])
        extra = [float(row[col]) for col in feature_cols]
        return chord_vec + extra
    except Exception as e:
        logger.error("Error in c\building feature vector for marks df: 5s", e)
        raise


if __name__ == '__main__':
    base_dir = os.path.dirname(os.path.abspath(__file__))
    song_data_path = os.path.join(base_dir, '..','data','raw','songs_data.parquet')
    spotify_data_path = os.path.join(base_dir,'..','data','raw','spotify_tracks.parquet')
    exercise_data_path = os.path.join(base_dir, '..','data','raw','chord_exercises.csv')

    song_data = pd.read_parquet(song_data_path)
    spotify_data = pd.read_parquet(spotify_data_path)
    exercise_data = pd.read_csv(exercise_data_path)
    marks_data = dataframe_join(song_data, spotify_data, join_column='spotify_song_id')
    marks_data = drop_dataframe_columns(marks_data, columns=['id','releasedate','decade',
                                                             'rockgenre','artistid',
                                                             'spotifysongid','spotifyartistid'])
    marks_file_path = os.path.join(base_dir, '..', 'data', 'raw', 'marks_data.parquet')
    marks_data_ohe = marks_custom_encoder(df=marks_data)
    exercise_data_ohe = exercise_custom_encoder(exercise_data)
    all_chords = get_universal_chords(marks_data_ohe)
    marks_data_ohe['chord_vector'] = marks_data_ohe['distinct_chords'].apply(
        lambda x: chords_to_vector(x, all_chords))
    exercise_data_ohe['chord_vector'] = exercise_data_ohe['chord_progression'].apply(
        lambda x: chords_to_vector(x.split(','), all_chords)
    )
    exercise_data_final = scaler_function(
        df = exercise_data_ohe, columns=['barre_chords_metric', 
                                         'major_minor_chords_metric', 
                                         'special_chords','tempo_mattric'])
    exercise_data_final['feature_vector'] = exercise_data_final.apply(exercise_build_vector, axis=1)
    marks_data_final = scaler_function(df = marks_data_ohe, columns=
                                       ['barre_chords_metric', 'major_minor_chords_metric', 
                                        'special_chords','song_length'])
    marks_data_final['feature_vector'] = marks_data_final.apply(marks_build_vector, axis=1)
    
    marks_data_final.to_parquet(os.path.join(
        base_dir, '..', 'data', 'processed', 'marks_data.parquet'))
    exercise_data_ohe_path = os.path.join(
        base_dir, '..','data','processed','chord_exercises.parquet')
    exercise_data_final.to_parquet(exercise_data_ohe_path)