Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import pandas as pd | |
| from sklearn.preprocessing import MinMaxScaler | |
| from logger import get_logger # pylint: disable=import-error | |
| logger = get_logger(__name__) | |
| def clean_string(vars : list) -> list: # pylint: disable=redefined-builtin | |
| """ | |
| Funciton to manipulate and "clean" string values | |
| Args: | |
| vars (list): list of string variables to manipulate | |
| Returns: | |
| list: cleaned list of strings | |
| """ | |
| try: | |
| clean_vars = [] | |
| if isinstance(vars, str): | |
| vars = [vars] | |
| for s in vars: | |
| temp = re.sub(r'[^a-z0-9\s]','',s.strip().lower()) | |
| temp = re.sub(r'\s+', '_', temp) | |
| clean_vars.append(temp) | |
| return clean_vars | |
| except Exception as e: | |
| logger.error("Error cleaning string : %s", e) | |
| raise | |
| def mandatory_column_check(df : pd.DataFrame, columns : list) -> bool: | |
| """ | |
| Function to check if the given columnsa re present in the dataframe | |
| Args: | |
| df (pd.DataFrame): Dataframe to check for columns presence | |
| columns (list): Mandatory columns to check for | |
| Returns: | |
| bool: Boolean value representing status | |
| """ | |
| try: | |
| if isinstance(columns, str): | |
| columns = [columns] | |
| missing_col = [] | |
| df_headers_list = df.columns.to_list() | |
| mandatory_columns = clean_string(vars = columns) | |
| df_headers = clean_string(vars= df_headers_list) | |
| for col in mandatory_columns: | |
| if col.lower() not in [s.lower() for s in df_headers]: | |
| missing_col.append(col) | |
| if len(missing_col) >= 1: | |
| return False | |
| else: | |
| return True | |
| except Exception as e: | |
| logger.error("Error checking for mandatory columns : %s", e) | |
| raise | |
| def drop_dataframe_columns(df : pd.DataFrame, columns: list) -> pd.DataFrame: | |
| """ | |
| Function to drop columns form a dataframe | |
| Args: | |
| df (pd.DataFrame): Dataframe to manipulate | |
| columns (list): Columns to be dropped from the dataframe | |
| Returns: | |
| pd.DataFrame: Formatted fataframe | |
| """ | |
| try: | |
| final_df = df.drop(columns=columns) | |
| return final_df | |
| except Exception as e: | |
| logger.error("Error droping columns : %s", e) | |
| raise | |
| def dataframe_join(df1 :pd.DataFrame, df2 : pd.DataFrame, | |
| join_column: str) -> pd.DataFrame: | |
| """ | |
| Function to join different dataframes on a paticular column | |
| Args: | |
| df1 (pd.DataFrame): Left dataframe | |
| df2 (pd.DataFrame): Right dataframe | |
| join_column (str): Join column | |
| Returns: | |
| pd.DataFrame: Cross join of both dataframes | |
| """ | |
| try: | |
| formatted_join_col = clean_string(join_column)[0] | |
| df_list = [df1, df2] | |
| for i in range(len(df_list)): # pylint: disable=consider-using-enumerate | |
| for col in df_list[i]: | |
| formatted_col_name = clean_string(col) | |
| df_list[i].rename(columns={col: formatted_col_name[0]}, inplace=True) | |
| merged_df = pd.merge(df1, df2, on=formatted_join_col, how ='inner') | |
| return merged_df | |
| except Exception as e: | |
| logger.error("Error merging dataframes : %s", e) | |
| raise | |
| def marks_custom_encoder(df : pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Function to implement custom one hot encoding for categorical variables | |
| Args: | |
| df (pd.DataFrame): Dataframe to manipulate | |
| Returns: | |
| pd.DataFrame: Updated dataframe with one hot encoded columns | |
| """ | |
| try: | |
| barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m", | |
| "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7", | |
| "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7", | |
| "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"] | |
| major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"] | |
| special_chords = [] | |
| df['formatted_chords'] = df['chords'].apply(lambda x: | |
| re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip()) | |
| all_chords = set(chord for row in df['formatted_chords'] for chord in row.split()) # pylint: disable=redefined-outer-name | |
| barre_chords = [x.lower() for x in barre_chords_temp] | |
| barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp] | |
| major_minor_chords = [x.lower() for x in major_minor_chords_temp] | |
| for chord in all_chords: | |
| if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords: | |
| special_chords.append(chord) | |
| df['song_length_temp'] = df['formatted_chords'].apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda | |
| df['song_length'] = (((df['song_length_temp'] - df['song_length_temp'].min())/ | |
| (df['song_length_temp'].max()-df['song_length_temp'].min()))) * 15 | |
| df['distinct_chords'] = df["formatted_chords"].apply(lambda x: set(x.split())) | |
| df['barre_chords_metric'] = (df['distinct_chords'].apply(lambda x: | |
| sum(chord in barre_chords_temp for chord in x)) * 2) | |
| df['major_minor_chords_metric'] = df['distinct_chords'].apply(lambda x: | |
| sum(chord in major_minor_chords_temp for chord in x)) | |
| df['special_chords'] = (df['distinct_chords'].apply(lambda x: | |
| sum(chord in special_chords for chord in x)) * 3) | |
| return df | |
| except Exception as e: | |
| logger.error("Error one hot encoding data : %s", e) | |
| raise | |
| def exercise_custom_encoder(df: pd.DataFrame)-> pd.DataFrame : | |
| """ | |
| Function to encode features in the exercise dataframe | |
| Args: | |
| df (pd.DataFrame): Dataframe to manipulate | |
| Returns: | |
| pd.DataFrame: Resultant dtaframe | |
| """ | |
| try: | |
| barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m", | |
| "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7", | |
| "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7", | |
| "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"] | |
| major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"] | |
| special_chords = [] | |
| df_exploded = df['chord_progression'].str.split(',').explode() | |
| all_chords = df_exploded.unique().tolist() # pylint: disable=redefined-outer-name | |
| barre_chords = [x.lower() for x in barre_chords_temp] | |
| barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp] | |
| major_minor_chords = [x.lower() for x in major_minor_chords_temp] | |
| for chord in all_chords: | |
| if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords: | |
| special_chords.append(chord) | |
| df['barre_chords_metric'] = (df['chord_progression'].apply(lambda x: | |
| sum(chord in barre_chords_temp for chord in x)) * 2) | |
| df['major_minor_chords_metric'] = df['chord_progression'].apply(lambda x: | |
| sum(chord in major_minor_chords_temp for chord in x)) | |
| df['special_chords'] = (df['chord_progression'].apply(lambda x: | |
| sum(chord in special_chords for chord in x)) * 3) | |
| df['tempo_mattric'] = ((df['tempo'] - 40) / (200 - 40)) | |
| return df | |
| except Exception as e: | |
| logger.error("Error encoding exercise data : %s", e) | |
| raise | |
| def get_universal_chords (df: pd.DataFrame) -> list: | |
| """ | |
| Function to get a list of all chords | |
| Args: | |
| df (pd.DataFrame): Dtaframe to extract chords cfrom | |
| columns (list): Chord column name | |
| Returns: | |
| list: List of all unqieu chords | |
| """ | |
| try: | |
| df['formatted_chords'] = df['chords'].apply(lambda x: | |
| re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip()) | |
| all_chords_list = set(chord for row in df['formatted_chords'] for chord in row.split()) | |
| return all_chords_list | |
| except Exception as e: | |
| logger.error("Error getting universal chords list : %s", e) | |
| raise | |
| def chords_to_vector(chord_list, universal_chords): | |
| """ | |
| Convert list of chords into a binary vector based on universal chords. | |
| """ | |
| try: | |
| return [1 if chord in chord_list else 0 for chord in universal_chords] | |
| except Exception as e: | |
| logger.error("Error in coverting chord to vector : 5s", e) | |
| raise | |
| def scaler_function(df:pd.DataFrame, columns:list)-> pd.DataFrame: | |
| """ | |
| Function to create and scale feature vectors | |
| Args: | |
| df (pd.DataFrame): Dataframe to manipulate | |
| columns (list): Columns to engineer | |
| Returns: | |
| pd.DataFrame: Resultant Dataframe | |
| """ | |
| try: | |
| if isinstance(columns, str): | |
| columns = [columns] | |
| scaler = MinMaxScaler() | |
| chords_scaled = scaler.fit_transform(df[columns]) | |
| df_scaled = pd.DataFrame(chords_scaled,columns=[c + "_scaled" for c in columns]) | |
| df = pd.concat([df.reset_index(drop=True), df_scaled.reset_index(drop=True)], axis=1) | |
| return df | |
| except Exception as e: | |
| logger.error("Error in scaling columns: 5s", e) | |
| raise | |
| def create_feature_vector(df:pd.DataFrame, columns:list)-> pd.DataFrame: | |
| """ | |
| Function to create final feature vector | |
| Args: | |
| df (pd.DataFrame): Dataframe to manipulate | |
| columns (list): _descriColumns to engineerption_ | |
| Returns: | |
| pd.DataFrame: Resultant Dataframe | |
| """ | |
| try: | |
| df['feature_vector'] = df.apply( | |
| lambda row: row['chord_vector'] + [row[col] for col in columns], | |
| axis=1) | |
| return df | |
| except Exception as e: | |
| logger.error("Error in creating feature vectors: 5s", e) | |
| raise | |
| def exercise_build_vector(row): # pylint: disable=missing-function-docstring | |
| try: | |
| feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled', | |
| 'special_chords_scaled','tempo_mattric_scaled'] | |
| chord_vec = list(row['chord_vector']) | |
| extra = [float(row[col]) for col in feature_cols] | |
| return chord_vec + extra | |
| except Exception as e: | |
| logger.error("Error in building feature vector for exercise df: 5s", e) | |
| raise | |
| def marks_build_vector(row): # pylint: disable=missing-function-docstring | |
| try: | |
| feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled', | |
| 'special_chords_scaled','song_length_scaled'] | |
| chord_vec = list(row['chord_vector']) | |
| extra = [float(row[col]) for col in feature_cols] | |
| return chord_vec + extra | |
| except Exception as e: | |
| logger.error("Error in c\building feature vector for marks df: 5s", e) | |
| raise | |
| if __name__ == '__main__': | |
| base_dir = os.path.dirname(os.path.abspath(__file__)) | |
| song_data_path = os.path.join(base_dir, '..','data','raw','songs_data.parquet') | |
| spotify_data_path = os.path.join(base_dir,'..','data','raw','spotify_tracks.parquet') | |
| exercise_data_path = os.path.join(base_dir, '..','data','raw','chord_exercises.csv') | |
| song_data = pd.read_parquet(song_data_path) | |
| spotify_data = pd.read_parquet(spotify_data_path) | |
| exercise_data = pd.read_csv(exercise_data_path) | |
| marks_data = dataframe_join(song_data, spotify_data, join_column='spotify_song_id') | |
| marks_data = drop_dataframe_columns(marks_data, columns=['id','releasedate','decade', | |
| 'rockgenre','artistid', | |
| 'spotifysongid','spotifyartistid']) | |
| marks_file_path = os.path.join(base_dir, '..', 'data', 'raw', 'marks_data.parquet') | |
| marks_data_ohe = marks_custom_encoder(df=marks_data) | |
| exercise_data_ohe = exercise_custom_encoder(exercise_data) | |
| all_chords = get_universal_chords(marks_data_ohe) | |
| marks_data_ohe['chord_vector'] = marks_data_ohe['distinct_chords'].apply( | |
| lambda x: chords_to_vector(x, all_chords)) | |
| exercise_data_ohe['chord_vector'] = exercise_data_ohe['chord_progression'].apply( | |
| lambda x: chords_to_vector(x.split(','), all_chords) | |
| ) | |
| exercise_data_final = scaler_function( | |
| df = exercise_data_ohe, columns=['barre_chords_metric', | |
| 'major_minor_chords_metric', | |
| 'special_chords','tempo_mattric']) | |
| exercise_data_final['feature_vector'] = exercise_data_final.apply(exercise_build_vector, axis=1) | |
| marks_data_final = scaler_function(df = marks_data_ohe, columns= | |
| ['barre_chords_metric', 'major_minor_chords_metric', | |
| 'special_chords','song_length']) | |
| marks_data_final['feature_vector'] = marks_data_final.apply(marks_build_vector, axis=1) | |
| marks_data_final.to_parquet(os.path.join( | |
| base_dir, '..', 'data', 'processed', 'marks_data.parquet')) | |
| exercise_data_ohe_path = os.path.join( | |
| base_dir, '..','data','processed','chord_exercises.parquet') | |
| exercise_data_final.to_parquet(exercise_data_ohe_path) | |