Raheel Abdul Rehman commited on
Commit
c31d1ca
·
1 Parent(s): 1606aeb

Initial Push

Browse files
.dockerignore ADDED
File without changes
.gitignore ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+ # Cursor
177
+ # Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
178
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
+ # refer to https://docs.cursor.com/context/ignore-files
180
+ .cursorignore
181
+ .cursorindexingignore
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY --chown=user ./src ./src
13
+ COPY --chown=user ./data/processed ./data/processed
14
+
15
+ RUN mkdir -p /app/logs
16
+
17
+ EXPOSE 7860
18
+
19
+ CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "7860"]
20
+
README.md CHANGED
@@ -1,12 +1,22 @@
 
 
 
 
 
1
  ---
2
- title: Marks.Guitar Assistant
3
- emoji: 📊
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- short_description: A guitar ML assistant which is tailored to the users skill
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
+ # 🎸 Marks : AI-Powered Guitar Exercise Recommender
2
+
3
+ An AI-driven music recommendation system that suggests guitar exercises and practice routines based on user skill level, tempo preferences, and chord progression complexity.
4
+ This project leverages **machine learning, FastAPI, and Docker** to provide a scalable production-ready API for real-time recommendations.
5
+
6
  ---
7
+
8
+ ## 🚀 Features
9
+ - **Intelligent Recommendations** – Suggests exercises based on chord progressions, difficulty, and tempo.
10
+ - **Clustering & ML Models** – Groups exercises dynamically by difficulty using machine learning techniques.
11
+ - **REST API with FastAPI** – Lightweight, high-performance backend for serving recommendations.
12
+ - **Scalable & Containerized** – Fully Dockerized for easy deployment across cloud platforms.
13
+ - **Interactive Data Handling** – Uses `pandas`, `numpy`, and `scikit-learn` for data processing and ML.
14
+ - **Spotify Integration (Optional)** Pulls metadata via [Spotipy](https://spotipy.readthedocs.io/) to enrich recommendations.
15
+
16
  ---
17
 
18
+ ## 🏗️ Tech Stack
19
+ - **Backend:** FastAPI, Uvicorn
20
+ - **ML / Data Science:** Scikit-learn, Pandas, Numpy, Matplotlib
21
+ - **Deployment:** Docker, Render (or any cloud hosting)
22
+ - **Testing:** Pytest
data/processed/chord_exercises.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6411955b2b75db12e9636397d87fc97b144a9042834e26a6f0116cd1ebaf3d12
3
+ size 13013
data/processed/prod_data.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73006bd788a51c4df96131171459c721b6bc187cb47c549c128998af458ec328
3
+ size 24639044
notebooks/data_visualisation.ipynb ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "88f47d25",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "import numpy as np\n",
12
+ "from pathlib import Path\n",
13
+ "from sklearn.metrics import silhouette_score\n",
14
+ "import matplotlib.pyplot as plt\n",
15
+ "from sklearn.decomposition import PCA\n",
16
+ "from sklearn.cluster import MiniBatchKMeans\n",
17
+ "from sklearn.metrics.pairwise import cosine_similarity"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": null,
23
+ "id": "b34b67b4",
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "marks_file_path = Path(\"..\",\"data\",\"processed\",\"marks_data.parquet\")#songs_df = pd.read_parquet(marks_file_path)\n",
28
+ "exercise_data_path = Path(\"..\",\"data\",\"processed\",\"chord_exercises.parquet\")\n",
29
+ "exercise_df = pd.read_parquet(exercise_data_path)\n",
30
+ "prod_data_path = Path(\"..\",\"data\",\"processed\",\"prod_data.parquet\")\n",
31
+ "prod_data = pd.read_parquet(prod_data_path)"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 6,
37
+ "id": "2422497a",
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "Running PCA to reduce from 3024 → 100 dimensions...\n",
45
+ "PCA complete. Shape: (168630, 100)\n",
46
+ "Explained variance ratio (first 10 comps): [0.13954444 0.10823043 0.06132166 0.04828419 0.03467049 0.03131611\n",
47
+ " 0.02613195 0.02348917 0.02307746 0.02195262]\n",
48
+ "Running clustering for different k values...\n",
49
+ "k=2 → inertia=695637.38, silhouette=0.1310\n",
50
+ "k=3 → inertia=644517.48, silhouette=0.1130\n",
51
+ "k=4 → inertia=618581.60, silhouette=0.1005\n",
52
+ "k=5 → inertia=585383.74, silhouette=0.1049\n",
53
+ "k=6 → inertia=566055.58, silhouette=0.1035\n",
54
+ "k=7 → inertia=545570.39, silhouette=0.1041\n",
55
+ "k=8 → inertia=536465.33, silhouette=0.1070\n",
56
+ "k=9 → inertia=532107.15, silhouette=0.0945\n",
57
+ "k=10 → inertia=528159.87, silhouette=0.0938\n",
58
+ "k=11 → inertia=517973.04, silhouette=0.0972\n",
59
+ "k=12 → inertia=510710.85, silhouette=0.0988\n",
60
+ "k=13 → inertia=505189.73, silhouette=0.1004\n",
61
+ "k=14 → inertia=498099.02, silhouette=0.1037\n",
62
+ "k=15 → inertia=485065.30, silhouette=0.0970\n",
63
+ "k=16 → inertia=476857.74, silhouette=0.0964\n",
64
+ "k=17 → inertia=472460.50, silhouette=0.0972\n",
65
+ "k=18 → inertia=468655.84, silhouette=0.0975\n",
66
+ "k=19 → inertia=467330.06, silhouette=0.0960\n"
67
+ ]
68
+ },
69
+ {
70
+ "data": {
71
+ "image/png": "",
72
+ "text/plain": [
73
+ "<Figure size 1200x500 with 2 Axes>"
74
+ ]
75
+ },
76
+ "metadata": {},
77
+ "output_type": "display_data"
78
+ },
79
+ {
80
+ "name": "stdout",
81
+ "output_type": "stream",
82
+ "text": [
83
+ "Final model trained with k=5\n",
84
+ "Centroids shape: (5, 100)\n"
85
+ ]
86
+ }
87
+ ],
88
+ "source": [
89
+ "\n",
90
+ "X = np.array(songs_df['feature_vector'].to_list())\n",
91
+ "\n",
92
+ "# --------------------------\n",
93
+ "# Step 1: Dimensionality Reduction with PCA\n",
94
+ "pca_components = 100 # adjust based on explained variance\n",
95
+ "print(f\"Running PCA to reduce from {X.shape[1]} → {pca_components} dimensions...\")\n",
96
+ "pca = PCA(n_components=pca_components, random_state=42)\n",
97
+ "X_reduced = pca.fit_transform(X)\n",
98
+ "\n",
99
+ "print(f\"PCA complete. Shape: {X_reduced.shape}\")\n",
100
+ "print(f\"Explained variance ratio (first 10 comps): {pca.explained_variance_ratio_[:10]}\")\n",
101
+ "\n",
102
+ "# --------------------------\n",
103
+ "# Step 2: Find optimal K with inertia + silhouette\n",
104
+ "inertias = []\n",
105
+ "silhouettes = []\n",
106
+ "K_range = range(2, 20) # You can extend this if needed\n",
107
+ "\n",
108
+ "print(\"Running clustering for different k values...\")\n",
109
+ "for k in K_range:\n",
110
+ " kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=2048, n_init=\"auto\")\n",
111
+ " labels = kmeans.fit_predict(X_reduced)\n",
112
+ " \n",
113
+ " inertia = kmeans.inertia_\n",
114
+ " inertias.append(inertia)\n",
115
+ " \n",
116
+ " sil_score = silhouette_score(X_reduced, labels, sample_size=10000, random_state=42)\n",
117
+ " silhouettes.append(sil_score)\n",
118
+ " \n",
119
+ " print(f\"k={k} → inertia={inertia:.2f}, silhouette={sil_score:.4f}\")\n",
120
+ "\n",
121
+ "# --------------------------\n",
122
+ "# Step 3: Plot inertia & silhouette to decide optimal k\n",
123
+ "plt.figure(figsize=(12,5))\n",
124
+ "\n",
125
+ "plt.subplot(1,2,1)\n",
126
+ "plt.plot(K_range, inertias, 'o-', label='Inertia')\n",
127
+ "plt.xlabel(\"Number of clusters (k)\")\n",
128
+ "plt.ylabel(\"Inertia\")\n",
129
+ "plt.title(\"Elbow Method\")\n",
130
+ "plt.legend()\n",
131
+ "\n",
132
+ "plt.subplot(1,2,2)\n",
133
+ "plt.plot(K_range, silhouettes, 'o-', color='green', label='Silhouette Score')\n",
134
+ "plt.xlabel(\"Number of clusters (k)\")\n",
135
+ "plt.ylabel(\"Silhouette\")\n",
136
+ "plt.title(\"Silhouette Method\")\n",
137
+ "plt.legend()\n",
138
+ "\n",
139
+ "plt.tight_layout()\n",
140
+ "plt.show()\n",
141
+ "\n",
142
+ "# --------------------------\n",
143
+ "# Step 4: Fit final model with optimal k (replace with your choice)\n",
144
+ "optimal_k = 5 # <-- set manually after looking at the plots\n",
145
+ "final_kmeans = MiniBatchKMeans(n_clusters=optimal_k, random_state=42, batch_size=2048, n_init=\"auto\")\n",
146
+ "final_labels = final_kmeans.fit_predict(X_reduced)\n",
147
+ "\n",
148
+ "# Get final centroids in PCA space\n",
149
+ "centroids = final_kmeans.cluster_centers_\n",
150
+ "\n",
151
+ "print(f\"Final model trained with k={optimal_k}\")\n",
152
+ "print(\"Centroids shape:\", centroids.shape)\n"
153
+ ]
154
+ }
155
+ ],
156
+ "metadata": {
157
+ "kernelspec": {
158
+ "display_name": "Python 3",
159
+ "language": "python",
160
+ "name": "python3"
161
+ },
162
+ "language_info": {
163
+ "codemirror_mode": {
164
+ "name": "ipython",
165
+ "version": 3
166
+ },
167
+ "file_extension": ".py",
168
+ "mimetype": "text/x-python",
169
+ "name": "python",
170
+ "nbconvert_exporter": "python",
171
+ "pygments_lexer": "ipython3",
172
+ "version": "3.11.9"
173
+ }
174
+ },
175
+ "nbformat": 4,
176
+ "nbformat_minor": 5
177
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ scikit-learn
4
+ pandas
5
+ numpy
6
+ pyarrow
src/__init__.py ADDED
File without changes
src/data_input.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from logger import get_logger # pylint: disable=import-error
3
+
4
+ logger = get_logger(__name__)
5
+
6
+ def read_chord_data(url : str) -> pd.DataFrame:
7
+ """
8
+ Function to read songs chord ata
9
+
10
+ Args:
11
+ url (str): Web api link to chord dataset
12
+
13
+ Returns:
14
+ pd.DataFrame: Pandas read dataframe
15
+ """
16
+ try:
17
+ logger.info('Reading Chord Data')
18
+ dataset = pd.read_csv(url, dtype=str)
19
+ return dataset
20
+ except Exception as e:
21
+ logger.error("Error reading chord data : %s", e)
22
+ raise
23
+
24
+ if __name__ == '__main__':
25
+ chord_data_url = "hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv"
26
+ df = read_chord_data(url= chord_data_url)
27
+ df.to_parquet("data/raw/songs_data.parquet", engine="pyarrow", index=False)
src/data_processing.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import pandas as pd
4
+ from sklearn.preprocessing import MinMaxScaler
5
+ from logger import get_logger # pylint: disable=import-error
6
+
7
+
8
+ logger = get_logger(__name__)
9
+
10
+ def clean_string(vars : list) -> list: # pylint: disable=redefined-builtin
11
+ """
12
+ Funciton to manipulate and "clean" string values
13
+
14
+ Args:
15
+ vars (list): list of string variables to manipulate
16
+
17
+ Returns:
18
+ list: cleaned list of strings
19
+ """
20
+ try:
21
+ clean_vars = []
22
+ if isinstance(vars, str):
23
+ vars = [vars]
24
+ for s in vars:
25
+ temp = re.sub(r'[^a-z0-9\s]','',s.strip().lower())
26
+ temp = re.sub(r'\s+', '_', temp)
27
+ clean_vars.append(temp)
28
+ return clean_vars
29
+ except Exception as e:
30
+ logger.error("Error cleaning string : %s", e)
31
+ raise
32
+
33
+ def mandatory_column_check(df : pd.DataFrame, columns : list) -> bool:
34
+ """
35
+ Function to check if the given columnsa re present in the dataframe
36
+
37
+ Args:
38
+ df (pd.DataFrame): Dataframe to check for columns presence
39
+ columns (list): Mandatory columns to check for
40
+
41
+ Returns:
42
+ bool: Boolean value representing status
43
+ """
44
+ try:
45
+ if isinstance(columns, str):
46
+ columns = [columns]
47
+ missing_col = []
48
+ df_headers_list = df.columns.to_list()
49
+ mandatory_columns = clean_string(vars = columns)
50
+ df_headers = clean_string(vars= df_headers_list)
51
+ for col in mandatory_columns:
52
+ if col.lower() not in [s.lower() for s in df_headers]:
53
+ missing_col.append(col)
54
+ if len(missing_col) >= 1:
55
+ return False
56
+ else:
57
+ return True
58
+ except Exception as e:
59
+ logger.error("Error checking for mandatory columns : %s", e)
60
+ raise
61
+
62
+ def drop_dataframe_columns(df : pd.DataFrame, columns: list) -> pd.DataFrame:
63
+ """
64
+ Function to drop columns form a dataframe
65
+
66
+ Args:
67
+ df (pd.DataFrame): Dataframe to manipulate
68
+ columns (list): Columns to be dropped from the dataframe
69
+
70
+ Returns:
71
+ pd.DataFrame: Formatted fataframe
72
+ """
73
+ try:
74
+ final_df = df.drop(columns=columns)
75
+ return final_df
76
+ except Exception as e:
77
+ logger.error("Error droping columns : %s", e)
78
+ raise
79
+
80
+ def dataframe_join(df1 :pd.DataFrame, df2 : pd.DataFrame,
81
+ join_column: str) -> pd.DataFrame:
82
+ """
83
+ Function to join different dataframes on a paticular column
84
+
85
+ Args:
86
+ df1 (pd.DataFrame): Left dataframe
87
+ df2 (pd.DataFrame): Right dataframe
88
+ join_column (str): Join column
89
+
90
+ Returns:
91
+ pd.DataFrame: Cross join of both dataframes
92
+ """
93
+ try:
94
+ formatted_join_col = clean_string(join_column)[0]
95
+ df_list = [df1, df2]
96
+ for i in range(len(df_list)): # pylint: disable=consider-using-enumerate
97
+ for col in df_list[i]:
98
+ formatted_col_name = clean_string(col)
99
+ df_list[i].rename(columns={col: formatted_col_name[0]}, inplace=True)
100
+ merged_df = pd.merge(df1, df2, on=formatted_join_col, how ='inner')
101
+ return merged_df
102
+ except Exception as e:
103
+ logger.error("Error merging dataframes : %s", e)
104
+ raise
105
+
106
+ def marks_custom_encoder(df : pd.DataFrame) -> pd.DataFrame:
107
+ """
108
+ Function to implement custom one hot encoding for categorical variables
109
+
110
+ Args:
111
+ df (pd.DataFrame): Dataframe to manipulate
112
+
113
+ Returns:
114
+ pd.DataFrame: Updated dataframe with one hot encoded columns
115
+ """
116
+ try:
117
+ barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m",
118
+ "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7",
119
+ "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7",
120
+ "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"]
121
+ major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"]
122
+ special_chords = []
123
+ df['formatted_chords'] = df['chords'].apply(lambda x:
124
+ re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip())
125
+ all_chords = set(chord for row in df['formatted_chords'] for chord in row.split()) # pylint: disable=redefined-outer-name
126
+ barre_chords = [x.lower() for x in barre_chords_temp]
127
+ barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp]
128
+ major_minor_chords = [x.lower() for x in major_minor_chords_temp]
129
+ for chord in all_chords:
130
+ if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords:
131
+ special_chords.append(chord)
132
+
133
+ df['song_length_temp'] = df['formatted_chords'].apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda
134
+ df['song_length'] = (((df['song_length_temp'] - df['song_length_temp'].min())/
135
+ (df['song_length_temp'].max()-df['song_length_temp'].min()))) * 15
136
+ df['distinct_chords'] = df["formatted_chords"].apply(lambda x: set(x.split()))
137
+ df['barre_chords_metric'] = (df['distinct_chords'].apply(lambda x:
138
+ sum(chord in barre_chords_temp for chord in x)) * 2)
139
+ df['major_minor_chords_metric'] = df['distinct_chords'].apply(lambda x:
140
+ sum(chord in major_minor_chords_temp for chord in x))
141
+ df['special_chords'] = (df['distinct_chords'].apply(lambda x:
142
+ sum(chord in special_chords for chord in x)) * 3)
143
+ return df
144
+ except Exception as e:
145
+ logger.error("Error one hot encoding data : %s", e)
146
+ raise
147
+
148
+ def exercise_custom_encoder(df: pd.DataFrame)-> pd.DataFrame :
149
+ """
150
+ Function to encode features in the exercise dataframe
151
+
152
+ Args:
153
+ df (pd.DataFrame): Dataframe to manipulate
154
+
155
+ Returns:
156
+ pd.DataFrame: Resultant dtaframe
157
+ """
158
+ try:
159
+ barre_chords_temp = ["F", "F♯", "G♯", "A♯", "B","Fm", "F♯m", "Gm", "G♯m","A♯m",
160
+ "Bm","Bb", "Cm", "C♯m", "D♯m","F7", "F♯7", "G♯7", "A♯7",
161
+ "C♯7", "D♯7","Fm7","F♯m7", "Gm7", "G♯m7", "Am7", "A♯m7",
162
+ "Bm7", "Cm7", "C♯m7", "Dm7", "D♯m7", "E7"]
163
+ major_minor_chords_temp = ["A", "C", "D", "E", "G", "Am", "Dm", "Em"]
164
+ special_chords = []
165
+ df_exploded = df['chord_progression'].str.split(',').explode()
166
+ all_chords = df_exploded.unique().tolist() # pylint: disable=redefined-outer-name
167
+ barre_chords = [x.lower() for x in barre_chords_temp]
168
+ barre_chords = [x for x in barre_chords if x not in major_minor_chords_temp]
169
+ major_minor_chords = [x.lower() for x in major_minor_chords_temp]
170
+ for chord in all_chords:
171
+ if chord.lower() not in barre_chords and chord.lower() not in major_minor_chords:
172
+ special_chords.append(chord)
173
+ df['barre_chords_metric'] = (df['chord_progression'].apply(lambda x:
174
+ sum(chord in barre_chords_temp for chord in x)) * 2)
175
+ df['major_minor_chords_metric'] = df['chord_progression'].apply(lambda x:
176
+ sum(chord in major_minor_chords_temp for chord in x))
177
+ df['special_chords'] = (df['chord_progression'].apply(lambda x:
178
+ sum(chord in special_chords for chord in x)) * 3)
179
+ df['tempo_mattric'] = ((df['tempo'] - 40) / (200 - 40))
180
+ return df
181
+ except Exception as e:
182
+ logger.error("Error encoding exercise data : %s", e)
183
+ raise
184
+
185
+ def get_universal_chords (df: pd.DataFrame) -> list:
186
+ """
187
+ Function to get a list of all chords
188
+
189
+ Args:
190
+ df (pd.DataFrame): Dtaframe to extract chords cfrom
191
+ columns (list): Chord column name
192
+
193
+ Returns:
194
+ list: List of all unqieu chords
195
+ """
196
+ try:
197
+ df['formatted_chords'] = df['chords'].apply(lambda x:
198
+ re.sub(r'\s+', ' ', re.sub(r'<[^>]+>', '', x)).strip())
199
+ all_chords_list = set(chord for row in df['formatted_chords'] for chord in row.split())
200
+ return all_chords_list
201
+ except Exception as e:
202
+ logger.error("Error getting universal chords list : %s", e)
203
+ raise
204
+
205
+ def chords_to_vector(chord_list, universal_chords):
206
+ """
207
+ Convert list of chords into a binary vector based on universal chords.
208
+ """
209
+ try:
210
+ return [1 if chord in chord_list else 0 for chord in universal_chords]
211
+ except Exception as e:
212
+ logger.error("Error in coverting chord to vector : 5s", e)
213
+ raise
214
+
215
+ def scaler_function(df:pd.DataFrame, columns:list)-> pd.DataFrame:
216
+ """
217
+ Function to create and scale feature vectors
218
+
219
+ Args:
220
+ df (pd.DataFrame): Dataframe to manipulate
221
+ columns (list): Columns to engineer
222
+
223
+ Returns:
224
+ pd.DataFrame: Resultant Dataframe
225
+ """
226
+ try:
227
+ if isinstance(columns, str):
228
+ columns = [columns]
229
+ scaler = MinMaxScaler()
230
+ chords_scaled = scaler.fit_transform(df[columns])
231
+ df_scaled = pd.DataFrame(chords_scaled,columns=[c + "_scaled" for c in columns])
232
+ df = pd.concat([df.reset_index(drop=True), df_scaled.reset_index(drop=True)], axis=1)
233
+ return df
234
+ except Exception as e:
235
+ logger.error("Error in scaling columns: 5s", e)
236
+ raise
237
+
238
+ def create_feature_vector(df:pd.DataFrame, columns:list)-> pd.DataFrame:
239
+ """
240
+ Function to create final feature vector
241
+
242
+ Args:
243
+ df (pd.DataFrame): Dataframe to manipulate
244
+ columns (list): _descriColumns to engineerption_
245
+
246
+ Returns:
247
+ pd.DataFrame: Resultant Dataframe
248
+ """
249
+ try:
250
+ df['feature_vector'] = df.apply(
251
+ lambda row: row['chord_vector'] + [row[col] for col in columns],
252
+ axis=1)
253
+ return df
254
+ except Exception as e:
255
+ logger.error("Error in creating feature vectors: 5s", e)
256
+ raise
257
+
258
+ def exercise_build_vector(row): # pylint: disable=missing-function-docstring
259
+ try:
260
+ feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled',
261
+ 'special_chords_scaled','tempo_mattric_scaled']
262
+ chord_vec = list(row['chord_vector'])
263
+ extra = [float(row[col]) for col in feature_cols]
264
+ return chord_vec + extra
265
+ except Exception as e:
266
+ logger.error("Error in building feature vector for exercise df: 5s", e)
267
+ raise
268
+
269
+ def marks_build_vector(row): # pylint: disable=missing-function-docstring
270
+ try:
271
+ feature_cols = ['barre_chords_metric_scaled','major_minor_chords_metric_scaled',
272
+ 'special_chords_scaled','song_length_scaled']
273
+ chord_vec = list(row['chord_vector'])
274
+ extra = [float(row[col]) for col in feature_cols]
275
+ return chord_vec + extra
276
+ except Exception as e:
277
+ logger.error("Error in c\building feature vector for marks df: 5s", e)
278
+ raise
279
+
280
+
281
+ if __name__ == '__main__':
282
+ base_dir = os.path.dirname(os.path.abspath(__file__))
283
+ song_data_path = os.path.join(base_dir, '..','data','raw','songs_data.parquet')
284
+ spotify_data_path = os.path.join(base_dir,'..','data','raw','spotify_tracks.parquet')
285
+ exercise_data_path = os.path.join(base_dir, '..','data','raw','chord_exercises.csv')
286
+
287
+ song_data = pd.read_parquet(song_data_path)
288
+ spotify_data = pd.read_parquet(spotify_data_path)
289
+ exercise_data = pd.read_csv(exercise_data_path)
290
+ marks_data = dataframe_join(song_data, spotify_data, join_column='spotify_song_id')
291
+ marks_data = drop_dataframe_columns(marks_data, columns=['id','releasedate','decade',
292
+ 'rockgenre','artistid',
293
+ 'spotifysongid','spotifyartistid'])
294
+ marks_file_path = os.path.join(base_dir, '..', 'data', 'raw', 'marks_data.parquet')
295
+ marks_data_ohe = marks_custom_encoder(df=marks_data)
296
+ exercise_data_ohe = exercise_custom_encoder(exercise_data)
297
+ all_chords = get_universal_chords(marks_data_ohe)
298
+ marks_data_ohe['chord_vector'] = marks_data_ohe['distinct_chords'].apply(
299
+ lambda x: chords_to_vector(x, all_chords))
300
+ exercise_data_ohe['chord_vector'] = exercise_data_ohe['chord_progression'].apply(
301
+ lambda x: chords_to_vector(x.split(','), all_chords)
302
+ )
303
+ exercise_data_final = scaler_function(
304
+ df = exercise_data_ohe, columns=['barre_chords_metric',
305
+ 'major_minor_chords_metric',
306
+ 'special_chords','tempo_mattric'])
307
+ exercise_data_final['feature_vector'] = exercise_data_final.apply(exercise_build_vector, axis=1)
308
+ marks_data_final = scaler_function(df = marks_data_ohe, columns=
309
+ ['barre_chords_metric', 'major_minor_chords_metric',
310
+ 'special_chords','song_length'])
311
+ marks_data_final['feature_vector'] = marks_data_final.apply(marks_build_vector, axis=1)
312
+
313
+ marks_data_final.to_parquet(os.path.join(
314
+ base_dir, '..', 'data', 'processed', 'marks_data.parquet'))
315
+ exercise_data_ohe_path = os.path.join(
316
+ base_dir, '..','data','processed','chord_exercises.parquet')
317
+ exercise_data_final.to_parquet(exercise_data_ohe_path)
src/logger.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
5
+
6
+ LOG_FILE = os.path.join(BASE_DIR, "..", "logs","app.log")
7
+
8
+ # Configure logging
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
12
+ handlers=[
13
+ logging.FileHandler(LOG_FILE), # Writes logs to logs/app.log
14
+ logging.StreamHandler() # Also prints logs to console
15
+ ]
16
+ )
17
+
18
+ # Function to get logger for each module
19
+ def get_logger(name: str):
20
+ """
21
+ Logger functoin to capture all data runs
22
+
23
+ Args:
24
+ name (str): Receives the log frome ach function
25
+ """
26
+ return logging.getLogger(name)
src/main.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import pandas as pd
4
+ from fastapi import FastAPI, Query
5
+ import uvicorn
6
+
7
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8
+ from model import recommend_songs as model1 # pylint: disable=import-error
9
+ from model import recommend_songs_random as model2 # pylint: disable=import-error
10
+ from logger import get_logger # pylint: disable=import-error
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ base_dir = os.path.dirname(os.path.abspath(__file__))
15
+ data_dir = os.path.join(base_dir, "..", "data", "processed")
16
+
17
+ prod_file = os.path.join(data_dir, "prod_data.parquet")
18
+ exercise_file = os.path.join(data_dir, "chord_exercises.parquet")
19
+ recommended_history = set()
20
+
21
+ app = FastAPI(title="Exercise Recommendation API")
22
+
23
+ @app.get("/")
24
+ def home():
25
+ return {"message": "Welcome to the Exercise Recommendation API"}
26
+
27
+ @app.get("/random_exercises")
28
+ def random_exercises(genre: str = Query(..., description="Genre of exercises")):
29
+ """Return n random exercises in batches to reduce memory usage."""
30
+ try:
31
+ recommended_temp = set()
32
+ prod_df = pd.read_parquet(
33
+ prod_file,
34
+ filters=[("maingenre", "=", genre)])
35
+ result = model2(genre=genre, songs_df=prod_df, recommended_cache=recommended_temp)
36
+
37
+ recommended_history.update(recommended_temp)
38
+ return result
39
+ except Exception as e:
40
+ logger.error("Error fetching API: %s", e)
41
+ return {"error": str(e)}
42
+
43
+
44
+ @app.get("/recommendations")
45
+ def recommendations(
46
+ tempo: int = Query(..., description="Tempo value"),
47
+ exercise_id: int = Query(..., description="Exercise ID"),
48
+ genre: str = Query(..., description="Genre"),
49
+ ):
50
+ """Return top N recommended songs for a given exercise and tempo using batch processing."""
51
+ try:
52
+ exercise_df = pd.read_parquet(
53
+ exercise_file,
54
+ filters=[("exercise_id", "=", exercise_id)],
55
+ )
56
+ prod_df = pd.read_parquet(
57
+ prod_file,
58
+ filters=[("maingenre", "=", genre)])
59
+
60
+ result = model1(
61
+ exercise_df=exercise_df,
62
+ prod_df=prod_df,
63
+ tempo=tempo,
64
+ exercise_id=exercise_id,
65
+ genre=genre,
66
+ )
67
+ return result
68
+
69
+ except Exception as e:
70
+ logger.error("Error fetching API: %s", e)
71
+ return {"error": str(e)}
72
+
73
+ if __name__ == "__main__":
74
+ uvicorn.run("main:app", host="0.0.0.0", port=7860)
src/model.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.decomposition import PCA
5
+ from sklearn.cluster import MiniBatchKMeans
6
+
7
+ from logger import get_logger # pylint: disable=import-error
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ def cluster_function(df : pd.DataFrame)-> pd.DataFrame:
12
+ """
13
+ Function to cluster marks dataframe
14
+
15
+ Args:
16
+ df (pd.DataFrame): Dataframe to predict
17
+
18
+ Returns:
19
+ pd.DataFrame: Resultant dataframe
20
+ """
21
+ try:
22
+ X = np.array(df['feature_vector'].to_list())
23
+ pca_components = 100
24
+ pca = PCA(n_components=pca_components, random_state=42)
25
+ X_reduced = pca.fit_transform(X)
26
+
27
+ kmeans = MiniBatchKMeans(n_clusters=14, random_state=42, batch_size=2048, n_init="auto")
28
+ df['cluster'] = kmeans.fit_predict(X_reduced)
29
+
30
+ df['difficulty_score'] = (df['barre_chords_metric_scaled'] +
31
+ df['major_minor_chords_metric_scaled'] +
32
+ df['special_chords_scaled'] +
33
+ df['song_length_scaled'])
34
+
35
+ cluster_difficulty = df.groupby("cluster")["difficulty_score"].mean().reset_index()
36
+ cluster_difficulty = cluster_difficulty.sort_values("difficulty_score").reset_index(drop=True)
37
+
38
+ difficulty_levels = ["Beginner", "Intermediate", "Advanced"]
39
+ bins = pd.qcut(cluster_difficulty["difficulty_score"], q=len(difficulty_levels), labels=difficulty_levels)
40
+
41
+ cluster_difficulty["difficulty_level"] = bins
42
+
43
+ cluster_map = dict(zip(cluster_difficulty["cluster"], cluster_difficulty["difficulty_level"]))
44
+ df["difficulty_level"] = df["cluster"].map(cluster_map)
45
+ return df
46
+ except Exception as e:
47
+ logger.error("Error in clustering marks dataset : %s", e)
48
+ raise
49
+
50
+ def recommend_songs(exercise_df, prod_df,exercise_id, tempo, genre, top_n=5):
51
+ """
52
+ Recommend top_n songs similar to the given exercise and tempo.
53
+ Works on PCA-reduced vectors.
54
+ """
55
+ try:
56
+ exercise_row = exercise_df[
57
+ (exercise_df['exercise_id'] == exercise_id) &
58
+ (exercise_df['tempo'] == tempo)
59
+ ]
60
+ if exercise_row.empty:
61
+ raise ValueError("No exercise found with given ID and tempo")
62
+
63
+ exercise_vector = np.array(exercise_row['feature_vector'].iloc[0]).reshape(1, -1)
64
+ filtered_prod_df = prod_df[prod_df['maingenre'] == genre]
65
+ if filtered_prod_df.empty:
66
+ raise ValueError(f"No songs found in genre '{genre}'")
67
+
68
+ similarities = []
69
+ for vec in filtered_prod_df['feature_vector'].values:
70
+ sin = np.dot(exercise_vector, vec) / (np.linalg.norm(exercise_vector) * np.linalg.norm(vec))
71
+ similarities.append(sin)
72
+
73
+ filtered_prod_df = filtered_prod_df.copy()
74
+ filtered_prod_df['similarity'] = similarities
75
+ top_recommendations = filtered_prod_df.sort_values(by='similarity', ascending=False).head(top_n)
76
+ return top_recommendations[['trackname', 'artistnames', 'maingenre', 'chords', 'difficulty_level']]
77
+ except Exception as e:
78
+ logger.error("Error in generating recommendations : %s", e)
79
+ raise
80
+
81
+ def recommend_songs_random(genre,songs_df, recommended_cache, n=5) -> list:
82
+ """
83
+ Cluster function to retrieve random songs
84
+
85
+ Args:
86
+ genre (_type_): String value
87
+ n (int, optional): Number of records to retrieve Defaults to 5.
88
+
89
+ Returns:
90
+ list: _description_
91
+ """
92
+ try:
93
+ if songs_df.empty:
94
+ return {"error": "Dataset not loaded"}
95
+
96
+ genre_songs = songs_df[songs_df["maingenre"] == genre]
97
+
98
+ available_songs = genre_songs[~genre_songs["trackname"].isin(recommended_cache)]
99
+
100
+ if available_songs.empty:
101
+ return {"error": f"No new songs available for genre: {genre}"}
102
+
103
+ selected = available_songs.sample(min(n, len(available_songs)), replace=False)
104
+
105
+ recommended_cache.update(selected["trackname"].tolist())
106
+
107
+ return selected[["trackname", "artistnames", "maingenre", "chords",
108
+ "difficulty_level"]].to_dict(orient="records"),recommended_cache
109
+ except Exception as e:
110
+ logger.error("Error retrieving random recommendations: %s", e)
111
+ raise
112
+
113
+ if __name__ == '__main__':
114
+ base_dir = os.path.dirname(os.path.abspath(__file__))
115
+ marks_data_file_path = os.path.join(base_dir, '..', 'data', 'processed', 'marks_data.parquet')
116
+ exercise_data_ohe_path = os.path.join(base_dir, '..','data','processed','chord_exercises.parquet')
117
+ marks_df = pd.read_parquet(marks_data_file_path)
118
+ exercise_df = pd.read_parquet(exercise_data_ohe_path)
119
+ df_prod = cluster_function(marks_df)
120
+ df_prod_file_path = os.path.join(base_dir, '..', 'data', 'processed', 'prod_data.parquet')
121
+ df_prod.to_parquet(df_prod_file_path)
src/spotify_data_fetch.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import random
4
+ import spotipy
5
+ import pandas as pd
6
+ from spotipy.oauth2 import SpotifyClientCredentials
7
+ from logger import get_logger # pylint: disable=import-error
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ def read_spotify_track_data(track_ids: list,
12
+ output_file: str,
13
+ save_every: int = 5000,
14
+ max_records: int = 25000) -> pd.DataFrame:
15
+ """
16
+ Fetches Spotify track data in batches with a tiny delay between batches to avoid rate limits.
17
+
18
+ Args:
19
+ track_ids (list): Spotify track ids
20
+ save_every (int): Save checkpoint after this many records
21
+ output_file (str): File path for partial results
22
+ max_records (int): Max number of tracks to fetch in one run
23
+
24
+ Returns:
25
+ pd.DataFrame: Track data (song, artist, id, etc.)
26
+ """
27
+ try:
28
+ client_id = "754888cc4fa4486daea9cb7917e176fc"
29
+ client_secret = "3f383e012f7442c18851a668b63849dc"
30
+
31
+ sp = spotipy.Spotify(
32
+ auth_manager=SpotifyClientCredentials(
33
+ client_id=client_id,
34
+ client_secret=client_secret
35
+ )
36
+ )
37
+
38
+ # Load partial results if they exist
39
+ if os.path.exists(output_file):
40
+ existing = pd.read_parquet(output_file)
41
+ processed_ids = set(existing["spotify_song_id"])
42
+ else:
43
+ existing = pd.DataFrame()
44
+ processed_ids = set()
45
+
46
+ # Filter only unprocessed IDs
47
+ remaining_ids = [tid for tid in track_ids if tid not in processed_ids]
48
+
49
+ # Limit to max_records this run
50
+ remaining_ids = remaining_ids[:max_records]
51
+
52
+ results = []
53
+ total = len(remaining_ids)
54
+
55
+ for i in range(0, total, 50):
56
+ batch = remaining_ids[i:i+50]
57
+ response = sp.tracks(batch)
58
+
59
+ for track in response["tracks"]:
60
+ if track is None:
61
+ continue
62
+ track_id = track["id"]
63
+ track_name = track["name"]
64
+ artist_names = ", ".join([artist["name"] for artist in track["artists"]])
65
+ results.append({
66
+ "spotify_song_id": track_id,
67
+ "track_name": track_name,
68
+ "artist_names": artist_names
69
+ })
70
+
71
+ # Tiny random delay to avoid hitting rate limit
72
+ time.sleep(random.uniform(0.2, 0.5))
73
+
74
+ if (i // 50) % (save_every // 50) == 0 and results:
75
+ df_partial = pd.concat([existing, pd.DataFrame(results)], ignore_index=True)
76
+ df_partial.to_parquet(output_file, index=False)
77
+
78
+ # Final save
79
+ final_df = pd.concat([existing, pd.DataFrame(results)], ignore_index=True)
80
+ final_df.to_parquet(output_file, index=False)
81
+ print(f"Run complete. Total saved: {len(final_df)} tracks")
82
+
83
+ return final_df
84
+ except Exception as e:
85
+ logger.error("Failed fetching spotify data : %s", e)
86
+ raise
87
+
88
+ if __name__ == '__main__':
89
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
90
+ file_path = os.path.join(BASE_DIR, "..", "data", "raw", "songs_data.parquet")
91
+
92
+ chord_data = pd.read_parquet(file_path)
93
+ track_id_list = [str(id) for id in chord_data["spotify_song_id"] if str(id).lower() != 'none']
94
+ output_path = os.path.join(BASE_DIR, "..", "data", "raw", "spotify_tracks.parquet")
95
+ spotify_track_data = read_spotify_track_data(track_id_list, output_path)
96
+
97
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
98
+ spotify_track_data.to_parquet(output_path, index=False)
src/utils.py ADDED
File without changes
tests/test_utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+ from src.main import app # adjust import if needed
3
+
4
+ client = TestClient(app)
5
+
6
+ def test_recommend_random():
7
+ """Test that /recommend/random runs and returns a response"""
8
+ response = client.get("/recommend/random?genre=rock&n=5")
9
+ assert response.status_code == 200
10
+ assert "recommendations" in response.json()
11
+
12
+ def test_recommend_filtered():
13
+ """Test that /recommend/filtered runs with tempo + exercise_id"""
14
+ payload = {
15
+ "tempo": 120,
16
+ "exercise_id": 1,
17
+ "genre": "rock"
18
+ }
19
+ response = client.post("/recommend/filtered", json=payload)
20
+ assert response.status_code == 200
21
+ assert "recommendations" in response.json()