In [2]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

/kaggle/input/music-education-performance-data/music_education_dataset.csv

Music Education Dataset Overview¶

This dataset evaluates the effectiveness of music education, collecting data on student performance, physiological information, and engagement, enhanced by IoT devices and AI algorithms.

Features¶

1. Student Information¶

Student_ID (Categorical)
Age (Numerical)
Gender (Categorical)
Class_Level (Categorical)

2. Music Performance Metrics¶

Accuracy (Numerical)
Rhythm (Numerical)
Tempo (Numerical)
Pitch_Accuracy (Numerical)
Duration (Numerical)
Volume (Numerical)

3. Physiological Data¶

Heart_Rate (Numerical)
Blood_Pressure (Numerical)
Stress_Level (Numerical)

4. Engagement and Behavioral Data¶

Engagement_Level (Numerical)
Focus_Time (Numerical)
Behavioral_Patterns (Categorical: 0 = No distractions, 1 = Mild distractions, 2 = Heavy distractions)

5. Learning Outcomes¶

Skill_Development (Categorical: 1 = Poor, 2 = Fair, 3 = Good, 4 = Very good, 5 = Excellent)

6. Lesson Information¶

Lesson_Type (Categorical)
Instrument_Type (Categorical)

In [3]:

df = pd.read_csv("/kaggle/input/music-education-performance-data/music_education_dataset.csv")
df.sample(5).T

Out[3]:

	70	46	84	11	15
Student_ID	S071	S047	S085	S012	S016
Age	14	17	12	15	15
Gender	Female	Female	Male	Female	Male
Class_Level	Beginner	Beginner	Advanced	Advanced	Beginner
Accuracy	76.529266	84.398024	97.879331	86.230771	79.080848
Rhythm	87.631394	78.199874	92.114773	88.511385	83.031635
Tempo	168.605189	132.84571	111.747719	123.844442	107.992242
Pitch_Accuracy	86.143872	83.379878	89.835529	98.038937	93.79111
Duration	525.675868	329.605049	397.407988	196.438237	267.08537
Volume	94.08109	97.223411	71.263968	61.971047	92.364228
Heart_Rate	94.785401	70.542072	77.681513	65.956249	99.097995
Blood_Pressure	110.935595	125.331827	129.799659	129.868737	111.482686
Stress_Level	9	7	9	10	6
Engagement_Level	6	9	10	5	9
Focus_Time	451.570417	105.436437	290.172471	274.862624	277.564585
Behavioral_Patterns	2	1	1	1	1
Performance_Score	99.270922	66.334577	84.377816	90.594479	61.151382
Skill_Development	3	2	3	2	4
Engagement_Score	2.674506	4.475308	5.965837	1.01425	9.897318
Timestamp	2024-12-09 14:06:00	2024-12-09 00:22:00	2024-12-09 06:16:00	2024-12-09 00:23:00	2024-12-09 11:12:00
Lesson_Type	Practical	Theory	Theory	Theory	Theory
Instrument_Type	Guitar	Piano	Piano	Guitar	Violin

In [4]:

df.shape

Out[4]:

(100, 22)

In [5]:

df.isna().sum()

Out[5]:

Student_ID             0
Age                    0
Gender                 0
Class_Level            0
Accuracy               0
Rhythm                 0
Tempo                  0
Pitch_Accuracy         0
Duration               0
Volume                 0
Heart_Rate             0
Blood_Pressure         0
Stress_Level           0
Engagement_Level       0
Focus_Time             0
Behavioral_Patterns    0
Performance_Score      0
Skill_Development      0
Engagement_Score       0
Timestamp              0
Lesson_Type            0
Instrument_Type        0
dtype: int64

Point to take note of:¶

The dataset is very small(100 rows), so dropping rows should be out of the question.
Thankfully, there are no nan values in the dataset.

In [6]:

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))

sns.countplot(df, x="Gender", ax=ax1)
sns.countplot(df, x="Class_Level", ax=ax2)
sns.countplot(df, x="Lesson_Type", ax=ax3)
sns.countplot(df, x="Instrument_Type", ax=ax4)

Out[6]:

<Axes: xlabel='Instrument_Type', ylabel='count'>

No description has been provided for this image

In [7]:

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))

sns.histplot(df,kde=True, x="Skill_Development", hue="Gender", ax=ax1, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Class_Level", ax=ax2, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Lesson_Type", ax=ax3, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Instrument_Type", ax=ax4, multiple="dodge")

Out[7]:

<Axes: xlabel='Skill_Development', ylabel='Count'>

In [36]:

df.sample(2).T

Out[36]:

	67	3
Student_ID	S068	S004
Age	13	16
Gender	Female	Female
Class_Level	Beginner	Intermediate
Accuracy	73.001505	77.101801
Rhythm	79.602953	90.951063
Tempo	169.239948	100.547725
Pitch_Accuracy	89.947919	99.51541
Duration	544.770562	259.091623
Volume	60.09066	61.625075
Heart_Rate	63.122992	76.125796
Blood_Pressure	114.844849	124.20161
Stress_Level	4	7
Engagement_Level	3	5
Focus_Time	439.493412	346.641786
Behavioral_Patterns	1	2
Performance_Score	72.763855	67.387286
Skill_Development	1	2
Engagement_Score	2.074393	4.553207
Timestamp	2024-12-09 11:08:00	2024-12-09 03:29:00
Lesson_Type	Theory	Theory
Instrument_Type	Piano	Violin

In [21]:

fig, [[ax1, ax2, ax3], [ax4, ax5, ax6]] = plt.subplots(2,3, figsize=(15,10))

sns.boxplot(df, x="Skill_Development", y="Accuracy", ax=ax1)
sns.boxplot(df, x="Skill_Development", y="Rhythm", ax=ax2)
sns.boxplot(df, x="Skill_Development", y="Tempo", ax=ax3)
sns.boxplot(df, x="Skill_Development", y="Pitch_Accuracy", ax=ax4)
sns.boxplot(df, x="Skill_Development", y="Focus_Time", ax=ax5)
sns.boxplot(df, x="Skill_Development", y="Performance_Score", ax=ax6)

Out[21]:

<Axes: xlabel='Skill_Development', ylabel='Performance_Score'>

In [30]:

fig, [ax1, ax2] = plt.subplots(1,2, figsize=(10,5))

sns.scatterplot(df, x="Age", y="Accuracy", hue="Skill_Development", ax=ax1, palette="Set1")
sns.scatterplot(df, x="Accuracy", y="Engagement_Score", hue="Skill_Development", ax=ax2, palette="Set1")

Out[30]:

<Axes: xlabel='Accuracy', ylabel='Engagement_Score'>

In [40]:

sns.boxplot(df, x="Skill_Development", y="Age")

Out[40]:

<Axes: xlabel='Skill_Development', ylabel='Age'>

In [31]:

df.columns

Out[31]:

Index(['Student_ID', 'Age', 'Gender', 'Class_Level', 'Accuracy', 'Rhythm',
       'Tempo', 'Pitch_Accuracy', 'Duration', 'Volume', 'Heart_Rate',
       'Blood_Pressure', 'Stress_Level', 'Engagement_Level', 'Focus_Time',
       'Behavioral_Patterns', 'Performance_Score', 'Skill_Development',
       'Engagement_Score', 'Timestamp', 'Lesson_Type', 'Instrument_Type'],
      dtype='object')

In [35]:

sns.heatmap(df[['Accuracy', 'Rhythm',
       'Tempo', 'Pitch_Accuracy', 'Duration', 'Volume', 'Heart_Rate',
       'Blood_Pressure', 'Performance_Score', 'Focus_Time' ]].corr())

Out[35]:

<Axes: >

In [58]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

y = df['Skill_Development']
X = df.drop(['Student_ID','Skill_Development','Timestamp'], axis=1)

In [59]:

X.sample(1).T

Out[59]:

	66
Age	15
Gender	Male
Class_Level	Intermediate
Accuracy	99.621783
Rhythm	80.44033
Tempo	90.688699
Pitch_Accuracy	91.916575
Duration	436.330554
Volume	60.95494
Heart_Rate	70.856061
Blood_Pressure	110.679336
Stress_Level	3
Engagement_Level	10
Focus_Time	199.726929
Behavioral_Patterns	2
Performance_Score	94.511324
Engagement_Score	2.394919
Lesson_Type	Theory
Instrument_Type	Piano

In [60]:

num_cols = ['Age','Accuracy','Rhythm','Tempo','Pitch_Accuracy','Duration','Volume','Heart_Rate', 'Blood_Pressure', 'Performance_Score', 'Focus_Time', 'Engagement_Score']
one_hot_cols = ['Gender','Class_Level','Lesson_Type','Instrument_Type']
label_cols = ['Stress_Level','Engagement_Level','Behavioral_Patterns']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('ohe', OneHotEncoder(drop="first"), one_hot_cols),
    ('passthrough', 'passthrough', label_cols)
])

In [61]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:

preprocessor = preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [65]:

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [73]:

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

# Evaluating
error_dfs = []
def train_and_generate_error(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    
    error_df = pd.DataFrame([], columns=["Error Name","Error Value","Error Range"])
    
    def generate_error_row(error_name: str, err_val: float, range: str):
        error_df.loc[len(error_df)] = [error_name, err_val, range]
    
    generate_error_row("Accuracy", acc, "[0, 1]")
    generate_error_row("Precision", precision, "[0, 1]")
    generate_error_row("F1 score", f1, "[0, 1]")
    generate_error_row("Recall", recall, "[0, 1]")

    error_dfs.append(error_df)

In [74]:

error_dfs = []

train_and_generate_error(LogisticRegression(multi_class='ovr'))
train_and_generate_error(SVC(class_weight='balanced', max_iter=3000))
train_and_generate_error(DecisionTreeClassifier(max_features="log2"))
train_and_generate_error(MLPClassifier(max_iter=500, hidden_layer_sizes=(32, 64,)))
train_and_generate_error(RandomForestClassifier(n_estimators=200))

/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

In [75]:

# Heatmapping errors on different models
model_names = ["LogisticRegression", "SVC", "DecisionTreeClassifier", "MLPClassifier", "RandomForestClassifier"]

def plot_model_perf_heatmap(model_names):
    error_values = {model_names[i] : list(map(lambda err: err[1], mdl.values)) for i, mdl in enumerate(error_dfs)}
    
    model_error_df = pd.DataFrame(error_values, index=["Accuracy","Precision","F1 score","Recall"]).T
    sns.heatmap(model_error_df, annot=True)

plot_model_perf_heatmap(model_names)

Conclusion¶

The models could be optimized, but it is highly likely they will resule in a low score due to the limited rows of data available.