In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
/kaggle/input/music-education-performance-data/music_education_dataset.csv
Music Education Dataset Overview¶
This dataset evaluates the effectiveness of music education, collecting data on student performance, physiological information, and engagement, enhanced by IoT devices and AI algorithms.
Features¶
1. Student Information¶
- Student_ID (Categorical)
- Age (Numerical)
- Gender (Categorical)
- Class_Level (Categorical)
2. Music Performance Metrics¶
- Accuracy (Numerical)
- Rhythm (Numerical)
- Tempo (Numerical)
- Pitch_Accuracy (Numerical)
- Duration (Numerical)
- Volume (Numerical)
3. Physiological Data¶
- Heart_Rate (Numerical)
- Blood_Pressure (Numerical)
- Stress_Level (Numerical)
4. Engagement and Behavioral Data¶
- Engagement_Level (Numerical)
- Focus_Time (Numerical)
- Behavioral_Patterns (Categorical: 0 = No distractions, 1 = Mild distractions, 2 = Heavy distractions)
5. Learning Outcomes¶
- Skill_Development (Categorical: 1 = Poor, 2 = Fair, 3 = Good, 4 = Very good, 5 = Excellent)
6. Lesson Information¶
- Lesson_Type (Categorical)
- Instrument_Type (Categorical)
In [3]:
df = pd.read_csv("/kaggle/input/music-education-performance-data/music_education_dataset.csv")
df.sample(5).T
Out[3]:
70 | 46 | 84 | 11 | 15 | |
---|---|---|---|---|---|
Student_ID | S071 | S047 | S085 | S012 | S016 |
Age | 14 | 17 | 12 | 15 | 15 |
Gender | Female | Female | Male | Female | Male |
Class_Level | Beginner | Beginner | Advanced | Advanced | Beginner |
Accuracy | 76.529266 | 84.398024 | 97.879331 | 86.230771 | 79.080848 |
Rhythm | 87.631394 | 78.199874 | 92.114773 | 88.511385 | 83.031635 |
Tempo | 168.605189 | 132.84571 | 111.747719 | 123.844442 | 107.992242 |
Pitch_Accuracy | 86.143872 | 83.379878 | 89.835529 | 98.038937 | 93.79111 |
Duration | 525.675868 | 329.605049 | 397.407988 | 196.438237 | 267.08537 |
Volume | 94.08109 | 97.223411 | 71.263968 | 61.971047 | 92.364228 |
Heart_Rate | 94.785401 | 70.542072 | 77.681513 | 65.956249 | 99.097995 |
Blood_Pressure | 110.935595 | 125.331827 | 129.799659 | 129.868737 | 111.482686 |
Stress_Level | 9 | 7 | 9 | 10 | 6 |
Engagement_Level | 6 | 9 | 10 | 5 | 9 |
Focus_Time | 451.570417 | 105.436437 | 290.172471 | 274.862624 | 277.564585 |
Behavioral_Patterns | 2 | 1 | 1 | 1 | 1 |
Performance_Score | 99.270922 | 66.334577 | 84.377816 | 90.594479 | 61.151382 |
Skill_Development | 3 | 2 | 3 | 2 | 4 |
Engagement_Score | 2.674506 | 4.475308 | 5.965837 | 1.01425 | 9.897318 |
Timestamp | 2024-12-09 14:06:00 | 2024-12-09 00:22:00 | 2024-12-09 06:16:00 | 2024-12-09 00:23:00 | 2024-12-09 11:12:00 |
Lesson_Type | Practical | Theory | Theory | Theory | Theory |
Instrument_Type | Guitar | Piano | Piano | Guitar | Violin |
In [4]:
df.shape
Out[4]:
(100, 22)
In [5]:
df.isna().sum()
Out[5]:
Student_ID 0 Age 0 Gender 0 Class_Level 0 Accuracy 0 Rhythm 0 Tempo 0 Pitch_Accuracy 0 Duration 0 Volume 0 Heart_Rate 0 Blood_Pressure 0 Stress_Level 0 Engagement_Level 0 Focus_Time 0 Behavioral_Patterns 0 Performance_Score 0 Skill_Development 0 Engagement_Score 0 Timestamp 0 Lesson_Type 0 Instrument_Type 0 dtype: int64
Point to take note of:¶
- The dataset is very small(100 rows), so dropping rows should be out of the question.
- Thankfully, there are no nan values in the dataset.
In [6]:
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))
sns.countplot(df, x="Gender", ax=ax1)
sns.countplot(df, x="Class_Level", ax=ax2)
sns.countplot(df, x="Lesson_Type", ax=ax3)
sns.countplot(df, x="Instrument_Type", ax=ax4)
Out[6]:
<Axes: xlabel='Instrument_Type', ylabel='count'>
In [7]:
fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))
sns.histplot(df,kde=True, x="Skill_Development", hue="Gender", ax=ax1, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Class_Level", ax=ax2, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Lesson_Type", ax=ax3, multiple="dodge")
sns.histplot(df,kde=True, x="Skill_Development", hue="Instrument_Type", ax=ax4, multiple="dodge")
Out[7]:
<Axes: xlabel='Skill_Development', ylabel='Count'>
In [36]:
df.sample(2).T
Out[36]:
67 | 3 | |
---|---|---|
Student_ID | S068 | S004 |
Age | 13 | 16 |
Gender | Female | Female |
Class_Level | Beginner | Intermediate |
Accuracy | 73.001505 | 77.101801 |
Rhythm | 79.602953 | 90.951063 |
Tempo | 169.239948 | 100.547725 |
Pitch_Accuracy | 89.947919 | 99.51541 |
Duration | 544.770562 | 259.091623 |
Volume | 60.09066 | 61.625075 |
Heart_Rate | 63.122992 | 76.125796 |
Blood_Pressure | 114.844849 | 124.20161 |
Stress_Level | 4 | 7 |
Engagement_Level | 3 | 5 |
Focus_Time | 439.493412 | 346.641786 |
Behavioral_Patterns | 1 | 2 |
Performance_Score | 72.763855 | 67.387286 |
Skill_Development | 1 | 2 |
Engagement_Score | 2.074393 | 4.553207 |
Timestamp | 2024-12-09 11:08:00 | 2024-12-09 03:29:00 |
Lesson_Type | Theory | Theory |
Instrument_Type | Piano | Violin |
In [21]:
fig, [[ax1, ax2, ax3], [ax4, ax5, ax6]] = plt.subplots(2,3, figsize=(15,10))
sns.boxplot(df, x="Skill_Development", y="Accuracy", ax=ax1)
sns.boxplot(df, x="Skill_Development", y="Rhythm", ax=ax2)
sns.boxplot(df, x="Skill_Development", y="Tempo", ax=ax3)
sns.boxplot(df, x="Skill_Development", y="Pitch_Accuracy", ax=ax4)
sns.boxplot(df, x="Skill_Development", y="Focus_Time", ax=ax5)
sns.boxplot(df, x="Skill_Development", y="Performance_Score", ax=ax6)
Out[21]:
<Axes: xlabel='Skill_Development', ylabel='Performance_Score'>
In [30]:
fig, [ax1, ax2] = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(df, x="Age", y="Accuracy", hue="Skill_Development", ax=ax1, palette="Set1")
sns.scatterplot(df, x="Accuracy", y="Engagement_Score", hue="Skill_Development", ax=ax2, palette="Set1")
Out[30]:
<Axes: xlabel='Accuracy', ylabel='Engagement_Score'>
In [40]:
sns.boxplot(df, x="Skill_Development", y="Age")
Out[40]:
<Axes: xlabel='Skill_Development', ylabel='Age'>
In [31]:
df.columns
Out[31]:
Index(['Student_ID', 'Age', 'Gender', 'Class_Level', 'Accuracy', 'Rhythm', 'Tempo', 'Pitch_Accuracy', 'Duration', 'Volume', 'Heart_Rate', 'Blood_Pressure', 'Stress_Level', 'Engagement_Level', 'Focus_Time', 'Behavioral_Patterns', 'Performance_Score', 'Skill_Development', 'Engagement_Score', 'Timestamp', 'Lesson_Type', 'Instrument_Type'], dtype='object')
In [35]:
sns.heatmap(df[['Accuracy', 'Rhythm',
'Tempo', 'Pitch_Accuracy', 'Duration', 'Volume', 'Heart_Rate',
'Blood_Pressure', 'Performance_Score', 'Focus_Time' ]].corr())
Out[35]:
<Axes: >
In [58]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
y = df['Skill_Development']
X = df.drop(['Student_ID','Skill_Development','Timestamp'], axis=1)
In [59]:
X.sample(1).T
Out[59]:
66 | |
---|---|
Age | 15 |
Gender | Male |
Class_Level | Intermediate |
Accuracy | 99.621783 |
Rhythm | 80.44033 |
Tempo | 90.688699 |
Pitch_Accuracy | 91.916575 |
Duration | 436.330554 |
Volume | 60.95494 |
Heart_Rate | 70.856061 |
Blood_Pressure | 110.679336 |
Stress_Level | 3 |
Engagement_Level | 10 |
Focus_Time | 199.726929 |
Behavioral_Patterns | 2 |
Performance_Score | 94.511324 |
Engagement_Score | 2.394919 |
Lesson_Type | Theory |
Instrument_Type | Piano |
In [60]:
num_cols = ['Age','Accuracy','Rhythm','Tempo','Pitch_Accuracy','Duration','Volume','Heart_Rate', 'Blood_Pressure', 'Performance_Score', 'Focus_Time', 'Engagement_Score']
one_hot_cols = ['Gender','Class_Level','Lesson_Type','Instrument_Type']
label_cols = ['Stress_Level','Engagement_Level','Behavioral_Patterns']
preprocessor = ColumnTransformer([
('num', StandardScaler(), num_cols),
('ohe', OneHotEncoder(drop="first"), one_hot_cols),
('passthrough', 'passthrough', label_cols)
])
In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [64]:
preprocessor = preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
In [73]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
# Evaluating
error_dfs = []
def train_and_generate_error(model):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
error_df = pd.DataFrame([], columns=["Error Name","Error Value","Error Range"])
def generate_error_row(error_name: str, err_val: float, range: str):
error_df.loc[len(error_df)] = [error_name, err_val, range]
generate_error_row("Accuracy", acc, "[0, 1]")
generate_error_row("Precision", precision, "[0, 1]")
generate_error_row("F1 score", f1, "[0, 1]")
generate_error_row("Recall", recall, "[0, 1]")
error_dfs.append(error_df)
In [74]:
error_dfs = []
train_and_generate_error(LogisticRegression(multi_class='ovr'))
train_and_generate_error(SVC(class_weight='balanced', max_iter=3000))
train_and_generate_error(DecisionTreeClassifier(max_features="log2"))
train_and_generate_error(MLPClassifier(max_iter=500, hidden_layer_sizes=(32, 64,)))
train_and_generate_error(RandomForestClassifier(n_estimators=200))
/opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) /opt/conda/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
In [75]:
# Heatmapping errors on different models
model_names = ["LogisticRegression", "SVC", "DecisionTreeClassifier", "MLPClassifier", "RandomForestClassifier"]
def plot_model_perf_heatmap(model_names):
error_values = {model_names[i] : list(map(lambda err: err[1], mdl.values)) for i, mdl in enumerate(error_dfs)}
model_error_df = pd.DataFrame(error_values, index=["Accuracy","Precision","F1 score","Recall"]).T
sns.heatmap(model_error_df, annot=True)
plot_model_perf_heatmap(model_names)
Conclusion¶
The models could be optimized, but it is highly likely they will resule in a low score due to the limited rows of data available.