# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import warnings # To suppress some warnings
# Suppress the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
/kaggle/input/playground-series-s4e11/sample_submission.csv /kaggle/input/playground-series-s4e11/train.csv /kaggle/input/playground-series-s4e11/test.csv
IntroductionΒΆ
Time taken β±οΈ
: ~5 hours
This notebooks showcases extensive EDA( Exploratory Data Analysis), Feature Engineering, data cleaning, and predictions using three different models (Linear Regression, Neural Network with Tensorflow and CatBoost Classification)
The dataset for this competition (both train and test) was generated from a deep learning model trained on the Depression Survey/Dataset for Analysis dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.
Exploratory Data AnalysisΒΆ
train_df = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
train_df.sample(2).T
138815 | 12651 | |
---|---|---|
id | 138815 | 12651 |
Name | Abhishek | Abhishek |
Gender | Male | Male |
Age | 20.0 | 33.0 |
City | Hyderabad | Kolkata |
Working Professional or Student | Student | Student |
Profession | NaN | NaN |
Academic Pressure | 2.0 | 5.0 |
Work Pressure | NaN | NaN |
CGPA | 7.25 | 7.92 |
Study Satisfaction | 5.0 | 3.0 |
Job Satisfaction | NaN | NaN |
Sleep Duration | 5-6 hours | 5-6 hours |
Dietary Habits | Healthy | Moderate |
Degree | Class 12 | MSc |
Have you ever had suicidal thoughts ? | Yes | Yes |
Work/Study Hours | 6.0 | 11.0 |
Financial Stress | 1.0 | 5.0 |
Family History of Mental Illness | Yes | No |
Depression | 0 | 1 |
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140700 entries, 0 to 140699 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 140700 non-null int64 1 Name 140700 non-null object 2 Gender 140700 non-null object 3 Age 140700 non-null float64 4 City 140700 non-null object 5 Working Professional or Student 140700 non-null object 6 Profession 104070 non-null object 7 Academic Pressure 27897 non-null float64 8 Work Pressure 112782 non-null float64 9 CGPA 27898 non-null float64 10 Study Satisfaction 27897 non-null float64 11 Job Satisfaction 112790 non-null float64 12 Sleep Duration 140700 non-null object 13 Dietary Habits 140696 non-null object 14 Degree 140698 non-null object 15 Have you ever had suicidal thoughts ? 140700 non-null object 16 Work/Study Hours 140700 non-null float64 17 Financial Stress 140696 non-null float64 18 Family History of Mental Illness 140700 non-null object 19 Depression 140700 non-null int64 dtypes: float64(8), int64(2), object(10) memory usage: 21.5+ MB
train_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
id | 140700.0 | 70349.500000 | 40616.735775 | 0.00 | 35174.75 | 70349.50 | 105524.25 | 140699.0 |
Age | 140700.0 | 40.388621 | 12.384099 | 18.00 | 29.00 | 42.00 | 51.00 | 60.0 |
Academic Pressure | 27897.0 | 3.142273 | 1.380457 | 1.00 | 2.00 | 3.00 | 4.00 | 5.0 |
Work Pressure | 112782.0 | 2.998998 | 1.405771 | 1.00 | 2.00 | 3.00 | 4.00 | 5.0 |
CGPA | 27898.0 | 7.658636 | 1.464466 | 5.03 | 6.29 | 7.77 | 8.92 | 10.0 |
Study Satisfaction | 27897.0 | 2.944940 | 1.360197 | 1.00 | 2.00 | 3.00 | 4.00 | 5.0 |
Job Satisfaction | 112790.0 | 2.974404 | 1.416078 | 1.00 | 2.00 | 3.00 | 4.00 | 5.0 |
Work/Study Hours | 140700.0 | 6.252679 | 3.853615 | 0.00 | 3.00 | 6.00 | 10.00 | 12.0 |
Financial Stress | 140696.0 | 2.988983 | 1.413633 | 1.00 | 2.00 | 3.00 | 4.00 | 5.0 |
Depression | 140700.0 | 0.181713 | 0.385609 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 |
train_df.isna().sum()
id 0 Name 0 Gender 0 Age 0 City 0 Working Professional or Student 0 Profession 36630 Academic Pressure 112803 Work Pressure 27918 CGPA 112802 Study Satisfaction 112803 Job Satisfaction 27910 Sleep Duration 0 Dietary Habits 4 Degree 2 Have you ever had suicidal thoughts ? 0 Work/Study Hours 0 Financial Stress 4 Family History of Mental Illness 0 Depression 0 dtype: int64
Distrbution of different categoriesΒΆ
fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))
plt.xticks(rotation=90)
sns.countplot(train_df, x="Gender", ax=ax1)
sns.countplot(train_df, x="Working Professional or Student", ax=ax2)
sns.countplot(train_df, x="Have you ever had suicidal thoughts ?", ax=ax3)
sns.countplot(train_df, x="Family History of Mental Illness", ax=ax4)
<Axes: xlabel='Family History of Mental Illness', ylabel='count'>
Huge Dimensions from the City and Profession columnsΒΆ
The categorical columns will be one hot encoded later, hence why the unique values are reffered to as dimensions.
fig, ax = plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.countplot(train_df, x="City")
<Axes: xlabel='City', ylabel='count'>
fig, ax = plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.countplot(train_df, x="Profession")
<Axes: xlabel='Profession', ylabel='count'>
fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))
plt.xticks(rotation=90)
sns.countplot(train_df.fillna("missing"), x="Academic Pressure", ax=ax1)
sns.countplot(train_df.fillna("missing"), x="Work Pressure", ax=ax2)
sns.countplot(train_df.fillna("missing"), x="Study Satisfaction", ax=ax3)
sns.countplot(train_df.fillna("missing"), x="Job Satisfaction", ax=ax4)
<Axes: xlabel='Job Satisfaction', ylabel='count'>
Why is there a huge amount of missing values for academic and work pressure ?ΒΆ
# Students with work pressure
train_df[(train_df["Work Pressure"].notna()) & (train_df["Working Professional or Student"] == "Student")]
id | Name | Gender | Age | City | Working Professional or Student | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
21880 | 21880 | Aarush | Male | 38.0 | Chennai | Student | NaN | NaN | 5.0 | NaN | NaN | 4.0 | 5-6 hours | Healthy | Class 12 | No | 2.0 | 3.0 | No | 0 |
75007 | 75007 | Aarav | Male | 21.0 | Lucknow | Student | NaN | NaN | 2.0 | NaN | NaN | 1.0 | 7-8 hours | Moderate | Class 12 | Yes | 3.0 | 3.0 | Yes | 0 |
129756 | 129756 | Kian | Male | 18.0 | Rajkot | Student | NaN | NaN | 5.0 | NaN | NaN | 4.0 | 7-8 hours | Moderate | Class 12 | Yes | 9.0 | 4.0 | No | 1 |
# Workers with academic pressure
train_df[(train_df["Academic Pressure"].notna()) & (train_df["Working Professional or Student"] == "Working Professional")]
id | Name | Gender | Age | City | Working Professional or Student | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18752 | 18752 | Aarti | Female | 18.0 | Kalyan | Working Professional | NaN | 2.0 | NaN | 8.14 | 5.0 | NaN | Less than 5 hours | Healthy | Class 12 | Yes | 7.0 | 1.0 | Yes | 1 |
41915 | 41915 | Advait | Male | 18.0 | Kolkata | Working Professional | NaN | 4.0 | NaN | 8.04 | 3.0 | NaN | 5-6 hours | Healthy | Class 12 | Yes | 1.0 | 5.0 | Yes | 1 |
55827 | 55827 | Prachi | Female | 19.0 | Kalyan | Working Professional | NaN | 3.0 | NaN | 8.11 | 2.0 | NaN | 5-6 hours | Unhealthy | Class 12 | Yes | 4.0 | 4.0 | No | 1 |
99062 | 99062 | Tanisha | Female | 24.0 | Surat | Working Professional | Content Writer | 4.0 | NaN | 5.42 | 4.0 | NaN | Less than 5 hours | Moderate | B.Ed | Yes | 1.0 | 5.0 | No | 1 |
101189 | 101189 | Keshav | Male | 34.0 | Rajkot | Working Professional | NaN | 4.0 | NaN | 8.24 | 3.0 | NaN | More than 8 hours | Moderate | MD | No | 11.0 | 1.0 | No | 0 |
Conclusion
There is negligible number of students with work pressure and negligible number of workers with academic pressure. So, I am going to merge the two columns in a single pressure
column
# If the row is a student, academic pressure takes precedence
# Else, work pressure take precedence
# If both are missing, then set as NaN
def merge_pressure(row):
return row["Academic Pressure"] if row["Working Professional or Student"] == "Student" else row["Work Pressure"]
pressure_col = train_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")
train_df = train_df.drop(["Academic Pressure","Work Pressure"], axis=1)
train_df = train_df.join(pressure_col)
train_df["Work/Study Pressure"].isna().sum() # The remaining can be dropped since the dataset is huge
29
Same things for study and work satisfactionΒΆ
# Students with job satisfaction
train_df[(train_df["Job Satisfaction"].notna()) & (train_df["Working Professional or Student"] == "Student")]
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1428 | 1428 | Rishi | Male | 29.0 | Srinagar | Student | NaN | 9.63 | 3.0 | 3.0 | 7-8 hours | Moderate | M.Pharm | No | 8.0 | 5.0 | No | 0 | 3.0 |
21880 | 21880 | Aarush | Male | 38.0 | Chennai | Student | NaN | NaN | NaN | 4.0 | 5-6 hours | Healthy | Class 12 | No | 2.0 | 3.0 | No | 0 | NaN |
70453 | 70453 | Veda | Female | 20.0 | Ahmedabad | Student | NaN | NaN | NaN | 2.0 | Less than 5 hours | Moderate | Class 12 | Yes | 12.0 | 3.0 | Yes | 1 | NaN |
75007 | 75007 | Aarav | Male | 21.0 | Lucknow | Student | NaN | NaN | NaN | 1.0 | 7-8 hours | Moderate | Class 12 | Yes | 3.0 | 3.0 | Yes | 0 | NaN |
105773 | 105773 | Anand | Male | 18.0 | Ahmedabad | Student | NaN | NaN | NaN | 1.0 | Less than 5 hours | Moderate | Class 12 | Yes | 9.0 | 5.0 | No | 1 | NaN |
116703 | 116703 | Neil | Male | 25.0 | Bangalore | Student | NaN | 9.44 | 5.0 | 2.0 | Less than 5 hours | Unhealthy | PhD | Yes | 4.0 | 1.0 | No | 1 | 4.0 |
129756 | 129756 | Kian | Male | 18.0 | Rajkot | Student | NaN | NaN | NaN | 4.0 | 7-8 hours | Moderate | Class 12 | Yes | 9.0 | 4.0 | No | 1 | NaN |
134830 | 134830 | Aaradhya | Female | 24.0 | Meerut | Student | NaN | NaN | NaN | 2.0 | More than 8 hours | Unhealthy | Class 12 | No | 0.0 | 5.0 | No | 0 | NaN |
# Workers with study satisfaction
train_df[(train_df["Study Satisfaction"].notna()) & (train_df["Working Professional or Student"] == "Working Professional")]
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18752 | 18752 | Aarti | Female | 18.0 | Kalyan | Working Professional | NaN | 8.14 | 5.0 | NaN | Less than 5 hours | Healthy | Class 12 | Yes | 7.0 | 1.0 | Yes | 1 | NaN |
41915 | 41915 | Advait | Male | 18.0 | Kolkata | Working Professional | NaN | 8.04 | 3.0 | NaN | 5-6 hours | Healthy | Class 12 | Yes | 1.0 | 5.0 | Yes | 1 | NaN |
55827 | 55827 | Prachi | Female | 19.0 | Kalyan | Working Professional | NaN | 8.11 | 2.0 | NaN | 5-6 hours | Unhealthy | Class 12 | Yes | 4.0 | 4.0 | No | 1 | NaN |
60348 | 60348 | Nishant | Male | 29.0 | Agra | Working Professional | NaN | 9.72 | 2.0 | NaN | 5-6 hours | Moderate | B.Ed | No | 7.0 | 2.0 | Yes | 0 | NaN |
99062 | 99062 | Tanisha | Female | 24.0 | Surat | Working Professional | Content Writer | 5.42 | 4.0 | NaN | Less than 5 hours | Moderate | B.Ed | Yes | 1.0 | 5.0 | No | 1 | NaN |
101189 | 101189 | Keshav | Male | 34.0 | Rajkot | Working Professional | NaN | 8.24 | 3.0 | NaN | More than 8 hours | Moderate | MD | No | 11.0 | 1.0 | No | 0 | NaN |
# If the row is a student, Study Satisfaction takes precedence
# Else, Job Satisfaction take precedence
# If both are missing, then set as NaN
def merge_satisfaction(row):
return row["Study Satisfaction"] if row["Working Professional or Student"] == "Student" else row["Job Satisfaction"]
satisfaction_col = train_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")
train_df = train_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
train_df = train_df.join(satisfaction_col)
train_df["Job/Study Satisfaction"].isna().sum() # The remaining can be dropped since the dataset is huge
27
Dropping rows with NA values for Pressure
and Financial Stress
, due to insignificant numbers
train_df = train_df[train_df["Work/Study Pressure"].notna()]
train_df = train_df[train_df["Job/Study Satisfaction"].notna()]
train_df = train_df[train_df["Financial Stress"].notna()]
train_df.sample(2)
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | Job/Study Satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
81178 | 81178 | Nandini | Female | 24.0 | Faridabad | Working Professional | Data Scientist | NaN | More than 8 hours | Moderate | B.Tech | No | 4.0 | 3.0 | No | 0 | 3.0 | 2.0 |
10710 | 10710 | Vibha | Female | 59.0 | Patna | Working Professional | Content Writer | NaN | Less than 5 hours | Moderate | B.Tech | Yes | 3.0 | 4.0 | Yes | 0 | 2.0 | 1.0 |
Why drop 114 rows of data with unique professions ?ΒΆ
train_df
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | Job/Study Satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Aaradhya | Female | 49.0 | Ludhiana | Working Professional | Chef | NaN | More than 8 hours | Healthy | BHM | No | 1.0 | 2.0 | No | 0 | 5.0 | 2.0 |
1 | 1 | Vivan | Male | 26.0 | Varanasi | Working Professional | Teacher | NaN | Less than 5 hours | Unhealthy | LLB | Yes | 7.0 | 3.0 | No | 1 | 4.0 | 3.0 |
2 | 2 | Yuvraj | Male | 33.0 | Visakhapatnam | Student | NaN | 8.97 | 5-6 hours | Healthy | B.Pharm | Yes | 3.0 | 1.0 | No | 1 | 5.0 | 2.0 |
3 | 3 | Yuvraj | Male | 22.0 | Mumbai | Working Professional | Teacher | NaN | Less than 5 hours | Moderate | BBA | Yes | 10.0 | 1.0 | Yes | 1 | 5.0 | 1.0 |
4 | 4 | Rhea | Female | 30.0 | Kanpur | Working Professional | Business Analyst | NaN | 5-6 hours | Unhealthy | BBA | Yes | 9.0 | 4.0 | Yes | 0 | 1.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
140695 | 140695 | Vidya | Female | 18.0 | Ahmedabad | Working Professional | NaN | NaN | 5-6 hours | Unhealthy | Class 12 | No | 2.0 | 4.0 | Yes | 1 | 5.0 | 4.0 |
140696 | 140696 | Lata | Female | 41.0 | Hyderabad | Working Professional | Content Writer | NaN | 7-8 hours | Moderate | B.Tech | Yes | 6.0 | 5.0 | Yes | 0 | 5.0 | 4.0 |
140697 | 140697 | Aanchal | Female | 24.0 | Kolkata | Working Professional | Marketing Manager | NaN | More than 8 hours | Moderate | B.Com | No | 4.0 | 4.0 | No | 0 | 3.0 | 1.0 |
140698 | 140698 | Prachi | Female | 49.0 | Srinagar | Working Professional | Plumber | NaN | 5-6 hours | Moderate | ME | Yes | 10.0 | 1.0 | No | 0 | 5.0 | 2.0 |
140699 | 140699 | Sai | Male | 27.0 | Patna | Student | NaN | 9.24 | Less than 5 hours | Healthy | BCA | Yes | 2.0 | 3.0 | Yes | 1 | 4.0 | 1.0 |
140659 rows Γ 18 columns
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.countplot(train_df, x="Degree")
plt.axhline(y=2000, c="red", ls="--")
<matplotlib.lines.Line2D at 0x7a6cb39b11b0>
(s := train_df["Degree"].value_counts())[s < 2000].sum()
114
Conclusion
We are not going to use LabelEncoder
for this column, since there is no ordinality between the categories. Instead, we are going to use OneHotEncoder
to create dummy columns. However, doing so will introduce many dimensions, making the model training sub-optimal. Hence, we are going to remove the 114 rows with Degree value that has an insignificant frequency (<2000, indicated by the red dashed line), reducing the dimensions as a result.
degree_val_counts = train_df["Degree"].value_counts()
insignificant_degree_col_values = degree_val_counts[degree_val_counts < 2000].keys()
insignificant_degree_col_values
Index(['M.Arch', 'UX/UI Designer', 'B.Sc', 'Kalyan', 'M', 'BArch', 'MEd', 'BPharm', 'P.Com', 'Jhanvi', 'LLBA', 'Degree', 'B', 'Bhopal', 'BEd', 'Nalini', 'LL B.Ed', 'L.Ed', '5.88', 'HCA', 'Marsh', 'S.Arch', 'Pihu', 'Lata', 'LHM', '8.56', 'Entrepreneur', 'Aarav', 'LLTech', 'BB', 'M_Tech', 'B.Student', 'E.Tech', 'M.S', 'Navya', 'Mihir', 'RCA', 'B B.Com', 'LCA', 'N.Pharm', 'Doctor', 'CGPA', 'LLEd', 'LLS', 'Esha', 'Working Professional', 'Mthanya', 'B.3.79', 'K.Ed', 'Mahika', '24', 'Vrinda', 'Brithika', 'ACA', 'Badhya', 'HR Manager', 'Unite', 'P.Pharm', 'MPharm', 'Data Scientist', 'LL.Com', 'Business Analyst', 'H_Pharm', 'Class 11', '20', 'S.Tech', 'Veda', 'BH', 'MPA', 'S.Pharm', 'M. Business Analyst', 'Bhavesh', 'Brit', 'B.B.Arch', '7.06', 'B BA', '5.56', 'Ritik', 'B.03', '5.61', '0', 'Plumber', 'BPA', 'Vivaan', 'MTech', '29', 'LLCom', 'Advait'], dtype='object', name='Degree')
train_df = train_df[~train_df["Degree"].isin(insignificant_degree_col_values)]
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.countplot(train_df, x="Degree")
plt.title("Reduced a whole lot of dimensions")
Text(0.5, 1.0, 'Reduced a whole lot of dimensions')
Relationships with targetΒΆ
fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))
# plt.xticks(rotation=90)
sns.histplot(train_df, x="Age",kde=True, hue="Depression", ax=ax1)
sns.countplot(train_df, x="Family History of Mental Illness", hue="Depression", ax=ax2)
sns.histplot(train_df, x="Work/Study Pressure",multiple="fill", hue="Depression", ax=ax3)
sns.histplot(train_df, x="Job/Study Satisfaction",multiple="fill", hue="Depression", ax=ax4)
<Axes: xlabel='Job/Study Satisfaction', ylabel='Count'>
train_df.sample(5)
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | Job/Study Satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
119395 | 119395 | Sanya | Male | 42.0 | Agra | Working Professional | Teacher | NaN | Less than 5 hours | Unhealthy | B.Ed | No | 10.0 | 4.0 | Yes | 0 | 1.0 | 2.0 |
85675 | 85675 | Ishaani | Female | 42.0 | Ghaziabad | Working Professional | Consultant | NaN | 7-8 hours | Moderate | BBA | No | 4.0 | 1.0 | No | 0 | 5.0 | 2.0 |
129275 | 129275 | Yuvraj | Male | 43.0 | Ahmedabad | Working Professional | Yuvraj | NaN | 7-8 hours | Moderate | MSc | No | 3.0 | 1.0 | No | 0 | 3.0 | 3.0 |
20132 | 20132 | Chirag | Male | 57.0 | Rajkot | Working Professional | Chef | NaN | 7-8 hours | Unhealthy | BHM | No | 7.0 | 4.0 | No | 0 | 4.0 | 3.0 |
6370 | 6370 | Raunak | Male | 21.0 | Kalyan | Working Professional | Teacher | NaN | 5-6 hours | Moderate | BCA | No | 12.0 | 2.0 | Yes | 0 | 4.0 | 3.0 |
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.countplot(train_df, x="Sleep Duration")
plt.axhline(y=2000, c="red", ls="--")
<matplotlib.lines.Line2D at 0x7a6cb39dead0>
# Just removing the values that have insignificant frequencies
sleep_duration_val_counts = train_df["Sleep Duration"].value_counts()
insignificant_sleep_duration_col_values = sleep_duration_val_counts[sleep_duration_val_counts < 2000].keys()
insignificant_sleep_duration_col_values
Index(['3-4 hours', '6-7 hours', '4-5 hours', '2-3 hours', '4-6 hours', '6-8 hours', '1-6 hours', 'No', '9-11 hours', '10-11 hours', 'Sleep_Duration', 'Unhealthy', '45', '8-9 hours', '10-6 hours', '9-5', '45-48 hours', '3-6 hours', 'Work_Study_Hours', '49 hours', 'than 5 hours', 'Pune', '9-6 hours', '8 hours', '35-36 hours', 'Indore', '1-3 hours', '55-66 hours', 'Moderate', '40-45 hours', '1-2 hours', '9-5 hours'], dtype='object', name='Sleep Duration')
train_df = train_df[~train_df["Sleep Duration"].isin(insignificant_sleep_duration_col_values)]
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="Sleep Duration", hue="Depression", multiple="fill")
plt.title("Removed insignificant dimensions")
Text(0.5, 1.0, 'Removed insignificant dimensions')
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="City", hue="Depression", kde=True)
plt.axhline(y=250, c="red", ls="--")
<matplotlib.lines.Line2D at 0x7a6cb86fc880>
# Just removing the values that have insignificant frequencies
city_val_counts = train_df["City"].value_counts()
insignificant_city_col_values = city_val_counts[city_val_counts < 2000].keys()
insignificant_city_col_values
Index(['Mihir', 'Nandini', 'Saanvi', 'City', 'Pratyush', 'Harsha', 'Bhavna', 'Mahi', 'Vidya', 'MCA', 'Atharv', 'M.Com', 'Molkata', 'Nalini', 'Keshav', 'Ayush', 'Tushar', 'MSc', 'Parth', 'Chhavi', 'Vaishnavi', 'Kibara', 'No', 'Rashi', 'Kashish', 'ME', 'Itheg', 'Researcher', 'Kagan', 'Armaan', 'Ithal', 'Nalyan', 'Dhruv', 'Galesabad', 'Harsh', 'Aaradhya', 'Pooja', 'Khushi', 'Khaziabad', 'Reyansh', 'Plata', 'Gaurav', 'Vaanya', 'Ishanabad', 'Vidhi', 'Gurgaon', 'Krishna', 'Aishwarya', 'Aditya', 'Malyansh', 'Raghavendra', 'M.Tech', 'Less Delhi', '3.0', 'Less than 5 Kalyan', 'Mira', 'Moreadhyay', 'Morena', 'Ishkarsh', 'Kashk', 'Tolkata', 'Anvi', 'Krinda', 'Ayansh', 'Shrey', 'Ivaan', 'Jhanvi'], dtype='object', name='City')
train_df = train_df[~train_df["City"].isin(insignificant_city_col_values)] # 97 rows
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="City", hue="Depression", kde=True)
<Axes: xlabel='City', ylabel='Count'>
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")
plt.axhline(y=100, c="red", ls="--")
<matplotlib.lines.Line2D at 0x7a6cb367cc40>
# Just removing the values that have insignificant frequencies
profession_val_counts = train_df["Profession"].value_counts()
insignificant_profession_col_values = profession_val_counts[profession_val_counts < 100].keys()
insignificant_profession_col_values
Index(['Student', 'Academic', 'Unemployed', 'Profession', 'Yogesh', 'BCA', 'MBA', 'LLM', 'PhD', 'Patna', 'Analyst', 'Pranav', 'Visakhapatnam', 'M.Ed', 'Moderate', 'Nagpur', 'B.Ed', 'City Manager', 'MBBS', 'Working Professional', 'Medical Doctor', 'BBA', 'FamilyVirar', 'Dev', 'BE', 'B.Com', 'Family Consultant', 'Yuvraj'], dtype='object', name='Profession')
train_df = train_df[~train_df["Profession"].isin(insignificant_profession_col_values)] # 49 rows
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")
<Axes: xlabel='Profession', ylabel='Count'>
def assign_student_if_student(row):
return "Student" if row["Working Professional or Student"] == "Student" else row["Profession"]
train_df["Profession"] = train_df.apply(assign_student_if_student, axis=1)
# Fill the remaining values as Unknown
train_df = train_df.fillna({
"Profession": "Unknown"
})
plt.subplots(1,1, figsize=(20,5))
plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")
<Axes: xlabel='Profession', ylabel='Count'>
train_df.sample(3)
id | Name | Gender | Age | City | Working Professional or Student | Profession | CGPA | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Depression | Work/Study Pressure | Job/Study Satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
119917 | 119917 | Ila | Female | 39.0 | Lucknow | Working Professional | Chef | NaN | More than 8 hours | Healthy | MHM | Yes | 2.0 | 2.0 | Yes | 0 | 2.0 | 4.0 |
72221 | 72221 | Ritika | Female | 28.0 | Nashik | Student | Student | 9.46 | Less than 5 hours | Healthy | MBBS | No | 11.0 | 4.0 | Yes | 1 | 2.0 | 4.0 |
119829 | 119829 | Ishwar | Male | 31.0 | Delhi | Working Professional | Lawyer | NaN | 7-8 hours | Unhealthy | LLB | Yes | 9.0 | 3.0 | Yes | 1 | 4.0 | 2.0 |
PreprocessingΒΆ
y = train_df["Depression"]
X = train_df.drop(["id","Name","CGPA","Depression"], axis=1) # Dropping CGPA because I don't have time for it
X[:3]
Gender | Age | City | Working Professional or Student | Profession | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | Work/Study Pressure | Job/Study Satisfaction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Female | 49.0 | Ludhiana | Working Professional | Chef | More than 8 hours | Healthy | BHM | No | 1.0 | 2.0 | No | 5.0 | 2.0 |
1 | Male | 26.0 | Varanasi | Working Professional | Teacher | Less than 5 hours | Unhealthy | LLB | Yes | 7.0 | 3.0 | No | 4.0 | 3.0 |
2 | Male | 33.0 | Visakhapatnam | Student | Student | 5-6 hours | Healthy | B.Pharm | Yes | 3.0 | 1.0 | No | 5.0 | 2.0 |
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
X.columns
Index(['Gender', 'Age', 'City', 'Working Professional or Student', 'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'Work/Study Pressure', 'Job/Study Satisfaction'], dtype='object')
cat_columns = ["Gender","City","Working Professional or Student","Profession","Sleep Duration","Dietary Habits","Degree","Have you ever had suicidal thoughts ?","Family History of Mental Illness"]
num_columns = list(set(X.columns) - set(cat_columns))
preprocessor = ColumnTransformer([
("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_columns),
("num", StandardScaler(), num_columns)
])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
preprocessor = preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)
/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:202: UserWarning: Found unknown categories in columns [5] during transform. These unknown categories will be encoded as all zeros warnings.warn(
TrainingΒΆ
Linear RegressionΒΆ
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix
# Evaluating
error_dfs = []
def train_and_generate_error(model):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
error_df = pd.DataFrame([], columns=["Error Name","Error Value","Error Range"])
def generate_error_row(error_name: str, err_val: float, range: str):
error_df.loc[len(error_df)] = [error_name, err_val, range]
generate_error_row("Accuracy", acc, "[0, 1]")
generate_error_row("Precision", precision, "[0, 1]")
generate_error_row("F1 score", f1, "[0, 1]")
generate_error_row("Recall", recall, "[0, 1]")
error_dfs.append(error_df)
return model
error_dfs = []
reg_model = train_and_generate_error(LogisticRegression(max_iter=500))
# Heatmapping errors on different models
model_names = ["LogisticRegression"]
error_values = {model_names[i] : list(map(lambda err: err[1], mdl.values)) for i, mdl in enumerate(error_dfs)}
model_error_df = pd.DataFrame(error_values, index=["Accuracy","Precision","F1 score","Recall"]).T
sns.heatmap(model_error_df, annot=True)
<Axes: >
TensorflowΒΆ
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
nn_model = tf.keras.Sequential([
Input(shape=(X_train.shape[1],)),
Dense(32, activation='relu'),
Dropout(0.2),
Dense(64, activation='relu'),
Dropout(0.2),
Dense(256, activation='relu'),
Dropout(0.2),
Dense(32, activation='relu'),
Dense(1, activation='sigmoid'),
])
nn_model.summary()
Model: "sequential"
βββββββββββββββββββββββββββββββββββ³βββββββββββββββββββββββββ³ββββββββββββββββ β Layer (type) β Output Shape β Param # β β‘βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ© β dense (Dense) β (None, 32) β 3,968 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dropout (Dropout) β (None, 32) β 0 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dense_1 (Dense) β (None, 64) β 2,112 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dropout_1 (Dropout) β (None, 64) β 0 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dense_2 (Dense) β (None, 256) β 16,640 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dropout_2 (Dropout) β (None, 256) β 0 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dense_3 (Dense) β (None, 32) β 8,224 β βββββββββββββββββββββββββββββββββββΌβββββββββββββββββββββββββΌββββββββββββββββ€ β dense_4 (Dense) β (None, 1) β 33 β βββββββββββββββββββββββββββββββββββ΄βββββββββββββββββββββββββ΄ββββββββββββββββ
Total params: 30,977 (121.00 KB)
Trainable params: 30,977 (121.00 KB)
Non-trainable params: 0 (0.00 B)
nn_model.compile(
optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy","precision","r2_score","recall"]
)
# nn_model.fit(X_train.toarray(), y_train, epochs=60, batch_size=128, validation_data=(X_test.toarray(), y_test))
test_df = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
test_df.head(3)
id | Name | Gender | Age | City | Working Professional or Student | Profession | Academic Pressure | Work Pressure | CGPA | Study Satisfaction | Job Satisfaction | Sleep Duration | Dietary Habits | Degree | Have you ever had suicidal thoughts ? | Work/Study Hours | Financial Stress | Family History of Mental Illness | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 140700 | Shivam | Male | 53.0 | Visakhapatnam | Working Professional | Judge | NaN | 2.0 | NaN | NaN | 5.0 | Less than 5 hours | Moderate | LLB | No | 9.0 | 3.0 | Yes |
1 | 140701 | Sanya | Female | 58.0 | Kolkata | Working Professional | Educational Consultant | NaN | 2.0 | NaN | NaN | 4.0 | Less than 5 hours | Moderate | B.Ed | No | 6.0 | 4.0 | No |
2 | 140702 | Yash | Male | 53.0 | Jaipur | Working Professional | Teacher | NaN | 4.0 | NaN | NaN | 1.0 | 7-8 hours | Moderate | B.Arch | Yes | 12.0 | 4.0 | No |
pressure_col = test_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")
test_df = test_df.drop(["Academic Pressure","Work Pressure"], axis=1)
test_df = test_df.join(pressure_col)
satisfaction_col = test_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")
test_df = test_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
test_df = test_df.join(satisfaction_col)
X = test_df.drop(["id","Name","CGPA"], axis=1)
X = preprocessor.transform(X)
/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:202: UserWarning: Found unknown categories in columns [1, 3, 4, 5, 6] during transform. These unknown categories will be encoded as all zeros warnings.warn(
# y_pred = nn_model.predict(X.toarray())
# final_output = pd.DataFrame({
# "id": test_df["id"],
# "Depression": (y_pred > 0.5).astype(int).flatten()
# })
# final_output.to_csv("nn_model_final.csv", index=False)
CatBoostΒΆ
y = train_df["Depression"]
X = train_df.drop(["id","Name","CGPA","Depression"], axis=1) # Dropping CGPA because I don't have time for it
for col in X.columns:
# Get unique values from the column, excluding NaN
unique_vals = X[col].dropna().unique()
# For columns with NaNs, replace them with random choices from the unique values
if len(unique_vals) > 0:
nan_indices = X[col].isna() # Get the indices of NaN values
X.loc[nan_indices, col] = np.random.choice(unique_vals, size=nan_indices.sum())
X[cat_columns] = X[cat_columns].apply(lambda col: col.astype('category'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from catboost import CatBoostClassifier, Pool
cb_model = CatBoostClassifier(
iterations=1000,
depth=10,
learning_rate=0.05,
loss_function='Logloss',
cat_features=cat_columns,
verbose=200
)
cb_model.fit(X_train, y_train)
0: learn: 0.6021241 total: 265ms remaining: 4m 24s 200: learn: 0.1247284 total: 34.4s remaining: 2m 16s 400: learn: 0.1038873 total: 1m 11s remaining: 1m 46s 600: learn: 0.0871013 total: 1m 45s remaining: 1m 10s 800: learn: 0.0734592 total: 2m 20s remaining: 35s 999: learn: 0.0613709 total: 2m 54s remaining: 0us
<catboost.core.CatBoostClassifier at 0x7a6c5f6fb130>
from sklearn.metrics import classification_report
y_pred = cb_model.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.96 0.97 0.96 22933 1 0.85 0.81 0.83 5131 accuracy 0.94 28064 macro avg 0.90 0.89 0.90 28064 weighted avg 0.94 0.94 0.94 28064
test_df = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
pressure_col = test_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")
test_df = test_df.drop(["Academic Pressure","Work Pressure"], axis=1)
test_df = test_df.join(pressure_col)
satisfaction_col = test_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")
test_df = test_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
test_df = test_df.join(satisfaction_col)
X = test_df.drop(["id","Name","CGPA"], axis=1) # Dropping CGPA because I don't have time for it
for col in X.columns:
# Get unique values from the column, excluding NaN
unique_vals = train_df[col].dropna().unique()
# For columns with NaNs, replace them with random choices from the unique values
if len(unique_vals) > 0:
nan_indices = X[col].isna() # Get the indices of NaN values
X.loc[nan_indices, col] = np.random.choice(unique_vals, size=nan_indices.sum())
cb_final_pred = cb_model.predict(X)
final_output = pd.DataFrame({
"id": test_df["id"],
"Depression": cb_final_pred,
})
final_output.to_csv("final_cb_model_output.csv", index=False)
ConclusionΒΆ
Reviewing all the models and their results, the tensorflow model performed the best ~94%, followed by the Linear Regression Model ~93.9%, and finally the CatBoost model with ~93.3% accuracy