In [1]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import warnings # To suppress some warnings
 
# Suppress the specific FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv

Introduction¶

Time taken ⏱️: ~5 hours

This notebooks showcases extensive EDA( Exploratory Data Analysis), Feature Engineering, data cleaning, and predictions using three different models (Linear Regression, Neural Network with Tensorflow and CatBoost Classification)

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Depression Survey/Dataset for Analysis dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

Exploratory Data Analysis¶

In [2]:

train_df = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")

In [3]:

train_df.sample(2).T

Out[3]:

	138815	12651
id	138815	12651
Name	Abhishek	Abhishek
Gender	Male	Male
Age	20.0	33.0
City	Hyderabad	Kolkata
Working Professional or Student	Student	Student
Profession	NaN	NaN
Academic Pressure	2.0	5.0
Work Pressure	NaN	NaN
CGPA	7.25	7.92
Study Satisfaction	5.0	3.0
Job Satisfaction	NaN	NaN
Sleep Duration	5-6 hours	5-6 hours
Dietary Habits	Healthy	Moderate
Degree	Class 12	MSc
Have you ever had suicidal thoughts ?	Yes	Yes
Work/Study Hours	6.0	11.0
Financial Stress	1.0	5.0
Family History of Mental Illness	Yes	No
Depression	0	1

In [4]:

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   float64
 11  Job Satisfaction                       112790 non-null  float64
 12  Sleep Duration                         140700 non-null  object 
 13  Dietary Habits                         140696 non-null  object 
 14  Degree                                 140698 non-null  object 
 15  Have you ever had suicidal thoughts ?  140700 non-null  object 
 16  Work/Study Hours                       140700 non-null  float64
 17  Financial Stress                       140696 non-null  float64
 18  Family History of Mental Illness       140700 non-null  object 
 19  Depression                             140700 non-null  int64  
dtypes: float64(8), int64(2), object(10)
memory usage: 21.5+ MB

In [5]:

train_df.describe().T

Out[5]:

	count	mean	std	min	25%	50%	75%	max
id	140700.0	70349.500000	40616.735775	0.00	35174.75	70349.50	105524.25	140699.0
Age	140700.0	40.388621	12.384099	18.00	29.00	42.00	51.00	60.0
Academic Pressure	27897.0	3.142273	1.380457	1.00	2.00	3.00	4.00	5.0
Work Pressure	112782.0	2.998998	1.405771	1.00	2.00	3.00	4.00	5.0
CGPA	27898.0	7.658636	1.464466	5.03	6.29	7.77	8.92	10.0
Study Satisfaction	27897.0	2.944940	1.360197	1.00	2.00	3.00	4.00	5.0
Job Satisfaction	112790.0	2.974404	1.416078	1.00	2.00	3.00	4.00	5.0
Work/Study Hours	140700.0	6.252679	3.853615	0.00	3.00	6.00	10.00	12.0
Financial Stress	140696.0	2.988983	1.413633	1.00	2.00	3.00	4.00	5.0
Depression	140700.0	0.181713	0.385609	0.00	0.00	0.00	0.00	1.0

In [6]:

train_df.isna().sum()

Out[6]:

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

Distrbution of different categories¶

In [7]:

fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))

plt.xticks(rotation=90)
sns.countplot(train_df, x="Gender", ax=ax1)
sns.countplot(train_df, x="Working Professional or Student", ax=ax2)
sns.countplot(train_df, x="Have you ever had suicidal thoughts ?", ax=ax3)
sns.countplot(train_df, x="Family History of Mental Illness", ax=ax4)

Out[7]:

<Axes: xlabel='Family History of Mental Illness', ylabel='count'>

No description has been provided for this image

Huge Dimensions from the City and Profession columns¶

The categorical columns will be one hot encoded later, hence why the unique values are reffered to as dimensions.

In [8]:

fig, ax = plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.countplot(train_df, x="City")

Out[8]:

<Axes: xlabel='City', ylabel='count'>

In [9]:

fig, ax = plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.countplot(train_df, x="Profession")

Out[9]:

<Axes: xlabel='Profession', ylabel='count'>

In [10]:

fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))

plt.xticks(rotation=90)
sns.countplot(train_df.fillna("missing"), x="Academic Pressure", ax=ax1)
sns.countplot(train_df.fillna("missing"), x="Work Pressure", ax=ax2)
sns.countplot(train_df.fillna("missing"), x="Study Satisfaction", ax=ax3)
sns.countplot(train_df.fillna("missing"), x="Job Satisfaction", ax=ax4)

Out[10]:

<Axes: xlabel='Job Satisfaction', ylabel='count'>

Why is there a huge amount of missing values for academic and work pressure ?¶

In [11]:

# Students with work pressure
train_df[(train_df["Work Pressure"].notna()) & (train_df["Working Professional or Student"] == "Student")]

Out[11]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	Academic Pressure	Work Pressure	CGPA	Study Satisfaction	Job Satisfaction	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression
21880	21880	Aarush	Male	38.0	Chennai	Student	NaN	NaN	5.0	NaN	NaN	4.0	5-6 hours	Healthy	Class 12	No	2.0	3.0	No	0
75007	75007	Aarav	Male	21.0	Lucknow	Student	NaN	NaN	2.0	NaN	NaN	1.0	7-8 hours	Moderate	Class 12	Yes	3.0	3.0	Yes	0
129756	129756	Kian	Male	18.0	Rajkot	Student	NaN	NaN	5.0	NaN	NaN	4.0	7-8 hours	Moderate	Class 12	Yes	9.0	4.0	No	1

In [12]:

# Workers with academic pressure
train_df[(train_df["Academic Pressure"].notna()) & (train_df["Working Professional or Student"] == "Working Professional")]

Out[12]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	Academic Pressure	Work Pressure	CGPA	Study Satisfaction	Job Satisfaction	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression
18752	18752	Aarti	Female	18.0	Kalyan	Working Professional	NaN	2.0	NaN	8.14	5.0	NaN	Less than 5 hours	Healthy	Class 12	Yes	7.0	1.0	Yes	1
41915	41915	Advait	Male	18.0	Kolkata	Working Professional	NaN	4.0	NaN	8.04	3.0	NaN	5-6 hours	Healthy	Class 12	Yes	1.0	5.0	Yes	1
55827	55827	Prachi	Female	19.0	Kalyan	Working Professional	NaN	3.0	NaN	8.11	2.0	NaN	5-6 hours	Unhealthy	Class 12	Yes	4.0	4.0	No	1
99062	99062	Tanisha	Female	24.0	Surat	Working Professional	Content Writer	4.0	NaN	5.42	4.0	NaN	Less than 5 hours	Moderate	B.Ed	Yes	1.0	5.0	No	1
101189	101189	Keshav	Male	34.0	Rajkot	Working Professional	NaN	4.0	NaN	8.24	3.0	NaN	More than 8 hours	Moderate	MD	No	11.0	1.0	No	0

Conclusion There is negligible number of students with work pressure and negligible number of workers with academic pressure. So, I am going to merge the two columns in a single pressure column

In [13]:

# If the row is a student, academic pressure takes precedence
# Else, work pressure take precedence
# If both are missing, then set as NaN

def merge_pressure(row):
    return row["Academic Pressure"] if row["Working Professional or Student"] == "Student" else row["Work Pressure"]

pressure_col = train_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")

In [14]:

train_df = train_df.drop(["Academic Pressure","Work Pressure"], axis=1)
train_df = train_df.join(pressure_col)

In [15]:

train_df["Work/Study Pressure"].isna().sum() # The remaining can be dropped since the dataset is huge

Out[15]:

Same things for study and work satisfaction¶

In [16]:

# Students with job satisfaction
train_df[(train_df["Job Satisfaction"].notna()) & (train_df["Working Professional or Student"] == "Student")]

Out[16]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Study Satisfaction	Job Satisfaction	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression	Work/Study Pressure
1428	1428	Rishi	Male	29.0	Srinagar	Student	NaN	9.63	3.0	3.0	7-8 hours	Moderate	M.Pharm	No	8.0	5.0	No	0	3.0
21880	21880	Aarush	Male	38.0	Chennai	Student	NaN	NaN	NaN	4.0	5-6 hours	Healthy	Class 12	No	2.0	3.0	No	0	NaN
70453	70453	Veda	Female	20.0	Ahmedabad	Student	NaN	NaN	NaN	2.0	Less than 5 hours	Moderate	Class 12	Yes	12.0	3.0	Yes	1	NaN
75007	75007	Aarav	Male	21.0	Lucknow	Student	NaN	NaN	NaN	1.0	7-8 hours	Moderate	Class 12	Yes	3.0	3.0	Yes	0	NaN
105773	105773	Anand	Male	18.0	Ahmedabad	Student	NaN	NaN	NaN	1.0	Less than 5 hours	Moderate	Class 12	Yes	9.0	5.0	No	1	NaN
116703	116703	Neil	Male	25.0	Bangalore	Student	NaN	9.44	5.0	2.0	Less than 5 hours	Unhealthy	PhD	Yes	4.0	1.0	No	1	4.0
129756	129756	Kian	Male	18.0	Rajkot	Student	NaN	NaN	NaN	4.0	7-8 hours	Moderate	Class 12	Yes	9.0	4.0	No	1	NaN
134830	134830	Aaradhya	Female	24.0	Meerut	Student	NaN	NaN	NaN	2.0	More than 8 hours	Unhealthy	Class 12	No	0.0	5.0	No	0	NaN

In [17]:

# Workers with study satisfaction
train_df[(train_df["Study Satisfaction"].notna()) & (train_df["Working Professional or Student"] == "Working Professional")]

Out[17]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Study Satisfaction	Job Satisfaction	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression	Work/Study Pressure
18752	18752	Aarti	Female	18.0	Kalyan	Working Professional	NaN	8.14	5.0	NaN	Less than 5 hours	Healthy	Class 12	Yes	7.0	1.0	Yes	1	NaN
41915	41915	Advait	Male	18.0	Kolkata	Working Professional	NaN	8.04	3.0	NaN	5-6 hours	Healthy	Class 12	Yes	1.0	5.0	Yes	1	NaN
55827	55827	Prachi	Female	19.0	Kalyan	Working Professional	NaN	8.11	2.0	NaN	5-6 hours	Unhealthy	Class 12	Yes	4.0	4.0	No	1	NaN
60348	60348	Nishant	Male	29.0	Agra	Working Professional	NaN	9.72	2.0	NaN	5-6 hours	Moderate	B.Ed	No	7.0	2.0	Yes	0	NaN
99062	99062	Tanisha	Female	24.0	Surat	Working Professional	Content Writer	5.42	4.0	NaN	Less than 5 hours	Moderate	B.Ed	Yes	1.0	5.0	No	1	NaN
101189	101189	Keshav	Male	34.0	Rajkot	Working Professional	NaN	8.24	3.0	NaN	More than 8 hours	Moderate	MD	No	11.0	1.0	No	0	NaN

In [18]:

# If the row is a student, Study Satisfaction takes precedence
# Else, Job Satisfaction take precedence
# If both are missing, then set as NaN

def merge_satisfaction(row):
    return row["Study Satisfaction"] if row["Working Professional or Student"] == "Student" else row["Job Satisfaction"]

satisfaction_col = train_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")

In [19]:

train_df = train_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
train_df = train_df.join(satisfaction_col)

In [20]:

train_df["Job/Study Satisfaction"].isna().sum() # The remaining can be dropped since the dataset is huge

Out[20]:

Dropping rows with NA values for Pressure and Financial Stress, due to insignificant numbers

In [21]:

train_df = train_df[train_df["Work/Study Pressure"].notna()]
train_df = train_df[train_df["Job/Study Satisfaction"].notna()]
train_df = train_df[train_df["Financial Stress"].notna()]

In [22]:

train_df.sample(2)

Out[22]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression	Work/Study Pressure	Job/Study Satisfaction
81178	81178	Nandini	Female	24.0	Faridabad	Working Professional	Data Scientist	NaN	More than 8 hours	Moderate	B.Tech	No	4.0	3.0	No	0	3.0	2.0
10710	10710	Vibha	Female	59.0	Patna	Working Professional	Content Writer	NaN	Less than 5 hours	Moderate	B.Tech	Yes	3.0	4.0	Yes	0	2.0	1.0

Why drop 114 rows of data with unique professions ?¶

In [23]:

train_df

Out[23]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression	Work/Study Pressure	Job/Study Satisfaction
0	0	Aaradhya	Female	49.0	Ludhiana	Working Professional	Chef	NaN	More than 8 hours	Healthy	BHM	No	1.0	2.0	No	0	5.0	2.0
1	1	Vivan	Male	26.0	Varanasi	Working Professional	Teacher	NaN	Less than 5 hours	Unhealthy	LLB	Yes	7.0	3.0	No	1	4.0	3.0
2	2	Yuvraj	Male	33.0	Visakhapatnam	Student	NaN	8.97	5-6 hours	Healthy	B.Pharm	Yes	3.0	1.0	No	1	5.0	2.0
3	3	Yuvraj	Male	22.0	Mumbai	Working Professional	Teacher	NaN	Less than 5 hours	Moderate	BBA	Yes	10.0	1.0	Yes	1	5.0	1.0
4	4	Rhea	Female	30.0	Kanpur	Working Professional	Business Analyst	NaN	5-6 hours	Unhealthy	BBA	Yes	9.0	4.0	Yes	0	1.0	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
140695	140695	Vidya	Female	18.0	Ahmedabad	Working Professional	NaN	NaN	5-6 hours	Unhealthy	Class 12	No	2.0	4.0	Yes	1	5.0	4.0
140696	140696	Lata	Female	41.0	Hyderabad	Working Professional	Content Writer	NaN	7-8 hours	Moderate	B.Tech	Yes	6.0	5.0	Yes	0	5.0	4.0
140697	140697	Aanchal	Female	24.0	Kolkata	Working Professional	Marketing Manager	NaN	More than 8 hours	Moderate	B.Com	No	4.0	4.0	No	0	3.0	1.0
140698	140698	Prachi	Female	49.0	Srinagar	Working Professional	Plumber	NaN	5-6 hours	Moderate	ME	Yes	10.0	1.0	No	0	5.0	2.0
140699	140699	Sai	Male	27.0	Patna	Student	NaN	9.24	Less than 5 hours	Healthy	BCA	Yes	2.0	3.0	Yes	1	4.0	1.0

140659 rows × 18 columns

In [24]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.countplot(train_df, x="Degree")
plt.axhline(y=2000, c="red", ls="--")

Out[24]:

<matplotlib.lines.Line2D at 0x7a6cb39b11b0>

In [25]:

(s := train_df["Degree"].value_counts())[s < 2000].sum()

Out[25]:

Conclusion We are not going to use LabelEncoder for this column, since there is no ordinality between the categories. Instead, we are going to use OneHotEncoder to create dummy columns. However, doing so will introduce many dimensions, making the model training sub-optimal. Hence, we are going to remove the 114 rows with Degree value that has an insignificant frequency (<2000, indicated by the red dashed line), reducing the dimensions as a result.

In [26]:

degree_val_counts = train_df["Degree"].value_counts()
insignificant_degree_col_values = degree_val_counts[degree_val_counts < 2000].keys()
insignificant_degree_col_values

Out[26]:

Index(['M.Arch', 'UX/UI Designer', 'B.Sc', 'Kalyan', 'M', 'BArch', 'MEd',
       'BPharm', 'P.Com', 'Jhanvi', 'LLBA', 'Degree', 'B', 'Bhopal', 'BEd',
       'Nalini', 'LL B.Ed', 'L.Ed', '5.88', 'HCA', 'Marsh', 'S.Arch', 'Pihu',
       'Lata', 'LHM', '8.56', 'Entrepreneur', 'Aarav', 'LLTech', 'BB',
       'M_Tech', 'B.Student', 'E.Tech', 'M.S', 'Navya', 'Mihir', 'RCA',
       'B B.Com', 'LCA', 'N.Pharm', 'Doctor', 'CGPA', 'LLEd', 'LLS', 'Esha',
       'Working Professional', 'Mthanya', 'B.3.79', 'K.Ed', 'Mahika', '24',
       'Vrinda', 'Brithika', 'ACA', 'Badhya', 'HR Manager', 'Unite', 'P.Pharm',
       'MPharm', 'Data Scientist', 'LL.Com', 'Business Analyst', 'H_Pharm',
       'Class 11', '20', 'S.Tech', 'Veda', 'BH', 'MPA', 'S.Pharm',
       'M. Business Analyst', 'Bhavesh', 'Brit', 'B.B.Arch', '7.06', 'B BA',
       '5.56', 'Ritik', 'B.03', '5.61', '0', 'Plumber', 'BPA', 'Vivaan',
       'MTech', '29', 'LLCom', 'Advait'],
      dtype='object', name='Degree')

In [27]:

train_df = train_df[~train_df["Degree"].isin(insignificant_degree_col_values)]

In [28]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.countplot(train_df, x="Degree")
plt.title("Reduced a whole lot of dimensions")

Out[28]:

Text(0.5, 1.0, 'Reduced a whole lot of dimensions')

Relationships with target¶

In [29]:

fig, [[ax1, ax2],[ax3, ax4]] = plt.subplots(2,2, figsize=(10,10))

# plt.xticks(rotation=90)
sns.histplot(train_df, x="Age",kde=True, hue="Depression", ax=ax1)
sns.countplot(train_df, x="Family History of Mental Illness", hue="Depression", ax=ax2)
sns.histplot(train_df, x="Work/Study Pressure",multiple="fill", hue="Depression", ax=ax3)
sns.histplot(train_df, x="Job/Study Satisfaction",multiple="fill", hue="Depression", ax=ax4)

Out[29]:

<Axes: xlabel='Job/Study Satisfaction', ylabel='Count'>

In [30]:

train_df.sample(5)

Out[30]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Work/Study Pressure	Job/Study Satisfaction
119395	119395	Sanya	Male	42.0	Agra	Working Professional	Teacher	NaN	Less than 5 hours	Unhealthy	B.Ed	No	10.0	4.0	Yes	1.0	2.0
85675	85675	Ishaani	Female	42.0	Ghaziabad	Working Professional	Consultant	NaN	7-8 hours	Moderate	BBA	No	4.0	1.0	No	5.0	2.0
129275	129275	Yuvraj	Male	43.0	Ahmedabad	Working Professional	Yuvraj	NaN	7-8 hours	Moderate	MSc	No	3.0	1.0	No	3.0	3.0
20132	20132	Chirag	Male	57.0	Rajkot	Working Professional	Chef	NaN	7-8 hours	Unhealthy	BHM	No	7.0	4.0	No	4.0	3.0
6370	6370	Raunak	Male	21.0	Kalyan	Working Professional	Teacher	NaN	5-6 hours	Moderate	BCA	No	12.0	2.0	Yes	4.0	3.0

In [31]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.countplot(train_df, x="Sleep Duration")
plt.axhline(y=2000, c="red", ls="--")

Out[31]:

<matplotlib.lines.Line2D at 0x7a6cb39dead0>

In [32]:

# Just removing the values that have insignificant frequencies
sleep_duration_val_counts = train_df["Sleep Duration"].value_counts()
insignificant_sleep_duration_col_values = sleep_duration_val_counts[sleep_duration_val_counts < 2000].keys()
insignificant_sleep_duration_col_values

Out[32]:

Index(['3-4 hours', '6-7 hours', '4-5 hours', '2-3 hours', '4-6 hours',
       '6-8 hours', '1-6 hours', 'No', '9-11 hours', '10-11 hours',
       'Sleep_Duration', 'Unhealthy', '45', '8-9 hours', '10-6 hours', '9-5',
       '45-48 hours', '3-6 hours', 'Work_Study_Hours', '49 hours',
       'than 5 hours', 'Pune', '9-6 hours', '8 hours', '35-36 hours', 'Indore',
       '1-3 hours', '55-66 hours', 'Moderate', '40-45 hours', '1-2 hours',
       '9-5 hours'],
      dtype='object', name='Sleep Duration')

In [33]:

train_df = train_df[~train_df["Sleep Duration"].isin(insignificant_sleep_duration_col_values)]

In [34]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="Sleep Duration", hue="Depression", multiple="fill")
plt.title("Removed insignificant dimensions")

Out[34]:

Text(0.5, 1.0, 'Removed insignificant dimensions')

In [35]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="City", hue="Depression", kde=True)
plt.axhline(y=250, c="red", ls="--")

Out[35]:

<matplotlib.lines.Line2D at 0x7a6cb86fc880>

In [36]:

# Just removing the values that have insignificant frequencies
city_val_counts = train_df["City"].value_counts()
insignificant_city_col_values = city_val_counts[city_val_counts < 2000].keys()
insignificant_city_col_values

Out[36]:

Index(['Mihir', 'Nandini', 'Saanvi', 'City', 'Pratyush', 'Harsha', 'Bhavna',
       'Mahi', 'Vidya', 'MCA', 'Atharv', 'M.Com', 'Molkata', 'Nalini',
       'Keshav', 'Ayush', 'Tushar', 'MSc', 'Parth', 'Chhavi', 'Vaishnavi',
       'Kibara', 'No', 'Rashi', 'Kashish', 'ME', 'Itheg', 'Researcher',
       'Kagan', 'Armaan', 'Ithal', 'Nalyan', 'Dhruv', 'Galesabad', 'Harsh',
       'Aaradhya', 'Pooja', 'Khushi', 'Khaziabad', 'Reyansh', 'Plata',
       'Gaurav', 'Vaanya', 'Ishanabad', 'Vidhi', 'Gurgaon', 'Krishna',
       'Aishwarya', 'Aditya', 'Malyansh', 'Raghavendra', 'M.Tech',
       'Less Delhi', '3.0', 'Less than 5 Kalyan', 'Mira', 'Moreadhyay',
       'Morena', 'Ishkarsh', 'Kashk', 'Tolkata', 'Anvi', 'Krinda', 'Ayansh',
       'Shrey', 'Ivaan', 'Jhanvi'],
      dtype='object', name='City')

In [37]:

train_df = train_df[~train_df["City"].isin(insignificant_city_col_values)] # 97 rows

In [38]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="City", hue="Depression", kde=True)

Out[38]:

<Axes: xlabel='City', ylabel='Count'>

In [39]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")
plt.axhline(y=100, c="red", ls="--")

Out[39]:

<matplotlib.lines.Line2D at 0x7a6cb367cc40>

In [40]:

# Just removing the values that have insignificant frequencies
profession_val_counts = train_df["Profession"].value_counts()
insignificant_profession_col_values = profession_val_counts[profession_val_counts < 100].keys()
insignificant_profession_col_values

Out[40]:

Index(['Student', 'Academic', 'Unemployed', 'Profession', 'Yogesh', 'BCA',
       'MBA', 'LLM', 'PhD', 'Patna', 'Analyst', 'Pranav', 'Visakhapatnam',
       'M.Ed', 'Moderate', 'Nagpur', 'B.Ed', 'City Manager', 'MBBS',
       'Working Professional', 'Medical Doctor', 'BBA', 'FamilyVirar', 'Dev',
       'BE', 'B.Com', 'Family Consultant', 'Yuvraj'],
      dtype='object', name='Profession')

In [41]:

train_df = train_df[~train_df["Profession"].isin(insignificant_profession_col_values)] # 49 rows

In [42]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")

Out[42]:

<Axes: xlabel='Profession', ylabel='Count'>

In [43]:

def assign_student_if_student(row):
    return "Student" if row["Working Professional or Student"] == "Student" else row["Profession"]

train_df["Profession"] = train_df.apply(assign_student_if_student, axis=1)

In [44]:

# Fill the remaining values as Unknown
train_df = train_df.fillna({
    "Profession": "Unknown"
})

In [45]:

plt.subplots(1,1, figsize=(20,5))

plt.xticks(rotation=90)
sns.histplot(train_df, x="Profession", hue="Depression")

Out[45]:

<Axes: xlabel='Profession', ylabel='Count'>

In [46]:

train_df.sample(3)

Out[46]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	CGPA	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Depression	Work/Study Pressure	Job/Study Satisfaction
119917	119917	Ila	Female	39.0	Lucknow	Working Professional	Chef	NaN	More than 8 hours	Healthy	MHM	Yes	2.0	2.0	Yes	0	2.0	4.0
72221	72221	Ritika	Female	28.0	Nashik	Student	Student	9.46	Less than 5 hours	Healthy	MBBS	No	11.0	4.0	Yes	1	2.0	4.0
119829	119829	Ishwar	Male	31.0	Delhi	Working Professional	Lawyer	NaN	7-8 hours	Unhealthy	LLB	Yes	9.0	3.0	Yes	1	4.0	2.0

Preprocessing¶

In [47]:

y = train_df["Depression"]
X = train_df.drop(["id","Name","CGPA","Depression"], axis=1) # Dropping CGPA because I don't have time for it

X[:3]

Out[47]:

	Gender	Age	City	Working Professional or Student	Profession	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness	Work/Study Pressure	Job/Study Satisfaction
0	Female	49.0	Ludhiana	Working Professional	Chef	More than 8 hours	Healthy	BHM	No	1.0	2.0	No	5.0	2.0
1	Male	26.0	Varanasi	Working Professional	Teacher	Less than 5 hours	Unhealthy	LLB	Yes	7.0	3.0	No	4.0	3.0
2	Male	33.0	Visakhapatnam	Student	Student	5-6 hours	Healthy	B.Pharm	Yes	3.0	1.0	No	5.0	2.0

In [48]:

from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [49]:

X.columns

Out[49]:

Index(['Gender', 'Age', 'City', 'Working Professional or Student',
       'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness',
       'Work/Study Pressure', 'Job/Study Satisfaction'],
      dtype='object')

In [50]:

cat_columns = ["Gender","City","Working Professional or Student","Profession","Sleep Duration","Dietary Habits","Degree","Have you ever had suicidal thoughts ?","Family History of Mental Illness"]
num_columns = list(set(X.columns) - set(cat_columns))

In [51]:

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_columns),
    ("num", StandardScaler(), num_columns)
])

In [52]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [53]:

preprocessor = preprocessor.fit(X_train)
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:202: UserWarning: Found unknown categories in columns [5] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

Training¶

Linear Regression¶

In [54]:

from sklearn.linear_model import LogisticRegression

In [55]:

from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

# Evaluating
error_dfs = []
def train_and_generate_error(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    error_df = pd.DataFrame([], columns=["Error Name","Error Value","Error Range"])
    
    def generate_error_row(error_name: str, err_val: float, range: str):
        error_df.loc[len(error_df)] = [error_name, err_val, range]
    
    generate_error_row("Accuracy", acc, "[0, 1]")
    generate_error_row("Precision", precision, "[0, 1]")
    generate_error_row("F1 score", f1, "[0, 1]")
    generate_error_row("Recall", recall, "[0, 1]")

    error_dfs.append(error_df)

    return model

In [56]:

error_dfs = []

reg_model = train_and_generate_error(LogisticRegression(max_iter=500))

In [57]:

# Heatmapping errors on different models
model_names = ["LogisticRegression"]
error_values = {model_names[i] : list(map(lambda err: err[1], mdl.values)) for i, mdl in enumerate(error_dfs)}

model_error_df = pd.DataFrame(error_values, index=["Accuracy","Precision","F1 score","Recall"]).T
sns.heatmap(model_error_df, annot=True)

Out[57]:

<Axes: >

Tensorflow¶

In [58]:

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout

nn_model = tf.keras.Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [59]:

nn_model.summary()

Model: "sequential"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 32)             │         3,968 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ (None, 32)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 64)             │         2,112 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ (None, 64)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 256)            │        16,640 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_2 (Dropout)             │ (None, 256)            │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_3 (Dense)                 │ (None, 32)             │         8,224 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ (None, 1)              │            33 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 30,977 (121.00 KB)

 Trainable params: 30,977 (121.00 KB)

 Non-trainable params: 0 (0.00 B)

In [60]:

nn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy","precision","r2_score","recall"]
)

In [61]:

# nn_model.fit(X_train.toarray(), y_train, epochs=60, batch_size=128, validation_data=(X_test.toarray(), y_test))

In [62]:

test_df = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

In [63]:

test_df.head(3)

Out[63]:

	id	Name	Gender	Age	City	Working Professional or Student	Profession	Academic Pressure	Work Pressure	CGPA	Study Satisfaction	Job Satisfaction	Sleep Duration	Dietary Habits	Degree	Have you ever had suicidal thoughts ?	Work/Study Hours	Financial Stress	Family History of Mental Illness
0	140700	Shivam	Male	53.0	Visakhapatnam	Working Professional	Judge	NaN	2.0	NaN	NaN	5.0	Less than 5 hours	Moderate	LLB	No	9.0	3.0	Yes
1	140701	Sanya	Female	58.0	Kolkata	Working Professional	Educational Consultant	NaN	2.0	NaN	NaN	4.0	Less than 5 hours	Moderate	B.Ed	No	6.0	4.0	No
2	140702	Yash	Male	53.0	Jaipur	Working Professional	Teacher	NaN	4.0	NaN	NaN	1.0	7-8 hours	Moderate	B.Arch	Yes	12.0	4.0	No

In [64]:

pressure_col = test_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")
test_df = test_df.drop(["Academic Pressure","Work Pressure"], axis=1)
test_df = test_df.join(pressure_col)

satisfaction_col = test_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")
test_df = test_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
test_df = test_df.join(satisfaction_col)

X = test_df.drop(["id","Name","CGPA"], axis=1)

In [65]:

X = preprocessor.transform(X)

/opt/conda/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:202: UserWarning: Found unknown categories in columns [1, 3, 4, 5, 6] during transform. These unknown categories will be encoded as all zeros
  warnings.warn(

In [66]:

# y_pred = nn_model.predict(X.toarray())

In [67]:

# final_output = pd.DataFrame({
#     "id": test_df["id"],
#     "Depression": (y_pred > 0.5).astype(int).flatten()
# })

In [68]:

# final_output.to_csv("nn_model_final.csv", index=False)

CatBoost¶

In [69]:

y = train_df["Depression"]
X = train_df.drop(["id","Name","CGPA","Depression"], axis=1) # Dropping CGPA because I don't have time for it

In [70]:

for col in X.columns:
    # Get unique values from the column, excluding NaN
    unique_vals = X[col].dropna().unique()
    
    # For columns with NaNs, replace them with random choices from the unique values
    if len(unique_vals) > 0:
        nan_indices = X[col].isna()  # Get the indices of NaN values
        X.loc[nan_indices, col] = np.random.choice(unique_vals, size=nan_indices.sum())

In [71]:

X[cat_columns] = X[cat_columns].apply(lambda col: col.astype('category'))

In [72]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [73]:

from catboost import CatBoostClassifier, Pool


cb_model = CatBoostClassifier(
    iterations=1000, 
    depth=10, 
    learning_rate=0.05, 
    loss_function='Logloss', 
    cat_features=cat_columns, 
    verbose=200
)

In [74]:

cb_model.fit(X_train, y_train)

0:	learn: 0.6021241	total: 265ms	remaining: 4m 24s
200:	learn: 0.1247284	total: 34.4s	remaining: 2m 16s
400:	learn: 0.1038873	total: 1m 11s	remaining: 1m 46s
600:	learn: 0.0871013	total: 1m 45s	remaining: 1m 10s
800:	learn: 0.0734592	total: 2m 20s	remaining: 35s
999:	learn: 0.0613709	total: 2m 54s	remaining: 0us

Out[74]:

<catboost.core.CatBoostClassifier at 0x7a6c5f6fb130>

In [75]:

from sklearn.metrics import classification_report

y_pred = cb_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     22933
           1       0.85      0.81      0.83      5131

    accuracy                           0.94     28064
   macro avg       0.90      0.89      0.90     28064
weighted avg       0.94      0.94      0.94     28064

In [76]:

test_df = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

In [77]:

pressure_col = test_df.apply(merge_pressure, axis=1)
pressure_col = pd.Series(pressure_col, name="Work/Study Pressure")
test_df = test_df.drop(["Academic Pressure","Work Pressure"], axis=1)
test_df = test_df.join(pressure_col)

satisfaction_col = test_df.apply(merge_satisfaction, axis=1)
satisfaction_col = pd.Series(satisfaction_col, name="Job/Study Satisfaction")
test_df = test_df.drop(["Study Satisfaction","Job Satisfaction"], axis=1)
test_df = test_df.join(satisfaction_col)

In [78]:

X = test_df.drop(["id","Name","CGPA"], axis=1) # Dropping CGPA because I don't have time for it

In [79]:

for col in X.columns:
    # Get unique values from the column, excluding NaN
    unique_vals = train_df[col].dropna().unique()
    
    # For columns with NaNs, replace them with random choices from the unique values
    if len(unique_vals) > 0:
        nan_indices = X[col].isna()  # Get the indices of NaN values
        X.loc[nan_indices, col] = np.random.choice(unique_vals, size=nan_indices.sum())

In [80]:

cb_final_pred = cb_model.predict(X)

In [81]:

final_output = pd.DataFrame({
    "id": test_df["id"],
    "Depression": cb_final_pred,
})

In [82]:

final_output.to_csv("final_cb_model_output.csv", index=False)

Conclusion¶

Reviewing all the models and their results, the tensorflow model performed the best ~94%, followed by the Linear Regression Model ~93.9%, and finally the CatBoost model with ~93.3% accuracy