Code samples - how to experiment with knobs
In [1]:
#!pip install plotly
In [85]:
import pandas as pd
import pyreadstat
import plotly.graph_objects as go
import plotly.subplots as psp
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode, iplot, plot
from plotly.subplots import make_subplots
init_notebook_mode()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
##. Add KNN model
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibrationDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
#from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample
In [86]:
#!pip install pyreadstat
#!pip install sklearn
print ("vsersion = {}.''")
vsersion = {}.'' Resampling Notebook¶Read Data files¶In [87]:
sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf20, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_topical_SAS/nsch_2020_topical.sas7bdat')
tdf19, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2019_topical_SAS/nsch_2019_topical.sas7bdat')
tdf18, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2018_topical_SAS/nsch_2018_topical.sas7bdat')
tdf17, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2017_topical_SAS/nsch_2017_topical.sas7bdat')
#tdf16, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2016_topical_SAS/nsch_2016_topical.sas7bdat')
In [88]:
tdf20['year']=2020
tdf19['year']=2019
tdf18['year']=2018
tdf17['year']=2017
tdf19['CONFIRMINJURY'] = tdf19['K2Q46A']
tdf18['CONFIRMINJURY'] = tdf18['K2Q46A']
tdf17['CONFIRMINJURY'] = tdf17['K2Q46A']
combined_17_18 = pd.concat([tdf18,tdf17])
combined_17_19 = pd.concat([tdf19,tdf18,tdf17])
combined_all = pd.concat([tdf20,tdf19,tdf18,tdf17])
In [89]:
print (tdf17.shape, tdf18.shape, tdf19.shape, tdf20.shape)
(21599, 433) (30530, 444) (29433, 445) (42777, 444) In [90]:
combined_17_19.head()
Out[90]:
5 rows × 461 columns In [8]:
combined_all.head()
Out[8]:
5 rows × 470 columns In [9]:
# Pre built CSV files
#input_file = 'NSCH_dataset/work/adhd-17-20-downsample-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-17-20-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-20-all-sc.csv'
Feature Columns¶In [10]:
feature_cols = ['Childs_age'
,'Mothers_age'
,'Family_structure'
,'Race'
,'Mothers_education'
,'Sex'
,'Premature'
,'Low_Birth_Weight'
,'Very_Low_Birth_Weight'
,'Insurance'
,'Headaches'
,'Depression'
,'Asthma'
,'Arthritis'
,'Anxiety'
,'Allergies'
,'Alcohol']
In [11]:
def read_adhd_data():
#sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf = pd.read_csv(input_file)
return tdf
def feature_subset_new(tdf):
tdf['target'] = 2-tdf['K2Q31A']
## check this
# BIRTH_YR does not work in 17-19-downsample
#tdf['Childs_age'] = tdf['year'] - tdf['BIRTH_YR'].fillna(-1).astype(int)
#tdf['Childs_age'] = 2020 - tdf['BIRTH_YR'].fillna(-1).astype(int)
### Remove under 4 years
# v8
#tdf = tdf[tdf['Childs_age']>4]
# it was removed before Oct 25
tdf['Childs_age'] = tdf['SC_AGE_YEARS'].fillna(-1).astype(int)
tdf['Mothers_age'] = tdf['MOMAGE'].fillna(-1).astype(int)
tdf['Family_structure'] = tdf['FAMILY_R'].fillna(-1).astype(int)
tdf['Race'] = tdf['SC_RACER'].fillna(-1).astype(int)
tdf['Mothers_education'] = tdf['HIGRADE'].fillna(-1).astype(int)
tdf['Sex'] = tdf['SC_SEX'].fillna(-1).astype(int)
tdf['Premature'] = tdf['K2Q05'].fillna(-1).astype(int)
tdf['Low_Birth_Weight'] = tdf['BIRTHWT_L'].fillna(-1).astype(int)
tdf['Very_Low_Birth_Weight'] = tdf['BIRTHWT_VL'].fillna(-1).astype(int)
tdf['Insurance'] = tdf['HCCOVOTH'].fillna(-1).astype(int)
##Health Questions
tdf['Headaches'] = tdf['HEADACHE'].fillna(2).astype(int)
tdf['Depression'] = tdf['K2Q32A'].fillna(2).astype(int)
#tdf['Brain_injury'] = tdf['CONFIRMINJURY'].fillna(2).astype(int)
tdf['Asthma'] = tdf['K2Q40A'].fillna(2).astype(int)
tdf['Arthritis'] = tdf['ARTHRITIS'].fillna(2).astype(int)
tdf['Anxiety'] = tdf['K2Q33A'].fillna(2).astype(int)
tdf['Allergies'] = tdf['ALLERGIES'].fillna(2).astype(int)
tdf['Alcohol'] = tdf['ACE9'].fillna(2).astype(int)
#v8 added
#df_train = pd.DataFrame()
#df_train = tdf[feature_cols+['target']].dropna()
#df_train = df_train[~df_train['target'].isnull()].copy()
return tdf
def feature_clean(df):
df = tdf[feature_cols+['target']].dropna()
df = df_train[~df_train['target'].isnull()].copy()
def remove_missing(tdf):
tdf = tdf[(tdf['Mothers_age']>=0) |
(tdf['Family_structure']>=0)|
(tdf['Race']>=0)|
(tdf['Mothers_education']>=0)|
(tdf['Sex']>=0)|
(tdf['Premature']>=0)|
(tdf['Low_Birth_Weight']>=0)|
(tdf['Very_Low_Birth_Weight']>=0)|
(tdf['Insurance']>=0)]
return tdf
def downsample(tdf, sub=True):
adhd = tdf[tdf["K2Q31A"] == 1]
if(sub==True):
noadhd = resample(tdf[tdf["K2Q31A"] == 2],
replace = False,
n_samples = 2 * len(adhd),
random_state = 50)
else:
noadhd = tdf[tdf["K2Q31A"] == 2]
df = pd.concat([adhd,noadhd],axis=0)
return df
def plot_results(pipeline, X, y, X_v1_train, X_v1_test, y_v1_train, y_v1_test, X_v1_valid= None, y_v1_valid=None):
clf = pipeline.fit(X, y )
#X_v1_train, X_v1_test, y_v1_train, y_v1_test = train_test_split(X_v1, y_v1, test_size=0.3)
print ("X_v1_train - ",X_v1_train.shape)
print ("X_v1_test - ",X_v1_test.shape)
print ("y_v1_train - ",y_v1_train.shape)
print ("y_v1_test - ",y_v1_test.shape)
print ("X_v1_valid - ",X_v1_valid.shape)
print ("y_v1_valid - ",y_v1_valid.shape)
# Apply The Full Featured Classifier To The Test Data
prediction_train = clf.predict(X_v1_train)
ac_train = accuracy_score(y_v1_train, prediction_train)
print ("accuracy score on training set- ",ac_train)
prediction = clf.predict(X_v1_test)
ac = accuracy_score(y_test, prediction)
print ("accuracy score on test set- ",ac)
if X_v1_valid is not None :
prediction_valid = clf.predict(X_v1_valid)
ac_valid = accuracy_score(y_v1_valid, prediction_valid)
print ("accuracy score on validation set- ",ac_valid)
labels_names = clf.classes_
target_names = clf.classes_
#print(classification_report(y_true, y_pred,labels=labels_names, target_names=target_names))
#print(classification_report(X_v1_train, y_pred,labels=labels_names, target_names=target_names))
# label should be binary (-1,1) or (0,1)
# returns fpr, tpr, threshold
fpr_train, tpr_train, t_train = metrics.roc_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
fpr_test, tpr_test, t_test = metrics.roc_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
if X_v1_valid is not None :
fpr_valid, tpr_valid, t_valid = metrics.roc_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
fig = go.Figure()
fig.update_layout(legend=dict(
yanchor="bottom",
y=0,
xanchor="right",
x=1
))
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.title = 'ROC Curve'
fig.layout.xaxis.title = 'FPR'
fig.layout.yaxis.title = 'TRP (Sensitivity)'
fig.add_trace(go.Scatter(
x=fpr_train
,y=tpr_train
,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
))
fig.add_trace(go.Scatter(
x=fpr_test
,y=tpr_test
,name=f'Test AUC:{metrics.auc(fpr_test,tpr_test):,.03f}'
))
fig.show()
# Added new
fig = go.Figure()
fig.update_layout(legend=dict(
yanchor="bottom",
y=0,
xanchor="right",
x=1
))
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.title = 'ROC Curve on validation'
fig.layout.xaxis.title = 'FPR'
fig.layout.yaxis.title = 'TRP (Sensitivity)'
fig.add_trace(go.Scatter(
x=fpr_train
,y=tpr_train
,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
))
fig.add_trace(go.Scatter(
x=fpr_valid
,y=tpr_valid
,name=f'Validation AUC:{metrics.auc(fpr_valid,tpr_valid):,.03f}'
))
fig.show()
precision_train, recall_train, thresholds_train = metrics.precision_recall_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
precision_test, recall_test, thresholds_test = metrics.precision_recall_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
if X_v1_valid is not None :
precision_valid, recall_valid, thresholds_valid = metrics.precision_recall_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
fig = go.Figure()
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.xaxis.title = 'Recall'
fig.layout.yaxis.title = 'Precision'
fig.layout.title = 'PR Curve on Test'
fig.add_trace(go.Scatter(
x=recall_train
,y=precision_train
,name='Train'
))
fig.add_trace(go.Scatter(
x=recall_test
,y=precision_test
,name='Test'
))
fig.show()
fig = go.Figure()
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.xaxis.title = 'Recall'
fig.layout.yaxis.title = 'Precision'
fig.layout.title = 'PR Curve on Validation'
fig.add_trace(go.Scatter(
x=recall_train
,y=precision_train
,name='Train'
))
fig.add_trace(go.Scatter(
x=recall_valid
,y=precision_valid
,name='Valid'
))
fig.show()
"""
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
color = 'white'
matrix = plot_confusion_matrix(knn, X_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
"""
cm = metrics.confusion_matrix(y_test,pipeline.predict(X_v1_test))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
cm_valid = metrics.confusion_matrix(y_valid,pipeline.predict(X_v1_valid))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_valid, display_labels=clf.classes_)
disp.plot()
plt.show()
"""
print ("call 2nd confusion metrics")
labels=clf.classes_
print ("labels=", labels)
plot_confusion_metric(y_v1_test, prediction, labels)
"""
In [12]:
train_test_val_option = 1
Option 1 (2017-2020)¶In [13]:
# 42777,443 from ss7bdat
# 12918, 444 from csv
#. Read from file
#tdf = read_adhd_data()
# Read from Memory
if train_test_val_option == 1:
tdf = combined_all
output_file_name = 'combined_17_20.csv'
elif train_test_val_option == 2:
tdf = combined_17_19
output_file_name = 'combined_17_19.csv'
elif train_test_val_option ==3:
tdf = combined_17_18
output_file_name = 'combined_17_18.csv'
Option 2 (2017-2019), 2020¶In [14]:
# option 1 - 124339, 2 - 81562 3- 59963
tdf.shape
Out[14]:
(124339, 470) In [15]:
tdf.head()
Out[15]:
5 rows × 470 columns Data Prep and cleaning¶In [16]:
df_train = feature_subset_new(tdf)
/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:9: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:22: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:24: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:25: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:26: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:27: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:28: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:29: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:30: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:31: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:32: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:35: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:36: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:38: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:40: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:41: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:42: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` In [17]:
df_train.shape
Out[17]:
(124339, 488) In [18]:
# 124339 124339 rows 2. 81562 3. 59963 to 59963
tdf2 = remove_missing(df_train)
In [19]:
tdf2.shape
Out[19]:
(124339, 488) In [20]:
# 1 124339 to 37206. changing 14582*488. tp. 24288*461 3. 59963 to 18099
#tdf3 = downsample(tdf2)
#tdf3.shape
#tdf = tdf3
In [21]:
tdf = tdf2
In [22]:
#df_train = feature_subset(tdf)
# 2017-2019
#df_train = feature_subset_new(tdf)
In [23]:
df_train = feature_subset_new(tdf)
In [24]:
df_train.shape
Out[24]:
(124339, 488) In [25]:
df_train_20 = feature_subset_new(tdf20)
df_train_20 = tdf20[feature_cols+['target']].dropna()
df_train_20 = df_train_20[~df_train_20['target'].isnull()].copy()
df_train_19 = feature_subset_new(tdf19)
df_train_19 = tdf19[feature_cols+['target']].dropna()
df_train_19 = df_train_19[~df_train_19['target'].isnull()].copy()
In [26]:
# original has 19 columns
# adhd-20-all has 18 columns. removed brain injury
df_train = tdf[feature_cols+['target']].dropna()
df_train = df_train[~df_train['target'].isnull()].copy()
df_train.head(1000)
Out[26]:
1000 rows × 18 columns In [27]:
#. option 2. reduced from 81562 to 80906
df_train.shape
Out[27]:
(123495, 18) Write file¶In [28]:
df_train.to_csv(output_file_name)
df_train_20.to_csv('only_20.csv')
df_train_19.to_csv('only_19.csv')
In [29]:
y = df_train['target']
#X =df_train[['Anxiety']] # need 2d array, double bracket
X = df_train[feature_cols]
#X_train, X_test, y_train, y_test = train_test_split(column_trans, y, test_size=0.1)
#clf=RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3, class_weight={0:0.10, 1:0.90})
#clf.fit(X_train,y_train)
In [30]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
In [31]:
import pandas_profiling as pp
pp.ProfileReport(df_train)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s] Generate report structure: 0%| | 0/1 [00:00<?, ?it/s] Render HTML: 0%| | 0/1 [00:00<?, ?it/s] Out[31]:
Create Train Test Validation Set¶In [32]:
## Option 1 - Divide 2017-2020 in train test validation set
## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set
In [33]:
if train_test_val_option == 1 :
# Option 1 - Divide 2017-2020 in train test validation set
percentage_test= .2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=percentage_test, random_state=1)
elif train_test_val_option == 2:
percentage_test= .2
# 2017-2019 data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)
y_valid = df_train_20['target']
#X =df_train[['Anxiety']] # need 2d array, double bracket
X_valid = df_train_20[feature_cols]
elif train_test_val_option == 3:
X_train = X
y_train = y
X_test = df_train_19[feature_cols]
y_test = df_train_19['target']
#X =df_train[['Anxiety']] # need 2d array, double bracket
X_valid = df_train_19[feature_cols]
y_valid = df_train_20['target']
#X =df_train[['Anxiety']] # need 2d array, double bracket
X_valid = df_train_20[feature_cols]
In [34]:
## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set
In [ ]:
In [35]:
# option 1 79036, 24669. 19760
# option 2 64724. 16182. 42589
# option 3 59445 29246 42589
print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)
train test validation X (79036, 17) (24699, 17) (19760, 17) In [36]:
print (" train test validation y " , y_train.shape, y_test.shape, y_valid.shape)
train test validation y (79036,) (24699,) (19760,) Oversample¶In [37]:
oversample_option = 1
In [38]:
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
"""
# does not work for categorical
if oversample_option == 1:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train,y_train)
"""
if oversample_option == 2:
oversample = SMOTENC(categorical_features=[0, 1,2,3,4,5,6,7],random_state=0)
#oversample = SMOTENC(categorical_features=[0, 7], random_state=0)
X_train, y_train = oversample.fit_resample(X_train,y_train)
print ("resampled")
#X_resampled, y_resampled = smote_nc.fit_resample(X, y)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Input In [38], in <cell line: 1>() ----> 1 from imblearn.over_sampling import SMOTE 2 from imblearn.over_sampling import SMOTENC 4 """ 5 # does not work for categorical 6 (...) 10 11 """ ModuleNotFoundError: No module named 'imblearn' In [39]:
print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)
train test validation X (79036, 17) (24699, 17) (19760, 17) Profile¶In [40]:
import pandas_profiling as pp
pp.ProfileReport(X)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s] Generate report structure: 0%| | 0/1 [00:00<?, ?it/s] Render HTML: 0%| | 0/1 [00:00<?, ?it/s] Out[40]:
In [41]:
pp.ProfileReport(X_test)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s] Generate report structure: 0%| | 0/1 [00:00<?, ?it/s] Render HTML: 0%| | 0/1 [00:00<?, ?it/s] Out[41]:
In [42]:
pp.ProfileReport(X_valid)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s] Generate report structure: 0%| | 0/1 [00:00<?, ?it/s] Render HTML: 0%| | 0/1 [00:00<?, ?it/s] Out[42]:
Model Training¶Make column transformer¶In [43]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
In [44]:
ct = make_column_transformer(
(StandardScaler (), ['Childs_age', 'Mothers_age']),
(OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex','Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis','Anxiety', 'Allergies', 'Alcohol' ]),
remainder='drop' # drop other columns
)
In [45]:
ct
Out[45]:
ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])]) ['Childs_age', 'Mothers_age'] StandardScaler() ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'] OneHotEncoder() All Model Architecture¶RF pipeline¶In [46]:
from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(
ct,
RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)
Gradient Boosted Decision Tree¶In [47]:
from sklearn.ensemble import GradientBoostingClassifier
pipeline = make_pipeline(
ct,
GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
)
Naive SVC¶List of modeling architecture -¶Pipelines with feature transformtion are part of it In [48]:
# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
rfc = RandomForestClassifier()
gr = GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
rfc2 = RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
gr_pipeline = make_pipeline(
ct,
GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
)
knn_pipeline = make_pipeline(
ct,
KNeighborsClassifier(n_neighbors=10)
)
rfc_pipeline3 = make_pipeline(
ct,
RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)
rfc_pipeline4 = make_pipeline(
ct,
RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=4,class_weight={0:0.10, 1:0.90})
)
List of architecture to run¶In [49]:
"""
clf_list = [
(gr_pipeline, "Gradient"),
(knn_pipeline, "K Nearest Neighbor"),
(rfc_pipeline4, "Random forest 4"),
(rfc_pipeline3, "Random forest 3"),
]
"""
clf_list = [
(gr_pipeline, "Gradient"),
(rfc_pipeline4, "Random forest 4"),
(rfc_pipeline3, "Random forest 3"),
]
In [50]:
print("data is", X.shape, y.shape, X_test.shape, y_test.shape)
data is (123495, 17) (123495,) (24699, 17) (24699,) In [51]:
#!pip install sklearn
In [52]:
from matplotlib.gridspec import GridSpec
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")
ax_calibration_curve = fig.add_subplot(gs[:2, :2])
# prepare plots
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))
calibration_displays = {}
for i, (clf, name) in enumerate(clf_list):
"""
pipeline = make_pipeline(
ct,
clf(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
)
pipeline.fit(X, y )
"""
clf.fit(X_train, y_train)
RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name=name)
DetCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_det, name=name)
display = CalibrationDisplay.from_estimator(
clf,
X_test,
y_test,
n_bins=10,
name=name,
ax=ax_calibration_curve,
color=colors(i),
)
calibration_displays[name] = display
ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")
ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")
ax_calibration_curve.grid()
ax_calibration_curve.set_title("Calibration plots")
plt.legend()
plt.show()
In [53]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(5, 3)
colors = plt.cm.get_cmap("Dark2")
# Add histogram
grid_positions = [(0, 0), ( 0,1), (0, 2), (2,0), (2,1 ), (2, 2),(4,0) , (4,1)]
"""
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")
# Add histogram
grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
"""
for i, (_, name) in enumerate(clf_list):
row, col = grid_positions[i]
ax = fig.add_subplot(gs[row, col])
ax.hist(
calibration_displays[name].y_prob,
range=(0, 1),
bins=10,
label=name,
color=colors(i),
)
ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")
plt.tight_layout()
plt.show()
In [54]:
for i, (clf, name) in enumerate(clf_list):
print (name)
#plot_results(clf, X, y, X_train, X_test, y_train, y_test)
plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)
Gradient X_v1_train - (79036, 17) X_v1_test - (24699, 17) y_v1_train - (79036,) y_v1_test - (24699,) X_v1_valid - (19760, 17) y_v1_valid - (19760,) accuracy score on training set- 0.9049293992610962 accuracy score on test set- 0.9084578323009028 accuracy score on validation set- 0.9035931174089069 Random forest 4 X_v1_train - (79036, 17) X_v1_test - (24699, 17) y_v1_train - (79036,) y_v1_test - (24699,) X_v1_valid - (19760, 17) y_v1_valid - (19760,) accuracy score on training set- 0.7912723315957285 accuracy score on test set- 0.7970363172598081 accuracy score on validation set- 0.793825910931174 Random forest 3 X_v1_train - (79036, 17) X_v1_test - (24699, 17) y_v1_train - (79036,) y_v1_test - (24699,) X_v1_valid - (19760, 17) y_v1_valid - (19760,) accuracy score on training set- 0.8300015182954603 accuracy score on test set- 0.8342847888578485 accuracy score on validation set- 0.833502024291498 Model Selected¶Calibrator¶In [55]:
"""
rfc_pipeline = make_pipeline( ct, RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)
"""
#rfc_selected = rfc_pipeline4
rfc_selected = rfc_pipeline3
In [56]:
plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)
X_v1_train - (79036, 17) X_v1_test - (24699, 17) y_v1_train - (79036,) y_v1_test - (24699,) X_v1_valid - (19760, 17) y_v1_valid - (19760,) accuracy score on training set- 0.8074168733235487 accuracy score on test set- 0.8108020567634318 accuracy score on validation set- 0.8092611336032388 In [63]:
for i, (clf, name) in enumerate(clf_list):
print (name)
Gradient Random forest 4 Random forest 3 In [64]:
clf
Out[64]:
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])])), ('randomforestclassifier', RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])])), ('randomforestclassifier', RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200))]) ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])]) ['Childs_age', 'Mothers_age'] StandardScaler() ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'] OneHotEncoder() RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200) In [65]:
from joblib import dump, load
In [66]:
dump(clf, 'App/models/model_nov15.joblib')
Out[66]:
['App/models/model_nov15.joblib'] In [67]:
X_train.head()
Out[67]:
In [68]:
#rclf = clf_list[2]
In [69]:
#rclf
In [70]:
pred_probs_train = clf.predict_proba(X_train)[:,1]
pred_probs_test = clf.predict_proba(X_test)[:,1]
pred_probs_valid = clf.predict_proba(X_valid)[:,1]
In [71]:
X_valid.head()
Out[71]:
In [72]:
pred_probs_train.shape
Out[72]:
(79036,) In [73]:
pred_probs_test.shape
Out[73]:
(24699,) In [74]:
pred_probs_valid.shape
Out[74]:
(19760,) In [75]:
X_valid["score_raw"] = [x[1] for x in clf.predict_proba(X_valid)]
X_valid["target"] = y_valid
X_valid.to_csv('App/data/df_validation_baseline_nov15.csv',index=False)
In [76]:
model_str = "randomforest3"
model_to_probs = {}
model_to_probs[model_str] = {'train': pred_probs_train, 'test': pred_probs_test, 'valid': pred_probs_valid}
In [77]:
model_to_probs[model_str]
Out[77]:
{'train': array([0.46206363, 0.25033351, 0.24809988, ..., 0.45276311, 0.43508985, 0.35771168]), 'test': array([0.45276311, 0.25722418, 0.35730152, ..., 0.50808493, 0.33125581, 0.43508985]), 'valid': array([0.33975222, 0.35771168, 0.38497885, ..., 0.38614429, 0.51071714, 0.82690017])} In [78]:
import seaborn as sns
plt.figure(figsize=(20,4))
plt.subplot(1,3,1)
sns.histplot(pred_probs_train)
plt.title(f"{model_str} - train", fontsize=20)
plt.subplot(1,3,2)
sns.histplot(pred_probs_test)
plt.title(f"{model_str} - test", fontsize=20)
plt.subplot(1,3,3)
sns.histplot(pred_probs_valid)
plt.title(f"{model_str} - validation", fontsize=20)
Out[78]:
Text(0.5, 1.0, 'randomforest3 - validation') In [79]:
model_str_to_trained_model = {}
model_str_to_trained_model[model_str] = clf
In [80]:
for model_str, pred_prob_dict in model_to_probs.items():
pred_probs = pred_prob_dict['test']
pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)
empirical_probs = []
pred_probs_midpoints = []
for i in range(len(pred_probs_space)-1):
empirical_probs.append(np.mean(y_test[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)
plt.figure(figsize=(10,4))
plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
plt.title(f"{model_str}", fontsize=20)
plt.xlabel('predicted prob', fontsize=14)
plt.ylabel('empirical prob', fontsize=14)
plt.plot([0,1],[0,1],linestyle='--',color='gray')
plt.legend(['original', 'ideal'], fontsize=20)
Use Calibration¶clf = model_str_to_trained_model['rf']¶In [81]:
clf
Out[81]:
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])])), ('randomforestclassifier', RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])])), ('randomforestclassifier', RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200))]) ColumnTransformer(transformers=[('standardscaler', StandardScaler(), ['Childs_age', 'Mothers_age']), ('onehotencoder', OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'])]) ['Childs_age', 'Mothers_age'] StandardScaler() ['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol'] OneHotEncoder() RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3, min_samples_leaf=10, min_samples_split=20, n_estimators=200) In [82]:
### Rewrite this function
def my_prediction_calibrated(age,mage,fs,race,me,sex,pre,lwth,vlwth,insu,head,depr,ast,art,anx,alle,alc):
# make a list
x_test = pd.Series()
# fill values
x_test["Childs_age"] = age
x_test["Mothers_age"] = mage
x_test["Family_structure"] = fs
x_test["Race"] = race
x_test["Mothers_education"] = me
x_test["Sex"] = sex
x_test["Premature"] = pre
x_test["Low_Birth_Weight"] = lwth
x_test["Very_Low_Birth_Weight"] = vlwth
x_test["Insurance"] = insu
x_test["Headaches"] = head
x_test["Depression"] = depr
x_test["Asthma"] = ast
x_test["Arthritis"] = art
x_test["Anxiety"] = anx
x_test["Allergies"] = alle
x_test["Alcohol"] = alc
#make prediction
ans= pipeline.predict(X=pd.DataFrame([x_test]))
return (ans)
Calibrator¶In [83]:
model_str_to_calibrator = {}
for model_str, pred_prob_dict in model_to_probs.items():
#train calibration model
lr_model = LogisticRegression()
lr_model.fit(pred_prob_dict['test'].reshape(-1,1), y_test)
pred_probs = pred_prob_dict['valid']
pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)
empirical_probs = []
pred_probs_midpoints = []
for i in range(len(pred_probs_space)-1):
empirical_probs.append(np.mean(y_valid[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)
calibrated_probs = lr_model.predict_proba(np.array([0.0]+pred_probs_midpoints+[1.0]).reshape(-1,1))[:,1]
plt.figure(figsize=(10,4))
plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
plt.title(f"{model_str}", fontsize=20)
plt.xlabel('predicted prob', fontsize=14)
plt.ylabel('empirical prob', fontsize=14)
plt.plot([0.0]+pred_probs_midpoints+[1.0], calibrated_probs, linewidth=2, marker='o')
plt.plot([0,1],[0,1],linestyle='--',color='gray')
plt.legend(['original', 'calibrated', 'ideal'], fontsize=20)
model_str_to_calibrator[model_str] = lr_model
In [84]:
print(" complete")
complete In [ ]:
In [ ]:
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||