#!pip install plotly


import pandas as pd
import pyreadstat
import plotly.graph_objects as go
import plotly.subplots as psp
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode, iplot, plot
from plotly.subplots import make_subplots
init_notebook_mode()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

##. Add KNN model
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt



import numpy as np

from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline


from sklearn.calibration import CalibrationDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

#from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample


#!pip install pyreadstat
#!pip install sklearn
print ("vsersion =  {}.''")

vsersion =  {}.''


sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf20, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_topical_SAS/nsch_2020_topical.sas7bdat')

tdf19, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2019_topical_SAS/nsch_2019_topical.sas7bdat')
tdf18, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2018_topical_SAS/nsch_2018_topical.sas7bdat')
tdf17, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2017_topical_SAS/nsch_2017_topical.sas7bdat')
#tdf16, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2016_topical_SAS/nsch_2016_topical.sas7bdat')


tdf20['year']=2020
tdf19['year']=2019
tdf18['year']=2018
tdf17['year']=2017

tdf19['CONFIRMINJURY'] = tdf19['K2Q46A']
tdf18['CONFIRMINJURY'] = tdf18['K2Q46A']
tdf17['CONFIRMINJURY'] = tdf17['K2Q46A']

combined_17_18 = pd.concat([tdf18,tdf17])
combined_17_19 = pd.concat([tdf19,tdf18,tdf17])
combined_all = pd.concat([tdf20,tdf19,tdf18,tdf17])


print (tdf17.shape, tdf18.shape, tdf19.shape, tdf20.shape)

(21599, 433) (30530, 444) (29433, 445) (42777, 444)


combined_17_19.head()


combined_all.head()


# Pre built CSV files
#input_file = 'NSCH_dataset/work/adhd-17-20-downsample-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-17-20-remove-missing-sc.csv'

#input_file = 'NSCH_dataset/work/adhd-20-all-sc.csv'


feature_cols = ['Childs_age'
    ,'Mothers_age'
    ,'Family_structure'
    ,'Race'
    ,'Mothers_education'
    ,'Sex'
    ,'Premature'
    ,'Low_Birth_Weight'
    ,'Very_Low_Birth_Weight'
    ,'Insurance'
    ,'Headaches'
    ,'Depression'
    ,'Asthma'
    ,'Arthritis'
    ,'Anxiety'
    ,'Allergies'
    ,'Alcohol']


def read_adhd_data():
    #sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
    
    tdf = pd.read_csv(input_file)
    
    return tdf

def feature_subset_new(tdf):
    tdf['target'] = 2-tdf['K2Q31A']
    ## check this
    
    # BIRTH_YR does not work in 17-19-downsample
    #tdf['Childs_age'] = tdf['year'] - tdf['BIRTH_YR'].fillna(-1).astype(int) 
    #tdf['Childs_age'] = 2020 - tdf['BIRTH_YR'].fillna(-1).astype(int) 
    
    ### Remove under 4 years
    
    # v8
    #tdf = tdf[tdf['Childs_age']>4]
    
    # it was removed before Oct 25
    tdf['Childs_age'] = tdf['SC_AGE_YEARS'].fillna(-1).astype(int)
    
    tdf['Mothers_age'] = tdf['MOMAGE'].fillna(-1).astype(int)
    tdf['Family_structure'] = tdf['FAMILY_R'].fillna(-1).astype(int)
    tdf['Race'] = tdf['SC_RACER'].fillna(-1).astype(int)
    tdf['Mothers_education'] = tdf['HIGRADE'].fillna(-1).astype(int)
    tdf['Sex'] = tdf['SC_SEX'].fillna(-1).astype(int)
    tdf['Premature'] = tdf['K2Q05'].fillna(-1).astype(int)
    tdf['Low_Birth_Weight'] = tdf['BIRTHWT_L'].fillna(-1).astype(int)
    tdf['Very_Low_Birth_Weight'] = tdf['BIRTHWT_VL'].fillna(-1).astype(int)
    tdf['Insurance'] = tdf['HCCOVOTH'].fillna(-1).astype(int)
    
    ##Health Questions
    tdf['Headaches'] = tdf['HEADACHE'].fillna(2).astype(int)
    tdf['Depression'] = tdf['K2Q32A'].fillna(2).astype(int)
    #tdf['Brain_injury'] = tdf['CONFIRMINJURY'].fillna(2).astype(int)
    tdf['Asthma'] = tdf['K2Q40A'].fillna(2).astype(int)
    tdf['Arthritis'] = tdf['ARTHRITIS'].fillna(2).astype(int)
    tdf['Anxiety'] = tdf['K2Q33A'].fillna(2).astype(int)
    tdf['Allergies'] = tdf['ALLERGIES'].fillna(2).astype(int)
    tdf['Alcohol'] = tdf['ACE9'].fillna(2).astype(int)
    
    #v8 added
    #df_train = pd.DataFrame()
    #df_train = tdf[feature_cols+['target']].dropna()
    #df_train = df_train[~df_train['target'].isnull()].copy()
    
    return tdf


    
def feature_clean(df):
    df = tdf[feature_cols+['target']].dropna()
    df = df_train[~df_train['target'].isnull()].copy()



def remove_missing(tdf):
    tdf = tdf[(tdf['Mothers_age']>=0) | 
            (tdf['Family_structure']>=0)|
            (tdf['Race']>=0)|
            (tdf['Mothers_education']>=0)|
            (tdf['Sex']>=0)|
            (tdf['Premature']>=0)|
            (tdf['Low_Birth_Weight']>=0)|
            (tdf['Very_Low_Birth_Weight']>=0)|
            (tdf['Insurance']>=0)]
    
    return tdf


def downsample(tdf, sub=True):
    adhd = tdf[tdf["K2Q31A"] == 1]
    if(sub==True):
        noadhd = resample(tdf[tdf["K2Q31A"] == 2],
                          replace = False,
                          n_samples = 2 * len(adhd),
                          random_state = 50)
    else:
        noadhd = tdf[tdf["K2Q31A"] == 2]
    df = pd.concat([adhd,noadhd],axis=0)
    return df


def plot_results(pipeline, X, y, X_v1_train, X_v1_test, y_v1_train, y_v1_test, X_v1_valid= None, y_v1_valid=None):
    clf = pipeline.fit(X, y )
    

    #X_v1_train, X_v1_test, y_v1_train, y_v1_test = train_test_split(X_v1, y_v1, test_size=0.3)
    print ("X_v1_train - ",X_v1_train.shape)
    print ("X_v1_test - ",X_v1_test.shape)
    print ("y_v1_train - ",y_v1_train.shape)
    print ("y_v1_test - ",y_v1_test.shape)
    
    print ("X_v1_valid - ",X_v1_valid.shape)
    print ("y_v1_valid - ",y_v1_valid.shape)
    # Apply The Full Featured Classifier To The Test Data
    
    prediction_train = clf.predict(X_v1_train)
    ac_train = accuracy_score(y_v1_train, prediction_train)
    print ("accuracy score on training set- ",ac_train)
    
    prediction = clf.predict(X_v1_test)
    ac = accuracy_score(y_test, prediction)
    print ("accuracy score on test set- ",ac)
    
    if X_v1_valid is not None :
        prediction_valid = clf.predict(X_v1_valid)
        ac_valid = accuracy_score(y_v1_valid, prediction_valid)
        print ("accuracy score on validation set- ",ac_valid)
    
    labels_names = clf.classes_
    target_names = clf.classes_
    #print(classification_report(y_true, y_pred,labels=labels_names, target_names=target_names)) 
    #print(classification_report(X_v1_train, y_pred,labels=labels_names, target_names=target_names)) 
    
    # label should be binary (-1,1) or (0,1)
    # returns fpr, tpr, threshold
    fpr_train, tpr_train, t_train = metrics.roc_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
    fpr_test, tpr_test, t_test = metrics.roc_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
    
    if X_v1_valid is not None :
        fpr_valid, tpr_valid, t_valid = metrics.roc_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
    
    
    fig = go.Figure()
    fig.update_layout(legend=dict(
        yanchor="bottom",
        y=0,
        xanchor="right",
        x=1
    ))
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.title = 'ROC Curve'
    fig.layout.xaxis.title = 'FPR'
    fig.layout.yaxis.title = 'TRP (Sensitivity)'
    fig.add_trace(go.Scatter(
        x=fpr_train
        ,y=tpr_train
        ,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
    ))
    fig.add_trace(go.Scatter(
        x=fpr_test
        ,y=tpr_test
        ,name=f'Test AUC:{metrics.auc(fpr_test,tpr_test):,.03f}'
    ))
    fig.show()

    # Added new
    fig = go.Figure()
    fig.update_layout(legend=dict(
        yanchor="bottom",
        y=0,
        xanchor="right",
        x=1
    ))
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.title = 'ROC Curve on validation'
    fig.layout.xaxis.title = 'FPR'
    fig.layout.yaxis.title = 'TRP (Sensitivity)'
    fig.add_trace(go.Scatter(
        x=fpr_train
        ,y=tpr_train
        ,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
    ))
    fig.add_trace(go.Scatter(
        x=fpr_valid
        ,y=tpr_valid
        ,name=f'Validation AUC:{metrics.auc(fpr_valid,tpr_valid):,.03f}'
    ))
    fig.show()
    
    
    precision_train, recall_train, thresholds_train = metrics.precision_recall_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
    precision_test, recall_test, thresholds_test = metrics.precision_recall_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
    
    if X_v1_valid is not None :
        precision_valid, recall_valid, thresholds_valid = metrics.precision_recall_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])

    fig = go.Figure()
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.xaxis.title = 'Recall'
    fig.layout.yaxis.title = 'Precision'
    fig.layout.title = 'PR Curve on Test'
    fig.add_trace(go.Scatter(
        x=recall_train
        ,y=precision_train
        ,name='Train'
    ))
    fig.add_trace(go.Scatter(
        x=recall_test
        ,y=precision_test
        ,name='Test'
    ))

    fig.show()
    
    
    fig = go.Figure()
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.xaxis.title = 'Recall'
    fig.layout.yaxis.title = 'Precision'
    fig.layout.title = 'PR Curve on Validation'
    fig.add_trace(go.Scatter(
        x=recall_train
        ,y=precision_train
        ,name='Train'
    ))
    fig.add_trace(go.Scatter(
        x=recall_valid
        ,y=precision_valid
        ,name='Valid'
    ))

    fig.show()
    
    """
    import matplotlib.pyplot as plt
    from sklearn.metrics import plot_confusion_matrix
 
    color = 'white'
    matrix = plot_confusion_matrix(knn, X_test, y_test, cmap=plt.cm.Blues)
    matrix.ax_.set_title('Confusion Matrix', color=color)
    plt.xlabel('Predicted Label', color=color)
    plt.ylabel('True Label', color=color)
    plt.gcf().axes[0].tick_params(colors=color)
    plt.gcf().axes[1].tick_params(colors=color)
    plt.show()
    """


    
    cm = metrics.confusion_matrix(y_test,pipeline.predict(X_v1_test))
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    disp.plot()
    plt.show()
    
    
    cm_valid = metrics.confusion_matrix(y_valid,pipeline.predict(X_v1_valid))
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_valid, display_labels=clf.classes_)
    disp.plot()
    plt.show()
    
    """

    print ("call 2nd confusion metrics")
    labels=clf.classes_
    print ("labels=", labels)
    plot_confusion_metric(y_v1_test, prediction, labels)
    
    """


train_test_val_option = 1


# 42777,443 from ss7bdat
# 12918, 444 from csv

#. Read from file
#tdf = read_adhd_data()


# Read from Memory
if train_test_val_option == 1:
    tdf = combined_all
    output_file_name = 'combined_17_20.csv'
elif train_test_val_option == 2:
    tdf = combined_17_19
    output_file_name = 'combined_17_19.csv'
elif train_test_val_option ==3:
    tdf = combined_17_18
    output_file_name = 'combined_17_18.csv'


# option 1 - 124339,  2 - 81562  3- 59963
tdf.shape

(124339, 470)


tdf.head()


df_train = feature_subset_new(tdf)

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:9: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:22: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:24: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:25: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:26: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:27: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:28: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:29: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:30: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:31: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:32: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:35: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:36: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:38: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:39: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:40: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:41: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:42: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


df_train.shape

(124339, 488)


#  124339 124339 rows   2.   81562      3.  59963 to 59963
tdf2 = remove_missing(df_train)


tdf2.shape

(124339, 488)


# 1 124339 to 37206. changing 14582*488.   tp. 24288*461   3. 59963 to 18099
#tdf3 = downsample(tdf2)
#tdf3.shape
#tdf = tdf3


tdf = tdf2


#df_train = feature_subset(tdf)
# 2017-2019
#df_train = feature_subset_new(tdf)


df_train = feature_subset_new(tdf)


df_train.shape

(124339, 488)


df_train_20 = feature_subset_new(tdf20)
df_train_20 = tdf20[feature_cols+['target']].dropna()
df_train_20 = df_train_20[~df_train_20['target'].isnull()].copy()

df_train_19 = feature_subset_new(tdf19)
df_train_19 = tdf19[feature_cols+['target']].dropna()
df_train_19 = df_train_19[~df_train_19['target'].isnull()].copy()


# original has 19 columns
# adhd-20-all has 18 columns.  removed brain injury
df_train = tdf[feature_cols+['target']].dropna()
df_train = df_train[~df_train['target'].isnull()].copy()
df_train.head(1000)


#. option 2.  reduced from 81562 to 80906
df_train.shape

(123495, 18)


df_train.to_csv(output_file_name)

df_train_20.to_csv('only_20.csv')
df_train_19.to_csv('only_19.csv')


y = df_train['target']
#X =df_train[['Anxiety']]  # need 2d array, double bracket
X = df_train[feature_cols]
#X_train, X_test, y_train, y_test = train_test_split(column_trans, y, test_size=0.1)
#clf=RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3, class_weight={0:0.10, 1:0.90})
#clf.fit(X_train,y_train)


#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip


import pandas_profiling as pp
pp.ProfileReport(df_train)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]


## Option 1 - Divide 2017-2020 in train test validation set 
## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set


if train_test_val_option == 1 :
    # Option 1 - Divide 2017-2020 in train test validation set 
    percentage_test= .2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=percentage_test, random_state=1)
elif train_test_val_option == 2:
    percentage_test= .2
    # 2017-2019 data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)

    y_valid = df_train_20['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_20[feature_cols]
elif train_test_val_option == 3:
    X_train = X
    y_train = y
    X_test = df_train_19[feature_cols]
    y_test = df_train_19['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_19[feature_cols]
    y_valid = df_train_20['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_20[feature_cols]


## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set


# option 1 79036, 24669. 19760
# option 2 64724. 16182.  42589
# option 3 59445 29246 42589
print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)

 train test validation X  (79036, 17) (24699, 17) (19760, 17)


print (" train test validation y " , y_train.shape, y_test.shape, y_valid.shape)

 train test validation y  (79036,) (24699,) (19760,)


oversample_option = 1


from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

"""
# does not work for categorical

if oversample_option == 1:
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train,y_train)
    
"""
if oversample_option == 2:
    oversample = SMOTENC(categorical_features=[0, 1,2,3,4,5,6,7],random_state=0)
    #oversample = SMOTENC(categorical_features=[0, 7], random_state=0)
    X_train, y_train = oversample.fit_resample(X_train,y_train)
    
    print ("resampled")
    #X_resampled, y_resampled = smote_nc.fit_resample(X, y)

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Input In [38], in <cell line: 1>()
----> 1 from imblearn.over_sampling import SMOTE
      2 from imblearn.over_sampling import SMOTENC
      4 """
      5 # does not work for categorical
      6 
   (...)
     10     
     11 """

ModuleNotFoundError: No module named 'imblearn'


print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)

 train test validation X  (79036, 17) (24699, 17) (19760, 17)


import pandas_profiling as pp
pp.ProfileReport(X)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]


pp.ProfileReport(X_test)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]


pp.ProfileReport(X_valid)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]


from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


ct = make_column_transformer(
    (StandardScaler (), ['Childs_age', 'Mothers_age']),
    (OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex','Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis','Anxiety', 'Allergies',  'Alcohol' ]),
    remainder='drop'  # drop other columns
)

ct

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

['Childs_age', 'Mothers_age']

StandardScaler()

['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol']

OneHotEncoder()


from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)


from sklearn.ensemble import GradientBoostingClassifier

pipeline = make_pipeline(
    ct,
    GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
)


    
# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
rfc = RandomForestClassifier()
gr = GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
rfc2 = RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})

gr_pipeline = make_pipeline(
    ct,
    GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
    )
knn_pipeline = make_pipeline(
    ct,
       KNeighborsClassifier(n_neighbors=10)
    )
 
    
rfc_pipeline3 = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})

    )
rfc_pipeline4 = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=4,class_weight={0:0.10, 1:0.90})

    )


"""

clf_list = [
    (gr_pipeline, "Gradient"),
    (knn_pipeline, "K Nearest Neighbor"),
    (rfc_pipeline4, "Random forest 4"),
    (rfc_pipeline3, "Random forest 3"),

]

"""
clf_list = [
    (gr_pipeline, "Gradient"),
    (rfc_pipeline4, "Random forest 4"),
    (rfc_pipeline3, "Random forest 3"),
]


print("data is", X.shape, y.shape, X_test.shape, y_test.shape)

data is (123495, 17) (123495,) (24699, 17) (24699,)


#!pip install sklearn


from matplotlib.gridspec import GridSpec
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay

fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")
ax_calibration_curve = fig.add_subplot(gs[:2, :2])
# prepare plots
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))

calibration_displays = {}

for i, (clf, name) in enumerate(clf_list):
    
    """ 
    pipeline = make_pipeline(
    ct,
    clf(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
    )
    pipeline.fit(X, y )
    """
    
    
    
 
    clf.fit(X_train, y_train)
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name=name)
    DetCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_det, name=name)
    

  
    display = CalibrationDisplay.from_estimator(
        clf,
        X_test,
        y_test,
        n_bins=10,
        name=name,
        ax=ax_calibration_curve,
        color=colors(i),
    )
    calibration_displays[name] = display

    

ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")

ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")
    
ax_calibration_curve.grid()
ax_calibration_curve.set_title("Calibration plots")

plt.legend()
plt.show()


fig = plt.figure(figsize=(10, 10))
gs = GridSpec(5, 3)
colors = plt.cm.get_cmap("Dark2")


# Add histogram
grid_positions = [(0, 0), ( 0,1), (0, 2), (2,0), (2,1 ), (2, 2),(4,0) , (4,1)]


"""
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")


# Add histogram
grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
"""

for i, (_, name) in enumerate(clf_list):
    row, col = grid_positions[i]
    ax = fig.add_subplot(gs[row, col])

    ax.hist(
        calibration_displays[name].y_prob,
        range=(0, 1),
        bins=10,
        label=name,
        color=colors(i),
    )
    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")

plt.tight_layout()
plt.show()


for i, (clf, name) in enumerate(clf_list):
    print (name)
    #plot_results(clf, X, y, X_train, X_test, y_train, y_test)
    plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)

Gradient
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.9049293992610962
accuracy score on test set-  0.9084578323009028
accuracy score on validation set-  0.9035931174089069

Random forest 4
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.7912723315957285
accuracy score on test set-  0.7970363172598081
accuracy score on validation set-  0.793825910931174

Random forest 3
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.8300015182954603
accuracy score on test set-  0.8342847888578485
accuracy score on validation set-  0.833502024291498


""" 
rfc_pipeline = make_pipeline( ct, RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)
"""

#rfc_selected = rfc_pipeline4

rfc_selected = rfc_pipeline3


plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)

X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.8074168733235487
accuracy score on test set-  0.8108020567634318
accuracy score on validation set-  0.8092611336032388


for i, (clf, name) in enumerate(clf_list):
    print (name)

Gradient
Random forest 4
Random forest 3

clf

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

['Childs_age', 'Mothers_age']

StandardScaler()

['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol']

OneHotEncoder()

RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=200)


from joblib import dump, load


dump(clf, 'App/models/model_nov15.joblib')

['App/models/model_nov15.joblib']


X_train.head()


#rclf = clf_list[2]


#rclf


pred_probs_train = clf.predict_proba(X_train)[:,1]
pred_probs_test = clf.predict_proba(X_test)[:,1]
pred_probs_valid = clf.predict_proba(X_valid)[:,1]


X_valid.head()


pred_probs_train.shape

(79036,)


pred_probs_test.shape

(24699,)


pred_probs_valid.shape

(19760,)


X_valid["score_raw"] = [x[1] for x in clf.predict_proba(X_valid)]
X_valid["target"] = y_valid
X_valid.to_csv('App/data/df_validation_baseline_nov15.csv',index=False)


model_str = "randomforest3"
model_to_probs = {}
model_to_probs[model_str] = {'train': pred_probs_train, 'test': pred_probs_test, 'valid': pred_probs_valid}


model_to_probs[model_str]

{'train': array([0.46206363, 0.25033351, 0.24809988, ..., 0.45276311, 0.43508985,
        0.35771168]),
 'test': array([0.45276311, 0.25722418, 0.35730152, ..., 0.50808493, 0.33125581,
        0.43508985]),
 'valid': array([0.33975222, 0.35771168, 0.38497885, ..., 0.38614429, 0.51071714,
        0.82690017])}


import seaborn as sns
plt.figure(figsize=(20,4))
    
plt.subplot(1,3,1)
sns.histplot(pred_probs_train)
plt.title(f"{model_str} - train", fontsize=20)
    
plt.subplot(1,3,2)
sns.histplot(pred_probs_test)
plt.title(f"{model_str} - test", fontsize=20)
    
plt.subplot(1,3,3)
sns.histplot(pred_probs_valid)
plt.title(f"{model_str} - validation", fontsize=20)

Text(0.5, 1.0, 'randomforest3 - validation')


model_str_to_trained_model = {}
model_str_to_trained_model[model_str] = clf


for model_str, pred_prob_dict in model_to_probs.items():
    pred_probs = pred_prob_dict['test']

    pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)

    empirical_probs = []
    pred_probs_midpoints = []

    for i in range(len(pred_probs_space)-1):
        empirical_probs.append(np.mean(y_test[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
        pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)

    plt.figure(figsize=(10,4))
    plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
    plt.title(f"{model_str}", fontsize=20)
    plt.xlabel('predicted prob', fontsize=14)
    plt.ylabel('empirical prob', fontsize=14)
    
    plt.plot([0,1],[0,1],linestyle='--',color='gray')
    
    plt.legend(['original', 'ideal'], fontsize=20)

clf

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

['Childs_age', 'Mothers_age']

StandardScaler()

['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol']

OneHotEncoder()

RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=200)


### Rewrite this function
def my_prediction_calibrated(age,mage,fs,race,me,sex,pre,lwth,vlwth,insu,head,depr,ast,art,anx,alle,alc):
    
    # make a list
    x_test = pd.Series()
    
    # fill values
    x_test["Childs_age"] = age
    x_test["Mothers_age"] = mage
    x_test["Family_structure"] = fs
    x_test["Race"] = race
    x_test["Mothers_education"] = me
    x_test["Sex"] = sex
    x_test["Premature"] = pre
    x_test["Low_Birth_Weight"] = lwth
    x_test["Very_Low_Birth_Weight"] = vlwth
    x_test["Insurance"] = insu
    x_test["Headaches"] = head
    x_test["Depression"] = depr
    x_test["Asthma"] = ast
    x_test["Arthritis"] = art
    x_test["Anxiety"] = anx
    x_test["Allergies"] = alle
    x_test["Alcohol"] = alc
    
    #make prediction
    ans= pipeline.predict(X=pd.DataFrame([x_test]))
    
    return (ans)


model_str_to_calibrator = {}

for model_str, pred_prob_dict in model_to_probs.items():
    #train calibration model
    lr_model = LogisticRegression()
    lr_model.fit(pred_prob_dict['test'].reshape(-1,1), y_test)
    
    pred_probs = pred_prob_dict['valid']

    pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)

    empirical_probs = []
    pred_probs_midpoints = []

    for i in range(len(pred_probs_space)-1):
        empirical_probs.append(np.mean(y_valid[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
        pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)

    calibrated_probs = lr_model.predict_proba(np.array([0.0]+pred_probs_midpoints+[1.0]).reshape(-1,1))[:,1]
    
    plt.figure(figsize=(10,4))
    plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
    plt.title(f"{model_str}", fontsize=20)
    plt.xlabel('predicted prob', fontsize=14)
    plt.ylabel('empirical prob', fontsize=14)
    
    plt.plot([0.0]+pred_probs_midpoints+[1.0], calibrated_probs, linewidth=2, marker='o')
    
    plt.plot([0,1],[0,1],linestyle='--',color='gray')
    
    plt.legend(['original', 'calibrated', 'ideal'], fontsize=20)
    
    model_str_to_calibrator[model_str] = lr_model


print("  complete")

  complete

Home

Code samples - how to experiment with knobs

Resampling Notebook¶

Read Data files¶

Feature Columns¶

Option 1 (2017-2020)¶

Option 2 (2017-2019), 2020¶

Data Prep and cleaning¶

Write file¶

Create Train Test Validation Set¶

Oversample¶

Profile¶

Model Training¶

Make column transformer¶

All Model Architecture¶

RF pipeline¶

Gradient Boosted Decision Tree¶

Naive SVC¶

List of modeling architecture -¶

List of architecture to run¶

Model Selected¶

Calibrator¶

Use Calibration¶

clf = model_str_to_trained_model['rf']¶

Calibrator¶

Topics

Related Links