Data Preparation 6

Back To Index

Untitled1
In [2]:
###################################################
#  Filename : 2_data_preparation_6                #
#  Purpose : To explore the distributions of      #
#    encoded data across the positive and         # 
#    negative class categories                    #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'

#M A I N   F U N C T I O N 
def main():
    conn = Eu.connection(data_work)
    try:
        
        ##Plot and compare distributions for the coded variable "Marital"
        f1,a1 = analyze_continuous_var('marital','marital',minVar=-0.2,
                           maxVar=1.3,step_size=0.1,
                           conn=conn)
        
        a1.xaxis.set_tick_params(rotation=90)
        
        #Excercise : Do the same for all the variables in following list
        #Bonus : Try being creative not just the brute force        
        vvarsL = ('marital','job','education', 'deflt', 
        'housing','contact','campaign','month', 'loan',
        'day_of_week','poutcome','previous',
        'age','duration','pdays','emp_var_rate',
        'cons_price_idx','cons_conf_idx',
        'euribor_m','nr_employed','y')
            
        
    except Exception as err:
        Eu.print_error(err)

def analyze_continuous_var(varName,varLabel,minVar,maxVar,step_size,conn):
    try:
        sql ='''
        select 1*?var?
        from bank_coded where 
        y*1 != 1
        '''.replace('?var?',varName)
        var_data = Eu.get_ncols_bysql(sql,1,conn)

        sql ='''
        select 1*?var?
        from bank_coded where 
        y*1 = 1
        '''.replace('?var?',varName)
        var_data2 = Eu.get_ncols_bysql(sql,1,conn)
        
        
        
        
        
        nbins = int((maxVar-minVar)/step_size)
        var_buckets = [minVar+i*step_size for i in range(0,nbins)]
        ### Draw a histogram for the age variable.
        fig1, ax1 = plt.subplots(1,1)
        
        n1,b1,p = ax1.hist(var_data,
                          bins=var_buckets,
                          histtype='step',
                          linewidth=2,
                          color='navy',
                          label = varLabel+'_Failure'
                          )
        
        n2,b2,p = ax1.hist(var_data2,
                          bins=var_buckets,
                          histtype='stepfilled',
                          linewidth=2,
                          color='red',
                          label = varLabel+'_Success'
                          )

        #ax1.set_yscale('log')
        ###Set aXis ticks configured
        ax1.set_xticks([b  for b in var_buckets[2:]])
        ###Erase the vertical frame spines
        ax1.spines['right'].set_color('none')
        ax1.spines['top'].set_color('none')
        ###Set aXes lables
        ax1.set_ylabel('Number of People')
        ###What should be the y-axis height?
        ###y_max_value + 20% extra
        ymax = max(n1.tolist())
        ylim = ymax+0.40*ymax
        ax1.set_ylim([0,ylim])
        ax1.legend()
        Eu.add_vals_ontop(ax1,b1,n1,threshold=10)
        return fig1,ax1

    except Exception as err:
        Eu.print_error(err)

if __name__ == '__main__':
    main()

Back To Index