Data Exploration-3

Back To Index

1_data_exploration_3
In [3]:
###################################################
#  Filename : 1_data_exploration_3                #
#  Purpose : To develop data exploration function #
#   using Euler toolkit                           #
#            1. Use generic function for          # 
#            exploring the discrete variables.    #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'
def main():
    try:
        conn = Eu.connection(data_work)
        
        '''
        Uncomment following and fix errors
        '''

        #Call analyze_var on 'age'        
        ##f1,a1 = analyze_continuous_var(
        ##        varName='age',varLabel = 'Age',
        ##        minVar=5,maxVar=105,step_size=10,conn= conn)
        ##a1.set_xlabel('Age')
        ##a1.legend()
        
        ##Call analyze_var on 'duration'
        ###f1,a1 = analyze_continuous_var(
        ###        varName='duration',varLabel = 'Call Duration',
        ###        minVar=0,maxVar=2000,step_size=100,conn= conn)
        ##Some specific settings
        ###a1.xaxis.set_tick_params(rotation=45)
        ###a1.set_xlabel('Time (s)')        
        ###a1.legend()
        
        analyze_discrete_var(varName='marital',
                             varLabel='Marital Status'
                             ,conn=conn)
        
        #Uncomment following and study the visuals
        f1,a1,f2,(a21,a22) = analyze_discrete_var(varName='job',varLabel='Job',conn=conn)
        #analyze_discrete_var(varName='education',varLabel='Education',conn=conn)
        #analyze_discrete_var(varName='deflt',varLabel='Loan Default',conn=conn)
        #analyze_discrete_var(varName='housing',varLabel='Housing Status',conn=conn)
        #analyze_discrete_var(varName='contact',varLabel='Medium of Contact',conn=conn)
        #analyze_discrete_var(varName='nr_employed',varLabel='nr_employed',conn=conn)
        
        ##Do following : What does the result suggest?
        #analyze_discrete_var(varName='euribor_m',varLabel='Euribor3m',conn=conn)
        
        ###
        ###DISCUSSION : What can be done to fix these?
        ###
        ##Call analyze_continous_var on 'euribor_m'
        #Eu.run('select distinct euribor_m from bank',conn)
        #f3,a3 = analyze_continuous_var(
        #        varName='euribor_m',varLabel = 'Call Duration',
        #        minVar=0,maxVar=5,step_size=0.1,conn= conn)
        #Some specific settings
        #a3.xaxis.set_tick_params(rotation=45)
        #a3.set_xlabel('Time (s)')        
        #a3.legend()
        
                          
        plt.show()       
       
    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()

#inspect_categorical_var(varName='marital',conn=conn)


def analyze_discrete_var(varName,varLabel,conn):
    try:
        
        #Getting all the possible values of variable and the counts
        sql ='''
        select ?var?, count(*) npeople
        from bank group by ?var?
        '''.replace('?var?',varName)
        vvar, freqs = Eu.get_ncols_bysql(sql,2,conn)
        
        #Make a Pie chart to show the counts
        fig1,ax1 = plt.subplots(1,1)
        ax1.pie(freqs,labels=vvar)
        ax1.set_title(varLabel)
        
        
        #Doing the bar plots compare success vs failing sample
        #A sql query to fetch only failing records
        sql = '''
        select ?var?,count(*) from
        bank where y = 'no'
        group by ?var?
        '''.replace('?var?',varName)
        vvar2, freqs2 = Eu.get_ncols_bysql(sql,2,conn)
        
        #Some dictionary stuff to ensure ordering
        vvar2_freq2 = dict(zip(vvar2,freqs2))
        freqs2_n = [vvar2_freq2[vv] \
                   if vv in vvar2_freq2.keys() else 0\
                   for vv in vvar]
        
        freqs2 = freqs2_n
        
        fig2,(ax2,ax2_b) = plt.subplots(2,1,sharex=True)
        width = 0.25
        xvar2 = [i-width/2 for i in range(0,len(vvar))]
        ax2.bar(xvar2,freqs2,width,label='Failure')
        ax2.set_ylabel('People')
        
        #Changing the SQL query to fetch success records
        sql = sql.replace("'no'","'yes'")
        vvar1, freqs1 = Eu.get_ncols_bysql(sql,2,conn)
        #Some dictionary stuff to ensure ordering
        vvar1_freq1 = dict(zip(vvar1,freqs1))
        freqs1_n = [vvar1_freq1[vv] \
                   if vv in vvar1_freq1.keys() else 0\
                   for vv in vvar]
        freqs1 = freqs1_n

        xvar1 = [i+width/2 for i in range(0,len(vvar))]
        ax2.bar(xvar1,freqs1,width,label='success')
        
        #Setting appropriate ticks
        ax2_b.set_xticks([x for x in range(0,len(vvar))])
        ax2_b.set_xticklabels(vvar,rotation='90')
        
        #Getting the probability of success
        ### freqs = [Total number of people] 
        ### freqs1 = [Total number of successes]
        ### [Probability] =  [freqs1]/[freqs]
        prob = [f2/f if f>0 else 0 for f,f2 in zip(freqs,freqs1)]
        xvar = [i for i in range(0,len(prob))]
        ax2_b.plot(xvar,prob,marker='^',color='red')
        ax2_b.set_ylabel('Probability (Success)')
        ax2.legend()
        
        ax2.spines['right'].set_color('none')
        ax2.spines['top'].set_color('none')
        ax2_b.spines['top'].set_color('none')
        ax2_b.spines['right'].set_color('none')

        ax2.set_title(varLabel)
        return fig1,ax1,fig2,(ax2,ax2_b)

    except Exception as err:
        Eu.print_error(err)




def analyze_continuous_var(varName,varLabel,minVar,maxVar,step_size,conn):
    try:
        sql ='''
        select 1*?var?
        from bank
        '''.replace('?var?',varName)
        var_data = Eu.get_ncols_bysql(sql,1,conn)
        
       
        nbins = int((maxVar-minVar)/step_size)
        var_buckets = [minVar+i*step_size for i in range(0,nbins)]
        ### Draw a histogram for the age variable.
        fig1, ax1 = plt.subplots(1,1)
        n1,b1,p = ax1.hist(var_data,
                          bins=var_buckets,
                          histtype='step',
                          linewidth=2,
                          color='navy',
                          label = varLabel
                          )


        ###Set aXis ticks configured
        ax1.set_xticks([b+0.5*step_size for b in var_buckets])
        ###Erase the vertical frame spines
        ax1.spines['right'].set_color('none')
        ax1.spines['top'].set_color('none')
        ###Set aXes lables
        ax1.set_ylabel('Number of People')
        ###What should be the y-axis height?
        ###y_max_value + 20% extra
        ymax = max(n1.tolist())
        ylim = ymax+0.20*ymax
        ax1.set_ylim([0,ylim])
        ax1.legend()
        Eu.add_vals_ontop(ax1,b1,n1,threshold=10)
        return fig1,ax1

    except Exception as err:
        Eu.print_error(err)

if __name__ == '__main__':
    main()

Back To Index