Data Exploration 2

Back To Index

1_data_exploration_2
In [2]:
###################################################
#  Filename : 1_data_exploration_2                #
#  Purpose : To develop data exploration function #
#   using Euler toolkit                           #
#            1. Use generic function for          # 
#            exploring the continuous variables.  #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'

def main():
    try:
        conn = Eu.connection(data_work)
        
        #Call analyze_var on 'age'        
        f1,a1 = analyze_continuous_var(
                varName='age',varLabel = 'Age',
                minVar=5,maxVar=105,step_size=10,conn= conn)
        a1.set_xlabel('Age')
        a1.legend()
        
        #Call analyze_var on 'duration'
        f1,a1 = analyze_continuous_var(
                varName='duration',varLabel = 'Call Duration',
                minVar=0,maxVar=2000,step_size=100,conn= conn)
        #Some specific settings
        a1.xaxis.set_tick_params(rotation=45)
        a1.set_xlabel('Time (s)')        
        a1.legend()
                          
        plt.show()       
       
    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()


def analyze_continuous_var(varName,varLabel,minVar,maxVar,step_size,conn):
    try:
        sql ='''
        select 1*?var?
        from bank
        '''.replace('?var?',varName)
        var_data = Eu.get_ncols_bysql(sql,1,conn)
        
       
        nbins = int((maxVar-minVar)/step_size)
        var_buckets = [minVar+i*step_size for i in range(0,nbins)]
        ### Draw a histogram for the age variable.
        fig1, ax1 = plt.subplots(1,1)
        n1,b1,p = ax1.hist(var_data,
                          bins=var_buckets,
                          histtype='step',
                          linewidth=2,
                          color='navy',
                          label = varLabel
                          )


        ###Set aXis ticks configured
        ax1.set_xticks([b+0.5*step_size for b in var_buckets])
        ###Erase the vertical frame spines
        ax1.spines['right'].set_color('none')
        ax1.spines['top'].set_color('none')
        ###Set aXes lables
        ax1.set_ylabel('Number of People')
        ###What should be the y-axis height?
        ###y_max_value + 20% extra
        ymax = max(n1.tolist())
        ylim = ymax+0.20*ymax
        ax1.set_ylim([0,ylim])
        ax1.legend()
        Eu.add_vals_ontop(ax1,b1,n1,threshold=10)
        return fig1,ax1

    except Exception as err:
        Eu.print_error(err)

if __name__ == '__main__':
    main()

Back To Index