Data Exploration-1

Back To Index

Aim : Developing a generic data exploration function.

1_data_exploration_0
In [5]:
###################################################
#  Filename : 1_data_exploration_0                #
#  Purpose : To develop data exploration function #
#   using Euler toolkit                           #
#            1. Use generic function for          # 
#            continuous variable inspection       #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################

#Following lines add Euler into the search path
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')
import Euler as Eu


from matplotlib import pyplot as plt


data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'



#M A I N   F U N C T I O N  
def main():
    try:
        conn = Eu.connection(data_work)
        
        #SQL to fetch age from first five records
        sql  = '''
        select age from bank limit 5
        '''
        
        #Pull the fetched data into program in following format
        # format: [age_1, age_2, age_3, age_4, age_5]
        age = Eu.get_ncols_bysql(sql,1,conn)
        
        # Note : All values are in string format
        print (age)
        
        ##SQL to fetch age from first five records in numeric format
        sql = '''
        select 1*age from bank limit 5
        '''
        
        ##Pull the fetched data into program in following format
        ## format: [age_1, age_2, age_3, age_4, age_5]
        age = Eu.get_ncols_bysql(sql,1,conn)
        
        ## Note : All values are in numreric format
        print (age)
        
        ### Visual analysis of variable age
        sql ='''
        select 1*age from bank
        '''
        age = Eu.get_ncols_bysql(sql,1,conn)
        age_buckets = [5,15,25,35,45,55,65,75,85,95,105]
        ### Draw a histogram for the age variable.
        fig, ax = plt.subplots(1,1)
        n,b,p = ax.hist(age, bins=age_buckets)

        ###Adding some cosmetics
        ###Try using following way to do the buckets 
        min_age,max_age,step_size = 5,105,10.
        nbins = int((max_age-min_age)/step_size)
        age_buckets = [min_age+i*step_size for i in range(0,nbins)]
        ### Draw a histogram for the age variable.
        fig1, ax1 = plt.subplots(1,1)
        n1,b1,p = ax1.hist(age,
                          bins=age_buckets,
                          histtype='step',
                          linewidth=2,
                          color='navy',
                          label = 'Age'
                          )
        
        ###Set aXis ticks configured
        ####ax1.set_xticks([b+5 for b in age_buckets])
        ###Erase the vertical frame spines
        ####ax1.spines['right'].set_color('none')
        ####ax1.spines['top'].set_color('none')
        ###Set aXes lables
        ####ax1.set_xlabel('Age')
        ####ax1.set_ylabel('Number of People')
        ###What should be the y-axis height?
        ###y_max_value + 20% extra
        ####ymax = max(n1.tolist())
        ####ylim = ymax+0.20*ymax
        ####ax1.set_ylim([0,ylim])
        ####ax1.legend()
        
        #Use some of Euler's Magic to add numbers on top.
        ###Eu.add_vals_ontop(ax1,b1,n1,threshold=10)

        
        #EXERCISE: Fix the function below
        #and uncomment following code

        """
        analyze_var(varName='age',
                    minVar=5,maxVar=105,
                    conn= conn)
        """
        plt.show()       
       
    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()


def analyze_var(varName,minVar,maxVar,conn):
    try:
        sql ='''
        select 1*?var?
        from bank
        '''.replace('?var?',varName)
        var_data = Eu.get_ncols_bysql(sql,1,conn)
        
        step_size = 10.
        nbins = int((maxVar-minVar)/step_size)
        var_buckets = [minVar+i*step_size for i in range(0,nbins)]
        ### Draw a histogram for the age variable.
        fig1, ax1 = plt.subplots(1,1)
        ###
                        
        ###Set aXis ticks configured
        ax1.set_xticks([b+5 for b in var_buckets])
        ###Erase the vertical frame spines
        ###------------------
        ###Set aXes lables
        ###------------------
        ###What should be the y-axis height?
        ###  ylim = y_max_value + 20% extra
        ###------------------
        ax1.legend()
        #Use some of Euler's Magic to add numbers on top.
        ###Eu.add_vals_ontop(ax1,b,n1,threshold=10)

    except Exception as err:
        Eu.print_error(err)

if __name__ == '__main__':
    main()
['56', '57', '37', '40', '56']
[56, 57, 37, 40, 56]

Back To Index