Data Preparation-3

Back To Index

2_data_preparation_3
In [ ]:
###################################################
#  Filename : 2_data_preparation_3                #
#  Purpose : To demonstrate how to construct      #
#   a coding dictionary for a discrete variable.  # 
#   We use likelihood encoding.                   #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'

#M A I N   F U N C T I O N 
def main():
    #Make probability encodings for all the categorical 
    ### variables.
    conn = Eu.connection(data_work)
    try:
           
        probability_encoding_categorical('marital',conn)
        probability_encoding_categorical(varName='job',conn=conn)
        probability_encoding_categorical(varName='education',conn=conn)
        probability_encoding_categorical(varName='deflt',conn=conn)
        probability_encoding_categorical(varName='housing',conn=conn)
        probability_encoding_categorical(varName='contact',conn=conn)
        probability_encoding_categorical(varName='campaign',conn=conn)
        probability_encoding_categorical(varName='month',conn=conn)
        probability_encoding_categorical(varName='day_of_week',conn=conn)
        probability_encoding_categorical(varName='poutcome',conn=conn)
        probability_encoding_categorical(varName='y',conn=conn)
  
        Eu.run('select * from y_coding',conn)


        #Challenge : Suggest a way to reduce the number of lines
        ## in the code above.

    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()
    
 

def probability_encoding_categorical(varName,conn):
    try:
        sql = '''
        DROP TABLE if exists ?vvar?_coding;
        CREATE TABLE ?vvar?_coding AS
        SELECT
        ?vvar?,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN 0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        ?vvar?,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by ?vvar?
        ) as T1
        '''.replace('?vvar?',varName)

        Eu.execute(sql,conn=conn)

    except Exception as err:
        Eu.print_error(err)
 
    
def main_alternative():
    conn = Eu.connection(data_work)
    try:
        cat_vars = ('marital','job', 'education', 'deflt', 
        'housing','contact','campaign','month', 
        'day_of_week','poutcome','y')
  
        for cat_var in cat_vars:
            probability_encoding_categorical(varName='y',conn=conn)
            Eu.run('select * from ?vv?_coding'.replace('?vv?',cat_var)
                   ,conn)

    
   
    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()



if __name__ == '__main__':
    main()

Back To Index