Data Preparation 4

Back To Index

2_data_preparation_4
In [1]:
###################################################
#  Filename : 2_data_preparation_4                #
#  Purpose : To demonstrate how to construct      #
#   a coding dictionary for a continuous/discrete # 
#   variable.                                     # 
#                                                 #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################
import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'

#M A I N   F U N C T I O N 
def main():
    conn = Eu.connection(data_work)
  
    try:
        #A list (tupple actually) of variables for probability encoding.
        cat_vars = ('marital','job', 'education', 'deflt', 
        'housing','contact','campaign','month', 'loan',
        'day_of_week','poutcome','previous','y')
        #Creating encoding dictionaries for variables.
        for cat_var in cat_vars:
            probability_encoding_categorical(varName=cat_var,conn=conn)
            Eu.run('select * from ?vv?_coding'.replace('?vv?',cat_var)
                   ,conn)
   
    
        cont_vars = ('age','duration','pdays','emp_var_rate',
                     'cons_price_idx','cons_conf_idx',
                     'euribor_m','nr_employed')

        for cont_var in cont_vars:
            min_max_encoding_continuous(cont_var,conn)
            Eu.run('select * from ?vv?_coding limit 10'.replace('?vv?',cont_var)
                   ,conn)
            

        
    except Exception as err:
        Eu.print_error(err)
    finally:
        conn.close()


def min_max_encoding_continuous(varName,conn):
    try:
        '''
        Min-Max Encoding: The code is obtained as follows:
            xi_code = (xi - min_x)/(max_x - min_x)
        '''
        
        sql = '''
        SELECT min(1.*?var?),max(1.*?var?)
        from bank
        '''.replace('?var?',varName)
        
        mmin,mmax = Eu.get_ncols_bysql(sql,2,conn)
        mmin,mmax = mmin[0],mmax[0]
        
        sql = '''
        drop table if exists ?var?_coding;
        create table ?var?_coding as
        select
        ?var?,
        ROUND(
        CASE
         WHEN ?max? - ?min? = 0.0 THEN -0.0001
         ELSE ( 1.*?var? - ?min?)/ (?max? - ?min?)
        END,4) code
        from bank
        '''
        sql = sql.replace('?var?',varName)
        sql = sql.replace('?min?',str(mmin))
        sql = sql.replace('?max?',str(mmax))
        Eu.execute(sql,conn)
        
    except Exception as err:
        Eu.print_error(err)



def probability_encoding_categorical(varName,conn):
    try:
        sql = '''
        DROP TABLE if exists ?vvar?_coding;
        CREATE TABLE ?vvar?_coding AS
        SELECT
        ?vvar?,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        ?vvar?,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by ?vvar?
        ) as T1
        '''.replace('?vvar?',varName)

        Eu.execute(sql,conn=conn)

    except Exception as err:
        Eu.print_error(err)
 
    


if __name__ == '__main__':
    main()
************************************************
*                    EULER                     *
*    A SQLITE POWERED DATA SCIENCE TOOLKIT     *
*          SINGH.AP79@GMAIL.NOSPAM.COM         *
************************************************


        DROP TABLE if exists marital_coding

        CREATE TABLE marital_coding AS
        SELECT
        marital,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        marital,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by marital
        ) as T1
        
============
marital,code
============
divorced,0.1032
married,0.1016
single,0.14
unknown,0.15
============


        DROP TABLE if exists job_coding

        CREATE TABLE job_coding AS
        SELECT
        job,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        job,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by job
        ) as T1
        
========
job,code
========
admin.,0.1297
blue-collar,0.0689
entrepreneur,0.0852
housemaid,0.1
management,0.1122
retired,0.2523
self-employed,0.1049
services,0.0814
student,0.3143
technician,0.1083
unemployed,0.142
unknown,0.1121
========


        DROP TABLE if exists education_coding

        CREATE TABLE education_coding AS
        SELECT
        education,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        education,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by education
        ) as T1
        
==============
education,code
==============
basic.4y,0.1025
basic.6y,0.082
basic.9y,0.0782
high.school,0.1084
illiterate,0.2222
professional.course,0.1135
university.degree,0.1372
unknown,0.145
==============


        DROP TABLE if exists deflt_coding

        CREATE TABLE deflt_coding AS
        SELECT
        deflt,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        deflt,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by deflt
        ) as T1
        
==========
deflt,code
==========
no,0.1288
unknown,0.0515
yes,-0.0001
==========


        DROP TABLE if exists housing_coding

        CREATE TABLE housing_coding AS
        SELECT
        housing,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        housing,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by housing
        ) as T1
        
============
housing,code
============
no,0.1088
unknown,0.1081
yes,0.1162
============


        DROP TABLE if exists contact_coding

        CREATE TABLE contact_coding AS
        SELECT
        contact,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        contact,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by contact
        ) as T1
        
============
contact,code
============
cellular,0.1474
telephone,0.0523
============


        DROP TABLE if exists campaign_coding

        CREATE TABLE campaign_coding AS
        SELECT
        campaign,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        campaign,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by campaign
        ) as T1
        
=============
campaign,code
=============
1,0.1304
10,0.0533
11,0.0678
12,0.024
13,0.0435
14,0.0145
15,0.0392
16,-0.0001
17,0.069
18,-0.0001
19,-0.0001
2,0.1146
20,-0.0001
21,-0.0001
22,-0.0001
23,0.0625
24,-0.0001
25,-0.0001
26,-0.0001
27,-0.0001
28,-0.0001
29,-0.0001
3,0.1075
30,-0.0001
31,-0.0001
32,-0.0001
33,-0.0001
34,-0.0001
35,-0.0001
37,-0.0001
39,-0.0001
4,0.0939
40,-0.0001
41,-0.0001
42,-0.0001
43,-0.0001
5,0.075
56,-0.0001
6,0.0766
7,0.0604
8,0.0425
9,0.0601
=============


        DROP TABLE if exists month_coding

        CREATE TABLE month_coding AS
        SELECT
        month,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        month,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by month
        ) as T1
        
==========
month,code
==========
apr,0.2048
aug,0.106
dec,0.489
jul,0.0905
jun,0.1051
mar,0.5055
may,0.0643
nov,0.1014
oct,0.4387
sep,0.4491
==========


        DROP TABLE if exists loan_coding

        CREATE TABLE loan_coding AS
        SELECT
        loan,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        loan,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by loan
        ) as T1
        
=========
loan,code
=========
no,0.1134
unknown,0.1081
yes,0.1093
=========


        DROP TABLE if exists day_of_week_coding

        CREATE TABLE day_of_week_coding AS
        SELECT
        day_of_week,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        day_of_week,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by day_of_week
        ) as T1
        
================
day_of_week,code
================
fri,0.1081
mon,0.0995
thu,0.1212
tue,0.1178
wed,0.1167
================


        DROP TABLE if exists poutcome_coding

        CREATE TABLE poutcome_coding AS
        SELECT
        poutcome,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        poutcome,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by poutcome
        ) as T1
        
=============
poutcome,code
=============
failure,0.1423
nonexistent,0.0883
success,0.6511
=============


        DROP TABLE if exists previous_coding

        CREATE TABLE previous_coding AS
        SELECT
        previous,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        previous,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by previous
        ) as T1
        
=============
previous,code
=============
0,0.0883
1,0.212
2,0.4642
3,0.5926
4,0.5429
5,0.7222
6,0.6
7,-0.0001
=============


        DROP TABLE if exists y_coding

        CREATE TABLE y_coding AS
        SELECT
        y,
        CASE 
        WHEN n_ppl_all*n_ppl_pass IS 0 THEN -0.0001
        ELSE round(n_ppl_pass/n_ppl_all,4) END code
        FROM
            (
        SELECT
        y,
        sum( CASE WHEN Y = 'yes' THEN 1. ELSE 0 END) n_ppl_pass,
        count(*) n_ppl_all
        from bank
        group by y
        ) as T1
        
======
y,code
======
no,-0.0001
yes,1.0
======


        drop table if exists age_coding

        create table age_coding as
        select
        age,
        ROUND(
        CASE
         WHEN 98.0 - 17.0 = 0.0 THEN -0.0001
         ELSE ( 1.*age - 17.0)/ (98.0 - 17.0)
        END,4) code
        from bank
        
========
age,code
========
56,0.4815
57,0.4938
37,0.2469
40,0.284
56,0.4815
45,0.3457
59,0.5185
41,0.2963
24,0.0864
25,0.0988
========


        drop table if exists duration_coding

        create table duration_coding as
        select
        duration,
        ROUND(
        CASE
         WHEN 4918.0 - 0.0 = 0.0 THEN -0.0001
         ELSE ( 1.*duration - 0.0)/ (4918.0 - 0.0)
        END,4) code
        from bank
        
=============
duration,code
=============
261,0.0531
149,0.0303
226,0.046
151,0.0307
307,0.0624
198,0.0403
139,0.0283
217,0.0441
380,0.0773
50,0.0102
=============


        drop table if exists pdays_coding

        create table pdays_coding as
        select
        pdays,
        ROUND(
        CASE
         WHEN 999.0 - 0.0 = 0.0 THEN -0.0001
         ELSE ( 1.*pdays - 0.0)/ (999.0 - 0.0)
        END,4) code
        from bank
        
==========
pdays,code
==========
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
999,1.0
==========


        drop table if exists emp_var_rate_coding

        create table emp_var_rate_coding as
        select
        emp_var_rate,
        ROUND(
        CASE
         WHEN 1.4 - -3.4 = 0.0 THEN -0.0001
         ELSE ( 1.*emp_var_rate - -3.4)/ (1.4 - -3.4)
        END,4) code
        from bank
        
=================
emp_var_rate,code
=================
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
1.1,0.9375
=================


        drop table if exists cons_price_idx_coding

        create table cons_price_idx_coding as
        select
        cons_price_idx,
        ROUND(
        CASE
         WHEN 94.767 - 92.201 = 0.0 THEN -0.0001
         ELSE ( 1.*cons_price_idx - 92.201)/ (94.767 - 92.201)
        END,4) code
        from bank
        
===================
cons_price_idx,code
===================
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
93.994,0.6988
===================


        drop table if exists cons_conf_idx_coding

        create table cons_conf_idx_coding as
        select
        cons_conf_idx,
        ROUND(
        CASE
         WHEN -26.9 - -50.8 = 0.0 THEN -0.0001
         ELSE ( 1.*cons_conf_idx - -50.8)/ (-26.9 - -50.8)
        END,4) code
        from bank
        
==================
cons_conf_idx,code
==================
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
-36.4,0.6025
==================


        drop table if exists euribor_m_coding

        create table euribor_m_coding as
        select
        euribor_m,
        ROUND(
        CASE
         WHEN 5.045 - 0.634 = 0.0 THEN -0.0001
         ELSE ( 1.*euribor_m - 0.634)/ (5.045 - 0.634)
        END,4) code
        from bank
        
==============
euribor_m,code
==============
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
4.857,0.9574
==============


        drop table if exists nr_employed_coding

        create table nr_employed_coding as
        select
        nr_employed,
        ROUND(
        CASE
         WHEN 5228.1 - 4963.6 = 0.0 THEN -0.0001
         ELSE ( 1.*nr_employed - 4963.6)/ (5228.1 - 4963.6)
        END,4) code
        from bank
        
================
nr_employed,code
================
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
5191,0.8597
================

Back To Index