Data Modeling

Back to Index

3_data_modeling
In [1]:
###################################################
#  Filename : 3_data_modeling_1                #
#  Purpose : To create predictive model for       #
#   conversion probability. And store it.         #
#                                                 #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################

import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_folder = 'C:\\Users\\singa72\\Desktop\\Tutorial2\\bank-additional\\bank-additional\\'
data_work   = data_folder+'data_work.db'

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm

def main():
    
    conn = Eu.connection(data_work)
    
    #Radomize the dataset
    Eu.execute('''
           drop table if exists bank_coded_rndm;
           create table bank_coded_rndm as
           SELECT * FROM bank_coded
           order by random(), random();
           ''',conn=conn)
   
    
    #Its easier to do analysis when
    # target variable is coded as
    # 0 or 1    
    Eu.execute(
    '''
    UPDATE bank_coded_rndm
    SET y = (
    CASE WHEN 1*y < 0 THEN 0
    ELSE 1 END
    ) ''',conn )
    
    
    #S E T T I N G  U P   A  M O D E L 
    #- - - - - - - - - - - - - - - - - - - - - - - 
    
    #Step 1:Split historical dataset into a
    ## training set and a testing set.
    #Getting total number of lines in data
    sql = 'select count(*) from bank_coded_rndm'
    max_recs = Eu.get_ncols_bysql(sql,1,conn)[0]
    ntrain = int((70./100)*max_recs)

    sql = '''
    SELECT
     ?var_list?
    from bank_coded_rndm
    where rowid < ?ntrain?
    order by rowid asc
    '''
    #Preparing a list of features to pickup.
    x_var_list = ['1.00*'+v for v in vvarsL]
    x_var_list = ',\n     '.join(x_var_list[:-1])

    sql_x_train = sql.replace('?var_list?',x_var_list)
    sql_x_train = sql_x_train.replace('?ntrain?',str(ntrain))

    sql_y_train = sql.replace('?var_list?','1*y')
    sql_y_train = sql_y_train.replace('?ntrain?',str(ntrain))

    sql_x_test = sql_x_train.replace('rowid <','rowid >=')
    sql_y_test = sql_y_train.replace('rowid <','rowid >=')



    #Now is the time to load data into memory  
    train_x = Eu.get_bysql(query=sql_x_train,conn=conn)
    train_y = Eu.get_bysql(query=sql_y_train,conn=conn)
    
    #The model is going to expect a row vector and not a column vector
    train_y = [y[0] for y in train_y]
    model = LogisticRegression()
    model.fit(train_x, train_y)
  
    #model2 = svm.SVC(kernel='linear', C=1, gamma=1) 
    #model2 = model2.fit(train_x,train_y) 

    
    
    
    
    
    #Now is the time to load data into memory  
    test_x = Eu.get_bysql(query=sql_x_test,conn=conn)
    test_y = Eu.get_bysql(query=sql_y_test,conn=conn)
    #Flat the Y axis
    test_y = [y[0] for y in test_y]
    
    
    #Here: Obtain the perdictions for test set
    test_yhat = model.predict(test_x)
    test_yhat_proba = model.predict_proba(test_x)[:,1]
    print(test_yhat_proba)
    #Here: Obtain the perdictions for test set
    #test_yhat2 = model2.predict(test_x)

    ## Now is the time to measure the model performance.
    
    #Initialize the scoring variables.
    tt,tf,ft,ff = 0.,0.,0.,0.
    
    for yi,yhi in zip(test_y,test_yhat):
        if yi == 0. and yhi == 0.:
            ff = ff+1
        elif yi == 0. and yhi != 0.:
            ft = ft+1
        elif yi != 0. and yhi == 0.:
            tf = tf+1
        else:
            tt = tt+1
    
   
    #Precision
    model_prec = round(tt/(tt+ft),2)
    model_recall = round(tt/(tt+tf),2)
    
 
    print ('The Model Performance')
    print ('model_prec: ',model_prec)
    print ('model_recall: ',model_recall)
    
   
    #Finally its the time to store the trained model.
    from sklearn.externals import joblib
    
    f = open(data_folder+'logistic_regression.pkl','wb')
    s = joblib.dump(model,f)
    f.close()
    
    
    
#Picking the test set.
vvarsL = ['marital','job','education', 'deflt', 
        'housing','contact','campaign','month', 'loan',
        'day_of_week','poutcome','previous',
        'age','duration','pdays','emp_var_rate',
        'cons_price_idx','cons_conf_idx',
        'euribor_m','nr_employed','y']
        #'duration' : Does not seem like a variable
        ## that is known before-hand.
if __name__ == '__main__':
    main()
************************************************
*                    EULER                     *
*    A SQLITE POWERED DATA SCIENCE TOOLKIT     *
*          SINGH.AP79@GMAIL.NOSPAM.COM         *
************************************************


           drop table if exists bank_coded_rndm

           create table bank_coded_rndm as
           SELECT * FROM bank_coded
           order by random(), random()

    UPDATE bank_coded_rndm
    SET y = (
    CASE WHEN 1*y < 0 THEN 0
    ELSE 1 END
    ) 
[0.02098259 0.05550052 0.13899764 ... 0.0440347  0.01605026 0.39600126]
The Model Performance
model_prec:  0.68
model_recall:  0.36

Back To Index