# Data Modeling

Back to Index

3_data_modeling
In [1]:
###################################################
#  Filename : 3_data_modeling_1                #
#  Purpose : To create predictive model for       #
#   conversion probability. And store it.         #
#                                                 #
#  Author : Niel S.                               #
#  (c) The English Tea Company LLC                #
###################################################

import sys
sys.path.append('C:\\Users\\singa72\\Desktop\\Euler\\')

import Euler as Eu
from matplotlib import pyplot as plt

data_work   = data_folder+'data_work.db'

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm

def main():

conn = Eu.connection(data_work)

Eu.execute('''
drop table if exists bank_coded_rndm;
create table bank_coded_rndm as
SELECT * FROM bank_coded
order by random(), random();
''',conn=conn)

#Its easier to do analysis when
# target variable is coded as
# 0 or 1
Eu.execute(
'''
UPDATE bank_coded_rndm
SET y = (
CASE WHEN 1*y < 0 THEN 0
ELSE 1 END
) ''',conn )

#S E T T I N G  U P   A  M O D E L
#- - - - - - - - - - - - - - - - - - - - - - -

#Step 1:Split historical dataset into a
## training set and a testing set.
#Getting total number of lines in data
sql = 'select count(*) from bank_coded_rndm'
max_recs = Eu.get_ncols_bysql(sql,1,conn)[0]
ntrain = int((70./100)*max_recs)

sql = '''
SELECT
?var_list?
from bank_coded_rndm
where rowid < ?ntrain?
order by rowid asc
'''
#Preparing a list of features to pickup.
x_var_list = ['1.00*'+v for v in vvarsL]
x_var_list = ',\n     '.join(x_var_list[:-1])

sql_x_train = sql.replace('?var_list?',x_var_list)
sql_x_train = sql_x_train.replace('?ntrain?',str(ntrain))

sql_y_train = sql.replace('?var_list?','1*y')
sql_y_train = sql_y_train.replace('?ntrain?',str(ntrain))

sql_x_test = sql_x_train.replace('rowid <','rowid >=')
sql_y_test = sql_y_train.replace('rowid <','rowid >=')

#Now is the time to load data into memory
train_x = Eu.get_bysql(query=sql_x_train,conn=conn)
train_y = Eu.get_bysql(query=sql_y_train,conn=conn)

#The model is going to expect a row vector and not a column vector
train_y = [y[0] for y in train_y]
model = LogisticRegression()
model.fit(train_x, train_y)

#model2 = svm.SVC(kernel='linear', C=1, gamma=1)
#model2 = model2.fit(train_x,train_y)

#Now is the time to load data into memory
test_x = Eu.get_bysql(query=sql_x_test,conn=conn)
test_y = Eu.get_bysql(query=sql_y_test,conn=conn)
#Flat the Y axis
test_y = [y[0] for y in test_y]

#Here: Obtain the perdictions for test set
test_yhat = model.predict(test_x)
test_yhat_proba = model.predict_proba(test_x)[:,1]
print(test_yhat_proba)
#Here: Obtain the perdictions for test set
#test_yhat2 = model2.predict(test_x)

## Now is the time to measure the model performance.

#Initialize the scoring variables.
tt,tf,ft,ff = 0.,0.,0.,0.

for yi,yhi in zip(test_y,test_yhat):
if yi == 0. and yhi == 0.:
ff = ff+1
elif yi == 0. and yhi != 0.:
ft = ft+1
elif yi != 0. and yhi == 0.:
tf = tf+1
else:
tt = tt+1

#Precision
model_prec = round(tt/(tt+ft),2)
model_recall = round(tt/(tt+tf),2)

print ('The Model Performance')
print ('model_prec: ',model_prec)
print ('model_recall: ',model_recall)

#Finally its the time to store the trained model.
from sklearn.externals import joblib

f = open(data_folder+'logistic_regression.pkl','wb')
s = joblib.dump(model,f)
f.close()

#Picking the test set.
vvarsL = ['marital','job','education', 'deflt',
'housing','contact','campaign','month', 'loan',
'day_of_week','poutcome','previous',
'age','duration','pdays','emp_var_rate',
'cons_price_idx','cons_conf_idx',
'euribor_m','nr_employed','y']
#'duration' : Does not seem like a variable
## that is known before-hand.
if __name__ == '__main__':
main()

************************************************
*                    EULER                     *
*    A SQLITE POWERED DATA SCIENCE TOOLKIT     *
*          SINGH.AP79@GMAIL.NOSPAM.COM         *
************************************************

drop table if exists bank_coded_rndm

create table bank_coded_rndm as
SELECT * FROM bank_coded
order by random(), random()

UPDATE bank_coded_rndm
SET y = (
CASE WHEN 1*y < 0 THEN 0
ELSE 1 END
)
[0.02098259 0.05550052 0.13899764 ... 0.0440347  0.01605026 0.39600126]
The Model Performance
model_prec:  0.68
model_recall:  0.36


Back To Index