This part is in progress. I need more time to properly explain how to modify your database in order to be put as an input in a neural network. The neural network works with tensorflow and we use the root mean square error to define the cost function.
The challenge here is to have a matrix with 0 and 1 and, for continuous variables, to choose if you want to create groups or not. To create groups, we use the kmean algorithm.
Here, I used another database than the titanic.
You will find below the code. I think in 2-3 weeks? I will provide all the explanation.
In [6]:
################################################
###############Variables Preparation############
################################################
In [7]:
#as_matrix() :> to convert pandas array as numpy array
df_toPred = df["Benefice net annuel"].as_matrix()
df_toPred = df_toPred.reshape(df_toPred.shape[0],1)
#We remove from the DataBase the Variable to explain
df_toUse = df.drop('Benefice net annuel', 1)
In [9]:
#We want to transform the DataBase in a matrix with only 0 and 1.
#First, We create groups (max 10) for all variables which are continuous
#We keep the possibility not to create group for continuous variables we want
#For variables which contains labels, each label will be change by a digits
from sklearn.cluster import KMeans
from scipy.stats import itemfreq
def matrix_Class(df_toUse, excluded, kmean_Size):
xval=np.array([])
index = 0
excluded_index = np.array([])
for i, col in enumerate(df_toUse.columns):
name = np.array(col)
x = np.array(df_toUse[col])
#if the variable is discrete _ even if it belongs to the list "not to group", it will be grouped (^o^)
if x.dtype == 'O':
_, xval0 = np.unique(x, return_inverse=True)
xval0 = xval0.reshape(1,xval0.shape[0])
#if the variable is continuous
else:
#if it belongs to the variables you want to group
if (name in excluded):
xval0 = x.reshape(1,x.shape[0])
xval0 = (xval0-np.min(xval0))/(np.max(xval0)-np.min(xval0)) #normalization between 0 and 1
excluded_index = np.append(excluded_index,[index])
#if it belongs to the variables which are in the excluded list
else:
globals()['kmean%s' % i]= KMeans(n_clusters=min(kmean_Size[index],itemfreq(x)[:,0].shape[0])).fit(x.reshape(-1,1))
xval0 = np.array([globals()['kmean%s' % i].labels_])
if (xval.shape[0] == 0):
xval = xval0
else:
xval = np.concatenate((xval,xval0),axis=0)
index += 1
return xval, excluded_index.astype(int)
excluded = np.array(['Age', 'Coefficient bonus malus'])
#excluded=np.array([''])
kmean_Size = np.array([0,10,10,10,0,10,10,10,10,10,15,15])
xval, excluded_index = matrix_Class(df_toUse, excluded, kmean_Size)
xval.shape
#Test this if you want to see the 1st variable
#print(kmean1.predict([[40.]]))
#xval[1]
#excluded_index
Out[9]:
In [10]:
#Each modality of a variable will become a column. It will value 1 if the observation has this modality and 0 if not.
df_nn=np.array([])
nb_var = xval.shape[0]
def matrix_Bin(nb_var, dt_nn, xval, excluded_index, name):
print(name)
for k in range(nb_var):
if (k not in excluded_index):
for _, i in enumerate(itemfreq(xval[k])[:,0].astype(int)):
dt_nn0 = np.where(xval[k] == itemfreq(xval[k])[:,0][i], 1., 0.)
dt_nn0 = dt_nn0.reshape(1,dt_nn0.shape[0])
if (dt_nn.shape[0] == 0):
dt_nn = dt_nn0
else:
dt_nn = np.concatenate((dt_nn,dt_nn0 ),axis=0)
else:
dt_nn0 = xval[k]
dt_nn0 = dt_nn0.reshape(1,dt_nn0.shape[0])
if (dt_nn.shape[0] == 0):
dt_nn = dt_nn0
else:
dt_nn = np.concatenate((dt_nn,dt_nn0 ),axis=0)
print("#Variable : {0} & Nber SubVariable {1}".format(k,itemfreq(xval[k])[:,0].shape[0]))
dt_nn = dt_nn.transpose()
print("Shape : {0}".format(dt_nn.shape))
return dt_nn
df_nn = matrix_Bin(nb_var , df_nn, xval, excluded_index, "DATABASE")
df_nn.shape
#Verif:
#df_nn[0]
Out[10]:
In [11]:
#Creation of the Train DataBase and Test DataBase
#x% of the observations will belong to the Train DAtaBase
from random import sample
def train_test_creation(x, data, toPred):
indices = sample(range(data.shape[0]),int(x * data.shape[0]))
indices = np.sort(indices, axis=None)
index = np.arange(df_nn.shape[0])
reverse_index = np.delete(index, indices,0)
train_toUse = data[indices]
train_toPred = toPred[indices]
test_toUse = data[reverse_index]
test_toPred = toPred[reverse_index]
return train_toUse, train_toPred, test_toUse, test_toPred
df_train_toUse, df_train_toPred, df_test_toUse, df_test_toPred = train_test_creation(0.7, df_nn, df_toPred)
df_train_toPred.shape
Out[11]:
In [ ]:
################################################
###############Tensorflow############
################################################
In [12]:
import tensorflow as tf
learning_rate = 0.01
batch_size = 100
size_train_df = df_train_toUse.shape[1]
df_train_toUse.shape
Out[12]:
In [13]:
def new_weights(shape):
return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
#outputs random value from a truncated normal distribution
def new_biases(length):
return tf.Variable(tf.constant(0.05, shape=[length]))
#outputs the constant vlaue 0.05
In [14]:
def new_fc_layer(input, # The previous layer.
num_inputs, # Num. inputs from prev. layer.
num_outputs, # Num. outputs.
use_relu=False): # Use Rectified Linear Unit (ReLU)?
# Create new weights and biases.
weights = new_weights(shape=[num_inputs, num_outputs])
biases = new_biases(length=num_outputs)
# Calculate the layer as the matrix multiplication of
# the input and weights, and then add the bias-values.
layer = tf.matmul(input, weights) + biases
# Use ReLU?
if use_relu:
layer = tf.nn.relu(layer)
return layer
In [15]:
x = tf.placeholder("float", [None, size_train_df], name='x')
y_true = tf.placeholder("float", [None, 1], name='y_true')
layer_1 = new_fc_layer(input=x,
num_inputs=size_train_df,
num_outputs=size_train_df,
use_relu=False)
layer_2 = new_fc_layer(input=layer_1,
num_inputs=size_train_df,
num_outputs=1,
use_relu=False)
In [16]:
y_pred = layer_2
rmse = tf.sqrt(tf.reduce_mean(tf.squared_difference(y_pred, y_true)))
cost = tf.reduce_mean(rmse)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
accuracy = tf.sqrt(tf.reduce_mean(tf.squared_difference(y_pred, y_true)))
In [17]:
session = tf.Session()
def init_variables():
session.run(tf.global_variables_initializer())
In [18]:
#function next_batch
def next_batch(num, data, labels):
'''
Return a total of `num` random samples and labels.
'''
idx = np.arange(0 , len(data))
np.random.shuffle(idx)
idx = idx[:num]
data_shuffle = [data[ i] for i in idx]
labels_shuffle = [labels[ i] for i in idx]
return np.asarray(data_shuffle), np.asarray(labels_shuffle)
#TEST
Xtr, Ytr = np.arange(0, 10), np.arange(0, 100).reshape(10, 10)
print(Xtr)
print(Ytr)
Xtr, Ytr = next_batch(5, Xtr, Ytr)
print('\n5 random samples')
print(Xtr)
print(Ytr)
In [19]:
batch_size_pred = 256
def predict_y(data, labels, cls_true):
num_data = len(data)
cls_pred = np.zeros(shape=num_data,dtype = np.int)
i=0
while i<num_data:
j=min(i+batch_size_pred, num_data)
feed_dict = {x : data[i:j, :],
y_true : labels[i:j, :]}
cls_pred[i:j] = session.run(y_pred, feed_dict = feed_dict)
i = j
correct = (y_true == y_pred)
return correct, y_pred
In [20]:
import time
from datetime import timedelta
In [21]:
def optimize(num_iterations, X):
global total_iterations
start_time = time.time()
for i in range(num_iterations):
total_iterations += 1
# Get a batch of training examples.
# x_batch now holds a batch of images and
# y_true_batch are the true labels for those images.
x_batch, y_true_batch = next_batch(batch_size, df_train_toUse, df_train_toPred)
# Put the batch into a dict with the proper names
# for placeholder variables in the TensorFlow graph.
feed_dict_train = {x: x_batch,
y_true: y_true_batch}
feed_dict_test = {x: df_test_toUse,
y_true: df_test_toPred}
# Run the optimizer using this batch of training data.
# TensorFlow assigns the variables in feed_dict_train
# to the placeholder variables and then runs the optimizer.
session.run(optimizer, feed_dict=feed_dict_train)
# Print status every X iterations.
if (total_iterations % X == 0) or (i ==(num_iterations -1)):
# Calculate the accuracy on the training-set.
acc_train = session.run(accuracy, feed_dict=feed_dict_train)
acc_test = session.run(accuracy, feed_dict=feed_dict_test)
msg = "Iteration: {0:>6}, Training Accuracy: {1}, Test Accuracy: {2}"
print(msg.format(total_iterations, acc_train, acc_test))
# Ending time.
end_time = time.time()
# Difference between start and end-times.
time_dif = end_time - start_time
# Print the time-usage.
print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))
In [22]:
init_variables()
total_iterations = 0
In [23]:
optimize(num_iterations=5000, X=100)
In [24]:
optimize(num_iterations=100000, X=10000)
In [25]:
x_batch, y_true_batch = next_batch(batch_size,df_train_toUse ,df_train_toPred )
# Put the batch into a dict with the proper names
# for placeholder variables in the TensorFlow graph.
feed_dict_train = {x: df_test_toUse,
y_true: df_test_toPred}
acc_train = session.run(y_true, feed_dict=feed_dict_train)
acc_test = session.run(y_pred, feed_dict=feed_dict_train)
print("True : {0}, False : {1}".format(acc_train[0:5], acc_test[0:5]))