티스토리 뷰
AI/Machine Learning
[Kaggle] Titanic: Machine Learning from Disaster - (3) Training, Cross-Validation, Test and Kaggle Submission
Arc Lab. 2017. 10. 31. 11:18[업데이트 2017.10.31 13:13]
최종 소스 코드는 아래와 같으며, 코드 로직은 다음과 같이 수행됩니다.
Logistic Regression을 사용하여 training을 진행하였고, feature scaling을 사용하였습니다.
Training data는 크게 training data, cross-validation data, test data로 나누어 evaluation을 하도록 하였습니다.
그 외에 hyper parameter를 통해 튜닝 할 수 있도록 구현 하였습니다.
1. Run Pre-Processing.
2. Load the CSV Files to Tensorflow.
3. Training the training data.
4. Run cross-validation data from the training data.
Separate the training data into a training data set, a cross-validation set and test data set.
5. Predict the trained model with the test data from Kaggle.
6. Create a submission file for Kaggle.
아래는 Kaggle에 제출후 받은 Score입니다.
hyper parameter 튜닝 없이 간단히 submit해보았는데, 성능 향상을 위한 방법을 향후 더 연구해볼 예정입니다.
<Source Code>
import asyncml as ml
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import csv
feature_size = 7
max_epochs = 400
train_batch = 10000
test_batch = 10000
shuffle = False
shuffle_size = 10000
learning_rate =0.1
skip_size = 1
threshold = 0.5
hyper_lambda = 0.01
training_cv_dataset_record_defaults = [
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([""], dtype=tf.string),
tf.constant([0], dtype=tf.float32),
tf.constant([""], dtype=tf.string),
tf.constant([0], dtype=tf.float32)
]
test_dataset_record_defaults = [
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([0], dtype=tf.float32),
tf.constant([""], dtype=tf.string),
tf.constant([0], dtype=tf.float32),
tf.constant([""], dtype=tf.string),
tf.constant([0], dtype=tf.float32)
]
def decode_csv_for_training_cv_set(line):
PassengerId, labels, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = tf.decode_csv(line,
training_cv_dataset_record_defaults)
# features = PassengerId, Pclass, Age, SibSp, Parch, Fare, labels
features = Pclass, Name, Sex, Age, SibSp, Parch, Embarked, labels
#print(features)
features = tf.reshape(features, [-1])
return features
def decode_csv_for_test_set(line):
PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = tf.decode_csv(line,
test_dataset_record_defaults)
#features = PassengerId, Pclass, Age, SibSp, Parch, Fare
features = Pclass, Name, Sex, Age, SibSp, Parch, Embarked
#print(features)
features = tf.reshape(features, [-1])
return features
def line_pre_process_train(line):
# Name
Name = line[3]
if "Mr." in Name:
line[3] = 0
elif "Mrs." in Name:
line[3] = 1
elif "Miss." in Name:
line[3] = 2
elif "Master." in Name:
line[3] = 3
elif "Rev." in Name:
line[3] = 4
elif "Dr." in Name:
line[3] = 5
elif "Mlle." in Name:
line[3] = 6
elif "Col." in Name:
line[3] = 7
elif "Lady." in Name:
line[3] = 8
elif "Don." in Name:
line[3] = 9
elif "Mme." in Name:
line[3] = 10
elif "Ms." in Name:
line[3] = 11
elif "Sir." in Name:
line[3] = 12
elif "Capt." in Name:
line[3] = 13
elif "the Countess." in Name:
line[3] = 14
elif "Jonkheer." in Name:
line[3] = 15
elif "Major." in Name:
line[3] = 16
else:
line[3] = 17
# Sex
Sex = line[4]
if Sex == "male":
line[4] = 0
elif Sex == "female":
line[4] = 1
# Age
Age = 0
try:
Age = float(line[5])
except ValueError:
pass
if Age < 1:
line[5] = 0
elif Age >= 1 and Age <= 9:
line[5] = 1
elif Age >= 10 and Age <= 19:
line[5] = 2
elif Age >= 20 and Age <= 29:
line[5] = 3
elif Age >= 30 and Age <= 39:
line[5] = 4
elif Age >= 40 and Age <= 49:
line[5] = 5
elif Age >= 50 and Age <= 59:
line[5] = 6
elif Age >= 60 and Age <= 69:
line[5] = 7
elif Age >= 70 and Age <= 79:
line[5] = 8
elif Age >= 80 and Age <= 89:
line[5] = 9
else:
line[5] = 10
# Embarked
Embarked = line[11]
if Embarked == "S":
line[11] = 0
elif Embarked == "C":
line[11] = 1
elif Embarked == "Q":
line[11] = 2
else:
line[11] = 3
return line
def line_pre_process_test(line):
# Name
Name = line[2]
if "Mr." in Name:
line[2] = 0
elif "Mrs." in Name:
line[2] = 1
elif "Miss." in Name:
line[2] = 2
elif "Master." in Name:
line[2] = 3
elif "Rev." in Name:
line[2] = 4
elif "Dr." in Name:
line[2] = 5
elif "Mlle." in Name:
line[2] = 6
elif "Col." in Name:
line[2] = 7
elif "Lady." in Name:
line[2] = 8
elif "Don." in Name:
line[2] = 9
elif "Mme." in Name:
line[2] = 10
elif "Ms." in Name:
line[2] = 11
elif "Sir." in Name:
line[2] = 12
elif "Capt." in Name:
line[2] = 13
elif "the Countess." in Name:
line[2] = 14
elif "Jonkheer." in Name:
line[2] = 15
elif "Major." in Name:
line[2] = 16
else:
line[2] = 17
# Sex
Sex = line[3]
if Sex == "male":
line[3] = 0
elif Sex == "female":
line[3] = 1
# Age
Age = 0
try:
Age = float(line[4])
except ValueError:
pass
if Age < 1:
line[4] = 0
elif Age >= 1 and Age <= 9:
line[4] = 1
elif Age >= 10 and Age <= 19:
line[4] = 2
elif Age >= 20 and Age <= 29:
line[4] = 3
elif Age >= 30 and Age <= 39:
line[4] = 4
elif Age >= 40 and Age <= 49:
line[4] = 5
elif Age >= 50 and Age <= 59:
line[4] = 6
elif Age >= 60 and Age <= 69:
line[4] = 7
elif Age >= 70 and Age <= 79:
line[4] = 8
elif Age >= 80 and Age <= 89:
line[4] = 9
else:
line[4] = 10
# Embarked
Embarked = line[10]
if Embarked == "S":
line[10] = 0
elif Embarked == "C":
line[10] = 1
elif Embarked == "Q":
line[10] = 2
else:
line[10] = 3
return line
ml.csv_pre_process('train.csv', 'train_pre_processed.csv', _func_line=line_pre_process_train)
ml.csv_pre_process('test.csv', 'test_pre_processed.csv', _func_line=line_pre_process_test)
filenames = tf.constant(["train_pre_processed.csv","test_pre_processed.csv"])
training_cv_dataset = tf.contrib.data.TextLineDataset(filenames[0]).skip(skip_size).map(decode_csv_for_training_cv_set).batch(train_batch).repeat(max_epochs)
if shuffle == True:
training_cv_dataset.shuffle(shuffle_size)
training_cv_iterator = training_cv_dataset.make_initializable_iterator()
training_batch_features = training_cv_iterator.get_next()
test_dataset = tf.contrib.data.TextLineDataset(filenames[1]).skip(skip_size).map(decode_csv_for_test_set).batch(test_batch)
test_iterator = test_dataset.make_initializable_iterator()
test_batch_features = test_iterator.get_next()
# Define a model
X = tf.placeholder(tf.float32, [None, feature_size])
y = tf.placeholder(tf.float32, [None, 1])
Lambda = tf.placeholder(tf.float32)
W = tf.Variable(tf.zeros([feature_size, 1]), name='weight')
b = tf.Variable(tf.zeros([1]), name='bias')
logits = tf.matmul(X, W) + b
hypothesis = tf.sigmoid(logits)
#hypothesis = tf.nn.relu(logits)
cost_logits = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y)
#hypothesis = tf.nn.softmax(logits)
#cost_logits = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)
cost_op = tf.reduce_mean(cost_logits) + Lambda * tf.reduce_mean(tf.square(W))
#cost_op = -tf.reduce_mean(y*tf.log(hypothesis) + (1-y)*tf.log(1-hypothesis)) + Lambda * tf.reduce_mean(tf.square(W))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost_op)
#optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate).minimize(cost_op)
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost_op)
predicted = tf.cast(hypothesis >= threshold, tf.float32)
#predicted = tf.argmax(hypothesis, 1)
#predicted = tf.argmax(hypothesis, 1)
#actual = y
accuracy_op = tf.metrics.accuracy(labels=y, predictions=predicted)
precision_op = tf.metrics.precision(labels=y, predictions=predicted)
recall_op = tf.metrics.recall(labels=y, predictions=predicted)
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
with tf.Session() as sess:
sess.run(init_op)
sess.run(training_cv_iterator.initializer)
cv_data = 0
test_data = 0
iteration = 0
mu = 0
sigma = 0
'''
TRAINING SESSION
'''
while True:
try:
training_features = sess.run(training_batch_features)
train_data = cv_data = training_features
train_data, cv_data, test_data = ml.map_dataset(training_features, 0.6, 0.2, 0.2)
X_data = train_data[:, 0:-1]
y_data = train_data[:, [-1]]
X_data, mu, sigma = ml.feature_scaling(X_data)
sess.run(optimizer, feed_dict={X: X_data, y: y_data, Lambda: hyper_lambda})
cost = sess.run(cost_op, feed_dict={X: X_data, y: y_data, Lambda: hyper_lambda})
accuracy,accuracy_update_op = sess.run(accuracy_op,
feed_dict={X: X_data, y: y_data, Lambda: hyper_lambda})
precision, precision_update_op = sess.run(precision_op,
feed_dict={X: X_data, y: y_data, Lambda: hyper_lambda})
recall, recall_update_op = sess.run(recall_op,
feed_dict={X: X_data, y: y_data, Lambda: hyper_lambda})
iteration += 1
print("{} Training >> Loss: {:.3f}\tAccuracy: {:.2%}\tPrecision: {:.2%}\tRecall: {:.2%}".format(iteration, cost, accuracy, precision, recall))
except tf.errors.OutOfRangeError:
print("Training Session is done!\n")
break
'''
CROSS-VALIDATION SESSION
'''
X_cv_data = cv_data[:, 0:-1]
y_cv_data = cv_data[:, [-1]]
for i in range(feature_size):
X_cv_data[:, i] = (X_cv_data[:, i] - mu[i]) / sigma[i]
num_of_rows = np.size(X_cv_data, 0)
sess.run(tf.shape(X), feed_dict={X: np.zeros(dtype=np.float32, shape=(num_of_rows, feature_size))})
cost = sess.run(cost_op, feed_dict={X: X_cv_data, y: y_cv_data, Lambda: 0.0})
accuracy, accuracy_update_op = sess.run(accuracy_op,
feed_dict={X: X_cv_data, y: y_cv_data, Lambda: 0.0})
precision, precision_update_op = sess.run(precision_op,
feed_dict={X: X_cv_data, y: y_cv_data, Lambda: 0.0})
recall, recall_update_op = sess.run(recall_op,
feed_dict={X: X_cv_data, y: y_cv_data, Lambda: 0.0})
print("Cross-Validation >> Loss: {:.3f}\tAccuracy: {:.2%}\tPrecision: {:.2%}\tRecall: {:.2%}".format(cost, accuracy, precision, recall))
print("Cross-Validation Session is done!\n")
'''
TEST SESSION
'''
sess.run(test_iterator.initializer)
while True:
try:
test_features = sess.run(test_batch_features)
X_test_data = test_data[:, 0:-1]
y_test_data = test_data[:, [-1]]
for i in range(feature_size):
test_features[:, i] = (test_features[:, i] - mu[i]) / sigma[i]
X_test_data[:, i] = (X_test_data[:, i] - mu[i]) / sigma[i]
num_of_rows = np.size(X_test_data, 0)
sess.run(tf.shape(X), feed_dict={X: np.zeros(dtype=np.float32, shape=(num_of_rows, feature_size))})
cost = sess.run(cost_op, feed_dict={X: X_test_data, y: y_test_data, Lambda: 0.0})
accuracy, accuracy_update_op = sess.run(accuracy_op,
feed_dict={X: X_test_data, y: y_test_data, Lambda: 0.0})
precision, precision_update_op = sess.run(precision_op,
feed_dict={X: X_test_data, y: y_test_data, Lambda: 0.0})
recall, recall_update_op = sess.run(recall_op,
feed_dict={X: X_test_data, y: y_test_data, Lambda: 0.0})
print("Test >> Loss: {:.3f}\tAccuracy: {:.2%}\tPrecision: {:.2%}\tRecall: {:.2%}".format(cost,
accuracy,
precision,
recall))
_predicted = sess.run(predicted, feed_dict={X: test_features, Lambda: 0.0})
num_of_rows = np.size(_predicted, 0)
print("Test Session is done!\n")
f = open('my_submission.csv', 'w', encoding='utf-8', newline='')
wr = csv.writer(f)
wr.writerow(["PassengerId", "Survived"])
for i in range(num_of_rows):
wr.writerow([i+892,(int)(_predicted[i][0])])
f.close()
except tf.errors.OutOfRangeError:
print("Kaggle submission data generation is done!\n")
break
<Training>
1 Training >> Loss: 0.679 Accuracy: 0.00% Precision: 0.00% Recall: 0.00% 2 Training >> Loss: 0.666 Accuracy: 79.40% Precision: 73.21% Recall: 73.91% 3 Training >> Loss: 0.654 Accuracy: 79.40% Precision: 73.21% Recall: 73.91% 4 Training >> Loss: 0.643 Accuracy: 79.40% Precision: 73.21% Recall: 73.91% 5 Training >> Loss: 0.633 Accuracy: 79.40% Precision: 73.21% Recall: 73.91% 6 Training >> Loss: 0.623 Accuracy: 79.40% Precision: 73.21% Recall: 73.91% 7 Training >> Loss: 0.614 Accuracy: 79.43% Precision: 73.26% Recall: 73.91% ... 397 Training >> Loss: 0.474 Accuracy: 80.30% Precision: 75.81% Recall: 72.24% 398 Training >> Loss: 0.474 Accuracy: 80.30% Precision: 75.81% Recall: 72.23% 399 Training >> Loss: 0.474 Accuracy: 80.30% Precision: 75.82% Recall: 72.23% 400 Training >> Loss: 0.474 Accuracy: 80.30% Precision: 75.82% Recall: 72.23% Training Session is done! Cross-Validation >> Loss: 0.506 Accuracy: 80.31% Precision: 75.82% Recall: 72.23% Cross-Validation Session is done! Test >> Loss: 0.443 Accuracy: 80.30% Precision: 75.82% Recall: 72.22% Test Session is done!
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
- Total
- Today
- Yesterday
링크
TAG
- ate
- OST
- Sea Bottom
- Library
- 도커
- Meow
- Ragdoll
- sentence test
- aws #cloudfront
- Memorize
- #REST API
- Physical Simulation
- belief
- Game Engine
- #ApacheSpark
- #ELK Stack
- Jekyll and Hyde
- GOD
- ILoop Engine
- English
- #ApacheZeppelin
- project
- Mask R-CNN
- #TensorFlow
- some time ago
- Badge
- docker
- Worry
- 2D Game
- SSM
| 일 | 월 | 화 | 수 | 목 | 금 | 토 |
|---|---|---|---|---|---|---|
| 1 | 2 | 3 | 4 | 5 | 6 | |
| 7 | 8 | 9 | 10 | 11 | 12 | 13 |
| 14 | 15 | 16 | 17 | 18 | 19 | 20 |
| 21 | 22 | 23 | 24 | 25 | 26 | 27 |
| 28 | 29 | 30 | 31 |
글 보관함
asyncml.zip