티스토리 뷰

[업데이트 2017.10.31 11:11]

 

이제 Training/Test Data에 대해 Pre-Processing 및 Feature Selection/Extraction을 통해 성능 향상에 도움을 줄 것으로 예상되는 Feature만 선별하는 작업을 해보도록 하겠습니다. 먼저 Kaggle에서 제공하는 Feature 정보는 다음과 같습니다. 



이중에서 실제로 성능을 올리는데 중요한 Feature가 어떤 것인지 선택하기 위해 각 Feature별 데이터 분포를 확인해보았습니다.

import asyncml as ml
import csv
import matplotlib.pyplot as plt
from collections import Counter
from pandas import Series

def line_pre_process(line):

    # Name
    Name = line[3]

    if "Mr." in Name:
        line[3] = 0
    elif  "Mrs." in Name:
        line[3] = 1
    elif "Miss." in Name:
        line[3] = 2
    elif "Master." in Name:
        line[3] = 3
    elif "Rev." in Name:
        line[3] = 4
    elif "Dr." in Name:
        line[3] = 5
    elif "Mlle." in Name:
        line[3] = 6
    elif "Col." in Name:
        line[3] = 7
    elif "Lady." in Name:
        line[3] = 8


    return line

#ml.csv_pre_process('train.csv', 'train_pre_processed.csv', _func_line=line_pre_process)

read_file = open('train.csv', 'r', encoding='utf-8')

rdr = csv.reader(read_file)

index = 0
name_list = []
male_female_list = []
age_list = []
pclass_list = []
survived_list = []
SibSp_list = []
Parch_list = []
Ticket_list = []
Fare_list = []
Cabin_list = []
Embarked_list = []

for line in rdr:
    if index != 0 :
      temp = line[3].split(',')
      temp = temp[1].split('.')
      name_list.append(temp[0].strip())
      male_female_list.append(line[4])
      age_list.append(line[5])
      pclass_list.append(line[2])
      survived_list.append(line[1])
      SibSp_list.append(line[6])
      Parch_list.append(line[7])
      Ticket_list.append(line[8])
      Fare_list.append(line[9])
      Cabin_list.append(line[10])
      Embarked_list.append(line[11])

    index+=1



# Name
print(Counter(name_list))
s = Series(name_list)
s = s.value_counts()
plt.figure(1)
s.plot(kind='bar')

# Sex
print(Counter(male_female_list))
s = Series(male_female_list)
s = s.value_counts()
plt.figure(2)
s.plot(kind='bar')

# Age
print(Counter(age_list))
s = Series(age_list)
s = s.value_counts()
plt.figure(3)
s.plot(kind='bar')

# PClass
print(Counter(pclass_list))
s = Series(pclass_list)
s = s.value_counts()
plt.figure(4)
s.plot(kind='bar')

# Survive
print(Counter(survived_list))
s = Series(survived_list)
s = s.value_counts()
plt.figure(5)
s.plot(kind='bar')

# SibSp
print(Counter(SibSp_list))
s = Series(SibSp_list)
s = s.value_counts()
plt.figure(6)
s.plot(kind='bar')

# Parch
print(Counter(Parch_list))
s = Series(Parch_list)
s = s.value_counts()
plt.figure(7)
s.plot(kind='bar')

# Ticket
print(Counter(Ticket_list))
s = Series(Ticket_list)
s = s.value_counts()
plt.figure(8)
s.plot(kind='bar')

# Fare
print(Counter(Fare_list))
s = Series(Fare_list)
s = s.value_counts()
plt.figure(9)
s.plot(kind='bar')

# Cabin
print(Counter(Cabin_list))
s = Series(Cabin_list)
s = s.value_counts()
plt.figure(10)
s.plot(kind='bar')

# Embarked
print(Counter(Embarked_list))
s = Series(Embarked_list)
s = s.value_counts()
plt.figure(11)
s.plot(kind='bar')

plt.show()
read_file.close()

 

아래와 같이 각 Feature의 데이터 속성에 대해 count 및 그래프를 통해 데이터 분석을 해보았습니다. Collections, Padas module을 활용하였습니다.


Counter({'Mr': 517, 'Miss': 182, 'Mrs': 125, 'Master': 40, 'Dr': 7, 'Rev': 6, 'Major': 2, 'Mlle': 2, 'Col': 2, 'Don': 1, 'Mme': 1, 'Ms': 1, 'Lady': 1, 'Sir': 1, 'Capt': 1, 'the Countess': 1, 'Jonkheer': 1})
Counter({'male': 577, 'female': 314})
Counter({'': 177, '24': 30, '22': 27, '18': 26, '28': 25, '19': 25, '30': 25, '21': 24, '25': 23, '36': 22, '29': 20, '26': 18, '35': 18, '27': 18, '32': 18, '31': 17, '16': 17, '20': 15, '34': 15, '33': 15, '23': 15, '39': 14, '40': 13, '42': 13, '17': 13, '45': 12, '38': 11, '2': 10, '4': 10, '50': 10, '47': 9, '44': 9, '48': 9, '54': 8, '9': 8, '51': 7, '1': 7, '14': 6, '3': 6, '49': 6, '37': 6, '41': 6, '52': 6, '58': 5, '15': 5, '43': 5, '8': 4, '5': 4, '11': 4, '56': 4, '62': 4, '60': 4, '7': 3, '65': 3, '46': 3, '61': 3, '6': 3, '55': 2, '28.5': 2, '0.83': 2, '59': 2, '71': 2, '32.5': 2, '40.5': 2, '45.5': 2, '63': 2, '10': 2, '64': 2, '13': 2, '0.75': 2, '57': 2, '70': 2, '30.5': 2, '66': 1, '14.5': 1, '70.5': 1, '12': 1, '36.5': 1, '55.5': 1, '20.5': 1, '23.5': 1, '0.92': 1, '53': 1, '80': 1, '24.5': 1, '0.67': 1, '0.42': 1, '34.5': 1, '74': 1})
Counter({'3': 491, '1': 216, '2': 184})
Counter({'0': 549, '1': 342})
Counter({'0': 608, '1': 209, '2': 28, '4': 18, '3': 16, '8': 7, '5': 5})
Counter({'0': 678, '1': 118, '2': 80, '5': 5, '3': 5, '4': 4, '6': 1})
Counter({'347082': 7, '1601': 7, 'CA. 2343': 7, '3101295': 6, 'CA 2144': 6, '347088': 6, '382652': 5, 'S.O.C. 14879': 5, '349909': 4, '347077': 4, '19950': 4, 'W./C. 6608': 4, '4133': 4, 'LINE': 4, '113781': 4, '17421': 4, 'PC 17757': 4, '113760': 4, '2666': 4, '347742': 3, 'SC/Paris 2123': 3, 'PC 17572': 3, 'C.A. 34651': 3, '371110': 3, '230080': 3, '363291': 3, '35273': 3, 'C.A. 31921': 3, '110152': 3, 'PC 17755': 3, '110413': 3, 'PC 17582': 3, 'PC 17760': 3, '13502': 3, '239853': 3, 'F.C.C. 13529': 3, '29106': 3, '345773': 3, '248727': 3, '24160': 3, '113803': 2, '237736': 2, 'PP 9549': 2, '239865': 2, 'PC 17569': 2, 'PC 17604': 2, '113789': 2, '345764': 2, '2651': 2, '11668': 2, '349237': 2, '113572': 2, '36973': 2, '2661': 2, '248738': 2, '364516': 2, '3101278': 2, 'C.A. 2315': 2, '231919': 2, '244367': 2, '35281': 2, '110465': 2, '2665': 2, '2627': 2, 'PC 17558': 2, '2668': 2, '7534': 2, 'PC 17593': 2, '2678': 2, 'STON/O2. 3101279': 2, 'C.A. 33112': 2, '113776': 2, '113505': 2, '230136': 2, '370365': 2, '364849': 2, '347054': 2, '2699': 2, '243847': 2, '19943': 2, '367230': 2, '19928': 2, '250649': 2, '11751': 2, '244252': 2, 'A/5. 3336': 2, '370129': 2, '230433': 2, '113798': 2, '250644': 2, 'C.A. 2673': 2, '19877': 2, '11967': 2, '367226': 2, 'PC 17758': 2, 'P/PP 3381': 2, 'PC 17485': 2, '11767': 2, 'PC 17608': 2, '36928': 2, '16966': 2, '111361': 2, 'PC 17611': 2, 'C.A. 37671': 2, '2691': 2, 'PC 17477': 2, '2653': 2, '28403': 2, '347080': 2, '250655': 2, '376564': 2, '13507': 2, '17453': 2, '31027': 2, '36947': 2, '26360': 2, '12749': 2, 'PC 17761': 2, 'WE/P 5735': 2, '2908': 2, 'A/4 48871': 2, '358585': 2, '220845': 2, '2659': 2, '54636': 2, '19996': 2, '29750': 2, '17474': 2, '250647': 2, '113806': 2, '392096': 2, 'S.O./P.P. 3': 2, 'W./C. 6607': 2, 'S.C./PARIS 2079': 2, 'A/5 21171': 1, 'PC 17599': 1, 'STON/O2. 3101282': 1, '373450': 1, '330877': 1, '17463': 1, '113783': 1, 'A/5. 2151': 1, '350406': 1, '248706': 1, '244373': 1, '345763': 1, '2649': 1, '248698': 1, '330923': 1, '113788': 1, '2631': 1, '330959': 1, '349216': 1, 'PC 17601': 1, '335677': 1, 'C.A. 24579': 1, '2677': 1, 'A./5. 2152': 1, '7546': 1, '349253': 1, '330958': 1, 'S.C./A.4. 23567': 1, '370371': 1, '14311': 1, '2662': 1, 'A/4. 39886': 1, '2926': 1, '113509': 1, '19947': 1, 'C.A. 31026': 1, '2697': 1, '2669': 1, 'PC 17605': 1, 'C.A. 29395': 1, 'S.P. 3464': 1, '3101281': 1, '315151': 1, 'C.A. 33111': 1, '2680': 1, '348123': 1, '349208': 1, '374746': 1, '345767': 1, '345779': 1, '330932': 1, '113059': 1, 'SO/C 14885': 1, 'SOTON/OQ 392086': 1, '343275': 1, '343276': 1, '347466': 1, 'W.E.P. 5734': 1, '364500': 1, '374910': 1, 'PC 17754': 1, 'PC 17759': 1, '349245': 1, '349215': 1, '7540': 1, '3101276': 1, '349207': 1, '343120': 1, '312991': 1, '349249': 1, '324669': 1, '4136': 1, 'STON/O 2. 3101294': 1, '370369': 1, 'A4. 54510': 1, '27267': 1, '370372': 1, 'C 17369': 1, '347061': 1, '349241': 1, 'SOTON/O.Q. 3101307': 1, 'A/5. 3337': 1, '228414': 1, 'C.A. 29178': 1, 'SC/PARIS 2133': 1, '11752': 1, '347081': 1, '365222': 1, '231945': 1, '350043': 1, '244310': 1, 'S.O.P. 1166': 1, 'A.5. 11206': 1, 'A/5. 851': 1, 'Fa 265302': 1, 'PC 17597': 1, '35851': 1, 'SOTON/OQ 392090': 1, '315037': 1, '371362': 1, 'C.A. 33595': 1, '347068': 1, '315093': 1, 'PC 17318': 1, '111240': 1, 'STON/O 2. 3101280': 1, '17764': 1, '350404': 1, 'PC 17595': 1, '250653': 1, 'SC/PARIS 2131': 1, '315153': 1, '113767': 1, '111428': 1, '349247': 1, '234604': 1, '28424': 1, '350046': 1, 'PC 17610': 1, '368703': 1, '4579': 1, '370370': 1, '248747': 1, '345770': 1, '3101264': 1, '2628': 1, 'A/5 3540': 1, '367231': 1, '112277': 1, 'SOTON/O.Q. 3101311': 1, 'F.C.C. 13528': 1, 'A/5 21174': 1, '250646': 1, '367229': 1, 'STON/O2. 3101283': 1, '11813': 1, 'W/C 14208': 1, 'SOTON/OQ 392089': 1, '220367': 1, '21440': 1, '349234': 1, 'PP 4348': 1, 'SW/PP 751': 1, 'A/5 21173': 1, '236171': 1, '347067': 1, '237442': 1, 'C.A. 29566': 1, 'W./C. 6609': 1, '26707': 1, '28665': 1, 'SCO/W 1585': 1, 'W./C. 14263': 1, 'STON/O 2. 3101275': 1, '2694': 1, '347071': 1, '362316': 1, '113514': 1, '2650': 1, 'PC 17585': 1, '384461': 1, '112059': 1, '382649': 1, 'C.A. 17248': 1, '347083': 1, 'PC 17596': 1, '370375': 1, '347073': 1, '336439': 1, '347464': 1, '345778': 1, 'A/5. 10482': 1, '113056': 1, '349239': 1, '345774': 1, '349206': 1, '237798': 1, '370373': 1, 'SC/Paris 2163': 1, '349236': 1, '349233': 1, 'PC 17612': 1, '2693': 1, '19988': 1, '9234': 1, '226593': 1, 'A/5 2466': 1, '250651': 1, '349243': 1, '347470': 1, '29011': 1, 'A/5 21172': 1, '349219': 1, '234818': 1, '345364': 1, '28551': 1, '113043': 1, '349225': 1, '7598': 1, '113784': 1, '248740': 1, '244361': 1, '229236': 1, '248733': 1, '31418': 1, '386525': 1, '315088': 1, '7267': 1, '113510': 1, '2695': 1, '2647': 1, '345783': 1, '237671': 1, '330931': 1, '330980': 1, 'SC/PARIS 2167': 1, 'SOTON/O.Q. 3101310': 1, 'C 7076': 1, '110813': 1, '2626': 1, '14313': 1, '11765': 1, '3101267': 1, '323951': 1, 'C 7077': 1, '113503': 1, '2648': 1, '347069': 1, 'STON/O 2. 3101293': 1, '349227': 1, '27849': 1, '367655': 1, 'SC 1748': 1, '350034': 1, '3101277': 1, '350052': 1, '350407': 1, '244278': 1, '240929': 1, 'STON/O 2. 3101289': 1, '341826': 1, '4137': 1, '315096': 1, '28664': 1, '347064': 1, '312992': 1, '349222': 1, '394140': 1, 'STON/O 2. 3101269': 1, '343095': 1, '28220': 1, '250652': 1, '28228': 1, '349254': 1, 'A/5. 13032': 1, '315082': 1, 'A/4. 34244': 1, '2003': 1, '364851': 1, 'SOTON/O.Q. 392078': 1, '110564': 1, 'SC/AH 3085': 1, 'STON/O 2. 3101274': 1, 'C.A. 18723': 1, '345769': 1, '347076': 1, '230434': 1, '65306': 1, '33638': 1, '113794': 1, '113786': 1, '65303': 1, '113051': 1, 'A/5 2817': 1, '349240': 1, '13509': 1, '17464': 1, 'F.C.C. 13531': 1, '371060': 1, '19952': 1, '364506': 1, '111320': 1, '234360': 1, 'A/S 2816': 1, 'SOTON/O.Q. 3101306': 1, '113792': 1, '36209': 1, '323592': 1, '315089': 1, 'SC/AH Basle 541': 1, '7553': 1, '3460': 1, '350060': 1, '3101298': 1, '239854': 1, 'A/5 3594': 1, '4134': 1, '11771': 1, 'A.5. 18509': 1, '65304': 1, 'SOTON/OQ 3101317': 1, '113787': 1, 'PC 17609': 1, 'A/4 45380': 1, 'C.A. 6212': 1, '350035': 1, '315086': 1, '364846': 1, '330909': 1, '4135': 1, '111427': 1, 'C 4001': 1, '382651': 1, 'SOTON/OQ 3101316': 1, 'PC 17473': 1, 'PC 17603': 1, '349209': 1, '36967': 1, 'C.A. 34260': 1, '226875': 1, '349242': 1, '349252': 1, '2624': 1, '2700': 1, '367232': 1, 'W./C. 14258': 1, 'PC 17483': 1, '3101296': 1, '29104': 1, '2641': 1, '2690': 1, '315084': 1, '113050': 1, '364498': 1, '13568': 1, '693': 1, 'SC/PARIS 2146': 1, '244358': 1, '330979': 1, '2620': 1, '347085': 1, '113807': 1, '11755': 1, '345572': 1, '372622': 1, '349251': 1, '218629': 1, 'SOTON/OQ 392082': 1, 'SOTON/O.Q. 392087': 1, '349205': 1, '2686': 1, '350417': 1, 'S.W./PP 752': 1, '11769': 1, 'PC 17474': 1, '14312': 1, 'A/4. 20589': 1, '243880': 1, '2689': 1, 'STON/O 2. 3101286': 1, '237789': 1, '13049': 1, '3411': 1, '237565': 1, '13567': 1, '14973': 1, 'A./5. 3235': 1, 'STON/O 2. 3101273': 1, 'A/5 3902': 1, '364848': 1, 'SC/AH 29037': 1, '2664': 1, '349214': 1, '113796': 1, '364511': 1, '111426': 1, '349910': 1, '349246': 1, '113804': 1, 'SOTON/O.Q. 3101305': 1, '370377': 1, '364512': 1, '31028': 1, '11753': 1, '350029': 1, '36963': 1, '219533': 1, '349224': 1, '334912': 1, '27042': 1, '347743': 1, '13214': 1, '112052': 1, '237668': 1, 'STON/O 2. 3101292': 1, '350050': 1, '349231': 1, '13213': 1, 'S.O./P.P. 751': 1, 'CA. 2314': 1, '349221': 1, '8475': 1, '330919': 1, '365226': 1, '349223': 1, '29751': 1, '2623': 1, '5727': 1, '349210': 1, 'STON/O 2. 3101285': 1, '234686': 1, '312993': 1, 'A/5 3536': 1, 'F.C. 12750': 1, 'C.A. 24580': 1, '244270': 1, '239856': 1, '349912': 1, '342826': 1, '4138': 1, '330935': 1, '6563': 1, '349228': 1, '350036': 1, '349256': 1, '2672': 1, '113800': 1, '248731': 1, '363592': 1, '35852': 1, '348121': 1, 'PC 17475': 1, '36864': 1, '350025': 1, '223596': 1, 'PC 17476': 1, 'PC 17482': 1, '113028': 1, '7545': 1, '348124': 1, '34218': 1, '36568': 1, '347062': 1, '350048': 1, '12233': 1, '250643': 1, '315094': 1, '36866': 1, '236853': 1, 'STON/O2. 3101271': 1, '239855': 1, '28425': 1, '233639': 1, '349201': 1, '349218': 1, '16988': 1, '376566': 1, 'STON/O 2. 3101288': 1, '250648': 1, '113773': 1, '335097': 1, '29103': 1, '345780': 1, '349204': 1, '350042': 1, '29108': 1, '363294': 1, 'SOTON/O2 3101272': 1, '2663': 1, '347074': 1, '112379': 1, '364850': 1, '8471': 1, '345781': 1, '350047': 1, '2674': 1, '29105': 1, '347078': 1, '383121': 1, '36865': 1, '2687': 1, '113501': 1, 'SOTON/O.Q. 3101312': 1, '374887': 1, '3101265': 1, '12460': 1, 'PC 17600': 1, '349203': 1, '28213': 1, '17465': 1, '349244': 1, '2685': 1, '2625': 1, '347089': 1, '347063': 1, '112050': 1, '347087': 1, '248723': 1, '3474': 1, '28206': 1, '364499': 1, '112058': 1, 'STON/O2. 3101290': 1, 'C 7075': 1, '315098': 1, '19972': 1, '368323': 1, '367228': 1, '2671': 1, '347468': 1, '2223': 1, 'PC 17756': 1, '315097': 1, '392092': 1, '11774': 1, 'SOTON/O2 3101287': 1, '2683': 1, '315090': 1, 'C.A. 5547': 1, '349213': 1, '347060': 1, 'PC 17592': 1, '392091': 1, '113055': 1, '2629': 1, '350026': 1, '28134': 1, '17466': 1, '233866': 1, '236852': 1, 'SC/PARIS 2149': 1, 'PC 17590': 1, '345777': 1, '349248': 1, '695': 1, '345765': 1, '2667': 1, '349212': 1, '349217': 1, '349257': 1, '7552': 1, 'C.A./SOTON 34068': 1, 'SOTON/OQ 392076': 1, '211536': 1, '112053': 1, '111369': 1, '370376': 1})
Counter({'8.05': 43, '13': 42, '7.8958': 38, '7.75': 34, '26': 31, '10.5': 24, '7.925': 18, '7.775': 16, '26.55': 15, '7.2292': 15, '0': 15, '7.25': 13, '7.8542': 13, '8.6625': 13, '7.225': 12, '9.5': 9, '16.1': 9, '15.5': 8, '24.15': 8, '31.275': 7, '52': 7, '14.4542': 7, '56.4958': 7, '7.05': 7, '14.5': 7, '69.55': 7, '21': 6, '39.6875': 6, '46.9': 6, '27.9': 6, '7.7958': 6, '26.25': 6, '30': 6, '53.1': 5, '29.125': 5, '27.7208': 5, '15.2458': 5, '73.5': 5, '30.5': 5, '21.075': 4, '35.5': 4, '31.3875': 4, '263': 4, '7.8792': 4, '27.75': 4, '7.65': 4, '12.475': 4, '15.85': 4, '34.375': 4, '23': 4, '79.2': 4, '11.5': 4, '7.7333': 4, '25.4667': 4, '39': 4, '90': 4, '13.5': 4, '7.55': 4, '7.125': 4, '151.55': 4, '110.8833': 4, '227.525': 4, '120': 4, '19.2583': 4, '11.1333': 3, '18': 3, '41.5792': 3, '76.7292': 3, '14.4583': 3, '20.525': 3, '31': 3, '113.275': 3, '52.5542': 3, '86.5': 3, '512.3292': 3, '79.65': 3, '153.4625': 3, '135.6333': 3, '29.7': 3, '77.9583': 3, '12.35': 3, '83.1583': 3, '18.75': 3, '26.2875': 3, '7.4958': 3, '33': 3, '211.3375': 3, '51.8625': 2, '30.0708': 2, '16.7': 2, '146.5208': 2, '82.1708': 2, '11.2417': 2, '17.8': 2, '80': 2, '83.475': 2, '29': 2, '9': 2, '20.575': 2, '77.2875': 2, '9.825': 2, '247.5208': 2, '22.3583': 2, '6.975': 2, '6.75': 2, '36.75': 2, '66.6': 2, '55': 2, '30.6958': 2, '6.4958': 2, '10.4625': 2, '18.7875': 2, '27': 2, '9.35': 2, '20.2125': 2, '19.5': 2, '20.25': 2, '78.85': 2, '91.0792': 2, '23.25': 2, '108.9': 2, '24': 2, '56.9292': 2, '262.375': 2, '164.8667': 2, '134.5': 2, '57.9792': 2, '133.65': 2, '15.9': 2, '9.225': 2, '69.3': 2, '15.7417': 2, '14.4': 2, '55.9': 2, '19.9667': 2, '89.1042': 2, '9.5875': 2, '49.5042': 2, '78.2667': 2, '93.5': 2, '106.425': 2, '71': 2, '7.8292': 2, '39.6': 2, '65': 2, '7.0542': 2, '57': 2, '7.7375': 2, '23.45': 2, '25.9292': 2, '37.0042': 2, '71.2833': 1, '8.4583': 1, '16': 1, '8.0292': 1, '9.475': 1, '21.6792': 1, '7.8': 1, '61.9792': 1, '8.1583': 1, '7.7875': 1, '47.1': 1, '61.175': 1, '34.6542': 1, '63.3583': 1, '8.6542': 1, '7.1417': 1, '15.0458': 1, '26.2833': 1, '9.2167': 1, '12.525': 1, '7.3125': 1, '61.3792': 1, '15.75': 1, '25.925': 1, '33.5': 1, '28.7125': 1, '15.05': 1, '22.025': 1, '50': 1, '8.4042': 1, '76.2917': 1, '12.275': 1, '12.875': 1, '8.85': 1, '14': 1, '6.2375': 1, '28.5': 1, '35': 1, '75.25': 1, '55.4417': 1, '211.5': 1, '4.0125': 1, '7.7292': 1, '12': 1, '12.65': 1, '6.8583': 1, '32.5': 1, '7.875': 1, '8.1125': 1, '81.8583': 1, '38.5': 1, '7.725': 1, '13.7917': 1, '9.8375': 1, '7.0458': 1, '7.5208': 1, '12.2875': 1, '15.1': 1, '7.6292': 1, '22.525': 1, '59.4': 1, '34.0208': 1, '221.7792': 1, '49.5': 1, '13.8625': 1, '17.4': 1, '51.4792': 1, '26.3875': 1, '40.125': 1, '8.7125': 1, '15': 1, '42.4': 1, '15.55': 1, '32.3208': 1, '8.4333': 1, '25.5875': 1, '9.8417': 1, '8.1375': 1, '10.1708': 1, '13.4167': 1, '7.7417': 1, '9.4833': 1, '8.3625': 1, '8.6833': 1, '8.5167': 1, '7.8875': 1, '6.45': 1, '6.95': 1, '8.3': 1, '6.4375': 1, '39.4': 1, '14.1083': 1, '13.8583': 1, '50.4958': 1, '5': 1, '9.8458': 1, '10.5167': 1})
Counter({'': 687, 'G6': 4, 'C23 C25 C27': 4, 'B96 B98': 4, 'F33': 3, 'E101': 3, 'F2': 3, 'D': 3, 'C22 C26': 3, 'C123': 2, 'D33': 2, 'C52': 2, 'B28': 2, 'C83': 2, 'F G73': 2, 'D26': 2, 'B58 B60': 2, 'C2': 2, 'E33': 2, 'F4': 2, 'D36': 2, 'C93': 2, 'C78': 2, 'D35': 2, 'B77': 2, 'E67': 2, 'C125': 2, 'B49': 2, 'C65': 2, 'B57 B59 B63 B66': 2, 'B18': 2, 'C124': 2, 'B35': 2, 'E44': 2, 'C92': 2, 'D20': 2, 'E25': 2, 'B22': 2, 'C68': 2, 'C126': 2, 'B51 B53 B55': 2, 'B5': 2, 'B20': 2, 'E24': 2, 'E8': 2, 'E121': 2, 'D17': 2, 'C85': 1, 'E46': 1, 'C103': 1, 'D56': 1, 'A6': 1, 'B78': 1, 'B30': 1, 'E31': 1, 'A5': 1, 'D10 D12': 1, 'C110': 1, 'F E69': 1, 'D47': 1, 'B86': 1, 'B19': 1, 'A7': 1, 'C49': 1, 'A32': 1, 'B4': 1, 'B80': 1, 'A31': 1, 'D15': 1, 'C87': 1, 'B94': 1, 'C99': 1, 'C118': 1, 'D7': 1, 'A19': 1, 'C106': 1, 'E36': 1, 'C54': 1, 'C7': 1, 'E34': 1, 'C32': 1, 'C91': 1, 'E40': 1, 'T': 1, 'C128': 1, 'D37': 1, 'E50': 1, 'C82': 1, 'E10': 1, 'A34': 1, 'C104': 1, 'C111': 1, 'E38': 1, 'D21': 1, 'E12': 1, 'E63': 1, 'A14': 1, 'B37': 1, 'C30': 1, 'B79': 1, 'D46': 1, 'B73': 1, 'C95': 1, 'B38': 1, 'B39': 1, 'C86': 1, 'C70': 1, 'A16': 1, 'C101': 1, 'A10': 1, 'E68': 1, 'B41': 1, 'A20': 1, 'D19': 1, 'D50': 1, 'D9': 1, 'A23': 1, 'B50': 1, 'A26': 1, 'D48': 1, 'E58': 1, 'B71': 1, 'D49': 1, 'F G63': 1, 'C62 C64': 1, 'C90': 1, 'C45': 1, 'B101': 1, 'D45': 1, 'C46': 1, 'D30': 1, 'D11': 1, 'E77': 1, 'F38': 1, 'B3': 1, 'D6': 1, 'B82 B84': 1, 'A36': 1, 'B102': 1, 'B69': 1, 'E49': 1, 'C47': 1, 'D28': 1, 'E17': 1, 'A24': 1, 'C50': 1, 'B42': 1, 'C148': 1})
Counter({'S': 644, 'C': 168, 'Q': 77, '': 2})




위와 같이 어떤 Feature는 데이터 자체가 다양한 값으로 구성된 것이 있었는데, 이런 경우는 특정 range 값을 줄이거나 Feature Selection에서 제외하는 방향으로 적용해보려고 합니다.


아래의 feature에 대해 선택하고 training을 진행하려고 합니다.


Pclass, Name, Sex, Age, SibSp, Parch, Embarked


아래와 같이 string을 integer 데이터 타입으로 변경하는 작업을 진행합니다.

Age경우는 나이대를 구간으로 나누어 discrete하게 만듭니다.

다음 포스팅에서 training, cross-validation, test 및 최종 Kaggle submission output file을 생성하는 코드를 올릴 예정입니다.


def line_pre_process_train(line):
    # Name
    Name = line[3]

    if "Mr." in Name:
        line[3] = 0
    elif "Mrs." in Name:
        line[3] = 1
    elif "Miss." in Name:
        line[3] = 2
    elif "Master." in Name:
        line[3] = 3
    elif "Rev." in Name:
        line[3] = 4
    elif "Dr." in Name:
        line[3] = 5
    elif "Mlle." in Name:
        line[3] = 6
    elif "Col." in Name:
        line[3] = 7
    elif "Lady." in Name:
        line[3] = 8
    elif "Don." in Name:
        line[3] = 9
    elif "Mme." in Name:
        line[3] = 10
    elif "Ms." in Name:
        line[3] = 11
    elif "Sir." in Name:
        line[3] = 12
    elif "Capt." in Name:
        line[3] = 13
    elif "the Countess." in Name:
        line[3] = 14
    elif "Jonkheer." in Name:
        line[3] = 15
    elif "Major." in Name:
        line[3] = 16
    else:
        line[3] = 17

    # Sex
    Sex = line[4]

    if Sex == "male":
        line[4] = 0
    elif Sex == "female":
        line[4] = 1

    # Age
    Age = 0

    try:
        Age = float(line[5])
    except ValueError:
        pass

    if Age < 1:
        line[5] = 0
    elif Age >= 1 and Age <= 9:
        line[5] = 1
    elif Age >= 10 and Age <= 19:
        line[5] = 2
    elif Age >= 20 and Age <= 29:
        line[5] = 3
    elif Age >= 30 and Age <= 39:
        line[5] = 4
    elif Age >= 40 and Age <= 49:
        line[5] = 5
    elif Age >= 50 and Age <= 59:
        line[5] = 6
    elif Age >= 60 and Age <= 69:
        line[5] = 7
    elif Age >= 70 and Age <= 79:
        line[5] = 8
    elif Age >= 80 and Age <= 89:
        line[5] = 9
    else:
        line[5] = 10

    # Embarked
    Embarked = line[11]

    if Embarked == "S":
        line[11] = 0
    elif Embarked == "C":
        line[11] = 1
    elif Embarked == "Q":
        line[11] = 2
    else:
        line[11] = 3

    return line


댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2024/04   »
1 2 3 4 5 6
7 8 9 10 11 12 13
14 15 16 17 18 19 20
21 22 23 24 25 26 27
28 29 30
글 보관함