Kaggle Titanic Case – Prediction Methods

Below, you will find a large code showing how to manipulate the data from the kaggle Titanic case. We will us pandas, seaborn, decision trees, random forest and xgboosting with also gridsearch method.

Have fun :

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [3]:
data_train = pd.read_csv('Data/train.csv')
data_test = pd.read_csv('Data/test.csv')
In [4]:
data_train.sample(3)
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
51 52 0 3 Nosworthy, Mr. Richard Cater male 21.0 0 0 A/4. 39886 7.8000 NaN S
471 472 0 3 Cacic, Mr. Luka male 38.0 0 0 315089 8.6625 NaN S
693 694 0 3 Saad, Mr. Khalil male 25.0 0 0 2672 7.2250 NaN C
In [ ]:
####VISUALIZING DATA####
In [5]:
sns.barplot(x="Embarked", y="Survived", hue="Sex", data=data_train);
In [6]:
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data_train,
             palette={"male": "blue", "female" : "pink"},
             markers=["*","o"], linestyles=["-","--"]);
In [7]:
data_train.Age.describe()
Out[7]:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64
In [50]:
compteur = 0
for i in range(len(data_train["Age"].isna())):
    if (data_train["Age"].isna()[i] == True):
        compteur = compteur + 1

compteur
Out[50]:
177
In [52]:
data_train.Fare.describe()
Out[52]:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64
In [53]:
def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

data_train = transform_features(data_train)
data_test = transform_features(data_test)
data_train.head()
Out[53]:
PassengerId Survived Pclass Sex Age SibSp Parch Fare Cabin Lname NamePrefix
0 1 0 3 male Student 1 0 1_quartile N Braund, Mr.
1 2 1 1 female Adult 1 0 4_quartile C Cumings, Mrs.
2 3 1 3 female Young Adult 0 0 1_quartile N Heikkinen, Miss.
3 4 1 1 female Young Adult 1 0 4_quartile C Futrelle, Mrs.
4 5 0 3 male Young Adult 0 0 2_quartile N Allen, Mr.
In [44]:
sns.barplot(x="Age", y="Survived", hue="Sex", data=data_train);
In [45]:
sns.barplot(x="Cabin", y="Survived", hue="Sex", data=data_train);
In [46]:
sns.barplot(x="Fare", y="Survived", hue="Sex", data=data_train);
In [54]:
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    df_combined = pd.concat([df_train[features], df_test[features]])
    le_indic=['init']
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
        le_indic = [le_indic, le.classes_]
    return df_train, df_test, le_indic
    
data_train, data_test, le_indic = encode_features(data_train, data_test)
data_train.head()
le_indic
Out[54]:
[[[[[[['init'],
      array(['1_quartile', '2_quartile', '3_quartile', '4_quartile', 'Unknown'], dtype=object)],
     array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'N', 'T'], dtype=object)],
    array(['Adult', 'Baby', 'Child', 'Senior', 'Student', 'Teenager',
           'Unknown', 'Young Adult'], dtype=object)],
   array(['female', 'male'], dtype=object)],
  array(['Abbing,', 'Abbott,', 'Abelseth,', 'Abelson,', 'Abrahamsson,',
         'Abrahim,', 'Adahl,', 'Adams,', 'Ahlin,', 'Aks,', 'Albimona,',
         'Aldworth,', 'Alexander,', 'Alhomaki,', 'Ali,', 'Allen,',
         'Allison,', 'Allum,', 'Andersen,', 'Andersen-Jensen,', 'Anderson,',
         'Andersson,', 'Andreasson,', 'Andrew,', 'Andrews,', 'Angheloff,',
         'Angle,', 'Appleton,', 'Arnold-Franchi,', 'Aronsson,',
         'Artagaveytia,', 'Ashby,', 'Asim,', 'Asplund,', 'Assaf', 'Assaf,',
         'Assam,', 'Astor,', 'Attalah,', 'Aubart,', 'Augustsson,', 'Ayoub,',
         'Baccos,', 'Backstrom,', 'Baclini,', 'Badman,', 'Badt,', 'Bailey,',
         'Baimbrigge,', 'Balkic,', 'Ball,', 'Banfield,', 'Barah,',
         'Barbara,', 'Barber,', 'Barkworth,', 'Barry,', 'Barton,',
         'Bateman,', 'Baumann,', 'Baxter,', 'Bazzani,', 'Beane,', 'Beattie,',
         'Beauchamp,', 'Beavan,', 'Becker,', 'Beckwith,', 'Beesley,',
         'Behr,', 'Bengtsson,', 'Bentham,', 'Berglund,', 'Berriman,',
         'Betros,', 'Bidois,', 'Bing,', 'Bird,', 'Birkeland,', 'Birnbaum,',
         'Bishop,', 'Bissette,', 'Bjorklund,', 'Bjornstrom-Steffansson,',
         'Blackwell,', 'Blank,', 'Bonnell,', 'Borebank,', 'Bostandyeff,',
         'Botsford,', 'Boulos,', 'Bourke,', 'Bowen,', 'Bowenur,',
         'Bowerman,', 'Bracken,', 'Bradley,', 'Brady,', 'Braf,', 'Brandeis,',
         'Braund,', 'Brewe,', 'Brobeck,', 'Brocklebank,', 'Brown,', 'Bryhl,',
         'Buckley,', 'Bucknell,', 'Burke,', 'Burns,', 'Buss,', 'Butler,',
         'Butt,', 'Byles,', 'Bystrom,', 'Cacic,', 'Cairns,', 'Calderhead,',
         'Caldwell,', 'Calic,', 'Cameron,', 'Campbell,', 'Canavan,',
         'Candee,', 'Cann,', 'Caram,', 'Carbines,', 'Cardeza,', 'Carlsson,',
         'Carr,', 'Carrau,', 'Carter,', 'Carver,', 'Case,', 'Cassebeer,',
         'Cavendish,', 'Celotti,', 'Chaffee,', 'Chambers,', 'Chapman,',
         'Charters,', 'Chaudanson,', 'Cherry,', 'Chevre,', 'Chibnall,',
         'Chip,', 'Chisholm,', 'Christmann,', 'Christy,', 'Chronopoulos,',
         'Clark,', 'Clarke,', 'Cleaver,', 'Clifford,', 'Coelho,', 'Cohen,',
         'Colbert,', 'Coleff,', 'Coleridge,', 'Collander,', 'Collett,',
         'Colley,', 'Collyer,', 'Compton,', 'Conlon,', 'Connaghton,',
         'Connolly,', 'Connors,', 'Cook,', 'Cor,', 'Corbett,', 'Corey,',
         'Corn,', 'Cornell,', 'Cotterill,', 'Coutts,', 'Coxon,', 'Crafton,',
         'Crease,', 'Cribb,', 'Crosby,', 'Culumovic,', 'Cumings,',
         'Cunningham,', 'Daher,', 'Dahl,', 'Dahlberg,', 'Dakic,', 'Daly,',
         'Danbom,', 'Daniel,', 'Daniels,', 'Danoff,', 'Dantcheff,',
         'Davidson,', 'Davies,', 'Davis,', 'Davison,', 'Deacon,', 'Dean,',
         'Delalic,', 'Demetri,', 'Denbury,', 'Denkoff,', 'Dennis,',
         'Devaney,', 'Dibden,', 'Dick,', 'Dika,', 'Dimic,', 'Dintcheff,',
         'Dodge,', 'Doharr,', 'Doling,', 'Dooley,', 'Dorking,', 'Douglas,',
         'Dowdell,', 'Downton,', 'Doyle,', 'Drapkin,', 'Drazenoic,', 'Drew,',
         'Duane,', 'Duff', 'Dulles,', 'Duquemin,', 'Duran', 'Dyker,',
         'Earnshaw,', 'Edvardsson,', 'Eitemiller,', 'Eklund,', 'Ekstrom,',
         'Elias,', 'Elsbury,', 'Emanuel,', 'Emir,', 'Enander,', 'Endres,',
         'Eustis,', 'Evans,', 'Everett,', 'Fahlstrom,', 'Farrell,',
         'Farthing,', 'Faunthorpe,', 'Fillbrook,', 'Finoli,', 'Fischer,',
         'Flegenheim,', 'Fleming,', 'Flynn,', 'Foley,', 'Foo,', 'Ford,',
         'Foreman,', 'Fortune,', 'Fox,', 'Francatelli,', 'Franklin,',
         'Frauenthal,', 'Frolicher,', 'Frolicher-Stehli,', 'Frost,', 'Fry,',
         'Funk,', 'Futrelle,', 'Fynney,', 'Gale,', 'Gallagher,', 'Garfirth,',
         'Garside,', 'Gaskell,', 'Gavey,', 'Gee,', 'Geiger,', 'Gheorgheff,',
         'Gibson,', 'Giglio,', 'Gilbert,', 'Giles,', 'Gilinski,', 'Gill,',
         'Gillespie,', 'Gilnagh,', 'Givard,', 'Glynn,', 'Goldenberg,',
         'Goldschmidt,', 'Goldsmith,', 'Goncalves,', 'Goodwin,', 'Gracie,',
         'Graham,', 'Green,', 'Greenberg,', 'Greenfield,', 'Gronnestad,',
         'Guest,', 'Guggenheim,', 'Gustafsson,', 'Haas,', 'Hagardon,',
         'Hagland,', 'Hakkarainen,', 'Hale,', 'Hamalainen,', 'Hampe,',
         'Hanna,', 'Hansen,', 'Harbeck,', 'Harder,', 'Harknett,', 'Harmer,',
         'Harper,', 'Harrington,', 'Harris,', 'Harrison,', 'Hart,',
         'Hassab,', 'Hassan,', 'Hawksford,', 'Hays,', 'Head,', 'Healy,',
         'Hedman,', 'Hee,', 'Hegarty,', 'Heikkinen,', 'Heininen,',
         'Hellstrom,', 'Hendekovic,', 'Henriksson,', 'Henry,', 'Herman,',
         'Hewlett,', 'Hickman,', 'Hilliard,', 'Hiltunen,', 'Hipkins,',
         'Hippach,', 'Hirvonen,', 'Hocking,', 'Hodges,', 'Hogeboom,',
         'Hold,', 'Holm,', 'Holthen,', 'Holverson,', 'Homer,', 'Honkanen,',
         'Hood,', 'Horgan,', 'Hosono,', 'Howard,', 'Hoyt,', 'Humblen,',
         'Hunt,', 'Hyman,', 'Ibrahim', 'Icard,', 'Ilett,', 'Ilieff,',
         'Ilmakangas,', 'Isham,', 'Ismay,', 'Ivanoff,', 'Jacobsohn,',
         'Jalsevac,', 'Jansson,', 'Jardin,', 'Jarvis,', 'Jefferys,',
         'Jenkin,', 'Jensen,', 'Jermyn,', 'Jerwan,',
         'Johannesen-Bratthammer,', 'Johanson,', 'Johansson', 'Johansson,',
         'Johnson,', 'Johnston,', 'Jones,', 'Jonkoff,', 'Jonsson,',
         'Julian,', 'Jussila,', 'Kallio,', 'Kalvik,', 'Kantor,', 'Karaic,',
         'Karlsson,', 'Karnes,', 'Karun,', 'Kassem,', 'Katavelas,', 'Keane,',
         'Keefe,', 'Keeping,', 'Kelly,', 'Kennedy,', 'Kent,', 'Kenyon,',
         'Khalil,', 'Kiernan,', 'Kilgannon,', 'Kimball,', 'Kink,',
         'Kink-Heilmann,', 'Kirkland,', 'Klaber,', 'Klasen,', 'Knight,',
         'Kraeff,', 'Krekorian,', 'Kreuchen,', 'Kvillner,', 'Lahoud,',
         'Lahtinen,', 'Laitinen,', 'Laleff,', 'Lam,', 'Lamb,', 'Landergren,',
         'Lane,', 'Lang,', 'Laroche,', 'Larsson,', 'Larsson-Rondberg,',
         'LeRoy,', 'Leader,', 'Leeni,', 'Lefebre,', 'Lehmann,', 'Leinonen,',
         'Leitch,', 'Lemberopolous,', 'Lemore,', 'Lennon,', 'Leonard,',
         'Lester,', 'Lesurer,', 'Levy,', 'Lewy,', 'Leyson,', 'Lievens,',
         'Lindahl,', 'Lindblom,', 'Lindeberg-Lind,', 'Lindell,',
         'Lindqvist,', 'Lindstrom,', 'Linehan,', 'Lines,', 'Ling,',
         'Lingane,', 'Lithman,', 'Lobb,', 'Lockyer,', 'Long,', 'Longley,',
         'Loring,', 'Louch,', 'Lovell,', 'Lulic,', 'Lundahl,', 'Lundin,',
         'Lundstrom,', 'Lurette,', 'Lyntakoff,', 'MacKay,', 'Mack,',
         'Madigan,', 'Madill,', 'Madsen,', 'Maenpaa,', 'Maguire,', 'Mahon,',
         'Maioni,', 'Maisner,', 'Makinen,', 'Malachard,', 'Mallet,',
         'Mamee,', 'Mangan,', 'Mangiavacchi,', 'Mannion,', 'Mardirosian,',
         'Marechal,', 'Markoff,', 'Markun,', 'Marvin,', 'Masselmani,',
         'Matinoff,', 'Matthews,', 'Maybery,', 'Mayne,', 'McCaffry,',
         'McCarthy,', 'McCormack,', 'McCoy,', 'McCrae,', 'McCrie,',
         'McDermott,', 'McEvoy,', 'McGough,', 'McGovern,', 'McGowan,',
         'McKane,', 'McMahon,', 'McNamee,', 'McNeill,', 'Meanwell,', 'Meek,',
         'Mellinger,', 'Mellors,', 'Meo,', 'Mernagh,', 'Meyer,', 'Midtsjo,',
         'Miles,', 'Millet,', 'Milling,', 'Minahan,', 'Mineff,', 'Minkoff,',
         'Mionoff,', 'Mitchell,', 'Mitkoff,', 'Mock,', 'Mockler,', 'Moen,',
         'Molson,', 'Montvila,', 'Moor,', 'Moore,', 'Moran,', 'Moraweck,',
         'Morley,', 'Morrow,', 'Moss,', 'Moubarek,', 'Moussa,', 'Moutal,',
         'Mudd,', 'Mullens,', 'Mulvihill,', 'Murdlin,', 'Murphy,',
         'Myhrman,', 'Myles,', 'Naidenoff,', 'Najib,', 'Nakid,',
         'Nancarrow,', 'Nankoff,', 'Nasr,', 'Nasser,', 'Natsch,',
         'Naughton,', 'Navratil,', 'Nenkoff,', 'Nesson,', 'Newell,',
         'Newsom,', 'Nicholls,', 'Nicholson,', 'Nicola-Yarred,', 'Nieminen,',
         'Niklasson,', 'Nilsson,', 'Nirva,', 'Niskanen,', 'Norman,',
         'Nosworthy,', 'Nourney,', 'Novel,', 'Nye,', 'Nysten,', 'Nysveen,',
         "O'Brien,", "O'Connell,", "O'Connor,", "O'Donoghue,", "O'Driscoll,",
         "O'Dwyer,", "O'Keefe,", "O'Leary,", "O'Sullivan,", 'Odahl,',
         'Ohman,', 'Oliva', 'Olsen,', 'Olsson,', 'Olsvigen,', 'Omont,',
         'Oreskovic,', 'Osen,', 'Osman,', 'Ostby,', 'Otter,', 'Ovies',
         'Oxenham,', 'Padro', 'Pain,', 'Pallas', 'Palsson,', 'Panula,',
         'Parker,', 'Parkes,', 'Parr,', 'Parrish,', 'Partner,', 'Pasic,',
         'Patchett,', 'Paulner,', 'Pavlovic,', 'Payne,', 'Peacock,',
         'Pearce,', 'Pears,', 'Pedersen,', 'Peduzzi,', 'Pekoniemi,',
         'Peltomaki,', 'Penasco', 'Pengelly,', 'Perkin,', 'Pernot,',
         'Perreault,', 'Persson,', 'Peruschitz,', 'Peter,', 'Peters,',
         'Petersen,', 'Petranec,', 'Petroff,', 'Petterson,', 'Pettersson,',
         'Peuchen,', 'Phillips,', 'Pickard,', 'Pinsky,', 'Plotcharsky,',
         'Pokrnic,', 'Ponesell,', 'Portaluppi,', 'Porter,', 'Potter,',
         'Pulbaum,', 'Quick,', 'Radeff,', 'Rasmussen,', 'Razi,', 'Reed,',
         'Reeves,', 'Rekic,', 'Renouf,', 'Reuchlin,', 'Reynaldo,',
         'Reynolds,', 'Rheims,', 'Rice,', 'Richard,', 'Richards,',
         'Ridsdale,', 'Riihivouri,', 'Ringhini,', 'Rintamaki,', 'Riordan,',
         'Risien,', 'Robbins,', 'Robert,', 'Robins,', 'Roebling,', 'Rogers,',
         'Romaine,', 'Rommetvedt,', 'Rood,', 'Rosblom,', 'Rosenbaum,',
         'Rosenshine,', 'Ross,', 'Roth,', 'Rothes,', 'Rothschild,', 'Rouse,',
         'Rowe,', 'Rugg,', 'Rush,', 'Ryan,', 'Ryerson,', 'Saad,', 'Saade,',
         'Saalfeld,', 'Sadlier,', 'Sadowitz,', 'Saether,', 'Sage,',
         'Sagesser,', 'Salander,', 'Salkjelsvik,', 'Salomon,', 'Salonen,',
         'Samaan,', 'Sandstrom,', 'Sap,', 'Saundercock,', 'Sawyer,',
         'Scanlan,', 'Schabert,', 'Schmidt,', 'Sdycoff,', 'Sedgwick,',
         'Serepeca,', 'Seward,', 'Sharp,', 'Shaughnessy,', 'Sheerlinck,',
         'Shellard,', 'Shelley,', 'Shine,', 'Shorney,', 'Shutes,', 'Silven,',
         'Silverthorne,', 'Silvey,', 'Simmons,', 'Simonius-Blumer,',
         'Sincock,', 'Sinkkonen,', 'Sirayanian,', 'Sirota,', 'Sivic,',
         'Sivola,', 'Sjoblom,', 'Sjostedt,', 'Skoog,', 'Slabenoff,',
         'Slayter,', 'Slemen,', 'Slocovski,', 'Sloper,', 'Smart,',
         'Smiljanic,', 'Smith,', 'Smyth,', 'Snyder,', 'Sobey,', 'Soholt,',
         'Somerton,', 'Spector,', 'Spedden,', 'Spencer,', 'Spinner,',
         'Stahelin-Maeglin,', 'Staneff,', 'Stankovic,', 'Stanley,',
         'Stanton,', 'Stead,', 'Stengel,', 'Stephenson,', 'Stewart,',
         'Stokes,', 'Stone,', 'Storey,', 'Stoytcheff,', 'Strandberg,',
         'Stranden,', 'Straus,', 'Strilic,', 'Strom,', 'Sunderland,',
         'Sundman,', 'Sutehall,', 'Sutton,', 'Svensson,', 'Swane,', 'Sweet,',
         'Swift,', 'Taussig,', 'Taylor,', 'Tenglin,', 'Thayer,', 'Theobald,',
         'Thomas,', 'Thomson,', 'Thorne,', 'Thorneycroft,', 'Tikkanen,',
         'Tobin,', 'Todoroff,', 'Tomlin,', 'Toomey,', 'Torber,', 'Torfa,',
         'Tornquist,', 'Toufik,', 'Touma,', 'Troupiansky,', 'Trout,',
         'Troutt,', 'Tucker,', 'Turcin,', 'Turja,', 'Turkula,', 'Turpin,',
         'Uruchurtu,', 'Van', 'Vande', 'Vanden', 'Vander', 'Vartanian,',
         'Veal,', 'Vendel,', 'Vestrom,', 'Vovk,', 'Waelens,', 'Walcroft,',
         'Walker,', 'Ward,', 'Ware,', 'Warren,', 'Watson,', 'Watt,',
         'Webber,', 'Weir,', 'Weisz,', 'Wells,', 'Wenzel,', 'West,',
         'Whabee,', 'Wheadon,', 'Wheeler,', 'White,', 'Wick,', 'Widegren,',
         'Widener,', 'Wiklund,', 'Wilhelms,', 'Wilkes,', 'Willard,',
         'Willer,', 'Willey,', 'Williams,', 'Williams-Lambert,', 'Wilson,',
         'Windelov,', 'Wirz,', 'Wiseman,', 'Wittevrongel,', 'Woolner,',
         'Wright,', 'Yasbeck,', 'Young,', 'Youseff,', 'Yousif,', 'Yousseff,',
         'Yrois,', 'Zabour,', 'Zakarian,', 'Zimmerman,', 'de', 'del', 'van'], dtype=object)],
 array(['Billiard,', 'Brito,', 'Capt.', 'Carlo,', 'Col.', 'Cruyssen,',
        'Don.', 'Dr.', 'Gordon,', 'Impe,', 'Jonkheer.', 'Khalil,', 'Major.',
        'Master.', 'Melkebeke,', 'Messemaeker,', 'Miss.', 'Mlle.', 'Mme.',
        'Mr.', 'Mrs.', 'Ms.', 'Mulder,', 'Palmquist,', 'Pelsmaeker,',
        'Planke,', 'Rev.', 'Shawah,', 'Steen,', 'Velde,', 'Walle,', 'der',
        'the', 'y'], dtype=object)]
In [57]:
from sklearn.model_selection import train_test_split

X_all = data_train.drop(['Survived', 'PassengerId'], axis=1)
y_all = data_train['Survived']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)
y_test.head()
Out[57]:
727    1
668    0
210    0
654    0
228    0
Name: Survived, dtype: int64
In [ ]:
####DECISION TREE####
In [ ]:
###CLASSIFICATION###
In [197]:
from sklearn import tree
import graphviz 
import tensorflow as tf

DTC = tree.DecisionTreeClassifier(max_leaf_nodes=10)
DTC = DTC.fit(X_train, y_train)
In [198]:
dot_data = tree.export_graphviz(DTC, out_file=None, 
                                feature_names=X_train.columns.values.tolist(),
                                class_names=['Survived','Died'], 
                                filled=True, rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data) 
graph.render("titanic") 
graph
Out[198]:




Tree 0 Sex ≤ 0.5 gini = 0.476 samples = 712 value = [434, 278] class = Survived 1 Pclass ≤ 2.5 gini = 0.379 samples = 256 value = [65, 191] class = Died 0->1 True 2 Cabin ≤ 6.0 gini = 0.309 samples = 456 value = [369, 87] class = Survived 0->2 False 3 gini = 0.095 samples = 140 value = [7, 133] class = Died 1->3 4 Fare ≤ 2.5 gini = 0.5 samples = 116 value = [58, 58] class = Survived 1->4 7 Lname ≤ 637.0 gini = 0.493 samples = 102 value = [45, 57] class = Died 4->7 8 gini = 0.133 samples = 14 value = [13, 1] class = Survived 4->8 17 gini = 0.471 samples = 79 value = [30, 49] class = Died 7->17 18 gini = 0.454 samples = 23 value = [15, 8] class = Survived 7->18 5 NamePrefix ≤ 16.0 gini = 0.488 samples = 85 value = [49, 36] class = Survived 2->5 6 NamePrefix ≤ 13.5 gini = 0.237 samples = 371 value = [320, 51] class = Survived 2->6 9 gini = 0.26 samples = 13 value = [2, 11] class = Died 5->9 10 gini = 0.453 samples = 72 value = [47, 25] class = Survived 5->10 11 Lname ≤ 587.5 gini = 0.478 samples = 33 value = [20, 13] class = Survived 6->11 12 gini = 0.2 samples = 338 value = [300, 38] class = Survived 6->12 13 SibSp ≤ 2.0 gini = 0.43 samples = 16 value = [5, 11] class = Died 11->13 14 gini = 0.208 samples = 17 value = [15, 2] class = Survived 11->14 15 gini = 0.165 samples = 11 value = [1, 10] class = Died 13->15 16 gini = 0.32 samples = 5 value = [4, 1] class = Survived 13->16

In [ ]:
###REGRESSION###
In [199]:
DTR = tree.DecisionTreeRegressor()
DTR = clf.fit(X_train, y_train)
In [200]:
y_pred = DTR.predict(X_test)
In [201]:
correct_prediction = tf.equal(y_pred, list(y_test))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

session = tf.Session()
session.run(accuracy)
Out[201]:
0.75418997
In [ ]:
####RANDOM FOREST####
In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
RFC = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10, 15], 
              'min_samples_split': [2, 3, 5, 10],
              'min_samples_leaf': [1,5,8, 15]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(RFC, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
RFC = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
RFC.fit(X_train, y_train)
Out[60]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=15, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [205]:
predictions = RFC.predict(X_test)
print(accuracy_score(y_test, predictions))
0.804469273743
In [ ]:
####XGBOOSTING####
In [61]:
import xgboost as xgb

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train)
predictions = gbm.predict(X_test)

print(accuracy_score(y_test, predictions))
0.821229050279
In [64]:
gbm = xgb.XGBClassifier()

# Choose some parameter combinations to try
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05, 0.1], #so called `eta` value
              'max_depth': [3, 6],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5,20,50,100,300], #number of trees, change it to 1000 for better results
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(gbm, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
RFC = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
RFC.fit(X_train, y_train)
Out[64]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=20, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)
In [66]:
predictions = RFC.predict(X_test)
print(accuracy_score(y_test, predictions))
0.826815642458