########TRAIN
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("d:\\data\\train.csv")
pd.set_option('display.max_columns',15)
rdf = df.drop(['Cabin'],axis=1)
# 운임 이상치 제거
local_std = rdf.Fare.std()*5
rdf['Fare'] = rdf['Fare'][rdf['Fare']<local_std]
#rdf = rdf.dropna(subset=['Age'], how='any',axis=0)
# 나이 na값 31 로
most_freq =rdf['Age'].value_counts(dropna=True).idxmax()
rdf['Age'].fillna(31,inplace=True)
# embarked 최빈값으로
most_freq =rdf['Embarked'].value_counts(dropna=True).idxmax()
rdf['Embarked'].fillna(most_freq,inplace=True)
# 운임 최빈값으로
most_freq =rdf['Fare'].value_counts(dropna=True).idxmax()
rdf['Fare'].fillna(most_freq,inplace=True)
ndf= rdf[['Survived','Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']]
# child 와 woman 파생변수 생성
mask = ( ndf.Age < 10) | (ndf.Sex == 'female')
mask2 = (ndf)
ndf['child_women'] = mask.astype(int)
### 혼자 여행온 사람
ndf['relatives'] = ndf['SibSp'] + ndf['Parch']
ndf.loc[ndf['relatives'] > 0, 'travelled_alone'] = 0
ndf.loc[ndf['relatives'] == 0, 'travelled_alone'] = 1
###FATHER 아빠는 가족들 챙기느라 죽었을것 같음
mask1 = ( ndf.relatives > 4) | (ndf.Sex == 'male')
ndf['Father'] = mask1.astype(int)
## 더미 생성
gender=pd.get_dummies(ndf['Sex'])
ndf=pd.concat([ndf,gender],axis=1)
onehot_embarked=pd.get_dummies(ndf['Embarked'],prefix='town')
ndf=pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['Sex','Embarked'],axis=1,inplace=True)
X=ndf[['Pclass','Father','travelled_alone', 'Age','Fare','female', 'male','Fare','town_C', 'town_Q', 'town_S','child_women']]
y=ndf['Survived']
######TEST#########
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df1 = pd.read_csv("d:\\data\\test.csv")
pd.set_option('display.max_columns',15)
rdf = df1.drop(['Cabin'],axis=1)
most_freq =rdf['Embarked'].value_counts(dropna=True).idxmax()
rdf['Embarked'].fillna(most_freq,inplace=True)
most_freq =rdf['Fare'].value_counts(dropna=True).idxmax()
rdf['Fare'].fillna(most_freq,inplace=True)
most_freq =rdf['Age'].value_counts(dropna=True).idxmax()
rdf['Age'].fillna(31,inplace=True)
ndf= rdf[['Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']]
mask = ( ndf.Age < 10) | (ndf.Sex == 'female')
ndf['child_women'] = mask.astype(int)
#relatives
ndf['relatives'] = ndf['SibSp'] + ndf['Parch']
ndf.loc[ndf['relatives'] > 0, 'travelled_alone'] = 1
ndf.loc[ndf['relatives'] == 0, 'travelled_alone'] = 0
##FATHER
mask1 = ( ndf.relatives > 4) | (ndf.Sex == 'male')
ndf['Father'] = mask1.astype(int)
gender=pd.get_dummies(ndf['Sex'])
ndf=pd.concat([ndf,gender],axis=1)
onehot_embarked=pd.get_dummies(ndf['Embarked'],prefix='town')
ndf=pd.concat([ndf,onehot_embarked],axis=1)
ndf.drop(['Sex','Embarked'],axis=1,inplace=True)
test=ndf[['Pclass','SibSp', 'Parch', 'Age','travelled_alone','Fare','female', 'male','Fare','town_C', 'town_Q', 'town_S','child_women']]
from sklearn import preprocessing
test=preprocessing.StandardScaler().fit(test).transform(test)
from sklearn import preprocessing
X=preprocessing.StandardScaler().fit(X).transform(X)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=19)
print('train data 의 갯수:',X_train.shape) # (478,9)
print('test data의 갯수:',X_test.shape) # (205,9)
from sklearn.ensemble import RandomForestClassifier
tree_model = RandomForestClassifier( n_estimators=800,
oob_score=True,
random_state= 9 )
tree_model.fit( X, y )
# 7단계 테스트 데이터로 예측을 한다.
y_hat = tree_model.predict( test )
print ( tree_model.oob_score_)
0.8204264870931538
제가 뽑은 파생변수는 Father 와 trabelled alone 입니다 ^^
Pclass SibSp Father Parch Age travelled_alone Fare female male \
0 3 0 1 0 34.5 0.0 7.8292 0 1
1 3 1 0 0 47.0 1.0 7.0000 1 0
2 2 0 1 0 62.0 0.0 9.6875 0 1
3 3 0 1 0 27.0 0.0 8.6625 0 1
4 3 1 0 1 22.0 1.0 12.2875 1 0
town_C town_Q town_S child_women
0 0 1 0 0
1 0 0 1 1
2 0 1 0 0
3 0 0 1 0
4 0 0 1 1
제 캐글 점수는 0.75598 에요 ^^
뭐가 문제일까요?
첫댓글 수정아 테스트 데이터의 나이의 결측치를 21로 갱신하는 코드가 없구나
수정해도 0.75598 입니다 ㅠㅠ