🥞 BE
home

Titanic: 타이타닉 데이터셋

import pandas as pd import matplotlib as mpl import matplotlib.pylab as plt import seaborn as sns import numpy as np # 상대경로 file_path = 'titanic/train.csv' titanic_data = pd.read_csv(file_path, encoding='CP949') titanic_data
Python
복사
# 백업 파일 생성 titanic_data_copy_backup = titanic_data.copy() titanic_data.to_csv('titanic/train_backup.csv') file_path = 'titanic/train_backup.csv' titanic_data_csv_backup = pd.read_csv(file_path) titanic_data_csv_backup
Python
복사
titanic_data.info()
Python
복사
컬럼별 데이터 구성 및 타입 확인
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
Plain Text
복사
titanic_data.describe()
Python
복사
sex_dict = {'male':0, 'female':1} titanic_data['Sex'] = titanic_data['Sex'].map(sex_dict)
Python
복사
형변
titanic_data.groupby(['Embarked']).mean()
Python
복사
평균값
Embarked_dict = {'C':0, 'Q':1, 'S':2} titanic_data['Embarked'] = titanic_data['Embarked'].map(Embarked_dict)
Python
복사
형변환
titanic_data
Python
복사
import missingno as msno titanic_data.isnull().sum()
Python
복사
Age, Cabin, Embarked 컬럼에 결측치 확인
msno.matrix(titanic_data)
Python
복사
titanic_data = titanic_data.drop(labels=['Ticket', 'Cabin'], axis=1)
Python
복사
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median()) titanic_data['Embarked'] = titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0])
Python
복사
Age 컬럼은 중간값, Embarked 컬럼은 최빈값으로 채우기
titanic_data.isnull().sum()
Python
복사
titanic_data['Age'] = titanic_data['Age'].astype(np.int64) titanic_data['Embarked'] = titanic_data['Embarked'].astype(np.int64)
Python
복사
titanic_data.info()
Python
복사
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null int64 5 Age 891 non-null int64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Fare 891 non-null float64 9 Embarked 891 non-null int64 dtypes: float64(1), int64(8), object(1) memory usage: 69.7+ KB
Plain Text
복사
# 표를 2x2로 배열하고, 한 표당 15의 크기로 지정 figure, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows=3, ncols=2) figure.set_size_inches(15, 15) sns.countplot('Survived', data=titanic_data, ax=ax1) sns.countplot('Pclass', data=titanic_data, ax=ax2) sns.countplot('Sex', data=titanic_data, ax=ax3) sns.countplot('SibSp', data=titanic_data, ax=ax4) sns.countplot('Parch', data=titanic_data, ax=ax5) sns.countplot('Embarked', data=titanic_data, ax=ax6)
Python
복사
def bar_chart(feature): survived = titanic_data[titanic_data['Survived']==1][feature].value_counts() dead = titanic_data[titanic_data['Survived']==0][feature].value_counts() df = pd.DataFrame([survived,dead]) df.index = ['Survived','Dead'] df.plot(kind='bar',stacked=True, figsize=(10,5))
Python
복사
bar_chart('Sex')
Python
복사
bar_chart('Pclass')
Python
복사
bar_chart('SibSp')
Python
복사
bar_chart('Parch')
Python
복사
bar_chart('Embarked')
Python
복사