csv 파일을 dataframe으로 읽어오기

In [2]:
import pandas as pd

df1 = pd.read_csv("./sungjuk.csv")

df1
Out[2]:
번호 이름 성별 국어 영어 수학
0 1 강민경 98 96 76
1 2 강순애 94 79 60
2 3 강영하 55 47 93
3 4 강혜정 99 76 78
4 5 권명숙 98 73 61
... ... ... ... ... ... ...
95 96 하혜연 96 96 71
96 97 한경규 96 94 95
97 98 한수정 93 97 77
98 99 한의병 93 59 63
99 100 한정희 93 78 52

100 rows × 6 columns

In [3]:
import pandas as pd
import seaborn as sns

df2 = sns.load_dataset('titanic')
df2
Out[3]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 0 3 male 22.0 1 0 7.2500 S Third man True NaN Southampton no False
1 1 1 female 38.0 1 0 71.2833 C First woman False C Cherbourg yes False
2 1 3 female 26.0 0 0 7.9250 S Third woman False NaN Southampton yes True
3 1 1 female 35.0 1 0 53.1000 S First woman False C Southampton yes False
4 0 3 male 35.0 0 0 8.0500 S Third man True NaN Southampton no True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 0 2 male 27.0 0 0 13.0000 S Second man True NaN Southampton no True
887 1 1 female 19.0 0 0 30.0000 S First woman False B Southampton yes True
888 0 3 female NaN 1 2 23.4500 S Third woman False NaN Southampton no False
889 1 1 male 26.0 0 0 30.0000 C First man True C Cherbourg yes True
890 0 3 male 32.0 0 0 7.7500 Q Third man True NaN Queenstown no True

891 rows × 15 columns

dataframe 내용보기

In [4]:
df1.head()
df1.tail()
Out[4]:
번호 이름 성별 국어 영어 수학
95 96 하혜연 96 96 71
96 97 한경규 96 94 95
97 98 한수정 93 97 77
98 99 한의병 93 59 63
99 100 한정희 93 78 52

데이터 요약 정보 확인

In [5]:
df1.shape
df1.info()
df2.count()
df2.value_counts()
df1['성별'].value_counts()
df2['pclass'].value_counts()
df2[['pclass','sex']].value_counts()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   번호      100 non-null    int64 
 1   이름      100 non-null    object
 2   성별      100 non-null    object
 3   국어      100 non-null    int64 
 4   영어      100 non-null    int64 
 5   수학      100 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 4.8+ KB
Out[5]:
pclass  sex   
3       male      347
        female    144
1       male      122
2       male      108
1       female     94
2       female     76
dtype: int64

통계함수

In [6]:
df1.mean()
df1['국어'].mean()
df1[['국어','영어']].mean()
df1.loc[10:50, ['국어','영어']].mean()
df1.iloc[10:50, 3:6].mean()

df1.median()
df1.max()
df1.min()
df1.std()
df1.var()
df1.corr()
df1[['국어','영어', '수학']].corr()
<ipython-input-6-849ec4cd70f4>:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df1.mean()
<ipython-input-6-849ec4cd70f4>:7: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df1.median()
<ipython-input-6-849ec4cd70f4>:10: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df1.std()
<ipython-input-6-849ec4cd70f4>:11: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df1.var()
Out[6]:
국어 영어 수학
국어 1.000000 0.311257 0.176915
영어 0.311257 1.000000 0.171293
수학 0.176915 0.171293 1.000000

내장 그래프

In [7]:
df3 = df1.copy()
df3.columns = ['no', 'name', 'gender', 'kor', 'eng', 'mat']
In [8]:
#df3.plot()
#df3['kor'].plot()
#df3['eng'].plot()
#df3[['kor','eng']].plot()

#df3['kor'].plot(kind='bar')

#df3['kor'].plot(kind='hist')

#df3.plot(x='kor', y='eng', kind='scatter')

#df3['kor'].plot(kind='box')
df3.plot(kind='box')
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd065ce27c0>

누락데이터 처리

In [12]:
df2.head()
df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB

dropna=False 옵션 : NaN 도 개수를 세라

In [16]:
df2['deck'].value_counts()
df2['deck'].value_counts(dropna=False)
Out[16]:
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: deck, dtype: int64

.isnull(), .notnull()

In [17]:
df2.head().isnull()
Out[17]:
survived pclass sex age sibsp parch fare embarked class who adult_male deck embark_town alive alone
0 False False False False False False False False False False False True False False False
1 False False False False False False False False False False False False False False False
2 False False False False False False False False False False False True False False False
3 False False False False False False False False False False False False False False False
4 False False False False False False False False False False False True False False False