#library
from os import chdir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, skew
from scipy import stats
from subprocess import check_output
from collections import Counter
#train.csv 파일 data에 저장
chdir('C:\\Users\\myyou\\Desktop\\house')
data = pd.read_csv('train.csv')
#data 선언
data = pd.DataFrame(data)
pd.DataFrame(data).head()
data.describe()
#데이터 전처리
#MsSubClass 제거
data_delete_MSSubClass = data.drop('MSSubClass',1)
#missing data
#결측치 전체 개수
total = data_delete_MSSubClass.isnull().sum().sort_values(ascending=False)
#결측치 퍼센트
percent = (data_delete_MSSubClass.isnull().sum()/data_delete_MSSubClass.isnull().count()).sort_values(ascending=False)
#total과 percent를 가진 table
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
#Missing data Percent 그래프
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=percent.index, y=percent)
plt.xlabel('variables', fontsize=20)
plt.ylabel('Percent of missing values', fontsize=20)
plt.title('Percent missing data', fontsize=30)
#find missing values
nums=list(data_delete_MSSubClass.describe().columns)
for i in nums:
if sum(np.isnan(data_delete_MSSubClass[i])) >0:
print(i)
cat = []
for j in cat:
if sum(pd.isnull(data_delete_MSSubClass[j]))>0:
print(j)
# Imputing the missing values
def cat_imputation(column, value):
data_delete_MSSubClass.loc[data_delete_MSSubClass[column].isnull(),column] = value
cols=missing_data.index
for cols in cols:
if data_delete_MSSubClass[cols].dtype==np.object:
cat_imputation(cols,'None')
else:
cat_imputation(cols,data_delete_MSSubClass[cols].mean())
#MsSubClass를 제거한 data_delete_MsSubClass를 data 대신 사용
#data_delete_MsSubClass 데이터 타입 확인
print(set(data_delete_MSSubClass.dtypes))
#data_delete_MSSubClass Counter dataname에 저장
dataname=Counter(data_delete_MSSubClass)
key=list(dataname.keys())
row_count = np.shape(data_delete_MSSubClass)[0]
col_count = np.shape(data_delete_MSSubClass)[1]
#check
check=data_delete_MSSubClass[data_delete_MSSubClass.columns[data_delete_MSSubClass.isnull().any()]].isnull().sum()
check
# drop all rows containing NaN -
#data = data.dropna()
#data.dropna(how='all') #drop observations, if ALL columns have missing value
#data.dropna() #drop all rows that have any NaN values
#save data(MSSubClass랑 결측치 처리된 data)
data_save=data_delete_MSSubClass.to_csv("datasave.csv")
#data 전처리 최종 data
data = pd.read_csv('datasave.csv')
print(data.head())
#relations w/ SalePrice (numerical,catergorical)
for i in range(col_count-2):
if(type(data.iloc[1,i+1])==np.int64):
plt.title(key[i+1])
plt.plot(data.iloc[:,i+1],data['SalePrice'],'r*')
plt.show()
#data 안에 있는 변수의 상관계수
corr = data.corr(method = 'pearson')
print(corr)
#상관계수에 따른 히트맵
plt.subplots(figsize=(18,15))
sns.heatmap(corr, vmax=1,square=True)
#Plot(Sqrt5LotArea, LotFrontage)
data['SqrtLotArea']=np.sqrt(data['LotArea'])
data['LotFrontage'].corr(data['SqrtLotArea'])
sns.pairplot(data[['LotFrontage','SqrtLotArea']].dropna())
# 서열척도 수치형 변환
#서열척도 변수들 서열상관관계 비모수통계
#ordinal_scale_vb1 = ['ExterQual','ExterCond','BsmtQual','BsmtCond',
# 'HeatingQC','KitchenQual','FireplaceQu','GarageQual',
# 'GarageCond','PoolQC']
#rank1 = {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
#
#yn_vb = 'CentralAir'
#rank_yn={
# 'N':0,
# 'Y':1
# }
#for j in range(row_count):
# data.loc[j,yn_vb]= list(rank_yn.keys()).index(data.loc[j,yn_vb])
#ordinal_scale_vb2 = 'LotShape'
#rank2={
# 'Reg':3,
# 'IR1':2,
# 'IR2':1,
# 'IR3':0
# }
#for j in range(row_count):
# data.loc[j,ordinal_scale_vb2]= list(rank2.keys()).index(data.loc[j,ordinal_scale_vb2])
#ordinal_scale_vb3 = 'LandSlope'
#rank3={
# 'Gtl':2,
# 'Mod':1,
# 'Sev':0,
# }
#for j in range(row_count):
# data.loc[j,ordinal_scale_vb3]= list(rank3.keys()).index(data.loc[j,ordinal_scale_vb3])
#ordinal_scale_vb4 = 'BsmtExposure'
#rank4 = {'None':0,
# 'No':1,
# 'Mn':2,
# 'Av':3,
# 'Gd':4,
# }
#for j in range(row_count):
# data.loc[j,ordinal_scale_vb4]= list(rank4.keys()).index(data.loc[j,ordinal_scale_vb4])
#data[ordinal_scale_vb4]
##############################################################################
#for i in ordinal_scale_vb1:
# index_vb = ordinal_scale_vb1.index(i)
# for j in range(row_count):
# data.loc[j,ordinal_scale_vb1[index_vb]]= list(rank1.keys()).index(data.loc[j,ordinal_scale_vb1[index_vb]])
##############################################################################
ordinal_scale_vb2 = ['LotShape','LandSlope','BsmtExposure']
#명목자료 뽑아내기
plt.rcParams['axes.unicode_minus'] = False
norminal_vb = ['MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle',
'RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','Foundation','BsmtFinType1','BsmtFinType2','Heating','CentralAir','Electrical',
'Functional','GarageType','GarageFinish','PavedDrive','Fence','MiscFeature','SaleType','SaleCondition'
]
#명목형 변수 빈도분석 비모수통계 교차분석
nominal_scale_vb = ['CentralAir','PavedDrive']
#빈도수 구하기
for i in range(len(norminal_vb)):
cnt = Counter(data[norminal_vb[i]])
l_cnt=list(cnt.keys())
if(np.nan in l_cnt):
l_nan=l_cnt.index(np.nan)
l_cnt[l_nan]='NA'
plt.figure(figsize=(12,8))
plt.rcParams.update({'font.size': 20})
plt.bar(l_cnt,cnt.values())
plt.title(norminal_vb[i])
plt.show()
#plotting correlations(numerical)
num_feat=data.columns[data.dtypes!=object]
num_feat=num_feat[1:-1]
labels = []
values = []
for col in num_feat:
labels.append(col)
values.append(np.corrcoef(data[col].values, data.SalePrice.values)[0,1])
#Correlation Coefficients w.r.t Sale Price
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(6,20))
rects = ax.barh(ind, np.array(values), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation Coefficients w.r.t Sale Price");
##################################################################################
#clean outlier
num = data.dtypes[data.dtypes != "object"].index
print(num)
# define plot function, and in this function, we will calculate the skew of X and take the log1p of y
def plot_outlier(x,y):
tmp=x.dropna()
skew_value=skew(tmp)#skewness:왜도(왜도계산값의 절댓값이 클수록 분포의 비대칭 정도가 커진다.)
y=np.log1p(y)
print('sample lengh: %s and skew: %s'%(len(x),skew_value))
fig,axs=plt.subplots(1,2,figsize=(8,3))
sns.boxplot(x,orient='v',ax=axs[0])
sns.regplot(x,y,ax=axs[1]) #regression plot
plt.tight_layout() #그래프가 붙지 않게
#numerical features
for cols in num:
plot_outlier(data[cols],data.SalePrice)
'Kaggle > House Prices' 카테고리의 다른 글
캐글(Kaggle) - 집값 예측(House Prices) (0) | 2019.03.06 |
---|
최근댓글