반응형
1. 데이터 분석 시작 -> 상관관계분석
2. 'Series' object has no attribute 'reshape' 문제 해결
데이터 분석 프로젝트
1) 데이터 분석 20200918 12시 38분 기준
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_corr = data.corr(method='pearson')
data_corr
#상관계수에 따른 히트맵
#0.7보다 크면 강한 상관관계
plt.subplots(figsize=(18,15))
sns.heatmap(data_corr, vmax=1,square=True)
y_corr = list(data_corr.iloc[12])
y_corr = y_corr[:12]
y_co = list(data.columns)
y_col = y_co[1:13]
ind = np.arange(len(y_corr))
width = 0.2
fig, ax = plt.subplots(figsize=(6,20))
rects = ax.barh(ind, np.array(y_corr), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(y_col, rotation='horizontal', size = 15)
ax.set_xlabel("상관계수", size = 20)
ax.set_title("접근지수와의 상관관계", size = 20)
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/로컬위치최최종.csv', encoding="CP949")
data_lat = list(data['위도'])
data_lon = list(data['경도'])
for i in range(len(data['위도'])) :
plt.plot(data_lat[i],data_lon[i],'o')
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_col = data.columns
data_col = data_col[1:14]
data_col
#Outlier 확인
#Outlier 2개 확인(인구, 면적)
for i in data_col :
plt.figure(figsize=(10,3))
sns.boxplot(x=data[i])
#Outlier 제거
data = data[data.면적 < 10000]
data = data[data.인구 < 30000]
for i in data_col :
plt.figure(figsize=(10,3))
sns.boxplot(x=data[i])
#Outlier 제거
data = data[data.방문객수 < 700000]
data = data[data.방문빈도수 < 150]
data = data[data.주변대형마트 < 10]
data = data[data.방문효율<30]
data = data[data.접근지수 < 75000]
for i in data_col :
plt.figure(figsize=(10,3))
sns.boxplot(x=data[i])
왜 자꾸 이상치가 생기지?
2020년 9월 18일 오후 2시 38분 기준 코드(선형회귀 그래프)
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_corr = data.corr(method='pearson')
data_corr
#상관계수에 따른 히트맵
#0.7보다 크면 강한 상관관계
plt.subplots(figsize=(18,15))
sns.heatmap(data_corr, vmax=1,square=True)
y_corr = list(data_corr.iloc[12])
y_corr = y_corr[:12]
y_co = list(data.columns)
y_col = y_co[1:13]
ind = np.arange(len(y_corr))
width = 0.2
fig, ax = plt.subplots(figsize=(6,20))
rects = ax.barh(ind, np.array(y_corr), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(y_col, rotation='horizontal', size = 15)
ax.set_xlabel("상관계수", size = 20)
ax.set_title("접근지수와의 상관관계", size = 20)
from statsmodels.formula.api import ols
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
model = smf.ols(formula = '접근지수 ~ 인구', data = data)
result = model.fit()
result.summary()
sns.lmplot(x="면적", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문객수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="인구", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="유동인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문빈도수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="도로혼잡도", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차선수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="블로그수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="생산인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="주변대형마트", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차량수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문효율", y="접근지수", data=data, line_kws={'color':"red"})
20200918 1528 Outlier 확인
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_col = data.columns
data_col = data_col[1:14]
for i in data_col :
plt.boxplot(data[i])
plt.show()
20200918 1601
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')
data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_col = data.columns
data_col = data_col[1:14]
for i in data_col :
plt.boxplot(data[i])
plt.xlabel(i)
plt.show()
data = data[data['면적']<10000]
data = data[data['인구']<300000]
for i in data_col :
plt.boxplot(data[i])
plt.xlabel(i)
plt.show()
data = data[data['인구']<100000]
#data = data[data['접근지수']<250000]
for i in data_col :
plt.boxplot(data[i])
plt.xlabel(i)
plt.show()
data = data[data['인구']<100000]
for i in data_col :
plt.boxplot(data[i])
plt.xlabel(i)
plt.show()
sns.lmplot(x="면적", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문객수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="인구", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="유동인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문빈도수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="도로혼잡도", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차선수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="블로그수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="생산인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="주변대형마트", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차량수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문효율", y="접근지수", data=data, line_kws={'color':"red"})
'Series' object has no attribute 'reshape' 문제 해결
yongku.tistory.com/entry/Series-object-has-no-attribute-reshape
반응형
'자기개발 > TIL' 카테고리의 다른 글
TIL(Today I Learned) 20210114 (0) | 2021.01.14 |
---|---|
TIL(Today I Learned) 20210113 (0) | 2021.01.13 |
TIL(Today I Learned) 20200922 ~ 20200923 (0) | 2020.09.23 |
TIL(Today I Learned) 20200919 ~ 20200921 (0) | 2020.09.21 |
TIL(Today I Learned) 20200917 (0) | 2020.09.18 |
TIL(Today I Learned) 20200916 (0) | 2020.09.16 |
TIL(Today I Learned) 20200915 (0) | 2020.09.15 |
TIL(Today I Learned) 20200914 (0) | 2020.09.14 |
최근댓글