반응형

안녕하세요, 츄르 사려고 코딩하는 집사, 코집사입니다.


1. 데이터 분석 시작 -> 상관관계분석

2. 'Series' object has no attribute 'reshape' 문제 해결


데이터 분석 프로젝트


1) 데이터 분석 20200918 12시 38분 기준

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")

data_corr = data.corr(method='pearson')

data_corr

#상관계수에 따른 히트맵 
#0.7보다 크면 강한 상관관계
plt.subplots(figsize=(18,15)) 
sns.heatmap(data_corr, vmax=1,square=True) 

y_corr = list(data_corr.iloc[12])
y_corr = y_corr[:12]
y_co = list(data.columns)
y_col = y_co[1:13]

ind = np.arange(len(y_corr)) 
width = 0.2
fig, ax = plt.subplots(figsize=(6,20))
rects = ax.barh(ind, np.array(y_corr), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(y_col, rotation='horizontal', size = 15) 
ax.set_xlabel("상관계수", size = 20) 
ax.set_title("접근지수와의 상관관계", size = 20)

 

 

 

 

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/로컬위치최최종.csv', encoding="CP949")

data_lat = list(data['위도'])
data_lon = list(data['경도'])

for i in range(len(data['위도'])) :
    plt.plot(data_lat[i],data_lon[i],'o')
    
    

 

 

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")
data_col = data.columns
data_col = data_col[1:14]
data_col

#Outlier 확인
#Outlier 2개 확인(인구, 면적)
for i in data_col : 
    plt.figure(figsize=(10,3))
    sns.boxplot(x=data[i])
    


#Outlier 제거
data = data[data.면적 < 10000]
data = data[data.인구 < 30000]

for i in data_col : 
    plt.figure(figsize=(10,3))
    sns.boxplot(x=data[i])
    
#Outlier 제거
data = data[data.방문객수 < 700000]
data = data[data.방문빈도수 < 150]
data = data[data.주변대형마트 < 10]
data = data[data.방문효율<30]
data = data[data.접근지수 < 75000]

for i in data_col : 
    plt.figure(figsize=(10,3))
    sns.boxplot(x=data[i])

왜 자꾸 이상치가 생기지?

 

 

2020년 9월 18일 오후 2시 38분 기준 코드(선형회귀 그래프)

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")

data_corr = data.corr(method='pearson')

data_corr

#상관계수에 따른 히트맵 
#0.7보다 크면 강한 상관관계
plt.subplots(figsize=(18,15)) 
sns.heatmap(data_corr, vmax=1,square=True) 

y_corr = list(data_corr.iloc[12])
y_corr = y_corr[:12]
y_co = list(data.columns)
y_col = y_co[1:13]

ind = np.arange(len(y_corr)) 
width = 0.2
fig, ax = plt.subplots(figsize=(6,20))
rects = ax.barh(ind, np.array(y_corr), color='red')
ax.set_yticks(ind+((width)/2.))
ax.set_yticklabels(y_col, rotation='horizontal', size = 15) 
ax.set_xlabel("상관계수", size = 20) 
ax.set_title("접근지수와의 상관관계", size = 20)

from statsmodels.formula.api import ols
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

model = smf.ols(formula = '접근지수 ~ 인구', data = data)
result = model.fit()
result.summary()

sns.lmplot(x="면적", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문객수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="인구", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="유동인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문빈도수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="도로혼잡도", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차선수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="블로그수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="생산인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="주변대형마트", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차량수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문효율", y="접근지수", data=data, line_kws={'color':"red"})

 

20200918 1528 Outlier 확인

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")

data_col = data.columns
data_col = data_col[1:14]

for i in data_col :
    plt.boxplot(data[i])
    plt.show()

 

20200918 1601

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
plt.rc('font', family='Malgun Gothic')

data = pd.read_csv('C:/Users/User/Desktop/project/찐train.csv', encoding="CP949")

data_col = data.columns
data_col = data_col[1:14]

for i in data_col :
    plt.boxplot(data[i])
    plt.xlabel(i)
    plt.show()
    
data = data[data['면적']<10000]
data = data[data['인구']<300000]

for i in data_col : 
    plt.boxplot(data[i])
    plt.xlabel(i)
    plt.show()
data = data[data['인구']<100000]
#data = data[data['접근지수']<250000]

for i in data_col : 
    plt.boxplot(data[i])
    plt.xlabel(i)
    plt.show()
    
data = data[data['인구']<100000]

for i in data_col : 
    plt.boxplot(data[i])
    plt.xlabel(i)
    plt.show()
    
sns.lmplot(x="면적", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문객수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="인구", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="유동인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문빈도수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="도로혼잡도", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차선수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="블로그수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="생산인구수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="주변대형마트", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="차량수", y="접근지수", data=data, line_kws={'color':"red"})
sns.lmplot(x="방문효율", y="접근지수", data=data, line_kws={'color':"red"})

 

 


'Series' object has no attribute 'reshape' 문제 해결


yongku.tistory.com/entry/Series-object-has-no-attribute-reshape

 

'Series' object has no attribute 'reshape'

'Series' object has no attribute 'reshape' 회귀분석을 할 때, 아래에 관한 문제가 발생하곤 한다. 1차원 배열을 2차원 배열로 만들어 주면 된다. X 데이터를 아래의 명령어를 사용하면 된다. X.values.re..

yongku.tistory.com

 

반응형

'자기개발 > TIL' 카테고리의 다른 글

TIL(Today I Learned) 20210114  (0) 2021.01.14
TIL(Today I Learned) 20210113  (0) 2021.01.13
TIL(Today I Learned) 20200922 ~ 20200923  (0) 2020.09.23
TIL(Today I Learned) 20200919 ~ 20200921  (0) 2020.09.21
TIL(Today I Learned) 20200917  (0) 2020.09.18
TIL(Today I Learned) 20200916  (0) 2020.09.16
TIL(Today I Learned) 20200915  (0) 2020.09.15
TIL(Today I Learned) 20200914  (0) 2020.09.14
  • 네이버 블러그 공유하기
  • 네이버 밴드에 공유하기
  • 페이스북 공유하기
  • 카카오스토리 공유하기