TIL(Today I Learned) 20200905 - 20200906 - 츄르 사려고 코딩하는 집사

1. R 기본 문법 실습

2. R 통계 실습

R 기본 문법 실습

#변수 이름 규칙
#1. 변수는 대소문자를 구별
#2. 밑줄이나 마침표를 사용 가능(단, 변수 첫 글자에 밑줄은 사용 불가)
#3. 특수문자, 예약어는 변수로 사용 불가

a<-1
A<-1
_a<-1
a_<-1
a.<-1

# factor형은 오름차순의 기준을 가짐
# 예를 들어, 혈액형은 A, AB, B, O 순으로 레벨을 결정

#데이터형 확인 함수
x<-1

class(x)
typeof(x)
is.integer(x)
is.numeric(x)
is.complex(x)
is.character(x)
is.na(x)

#데이터형 변환 함수
as.factor(x)
as.integer(x)
as.numeric(x)
as.character(x)
as.matrix(x)
as.array(x)

#산술연산자
a<-5
b<-2

#지수승
a^b

#나눈 나머지
a%%b

#나눈 몫
a%/%b

#벡터생성
1:7

c(1:5)

seq(from = 1, to = 10, by=2)

seq(0,1, length.out = 11)

# 반복
rep(1:3,2)

rep(c(1:10), times = 3)

rep(c(1:10), each = 3)

#벡터끼리 연산 수행
#아래의 식을 연산하면 두 객체의 길이가 서로 배수관계에 있지 않아 맨 마지막 것은 다시 처음 수와 더하여 결과 출력
x<-c(1,2,3,4)
y<-c(3,6,9)
z<-c(2,2,2)
x+y
x+z


x <- 1:10
x>5 #5보다 작은 것은 FALSE 크면 TRUE 출력

#합집합
x<-c(1,2,3)
y<-c(3,4,5)
z<-c(3,1,2)

union(x,y) #3개 인자는 union 불가

#교집합
intersect(x,y)

#차집합
setdiff(x,y)

#같은지 다른지 확인
setequal(x,y)

#배열 생성 : array 함수를 사용하여 N차원 배열 생성
#matrix 함수는 2차원 배열 생성, 벡터를 묶어 배열 생성

x<- array(1:5, c(2,4))
x
dimnamex = list(c("1st","2nd"), c("1st","2nd","3rd","4th"))
x <- array(1:5, c(2,4), dimnames = dimnamex)
x

#matrix 사용
#byrow = T면 행을 먼저 채우고, byrow = F면 열을 먼저 채움
x <- c(1:10)
m <- matrix(x, nrow=2, byrow = F)
m
n <- matrix(x, ncol=2, byrow = F)
n

#cbind와 rbind
#cbind는 열로 배치하고, 데이터는 아래로
#rbind는 행으로 배치하고, 데이터는 오른쪽으로
v1 <- c(1:3)
v2 <- c(4:6)
v3 <- c(7:9)
cbind(v1,v2,v3)
rbind(v1,v2,v3)


#dim을 이용한 배열
x<-array(1:4, dim=c(2,2))
x
y<-array(5:8, dim=c(2,2))
y

x<-array(1:12, c(3,4))
x
#apply에서 1은 행, 2는 열
apply(x, 1, mean)
apply(x, 2, mean)

#랜덤추출
sample(x)

#DataFrame
head(cars)
#속성을 변수로 만듦. cars 데이터의 speed와 dist를 사용 가능
attach(cars)
#변수 사용 해제
detach(cars)

#평균
mean(cars$speed)

#최댓값
max(cars$speed)

#최솟값
min(cars$speed)

#중간값
median(cars$speed)

#subset 함수 : 일부 데이터만 추출
#cars 데이터에서 speed가 20 초과인 데이터 중 dist만 출력
subset(cars, speed>20, select = c(dist))


#DataFrame 예제
member <- data.frame(no = c("1","2","3","4","5"),
                     name = c("KIM","LEE","KANG","PARK","CHOI"),
                     gender = c("M","M","F","M","F"),
                     height = c(170,179,163,174,157))

# 성별이 남자인 데이터 출력
subset(member, member$gender=="M")
subset(member, member$gender=="M", select = c(no))

# 성별이 여자고 키가 160이하인 사람의 이름 출력
subset(member, member$gender=="F"&member$height<=160, select = c(name))

# 성별과 키만 출력
subset(member, select=c(gender,height))

#파일을 읽을 때 문장을 요인으로 인식하지 않도록 설정
students <- read.csv("", stringsAsFactors = FALSE)


test <- c(15,20,30,NA,45)
test[test<40]
test[is.na(test)]

#while 문을 이용한 구구단 2단 만들기

i<-1
while(i<10){
  print(paste(2,"X",i,"=",2*i))
  i = i+1
}

i<-1
j<-1
for(i in 1:9)
  for(j in 1:9)
    print(paste(i,"X",j, "=",i*j))


fact <- function(x){
  i <- 1
  while(x>1){
    i = i*x
    x = x-1
  }
  return(i)
}

fact(5)

#1부터 100까지의 수 중에서 3의배수이면서 4의배수는 아닌 수의 합을 구하라

sum <- 0
for(i in 1:100){
  if(i%%3 == 0 & i %% 4 != 0) sum = sum + i
}
print(sum)


ga <- function(x,y){
  sum <- 0
  for(i in 1:y){
    if(i%%x==0) sum = sum + i
  }
  return(print(paste("1부터",y,"까지의 수 중에서",x,"의 배수의 합은",sum,"입니다.")))
}

ga(3,100)

#NA 처리
#na.rm = T는 na를 삭제 후 처리
str(airquality)

table(is.na(airquality))


air <- na.omit(airquality)
air

table(is.na(air))


#NA처리 2
air <- airquality
table(is.na(air))


air <- ifelse(air$Ozone<1 | air$Ozone > 122, NA, air$Ozone)

air2 <- air[!is.na(air),]

R 통계 실습

library(gapminder)
library(dplyr)
glimpse(gapminder)
View(gapminder)

#Croatia 국저기면서 1990년 이후 기대 수명과 인구 추출
gapminder[gapminder$country == "Croatia" & gapminder$year>1990, c("lifeExp", "pop")]

#dplyr 라이브러리에서는 select, filter, summarise 함수 사용

gapminder %>% filter(country == "Croatia") %>% select(country, year, lifeExp) %>% summarise(lifeExp_avg = mean(lifeExp))

#분산
score<-c(85,90,93,86,82)

var(score)

#표준편차
sd(score)

#정규분포
#rnorm(개수, 평균, 표준편차) ; 정규분포 난수 발생
#hist() 특정 데이터 빈도를 막대모양으로 보여줌
#breaks : 막대 수
#probability : 상대도수
height <- rnorm(n=1000000, mean=168, sd=7)

hist(height, breaks=100, probability = T)


#p-value > 유의수준 귀무가설 채택
#p-value < 유의수준 대립가설 채택

#t검정 : shapiro.test
score1 <- read.csv("tdata.csv")

score1


#p-value > 0.05보다 크기 때문에 정규 분포를 따름 그렇기에 t-test를 할 수 있음
shapiro.test(score1$성적)

#alternative는 모평균보다 크다, 작다, 다르다를 나타냄
# 크다 : greater 작다 : less, 다르다 : two.sided
# mu는 모평균의 설정값

#p-value가 유의수준인 0.05보다 크므로 귀무가설 채택 대립가설 기각
result <- t.test(score1$성적, alternative=c("greater"), mu=75)
result

#t검정 예제
x<-c(15,10,13,7,9,8,21,9,14,8)
y<-c(15,14,12,8,14,7,16,10,15,12)

#x와 y는 p-value가 유의수준인 0.05보다 크기 때문에 정규분포를 따름
shapiro.test(x)
shapiro.test((y))

#t-test 결과 p-value가 0.05보다 크기 때문에 귀무가설 채택
t.test(x,y)

#-----------------------------------------------------------------
#t-test 예제2

x<-c(52,60,63,43,46,56,62,50)
y<-c(58,62,62,48,50,55,68,57)

#x와 y의 p-value는 유의수준인 0.05보다 크기 때문에 정규분포를 따름
#귀무가설 : 교육전과 교육후는 차이가 없다.
#대립가설 : 교육전과 교육후는 차이가 있다.
shapiro.test(x)
shapiro.test(y)

#p-value가 유의수준인 0.05보다 작기 때문에 대립가설 채택
#교육전과 교육후는 차이가 있다
t.test(x,y, paired = TRUE)

#---------------------------------------------------------------------
#분산분석(Oneway.test())
#분산분석은 oneway.test()를 통하여 3집단 분석을 할 수 있다.

x<-c(1.09,2.12,2.92,4.06,4.90)
y<-c(1,2,3,4,5)
z<-c(1.10,1.96,3.98,4.09,4.92)

#x,y,z의 p-value는 유의수준인 0.05보다 크기 때문에 정규분포를 따름
shapiro.test(x)
shapiro.test(y)
shapiro.test(z)


mydata <- c(x,y,z)

group<-c(rep(1,5),rep(2,5),rep(3,5))
group

oneway.test(mydata ~ group, var=T)

#oneway.test는 벡터로 한 줄 쭉 이어놓고, group을 통해 각 데이터마다의 같은 평균 측정을 위해 같은 숫자 배열

#----------------------------------------------------------------------
#빈도분석

fruits1 <- read.csv('love_fruits.csv', header=T)


fruits1


#prop.table() 빈도계산

prop.table(table(fruits1$선호과일))

#round 반올림

round(prop.table(table(fruits1$선호과일))*100,2)


#빈도분석 barplot
table(fruits1$선호과일)
count <- c(table(fruits1$선호과일))
count
cc<-c(round(prop.table(table(fruits1$선호과일))*100,2))
fruits1 <- data.frame(건수=count)

#-------------------------------------------------------------------------
#상관관계 분석 cor.test(x,y)
#회귀분석 : 어떤 결과값을 기준으로 다른 결과값을 예측하는 방법 
# 종속변수 ; y
# 독립변수 : x

#--------------------------------------------------------------------------
#회귀분석 예제
x<-c(110, 120, 130, 140, 150)
y<-c(100,105,128,115,142)

lm(y~x)

plot(x,y)

저작자표시

'자기개발 > TIL' 카테고리의 다른 글

TIL(Today I Learned) 20200910 (0)	2020.09.10
TIL(Today I Learned) 20200909 (0)	2020.09.09
TIL(Today I Learned) 20200908 (0)	2020.09.08
TIL(Today I Learned) 20200907 (0)	2020.09.07
TIL(Today I Learned) 20200904 (0)	2020.09.04
TIL(Today I Learned) 20200903 (0)	2020.09.03
TIL(Today I Learned) 20200902 (0)	2020.09.02
TIL(Today I Learned) 20200901 (0)	2020.09.01