더보기
▣ 주요 키워드 ▣
- dplyr
- summarise(n()), count,add_count, inner_join, left_join, right_join, full_join, semi_join, anti_join
- sqldf
⊙부서별 급여를 기준으로 내림차순 순위를 구하세요.(연이은순위)
1.ddply
df <- plyr::ddply(employees[,c('EMPLOYEE_ID','LAST_NAME','SALARY','DEPARTMENT_ID')],'DEPARTMENT_ID',transform,
순위=dplyr::dense_rank(desc(SALARY)),
순위2=dplyr::min_rank(desc(SALARY))) #순위라는 새 컬럼에 dense_rank(연이은순위)함수의 결과를 저장, min_rank는 비교해보기
library(doBy)
doBy::orderBy(~DEPARTMENT_ID+순위,df) # department_id,순위를 기준으로 정렬
2.dplyr
options(tibble.print_max=Inf)
employees%>%
dplyr::select(EMPLOYEE_ID,LAST_NAME,SALARY,DEPARTMENT_ID)%>% # EMPLOYEE_ID,LAST_NAME,SALARY,DEPARTMENT_ID만 추출
dplyr::group_by(DEPARTMENT_ID)%>% #department_id로 그룹핑
dplyr::mutate(순위=dplyr::dense_rank(desc(SALARY)), #dense_rank(연이은 순위)값이 들어갈순위 컬럼을 만들기
순위2=dplyr::min_rank(desc(SALARY)))%>%
dplyr::arrange(DEPARTMENT_ID,순위) # department_id,순위컬럼을 기준으로 정렬
#sql
select employee_id,last_name, salary, department_id,
rank() over(partition by department_id order by salary desc) 순위2,
dense_rank() over(partition by department_id order by salary desc)순위
from employees
order by 4,6;
⊙JOB_ID별 급여를 많이 받는 사원 1등만 추출해 주세요.
1.dplyr
employees%>%
dplyr::group_by(JOB_ID)%>%
dplyr::filter(SALARY == max(SALARY))
#job_id별로 salary값이 가장높은 값과 같은 값을 출력
2.ddply
ddply(employees,'JOB_ID',subset,SALARY==max(SALARY))
▶dplyr 개수 함수
employees%>%
dplyr::summarise(n()) #데이터프레임에 있는 행의 개수
employees%>%
dplyr::filter(DEPARTMENT_ID==50)%>%
dplyr::summarise(n()) #50번부서인 사원의 수
employees%>%
dplyr::count() # 데이터프레임의 행의 수
employees%>%
dplyr::filter(DEPARTMENT_ID==50)%>%
dplyr::count()
employees%>%
dplyr::group_by(DEPARTMENT_ID)%>%
dplyr::summarise(n())# 부서아이디별로 개수
employees%>%
dplyr::group_by(DEPARTMENT_ID)%>%
dplyr::count() # 부서아이디별로 개수
employees%>%
dplyr::count(DEPARTMENT_ID)# 부서아이디별로 개수
employees%>%
dplyr::count(DEPARTMENT_ID,JOB_ID) #DEPARTMENT_ID, JOB_ID별로 개수
employees%>%
dplyr::count(DEPARTMENT_ID,JOB_ID, sort = T) #DEPARTMENT_ID, JOB_ID별로 개수, 정렬까지(내림차순)
employees%>%
dplyr::add_count(DEPARTMENT_ID) #행단위로 DEPARTMENT_ID별로 개수 출력
employees%>%
dplyr::group_by(DEPARTMENT_ID,JOB_ID)%>%
dplyr::summarise(n())
employees%>%
dplyr::group_by(DEPARTMENT_ID)%>%
dplyr::summarise(부서인원수 = n(),
부서내JOB_ID수 = n_distinct(JOB_ID)) # 한 부서에 몇명의 인원과 job_id가 몇개인지
● 조인
emp <- employees[,c('EMPLOYEE_ID','LAST_NAME','DEPARTMENT_ID')]
dept <- departments[,c('DEPARTMENT_ID','DEPARTMENT_NAME')]
# 등가조인, equi join, inner join, simple join
merge(emp,dept,by='DEPARTMENT_ID') #1. emp,dept데이터프레임을 DEPARTMENT_ID로 조인하기
dplyr::inner_join(emp,dept,by='DEPARTMENT_ID') #2. emp,dept데이터프레임을 DEPARTMENT_ID로 조인하기
#left outer join
merge(emp,dept,by='DEPARTMENT_ID',all.x = T) #left outer join, 키값에 일치하지 않는 x축에 있는 데이터 다 출력
dplyr::left_join(emp,dept,by='DEPARTMENT_ID') #left outer join, 키값에 일치하지 않는 x축에 있는 데이터 다 출력
#right outer join
merge(emp,dept,by='DEPARTMENT_ID',all.y = T) #right outer join, 키값에 일치하지 않는 y축에 있는 데이터 다 출력
dplyr::right_join(emp,dept,by='DEPARTMENT_ID') #right outer join, 키값에 일치하지 않는 y축에 있는 데이터 다 출력
#full outer join
merge(emp,dept,by='DEPARTMENT_ID',all = T) #full outer join, 키값에 일치하지 않는 x,y축에 있는 데이터 다 출력
dplyr::full_join(emp,dept,by='DEPARTMENT_ID') #full outer join, 키값에 일치하지 않는 x,y축에 있는 데이터 다 출력
⊙소속 사원이 있는 부서 정보 출력
● sql풀이
(1) in
select *
from departments
where department_id in (select department_id
from employees)
#문제점 where department_id in (null,10,20,20,20,30,30......)
#where department_id =null
# or department_id = 10
# ... 107개를 다 비교함, distinct를 사용하면 내부적으로 sort(oracle server에서)를 하기 때문에 그것도 문제임
select *
from departments
where department_id in (select distinct department_id # 현장에서 사용하실 때 주의 !! 내부적으로 정렬작업이 발생, 데이터가 적으면 괜찮 벗 많으면 하루종일 걸림
from employees)
(2)inner join으로 해결하겠다 하는 순간 현장에서 문제가 발생한다.
1쪽 집합으로 만들기 위해서 중복을 제거하는 작업을 수행하는 순간 정렬을 수행해야 하는 문제점 발생
select d.*
from (select distinct department_id
from employees) e, departments d
where e.department_id = d.department_id
(3) exists를 사용하면 semi기법이 들어간다.
select *
from departments d
where exists (select 'x'
from employees
where department_id = d.department_id)
#semi join 기법 - department_id가 d.department_id와 맞으면 더이상 찾지 않고 다음 것으로 간다(oracle에서는 위에 in으로 사용해도 exists로 변경해서 수행함)
● R 풀이
(1)in(sql에서 in연산자를 r로 표현)
departments[departments$DEPARTMENT_ID %in% employees$DEPARTMENT_ID,]
departments[departments$DEPARTMENT_ID %in% unique(employees$DEPARTMENT_ID),]
(2) inner join
dplyr::inner_join(departments,employees,by='DEPARTMENT_ID')
employees$DEPARTMENT_ID를 1쪽 집합처럼 나들어서 수행하려면
unique(employees$DEPARTMENT_ID)를 수행해서 하면 된다. 하지만 문제점은 유일한 값을 만들기 위해서 내부적으로 정렬 작업을 수행해야한다는 문제점.
dplyr::inner_join(departments,
data.frame(DEPARTMENT_ID = unique(employees$DEPARTMENT_ID)),by='DEPARTMENT_ID')
이런 문제점을 개선해서 성능이 좋아지도록 하는 방법이 semi join 기법이다.
(3)semi join (sql에서 exists연산자)
departments%>%
dplyr::semi_join(employees,by='DEPARTMENT_ID')
⊙소속사원이 없는 부서 정보를 출력
# sql
(1) anti join
select *
from departments d
where not exists (select 'x'
from employees
where department_id = d.department_id) #내부적으로 anti join 이 수행된다.
#R
(1) anti join
departments%>%
dplyr::anti_join(employees,by='DEPARTMENT_ID')
⊙관리자 사원들의 정보를 출력하세요.
●sql
(1) in
select *
from employees e
where employee_id in (select manager_id
from employees
where manager_id = e.employee_id);
(2) exists
select *
from employees e
where exists(select *
from employees
where manager_id = e.employee_id);
#R
(1) in
employees[employees$EMPLOYEE_ID %in% employees$MANAGER_ID,]
(2) semi_join
employees%>%
dplyr::semi_join(employees,by=c('EMPLOYEE_ID'='MANAGER_ID'))
⊙관리자가 아닌 사원들의 정보를 출력하세요.
●sql
(1) in
select *
from employees e
where employee_id not in (select manager_id
from employees
where manager_id is not null); #not in은 null값이 있으면 수행이 안됨
(2) exists
select *
from employees e
where not exists(select *
from employees
where manager_id = e.employee_id);
#R
(1) in
employees[!employees$EMPLOYEE_ID %in% employees$MANAGER_ID,]
(2) semi_join
employees%>%
dplyr::anti_join(employees,by=c('EMPLOYEE_ID'='MANAGER_ID'))
★ sqldf
- sql을 이용해서 데이터를 처리
# 패키지 설치
install.packages("sqldf")
library(sqldf)
sqldf("select * from employees")
sqldf("select employee_id, last_name from employees")
sqldf("SELECT EMPLOYEE_ID, LAST_NAME, FROM employees") # 테이블명만큼은 데이터프레임 이름과 동일하게 대소문자 구분한다.
sqldf("select distinct department_id from employees")
unique(employees$DEPARTMENT_ID)
# SQL문을 R에서 약간의 제한을 가지고 마음대로 사용할 수 있다.
● 관리자가 아닌 사원 추출
sqldf("
select *
from employees e
where employee_id not in (select manager_id
from employees
where manager_id is not null)") # not in, null이 있으면 수행불가
sqldf("select *
from employees e
where not exists(select *
from employees
where manager_id = e.employee_id)") # not exists
sqldf("select employee_id, last_name, salary*12
from employees
order by salary * 12") # salary * 12 연산된 컬럼을 기준으로 정렬
sqldf("select employee_id, last_name, salary*12 ann_sal
from employees
order by salary * 12") # 별칭 사용 가능
sqldf("select employee_id, last_name, salary*12 ann_sal
from employees
order by ann_sal desc") # 별칭으로 내림차순
sqldf("select employee_id, last_name, salary*12 ann_sal
from employees
order by 3 desc") # 컬럼 순서 번호로 내림차순 정렬
● 대소문자 만들기
sqldf("select last_name,upper(last_name), lower(last_Name)
from employees")
● 대소문자, 맨 앞자리 대문자 나머지 소문자
sqldf("select last_name,upper(last_name), lower(last_Name),
upper(substr(last_name,1,1)) || lower(substr(last_name,2)) as 'inicap'
from employees") #별칭넣을 때 안에 큰따옴표면 별칭은 작은 따옴표, 작은따옴표면 큰따옴표로 표현
● 앞에서 두글자 뒤에서 두글자 추출(leftstr,rightstr)
sqldf("select last_name, length(last_Name),
substr(last_name,1,2), leftstr(last_name,2), substr(last_name,-2,2),rightstr(last_name,2),
reverse(last_name)
from employees") # 왼쪽의 두글자만 뽑기, 오른쪽 2글자 뽑기, reverse 반대로 뽑기
● 테이블이 없는 표현식을 만들 때 from 절 없이 표현(mysql)
sqldf("select 1+2")
● sqldf의 round, ceil,floor
sqldf("select round(45.926), round(45.926,0), round(45.926,1),round(45.926,2),
round(45.926,-1), round(45.926,-3)")
# 반올림함수에서 '-'(sql에서 10의 자리, 100의 자리..)는 의미가 없다
sqldf("select ceil(45.926),floor(45.926)")
사용가능한 sql연산자들
● 부서id가 10
sqldf("select * from employees where department_id = 10")
● 부서id가 null
sqldf("select * from employees where department_id is null")
● 부서 id가 null이 아닌 값
sqldf("select * from employees where department_id is not null")
● 부서id가 10 or 20
sqldf("select * from employees where department_id = 10 or department_id = 20")
sqldf("select * from employees where department_id in (10,20)")
● 급여가 10000 ~ 20000 and연산자
sqldf('select *
from employees
where salary >= 10000 and salary <= 20000')
● 급여 10000 ~ 20000 between연산자
sqldf('select *
from employees
where salary between 10000 and 20000')
● not between
sqldf('select *
from employees
where salary not between 10000 and 20000')
● like 연산자
sqldf("select *
from employees
where last_name like 'K%'")
● not like 연산자
sqldf("select *
from employees
where last_name not like 'K%'")
● like '_'옵션
sqldf("select *
from employees
where last_name like '_i%'")
■sqldf조인
●inner join
sqldf("select e.last_name,d.department_name
from employees e, departments d
where e.department_id = d.department_id")
●inner join
sqldf("select e.last_name,d.department_name
from employees e join departments d
on e.department_id = d.department_id")
●using
sqldf("select e.last_name,d.department_name
from employees e, departments d
using(department_id)")
● natural join
sqldf("select e.last_name,d.department_name
from employees e natural join departments d")
● natural join(=)
sqldf("select e.last_name,d.department_name
from employees e join departments d
on e.department_id = d.department_id
and e.manager_id = d.manager_id")
● left outer join
sqldf("select e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id")
● right outer join (기능 x)
sqldf("select e.last_name,d.department_name
from employees e right outer join departments d
on e.department_id = d.department_id") # right outer join기능은 없다. 테이블의 위치만 바꾸면 되기 때문인듯하다
●full outer join (기능 x)
sqldf("select e.last_name,d.department_name
from employees e full outer join departments d
on e.department_id = d.department_id") # full outer join 기능도 없다.
● 합집합 (full outer join없는 대신 합집합을 이용해서 표현하기, 중복제거) # 동명이인은 안나오는 문제
sqldf("select e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
union
select e.last_name,d.department_name
from departments d left outer join employees e
on e.department_id = d.department_id")
● 합집합 (full outer join없는 대신 합집합을 이용해서 표현하기, 중복제거) # 동명이인이 있을 수 있으니 이렇게 사용
sqldf("select e.employee_id, e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
union
select e.employee_id, e.last_name,d.department_name
from departments d left outer join employees e
on e.department_id = d.department_id")
● 합집합(중복제거 x)
sqldf("select e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
union all
select e.last_name,d.department_name
from departments d left outer join employees e
on e.department_id = d.department_id")
union -> union all not exists
rm(list=ls())
ls()
employees <- read.csv('c:/data/employees.csv',header = T)
departments <- read.csv('c:/data/departments.csv',header = T)
● 합집합(중복제거 대신 중복되지 않는 조건으로 합집합만들기)
sqldf("select e.employee_id, e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
union all
select NULL,NULL, department_name
from departments d
where not exists (select 'x'
from employees
where department_id = d.department_id)")
● departments만 있고 employees에는 없는 값들
sqldf("select department_name
from departments d
where not exists (select 'x'
from employees
where department_id = d.department_id)")
● intersect 교집합 연산자
sqldf("select e.employee_id,e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
intersect
select e.employee_id,e.last_name,d.department_name
from departments d left outer join employees e
on e.department_id = d.department_id")
● except 차집합 연산자
sqldf("select e.employee_id,e.last_name,d.department_name
from employees e left outer join departments d
on e.department_id = d.department_id
except
select e.employee_id,e.last_name,d.department_name
from departments d left outer join employees e
on e.department_id = d.department_id")
● 그룹함수
sqldf("select sum(salary), avg(salary),min(salary),max(salary), variance(salary) 분산 ,stdev(salary) 표준편차
from employees")
● 그룹함수,그룹조건
sqldf("select department_id, sum(salary)
from employees
group by department_id
having sum(salary) >= 10000
order by 2 desc")
●자신의 부서 평균 급여보다 더 많이 받는 사원들의 정보를 출력해주세요.
sqldf("select e2.*
from (select department_id, avg(salary) avg_sal
from employees
group by department_id) e1, employees2 e2
where e1.department_id = e2.department_id
and e2.salary > e1.avg_sal")
● nvl, nvl2 함수는 R에서 사용불가(직접 쿼리문 작성)
# nvl2(commission_pct,(salary*12)+(salary*12*commission_pcy),salary * 12)
sqldf("select last_name, salary , commission_pct,
case
when commission_pct is null then salary * 12
else
(salary * 12) + (salary * 12 * commission_pct)
end ann_sal
from employees")
● rank함수
sqldf("select last_name, salary, rank() over(order by salary desc)
from employees")
● dense_rank함수(연이은순위)
sqldf("select last_name, salary, dense_rank() over(order by salary desc)
from employees")
⊙fruits_sales.csv file 읽어 들인 후 과일 이름별 판매량, 판매 합계를 구하세요.(sqldf를 이용하세요)
sales <- read.csv('c:/data/fruits_sales.csv',header = T)
sales
sqldf("select name, sum(qty),sum(price)
from sales
group by name")
⊙fruits_sales.csv file 읽어 들인 후 연도별로 판매량 중에 가장 많은 판매를 한 년도를 출력해주세요.(sqldf를 이용하세요)
1.
x <- plyr::ddply(sales,'year',summarise,qty=sum(qty))
x[x$qty == max(x$qty),'year']
2.
sales%>%
dplyr::group_by(year)%>%
dplyr::summarise(sum_qty=sum(qty))%>%
dplyr::filter(sum_qty == max(sum_qty))%>%
dplyr::select(year)
3.
sqldf("select year
from sales
group by year
having sum(qty) = (select max(sumqty)
from(select sum(qty) sumqty
from sales
group by year))")
# sql에서는 되는데 r에서는 안되는 풀이
select year, sum(qty) qt
from sales
group by year
having sum(qty) = (select max(sum(qty))
from sales
group by year)
⊙부서별 급여를 기준으로 내림차순 순위를 구하세요.(연이은 순위).(sqldf를 이용하세요)
sqldf("select last_name, department_id, salary, dense_rank() over(partition by department_id order by salary desc) 순위
from employees
order by 3,4")
⊙부서별 최소 급여자들의 정보를 출력해주세요. (sqldf를 이용하세요)
1.in사용
sqldf("select *
from employees e2
where (department_id, salary) in (select department_id, min(salary) over(partition by department_id)
from employees)")
2. min사용
sqldf("select *
from (select employee_id, last_name, salary, department_id,
min(salary) over(partition by department_id) min_sal,
case when salary = min(salary) over(partition by department_id) then 1 end case_min
from employees)
where case_min = 1")
3.dense_rank 사용
sqldf("select *
from (select *,
dense_rank() over(partition by department_id order by salary) 순위
from employees)
where 순위 = 1")
⊙동일한 날짜에 입사한 사원들의 정보를 출력
# 본인을 제외하고 나머지 hire_date랑 같은 사원들의 정보 출력
sqldf("select *
from employees e
where exists(select 'x'
from employees
where hire_date = e.hire_date
and employee_id != e.employee_id)
order by 6")
'R' 카테고리의 다른 글
R - reshape2,cut,히스토그램,상자그림 (0) | 2022.01.26 |
---|---|
R barplot, 산점도 (0) | 2022.01.25 |
R dplyr, rank 함수 (0) | 2022.01.20 |
R subset,ddply 함수 (0) | 2022.01.19 |
R merge (0) | 2022.01.17 |