데브코스/TIL

2024-01-02 데브코스 3주차 - 5 TIL

Hynnjnn 2024. 1. 2. 18:44

Indian Restaurant Dataset

SELECT * FROM `dev-course`.indian_restaurants;

mySQL에서 여러 스키마 있을 경우 스키마.테이블명으로 접근해야 해서 불편하다. 이럴 경우 USE문을 한번 실행해준 후 사용하면 된다.

USE `dev-course`;
SELECT * FROM indian_restaurants;

USE `dev-course`;

-- restaurant name 중복 확인
SELECT count(1) as cnt, count(distinct restaurant_name) as cnt_distinct
from restaurants;

-- cnt 1 이상인 restaurant name 찾기
SELECT restaurant_name, count(1) as cnt
FROM restaurants
GROUP BY 1
HAVING cnt > 1;

-- 몇개 뽑아서 location이 다른지, location과 name이 같은 경우가 있는지 확인
SELECT *
FROM restaurants
WHERE restaurant_name in ('7th Heaven', '1441 Pizzeria', '1944 -The HOCCO Kitchen')
ORDER BY restaurant_name, location;

SELECT count(1) as cnt, count(distinct restaurant_name, location, fast_food_or_not) as cnt_distinct
FROM restaurants;
-- 결론: 행을 구분할 수 있는 유니크 키가 존재하지 않는다.

-- 레스토랑 이름별로 평점 확인
SELECT restaurant_name, count(1) as cnt, avg(rating) as avg_rating, avg(average_price) as avg_price, avg(average_delivery_time) as avg_delivery_time
FROM restaurants
GROUP BY 1
ORDER BY 3 DESC;
-- cnt 1인 레스토랑이 평점이 높다?

-- 실제로 cnt 1인 레스토랑이 평점이 높은지 확인
WITH counts as (
    SELECT restaurant_name, count(1) as cnt
    FROM restaurants
    GROUP BY 1
)
SELECT CASE cnt
       WHEN 1 THEN 'cnt-1'
       WHEN 2 THEN 'cnt-2'
       ELSE 'cnt-ov3'
       END AS cnt_group, avg(rating) as avg_rating
FROM counts INNER JOIN restaurants ON counts.restaurant_name = restaurants.restaurant_name
GROUP BY 1;
-- 그렇진 않다. 오히려 레시피 등이 정형화된 여러 지점이 있는 레스토랑이 평균 평점이 높다. 개인 레스토랑은 편차가 클 것.

WITH counts as (
    SELECT restaurant_name, count(1) as cnt
    FROM restaurants
    GROUP BY 1
)
SELECT counts.restaurant_name, cnt, rating
FROM counts INNER JOIN restaurants ON counts.restaurant_name = restaurants.restaurant_name;
-- export 해서 파이썬으로 지점 개수에 따른 히스토그램

-- Pearson 상관계수? -1 ~ 1

-- 큰 도시 작은도시 구분? -> location 별 레스토랑 개수로 판단
WITH cnts as (
    SELECT location, count(1) as cnt
    FROM restaurants
    GROUP BY 1
)
SELECT location, cnt
FROM cnts
ORDER BY 2 DESC
LIMIT 30;

-- percent 사용해서 cnt 개수로 랭크
WITH cnts as (
    SELECT location, count(1) as cnt
    FROM restaurants
    GROUP BY 1
)
SELECT location, percent_rank() over (ORDER BY cnt) as cnt_rank
FROM cnts
ORDER BY 2 DESC
LIMIT 30;

-- 상위 5퍼 location
WITH cnts as (
    SELECT location, count(1) as cnt
    FROM restaurants
    GROUP BY 1
),
res as (
    SELECT location, percent_rank() over (ORDER BY cnt) as cnt_rank
    FROM cnts
)
SELECT location
FROM res
WHERE cnt_rank > 0.95;

-- 하위 15퍼 location
WITH cnts as (
    SELECT location, count(1) as cnt
    FROM restaurants
    GROUP BY 1
),
res as (
    SELECT location, percent_rank() over (ORDER BY cnt) as cnt_rank
    FROM cnts
)
SELECT location
FROM res
WHERE cnt_rank < 0.15;


-- binary 값에 따른 평균 가격
SELECT IF(south_indian_or_not = 0, 'south-0', 'south-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(north_indian_or_not = 0, 'north-0', 'north-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(fast_food_or_not = 0, 'fast-0', 'fast-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(street_food = 0, 'street-0', 'street-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(north_indian_or_not = 0, 'north-0', 'north-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(biryani_or_not = 0, 'biryani-0', 'biryani-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1

UNION

SELECT IF(bakery_or_not = 0, 'bakery-0', 'bakery-1') as group_name, AVG(average_price) as avg_price
FROM restaurants
GROUP BY 1;
-- 차이가 있는 것을 확인


-- Rishikesh=가장 저렴한 레스토랑, Shimla=가장 비싼 레스토랑 비교
WITH base as (
    SELECT location, south_indian_or_not, north_indian_or_not, fast_food_or_not, street_food, biryani_or_not, bakery_or_not
    FROM restaurants
    WHERE location in ('Rishikesh', 'Shimla')
)
SELECT location, count(1) as tot_cnt, SUM(south_indian_or_not) as south_cnt,
SUM(north_indian_or_not) as north_cnt, SUM(fast_food_or_not) as fast_cnt,
SUM(street_food) as street_cnt, SUM(biryani_or_not) as biryani_cnt,
SUM(bakery_or_not) as bakery_cnt
FROM base
GROUP BY 1;


-- binary값에 따른 평균 평점, 평균 배달시간도 확인
SELECT IF(south_indian_or_not = 0, 'south-0', 'south-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(north_indian_or_not = 0, 'north-0', 'north-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(fast_food_or_not = 0, 'fast-0', 'fast-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(street_food = 0, 'street-0', 'street-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(north_indian_or_not = 0, 'north-0', 'north-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(biryani_or_not = 0, 'biryani-0', 'biryani-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1

UNION

SELECT IF(bakery_or_not = 0, 'bakery-0', 'bakery-1') as group_name, AVG(average_price) as avg_price,
AVG(average_delivery_time) as avg_time, ROUND(AVG(rating), 3) as avg_rating
FROM restaurants
GROUP BY 1;

-- 평점 가장 낮은/높은 지역(DESC)
SELECT location, AVG(rating) as avg_rating, count(1) as cnt
FROM restaurants
GROUP BY 1
ORDER BY 2
LIMIT 10;

WITH base as (
    SELECT location, south_indian_or_not, north_indian_or_not, fast_food_or_not, street_food, biryani_or_not, bakery_or_not
    FROM restaurants
    WHERE location in ('Junagadh', 'Motihari')
)
SELECT location, count(1) as tot_cnt, SUM(south_indian_or_not) as south_cnt,
SUM(north_indian_or_not) as north_cnt, SUM(fast_food_or_not) as fast_cnt,
SUM(street_food) as street_cnt, SUM(biryani_or_not) as biryani_cnt,
SUM(bakery_or_not) as bakery_cnt
FROM base
GROUP BY 1;

Global AI, ML, Data Science Salary

USE `dev-course`;

SELECT work_year, count(1) as cnt
FROM salary
GROUP BY 1;

-- 데이터가 많은 2022, 2023년만 비교
SELECT work_year, AVG(salary_in_usd) as usd_salary
FROM salary
WHERE work_year in ('2022', '2023')
GROUP BY 1
ORDER BY 1;

-- 연봉이 왜 올랐나?

-- 숙련도에 따른 연봉 차이
SELECT experience_level, AVG(salary_in_usd) as usd_salary
FROM salary
GROUP BY 1
ORDER BY 2;

-- 직무에 따른 연봉 차이
SELECT job_title, AVG(salary_in_usd) as usd_salary
FROM salary
GROUP BY 1
ORDER BY 2;
-- 직무명에 직급이 포함된 경우가 많음 -> 나중에 비슷한 직무끼리 그룹핑 ㄱㄱ

-- 국가
SELECT employee_residence, AVG(salary_in_usd) as usd_salary
FROM salary
GROUP BY 1
ORDER BY 2;

-- 원격 출근
SELECT remote_ratio, AVG(salary_in_usd) as usd_salary
FROM salary
GROUP BY 1
ORDER BY 2;

-- 회사 규모
SELECT company_size, AVG(salary_in_usd) as usd_salary
FROM salary
GROUP BY 1
ORDER BY 2;

-- 2022 > 2023 숙련도별 직원 수 차이
WITH bef as (
    SELECT work_year, experience_level, count(1) as cnt_2022
    FROM salary
    WHERE work_year = '2022'
    GROUP BY 1, 2
),
aft as (
    SELECT work_year, experience_level, count(1) as cnt_2023
    FROM salary
    WHERE work_year = '2023'
    GROUP BY 1, 2
)
SELECT bef.*, aft.cnt_2023
FROM bef INNER JOIN aft ON bef.experience_level = aft.experience_level;
-- export 해서 비율로
-- 고연봉 직급의 비율이 늘어서 평균 연봉이 높아졌다.


WITH bef as (
    SELECT work_year, company_size, count(1) as cnt_2022
    FROM salary
    WHERE work_year = '2022'
    GROUP BY 1, 2
),
aft as (
    SELECT work_year, company_size, count(1) as cnt_2023
    FROM salary
    WHERE work_year = '2023'
    GROUP BY 1, 2
)
SELECT bef.*, aft.cnt_2023
FROM bef INNER JOIN aft ON bef.company_size = aft.company_size;
-- 평균 연봉이 높은 중간 규모 회사의 비율이 늘었음
-- 결론: 22년에 비해 23년의 평균 연봉이 증가한 이유는 시니어 비중이 높아지고, 중간 규모 회사의 비중이 높아졌기 때문이다.


-- 연도별 원격 근무 비중
SELECT work_year, AVG(remote_ratio) as avg_remote_ratio
FROM salary
WHERE work_year in ('2022', '2023')
GROUP BY 1
ORDER BY 1;

-- 미국내 미국 외 근무 비율
WITH bef as (
    SELECT work_year, IF(employee_residence = 'US', 'In-US', 'Out-US') as residence_group, count(1) as cnt_2022
    FROM salary
    WHERE work_year = '2022'
    GROUP BY 1, 2
),
aft as (
    SELECT work_year, IF(employee_residence = 'US', 'In-US', 'Out-US') as residence_group, count(1) as cnt_2023
    FROM salary
    WHERE work_year = '2023'
    GROUP BY 1, 2
)
SELECT bef.*, aft.cnt_2023
FROM bef INNER JOIN aft ON bef.residence_group = aft.residence_group;
-- 해외 근무 비율이 줄어듦


-- 직무별 평균 연봉, 원격 근무 비중
SELECT CASE
       WHEN job_title LIKE '%Scientist%' THEN 'S'
       WHEN job_title LIKE '%Director%' THEN 'D'
       WHEN job_title LIKE '%Engineer%' THEN 'E'
       WHEN job_title LIKE '%Analyst%' THEN 'A'
       WHEN job_title LIKE '%Architect%' THEN 'AC'
       WHEN job_title LIKE '%Consultant%' THEN 'C'
       WHEN job_title LIKE '%Manager%' THEN 'M'
       WHEN job_title LIKE '%Specialist%' THEN 'SP'
       WHEN job_title LIKE '%Practitioner%' THEN 'P'
       ELSE 'OTHER'
       END AS job_group, AVG(salary_in_usd) as avg_salary, AVG(remote_ratio) as avg_remote_ratio, count(1) as cnt
FROM salary
GROUP BY 1
ORDER BY 3;

-- 직무별 미국 내외 근무 비중
WITH base as (
SELECT CASE
       WHEN job_title LIKE '%Scientist%' THEN 'S'
       WHEN job_title LIKE '%Director%' THEN 'D'
       WHEN job_title LIKE '%Engineer%' THEN 'E'
       WHEN job_title LIKE '%Analyst%' THEN 'A'
       WHEN job_title LIKE '%Architect%' THEN 'AC'
       WHEN job_title LIKE '%Consultant%' THEN 'C'
       WHEN job_title LIKE '%Manager%' THEN 'M'
       WHEN job_title LIKE '%Specialist%' THEN 'SP'
       WHEN job_title LIKE '%Practitioner%' THEN 'P'
       ELSE 'OTHER'
       END AS job_group, IF(employee_residence = 'US', 'In-US', 'Out-US') as residence_group
FROM salary
),
g_1 as (
    SELECT job_group, residence_group, count(1) as group_cnt
    FROM base
    GROUP BY 1, 2
),
g_2 as (
    SELECT job_group, count(1) as tot_cnt
    FROM base
    GROUP BY 1
)
SELECT g_1.job_group, residence_group, group_cnt, group_cnt / tot_cnt as ratio
FROM g_1 INNER JOIN g_2 ON g_1.job_group = g_2.job_group;

-- 숙련도(categorical) 연봉(숫자)간의 상관계수? -> Point-Biserial

NBA Players

USE `dev-course`;

-- 시즌별 데이터 분포?
SELECT season, count(1) as cnt
FROM nba
GROUP BY 1
ORDER BY 1;

-- player_name으로 구분 가능한지
SELECT count(1) as cnt, count(distinct season, player_name) as d_cnt
FROM nba;

-- 동명이인 확인
SELECT player_name, season, count(1) as cnt
FROM nba
GROUP BY 1, 2
HAVING cnt > 1;


-- 중복 선수 확인
WITH dups as (
    SELECT player_name, season, count(1) as cnt
    FROM nba
    GROUP BY 1, 2
    HAVING cnt > 1
),
tot as (
    SELECT *
    FROM nba
)
SELECT tot.*
FROM tot INNER JOIN dups ON tot.player_name = dups.player_name and tot.season = dups.season;

-- 선수 구분하기 위해서는 세 개의 컬럼이 필요 (선수명, 시즌, 팀 or 대학)
SELECT count(1) as cnt, count(distinct player_name, season, team_abbreviation) as d_cnt, count(distinct player_name, season, college) as d_cnt_2
FROM nba;

-- 시즌별 신장, 체중, 경기 수, 득점, 리바운드, 어시스트
SELECT season, AVG(player_height) as height, AVG(player_weight) as weight, AVG(gp) as gp,
AVG(pts) as pts, AVG(reb) as reb, AVG(ast) as ast
FROM nba
GROUP BY 1;


-- 상위(드래프트 1라운드, 10순위 이내) 드래프트 선수들의 평균 신장 체중 나이
SELECT draft_year, AVG(player_height), AVG(player_weight), AVG(age)
FROM nba
WHERE draft_round = 1
AND draft_number <= 10
GROUP BY 1;


-- 시즌별 득점왕?
WITH max_pts as (
    SELECT season, MAX(pts) as max_pt
    FROM nba
    GROUP BY 1
)
SELECT m.season, player_name, age, player_height, player_weight, max_pt, gp
FROM max_pts m INNER JOIN nba n ON m.season = n.season AND m.max_pt = n.pts
ORDER BY 1;


-- 시즌별 각 선수 랭크
SELECT season, player_name, rank() over (PARTITION BY season ORDER BY pts DESC) as pts_rank, pts
FROM nba;


-- 시즌별 상위 10명의 득점 수 
WITH base as (
    SELECT season, player_name, rank() over (PARTITION BY season ORDER BY pts DESC) as pts_rank, pts
    FROM nba
)
SELECT season, AVG(pts) as pts
FROM base
WHERE pts_rank <= 10
GROUP BY 1
ORDER BY 1;