[R] 데이터 전처리 및 시각화 (4)

choiwonjin 2024. 2. 23. 14:30

Kaggle에 있는 Netflix Movies and TV Shows 데이터 셋으로 전처리 및 시각화를 해보겠습니다.

EDA 주제는 다음과 같습니다.

① 컨텐츠 타입 비율
② 영상물 등급 분포
③ 제작 국가 분포
④ 연도별 발매량 분포
⑤ 핵심 주제 텍스트

https://www.kaggle.com/datasets/shivamb/netflix-shows/code

Netflix Movies and TV Shows

Listings of movies and tv shows on Netflix - Regularly Updated

www.kaggle.com

먼저 데이터를 살펴보겠습니다.

setwd('C:\\Users\\32217778\\Downloads\\netflix_titles')
ds = read.csv('C:\\Users\\32217778\\Downloads\\netflix_titles\\netflix_titles.csv')

str(ds)
head(ds)

데이터를 보면 작품명, 감독, 출연진, 영상물 등급, 개요 등 영화와 TV 쇼의 구체적인 정보를 담고 있습니다.

colSums(is.na(ds[,]))

director, cast 칼럼을 보면 NA값이 아닌 공백 칸이 있기 때문에 is.na 함수를 이용해 결측치를 파악할 수 없으므로 공백을 이용해 결측치 여부를 확인합니다.

colSums(ds[,] == '')

감독, 출연진, 나라, 등록일 칼럼 등에서 결측치가 존재하지만, 해당 행을 제거하면 데이터의 양이 크게 감소하며 결측치 제거가 이번 분석에 큰 영향을 주지는 않다고 판단했기 때문에 제거하지 않겠습니다.

① 컨텐츠 타입 비율

# 전처리

library(dplyr)

cts_type = table(ds$type) 
cts_type = data.frame(cts_type) 
cts_type = data.frame(cts_type,
                      ratio = round(cts_type$Freq / sum(cts_type$Freq) * 100, 1))
names(cts_type)[names(cts_type) == "Var1"] = "Type"

# 시각화

library(ggplot2)

ggplot(cts_type, aes(x = '', y = Freq, fill = Type)) +
  geom_bar(width = 5, stat = "identity") +
  coord_polar("y") + 
  theme_void() +
  geom_text(aes(label = paste0(ratio, "%")), 
            position = position_stack(vjust = 0.5)) +
  labs(title = "Ratio of Constents Type")

② 영상물 등급 분포

# 전처리

library(tidyverse)

tot_rating = ds %>%
  select(type, rating) %>%                    # type, rating 열 선택
  mutate(rating = fct_lump(rating, 5)) %>%    # rating 변수의 상위 5개 영역 선택, 나머진 others
  group_by(type, rating) %>%                  # 각 변수 별로 그룹화
  summarise(count = n())                      # 각 변수의 빈도를 계산하여 count 열에 추가

# 시각화
# 인터렉티브 그래프(사용자와 상호작용이 가능한 그래프)

library(plotly)

plot_ly(tot_rating,                 # 데이터
        x = ~type,                  # x변수
        y = ~count,                 # y변수
        type = "bar",               # 그래프 타입
        color = ~rating,            # 색 지정
        text = ~count,              # 텍스트 표시
        textposition = 'outside',   # 텍스트 위치
        textfont = list(color = "black",size = 12),           # 텍스트 색/크기 지정    
        hovertext = ~paste0("<Type> : ", type, "<br>",        # 커서 지정 시 나타낼 정보 지정    
                           "<Rating> : ", rating, "<br>",   
                           "<Count> : ", count),
        hoverinfo = 'text') %>%          # 위에서 지정한 정보만 표시
  layout(title = "Rating by Type",
         xaxis = list(title = ""),
         yaxis = list(title = ""))

# aesthetic 설정을 위해서 x, y, color, text, hovertext 변수 앞에 '~'을 붙임

TV-MA(17세 이상), TV-14(14세 이상), TV-PG(7세 이상), R(17세 이상), PG-13(13세 이상) 순서로 상위 5개 등급임을 확인할 수 있습니다.

③ 제작 국가 분포

# 전처리

country = ds %>%
  select(country) %>%
  filter(country != '') %>%                        # 공백값이 아닌 행만 선택
  mutate(country = fct_lump(country, 15)) %>%      # 상위 15개 성분만 선택, 나머진 other
  group_by(country) %>%                            # 국가별로 그룹화
  summarise(count = n()) %>%                       # 각 국가들의 빈도를 계산하여 count라는 새로운 열에 추가
  arrange(desc(count)) %>%                         # 내림차순 정렬
  data.frame()                                     # 데이터프레임화

# 시각화
# 인터렉티브 그래프

plot_ly(country,                    # 데이터
        x = ~count,                 # x 변수
        y = ~country,               # y 변수
        type = "bar",               # 그래프 타입 지정
        orientation = 'h') %>%      # 가로 형태 막대 그래프 설정
  layout(yaxis = list(categoryorder = "array",       # 카테고리의 정렬 방식을 지정, "array"로 설정하면 categoryarray에 지정된 순서대로 카테고리가 정렬됨.
                      categoryarray = ~count)) %>%   # count 열을 기준으로 카테고리 순서를 정렬
  layout(title = "Top Country with Highest Number of Movie & TV Show", 
         yaxis = list(title = ""))

④ 연도별 발매량 분포

# 전처리

year = ds %>%
  select(type, release_year) %>%       # type, release_year 열 선택
  filter(release_year >= 2000) %>%     # release_year이 2000 이상인 행만 선택
  group_by(type, release_year) %>%     # 각 변수 별로 그룹화
  summarise(count = n())               # 각 변수들의 빈도 수를 count라는 새로운 열에 저장

# 시각화

ggplot(year, aes(x = release_year, y = count)) +
  geom_line(aes(color = type), lwd = 1) +
  labs(x = 'Release Year', y = '',
       title = 'Distribution of Contents Realesed')

⑤ 핵심 주제 텍스트

library(RColorBrewer)
library(tidytext)

# Movie

movie_word = ds %>%
  select(type, description) %>%
  filter(type == "Movie") %>% 
  unnest_tokens(word, description) %>%    # description 열에 있는 원본 텍스트를 토큰화하여 word라는 새로운 열에 저장
  anti_join(stop_words)                   # "the" "and" "is" 와 같은 불용어(stop words)를 제외

cnt_movie_word = movie_word %>%
  count(word, sort = T)                   # 각 단어의 빈도 수를 계산해서 내림차순으로 정렬


# TV Show

tv_word = ds %>%
  select(type, description) %>%
  filter(type == "TV Show") %>%
  unnest_tokens(word, description) %>%
  anti_join(stop_words)

cnt_tv_word = tv_word %>%
  count(word, sort = T)


# 데이터 미리보기

head(movie_word)
head(cnt_movie_word)
head(tv_word)
head(cnt_tv_word)

두 타입 모두 빈도가 가장 많은 단어 순으로 정렬했습니다.

# 시각화

library(wordcloud)

wordcloud(words = cnt_movie_word$word,
          freq = cnt_movie_word$n,
          min.freq = 70,
          max.words = max(cnt_movie_word$n),
          random.order = F,                    # 단어 위치에 대한 파라미터, F일 경우 최다 빈출 단어가 가운데 위치
          rot.per = 0.1,                       # 90도 회전해서 보여줄 단어의 비율
          colors = brewer.pal(8, "Dark2"))     # 빈도에 따른 색상


wordcloud(words = cnt_tv_word$word,
          freq = cnt_tv_word$n,
          min.freq = 30,
          max.words = max(cnt_tv_word$n),
          random.order = F,
          rot.per = 0.1,
          colors = brewer.pal(8, "Dark2"))

두 타입 모두 공통적으로 Life, Family, Friends, Love 등의 단어들이 많이 보이는 것을 알 수 있습니다.

데이터 제공

Shivam Bansal in Kaggle (@shivamb)