Tidy Tuesday Exercise

# Option 1: tidytuesdayR package 
#install.packages("tidytuesdayR")
library(tidytuesdayR)
Warning: package 'tidytuesdayR' was built under R version 4.3.3
tuesdata <- tidytuesdayR::tt_load(2024, week = 30)
--- Compiling #TidyTuesday Information for 2024-07-23 ----
--- There are 6 files available ---
--- Starting Download ---

    Downloading file 1 of 6: `auditions.csv`
    Downloading file 2 of 6: `eliminations.csv`
    Downloading file 3 of 6: `finalists.csv`
    Downloading file 4 of 6: `ratings.csv`
    Downloading file 5 of 6: `seasons.csv`
    Downloading file 6 of 6: `songs.csv`
--- Download complete ---
auditions <- tuesdata$auditions
eliminations <- tuesdata$eliminations
finalists <- tuesdata$finalists
ratings <- tuesdata$ratings
seasons <- tuesdata$seasons
songs <- tuesdata$songs
head(auditions)
# A tibble: 6 × 12
  season audition_date_start audition_date_end audition_city      audition_venue
   <dbl> <date>              <date>            <chr>              <chr>         
1      1 2002-04-20          2002-04-22        Los Angeles, Cali… Westin Bonave…
2      1 2002-04-23          2002-04-25        Seattle, Washingt… Hyatt Regency…
3      1 2002-04-26          2002-04-28        Chicago, Illinois  Congress Plaz…
4      1 2002-04-29          2002-05-01        New York City, Ne… Millenium Hil…
5      1 2002-05-03          2002-05-05        Atlanta, Georgia   AmericasMart/…
6      1 2002-05-05          2002-05-07        Dallas, Texas      Wyndham Anato…
# ℹ 7 more variables: episodes <chr>, episode_air_date <chr>,
#   callback_venue <chr>, callback_date_start <date>, callback_date_end <date>,
#   tickets_to_hollywood <dbl>, guest_judge <chr>
head(eliminations)
# A tibble: 6 × 46
  season place gender contestant        top_36 top_36_2 top_36_3 top_36_4 top_32
   <dbl> <chr> <chr>  <chr>             <chr>  <chr>    <chr>    <chr>    <chr> 
1      1 1     Female Kelly Clarkson    <NA>   <NA>     <NA>     <NA>     <NA>  
2      1 2     Male   Justin Guarini    <NA>   <NA>     <NA>     <NA>     <NA>  
3      1 3     Female Nikki McKibbin    <NA>   <NA>     <NA>     <NA>     <NA>  
4      1 4     Female Tamyra Gray       <NA>   <NA>     <NA>     <NA>     <NA>  
5      1 5     Male   R. J. Helton      <NA>   <NA>     <NA>     <NA>     <NA>  
6      1 6     Female Christina Christ… <NA>   <NA>     <NA>     <NA>     <NA>  
# ℹ 37 more variables: top_32_2 <chr>, top_32_3 <chr>, top_32_4 <chr>,
#   top_30 <chr>, top_30_2 <chr>, top_30_3 <chr>, top_25 <chr>, top_25_2 <chr>,
#   top_25_3 <chr>, top_24 <chr>, top_24_2 <chr>, top_24_3 <chr>, top_20 <chr>,
#   top_20_2 <chr>, top_16 <chr>, top_14 <chr>, top_13 <chr>, top_12 <chr>,
#   top_11 <chr>, top_11_2 <chr>, wildcard <chr>, comeback <lgl>, top_10 <chr>,
#   top_9 <chr>, top_9_2 <chr>, top_8 <chr>, top_8_2 <chr>, top_7 <chr>,
#   top_7_2 <chr>, top_6 <chr>, top_6_2 <chr>, top_5 <chr>, top_5_2 <chr>, …
head(finalists)
# A tibble: 6 × 6
  Contestant          Birthday  Birthplace           Hometown Description Season
  <chr>               <chr>     <chr>                <chr>    <chr>        <dbl>
1 Kelly Clarkson      24-Apr-82 Fort Worth, Texas    Burleso… "She perfo…      1
2 Justin Guarini      28-Oct-78 Columbus, Georgia    Doylest… "He perfor…      1
3 Nikki McKibbin      28-Sep-78 Grand Prairie, Texas <NA>     "She had p…      1
4 Tamyra Gray         26-Jul-79 Takoma Park, Maryla… Atlanta… "She had a…      1
5 R. J. Helton        17-May-81 Pasadena, Texas      Cumming… "J. Helton…      1
6 Christina Christian 21-Jun-81 Brooklyn, New York   <NA>     ".Christin…      1
head(ratings)
# A tibble: 6 × 17
  season show_number episode    airdate `18_49_rating_share` viewers_in_millions
   <dbl>       <dbl> <chr>      <chr>   <chr>                              <dbl>
1      1           1 Auditions  June 1… 4.8                                 9.85
2      1           2 Hollywood… June 1… 5.2                                11.2 
3      1           3 Top 30: G… June 1… 5.2                                10.3 
4      1           4 Top 30: G… June 1… 4.7                                 9.47
5      1           5 Top 30: G… June 2… 4.5                                 9.08
6      1           6 Top 30: G… June 2… 4.2                                 8.53
# ℹ 11 more variables: timeslot_et <chr>, dvr_18_49 <chr>,
#   dvr_viewers_millions <chr>, total_18_49 <chr>,
#   total_viewers_millions <chr>, weekrank <chr>, ref <lgl>, share <chr>,
#   nightlyrank <dbl>, rating_share_households <chr>, rating_share <chr>
head(seasons)
# A tibble: 6 × 10
  season winner     runner_up original_release original_network hosted_by judges
   <dbl> <chr>      <chr>     <chr>            <chr>            <chr>     <chr> 
1      1 Kelly Cla… Justin G… June 11 (2002-0… Fox              Ryan Sea… Paula…
2      2 Ruben Stu… Clay Aik… January 21 (200… Fox              Ryan Sea… Paula…
3      3 Fantasia … Diana De… January 19 (200… Fox              Ryan Sea… Paula…
4      4 Carrie Un… Bo Bice   January 18 (200… Fox              Ryan Sea… Paula…
5      5 Taylor Hi… Katharin… January 17 (200… Fox              Ryan Sea… Paula…
6      6 Jordin Sp… Blake Le… January 16 (200… Fox              Ryan Sea… Paula…
# ℹ 3 more variables: no_of_episodes <dbl>, finals_venue <chr>, mentor <chr>
head(songs)
# A tibble: 6 × 8
  season    week                 order contestant song  artist song_theme result
  <chr>     <chr>                <dbl> <chr>      <chr> <chr>  <chr>      <chr> 
1 Season_01 20020618_top_30_gro…     1 Tamyra Gr… And … Jenni… <NA>       Advan…
2 Season_01 20020618_top_30_gro…     2 Jim Verra… When… Doris… <NA>       Advan…
3 Season_01 20020618_top_30_gro…     3 Adriel He… I'll… Edwin… <NA>       Elimi…
4 Season_01 20020618_top_30_gro…     4 Rodesia E… Dayd… The M… <NA>       Elimi…
5 Season_01 20020618_top_30_gro…     5 Natalie B… Crazy Patsy… <NA>       Elimi…
6 Season_01 20020618_top_30_gro…     6 Brad Estr… Just… James… <NA>       Elimi…
library(tidyverse)
clean_songs = songs %>% 
  mutate(artist = as.factor(artist)) %>% 
  mutate(result = as.factor(result))

summary(clean_songs)
    season              week               order         contestant       
 Length:2429        Length:2429        Min.   : 1.000   Length:2429       
 Class :character   Class :character   1st Qu.: 3.000   Class :character  
 Mode  :character   Mode  :character   Median : 5.000   Mode  :character  
                                       Mean   : 5.931                     
                                       3rd Qu.: 8.000                     
                                       Max.   :40.000                     
                                                                          
     song                       artist      song_theme       
 Length:2429        Stevie Wonder  :  56   Length:2429       
 Class :character   The Beatles    :  51   Class :character  
 Mode  :character   Elton John     :  48   Mode  :character  
                    Queen          :  34                     
                    Whitney Houston:  32                     
                    Billy Joel     :  26                     
                    (Other)        :2182                     
          result    
 Safe        :1155  
 Eliminated  : 476  
 Advanced    : 217  
 Bottom three:  96  
 Wild Card   :  59  
 (Other)     : 396  
 NA's        :  30  
plot(clean_songs$artist)

Converting variables to factors

clean_auditions = auditions %>% 
  separate(audition_city, c('city', 'state'), sep = ",") %>% 
    mutate(city = as.factor(city)) %>% 
  mutate(state = as.factor(state))
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [98,
102].
summary(clean_auditions)
     season      audition_date_start  audition_date_end               city    
 Min.   : 1.00   Min.   :2002-04-20   Min.   :2002-04-22   Los Angeles  : 10  
 1st Qu.: 6.00   1st Qu.:2006-08-11   1st Qu.:2006-08-11   Nashville    :  7  
 Median :10.00   Median :2010-09-05   Median :2010-09-05   New York City:  7  
 Mean   :10.37   Mean   :2011-04-14   Mean   :2011-04-14   San Francisco:  7  
 3rd Qu.:15.00   3rd Qu.:2015-09-05   3rd Qu.:2015-09-05   Atlanta      :  6  
 Max.   :18.00   Max.   :2019-09-21   Max.   :2019-09-21   Austin       :  5  
                                                           (Other)      :100  
         state    audition_venue       episodes         episode_air_date  
  California:22   Length:142         Length:142         Length:142        
  Texas     :13   Class :character   Class :character   Class :character  
  Georgia   : 9   Mode  :character   Mode  :character   Mode  :character  
  New York  : 9                                                           
  Tennessee : 9                                                           
 (Other)    :78                                                           
 NA's       : 2                                                           
 callback_venue     callback_date_start  callback_date_end   
 Length:142         Min.   :2002-02-06   Min.   :2002-02-06  
 Class :character   1st Qu.:2006-10-02   1st Qu.:2006-10-03  
 Mode  :character   Median :2010-11-09   Median :2010-11-10  
                    Mean   :2011-06-11   Mean   :2011-06-12  
                    3rd Qu.:2015-09-13   3rd Qu.:2015-09-14  
                    Max.   :2019-09-21   Max.   :2019-09-21  
                    NA's   :13           NA's   :13          
 tickets_to_hollywood guest_judge       
 Min.   :  6.0        Length:142        
 1st Qu.: 20.0        Class :character  
 Median : 29.0        Mode  :character  
 Mean   : 41.8                          
 3rd Qu.: 37.0                          
 Max.   :561.0                          
 NA's   :48                             

Trying to create State variable for Auditions

clean_auditions$state =  fct_lump(clean_auditions$state, 15, other_level = "Other")


ggplot(clean_auditions, aes(x=reorder(state, state, function(x)-length(x))))+
  geom_bar()+
  coord_flip()

Mean number of tickets to Hollywood per state

clean_auditions %>% 
  group_by(state) %>% 
  summarise(mean = mean(tickets_to_hollywood)) %>% arrange(desc(mean))
# A tibble: 22 × 2
   state             mean
   <fct>            <dbl>
 1 " Missouri"       35  
 2 " Massachusetts"  28.3
 3 " Alabama"        NA  
 4 " California"     NA  
 5 " Colorado"       NA  
 6 " Florida"        NA  
 7 " Georgia"        NA  
 8 " Illinois"       NA  
 9 " Louisiana"      NA  
10 " Michigan"       NA  
# ℹ 12 more rows
##Eliminations

clean_elim = eliminations %>% 
  mutate(gender = as.factor(gender)) %>% 
  mutate(place = as.factor(place))
library(stringr)

bin_elim = eliminations

bin_elim = bin_elim %>% 
  select(place, gender) %>% 
  mutate(place = ifelse(str_length(place)>2,substr(place, start = 1, stop = 2),place)) 


bin_elim
# A tibble: 456 × 2
   place gender
   <chr> <chr> 
 1 1     Female
 2 2     Male  
 3 3     Female
 4 4     Female
 5 5     Male  
 6 6     Female
 7 7     Female
 8 8     Male  
 9 9–    Male  
10 9–    Male  
# ℹ 446 more rows

Attempting to bin placements into 4 categories

bin_elim2 = bin_elim %>% 
  mutate(place = recode(place, `1` = '1-5')) %>% 
  mutate(place = recode(place, `2` = '1-5')) %>% 
  mutate(place = recode(place, `3` = '1-5')) %>% 
  mutate(place = recode(place, `4` = '1-5')) %>% 
  mutate(place = recode(place, `5` = '1-5')) %>% 
  mutate(place = recode(place, `6` = '6-10')) %>% 
  mutate(place = recode(place, `7` = '6-10')) %>% 
  mutate(place = recode(place, `8` = '6-10')) %>% 
  mutate(place = recode(place, `9` = '6-10')) %>% 
  mutate(place = recode(place, `10` = '6-10')) %>% 
  mutate(place = recode(place, `11` = '10-15')) %>% 
  mutate(place = recode(place, `12` = '10-15')) %>% 
  mutate(place = recode(place, `13` = '10-15')) %>% 
  mutate(place = recode(place, `14` = '10-15')) %>% 
  mutate(place = recode(place, `15` = '10-15')) %>% 
  mutate(place = recode(place, `16` = '16+')) %>% 
  mutate(place = recode(place, `17` = '16+')) %>% 
  mutate(place = recode(place, `18` = '16+')) %>% 
  mutate(place = recode(place, `21` = '16+')) %>% 
  mutate(place = as.factor(place))


bin_elim2$place = fct_lump(bin_elim2$place, 4,other_level = "Other" )

Practically there

ggplot(bin_elim2, aes(x=gender))+
  geom_bar()+
  facet_wrap(~place)

Creating the Year variable

substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}


year = substrRight(ratings$airdate, 4)
ratings$year = year


ratings1 = ratings %>% 
    filter(str_length(airdate) > 11)


viewers_by_year = ratings1 %>% 
  mutate(year = substrRight(ratings1$airdate, 4)) %>% 
  group_by(year) %>% 
  summarise(mean = mean(viewers_in_millions)) %>% arrange(desc(mean))

viewers_by_year
# A tibble: 16 × 2
   year   mean
   <chr> <dbl>
 1 2006  29.5 
 2 2007  29.1 
 3 2008  26.4 
 4 2009  24.6 
 5 2011  22.8 
 6 2010  22.2 
 7 2012  17.3 
 8 2013  13.3 
 9 2002  11.9 
10 2016   9.09
11 2018   7.84
12 2019   7.16
13 2020   6.88
14 2003  NA   
15 2004  NA   
16 2005  NA   

Yikes

plot(viewers_by_year)

Hypothesis/Question

While there are not too many clear options that I can see, I think what could be interesting is trying to predict the number of viewers based on some of the other variables in the ratings table.

clean_ratings = ratings1 %>% 
  select(season, show_number, weekrank, year, viewers_in_millions) %>% 
  mutate(year = as.numeric(year)) %>% 
  mutate(weekrank = as.factor(weekrank)) %>% 
  drop_na()


summary(clean_ratings)
     season        show_number       weekrank        year     
 Min.   : 1.000   Min.   : 1.00   2      :114   Min.   :2002  
 1st Qu.: 4.000   1st Qu.: 9.00   1      :100   1st Qu.:2005  
 Median : 6.000   Median :18.00   3      : 58   Median :2007  
 Mean   : 6.815   Mean   :19.06   4      : 52   Mean   :2008  
 3rd Qu.:10.000   3rd Qu.:28.00   5      : 23   3rd Qu.:2011  
 Max.   :15.000   Max.   :44.00   6      : 19   Max.   :2016  
                                  (Other): 61                 
 viewers_in_millions
 Min.   : 5.98      
 1st Qu.:17.34      
 Median :23.38      
 Mean   :22.09      
 3rd Qu.:26.90      
 Max.   :38.10      
                    
clean_ratings$weekrank =  fct_lump(clean_ratings$weekrank, 10, other_level = NA)

ggplot(clean_ratings, aes(weekrank))+
  geom_bar()

clean_ratings$weekrank = as.numeric(clean_ratings$weekrank)

summary(clean_ratings)
     season        show_number       weekrank           year     
 Min.   : 1.000   Min.   : 1.00   Min.   : 1.000   Min.   :2002  
 1st Qu.: 4.000   1st Qu.: 9.00   1st Qu.: 3.000   1st Qu.:2005  
 Median : 6.000   Median :18.00   Median : 3.000   Median :2007  
 Mean   : 6.815   Mean   :19.06   Mean   : 4.094   Mean   :2008  
 3rd Qu.:10.000   3rd Qu.:28.00   3rd Qu.: 5.000   3rd Qu.:2011  
 Max.   :15.000   Max.   :44.00   Max.   :11.000   Max.   :2016  
 viewers_in_millions
 Min.   : 5.98      
 1st Qu.:17.34      
 Median :23.38      
 Mean   :22.09      
 3rd Qu.:26.90      
 Max.   :38.10      
hist(clean_ratings$season)

hist(clean_ratings$show_number)

hist(clean_ratings$weekrank)

hist(clean_ratings$year)

Splitting the Data

library(caret)

clean_ratings = as.data.frame(clean_ratings)

set.seed(20)

split1 = sample(c(rep(0, 0.8 * nrow(clean_ratings)), rep(1,0.2*nrow(clean_ratings))))


train_data = clean_ratings[split1 == 0, ]
test_data = clean_ratings[split1 == 1, ]


train_y = train_data$viewers_in_millions
test_y = test_data$viewers_in_millions


train_x = train_data %>% 
  select(!viewers_in_millions)

test_x = test_data %>% 
  select(!viewers_in_millions)

PreProcessing Training and Test Predictors

TrainXPP = preProcess(train_x, method = c("scale","center", "BoxCox"))
TrainXTrans = predict(TrainXPP, train_x)
TestXPP = preProcess(test_x, method = c("scale", "center", "BoxCox"))
TestXTrans = predict(TestXPP, test_x)

Fitting 3 models

Linear Regression Model

library(tidymodels)
Warning: package 'tidymodels' was built under R version 4.3.3
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.5      ✔ rsample      1.2.1 
✔ dials        1.2.1      ✔ tune         1.2.1 
✔ infer        1.0.7      ✔ workflows    1.1.4 
✔ modeldata    1.4.0      ✔ workflowsets 1.1.0 
✔ parsnip      1.2.1      ✔ yardstick    1.3.1 
✔ recipes      1.0.10     
Warning: package 'dials' was built under R version 4.3.3
Warning: package 'scales' was built under R version 4.3.3
Warning: package 'infer' was built under R version 4.3.3
Warning: package 'modeldata' was built under R version 4.3.3
Warning: package 'parsnip' was built under R version 4.3.3
Warning: package 'recipes' was built under R version 4.3.3
Warning: package 'rsample' was built under R version 4.3.3
Warning: package 'tune' was built under R version 4.3.3
Warning: package 'workflows' was built under R version 4.3.3
Warning: package 'workflowsets' was built under R version 4.3.3
Warning: package 'yardstick' was built under R version 4.3.3
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard()        masks purrr::discard()
✖ dplyr::filter()          masks stats::filter()
✖ recipes::fixed()         masks stringr::fixed()
✖ dplyr::lag()             masks stats::lag()
✖ caret::lift()            masks purrr::lift()
✖ yardstick::precision()   masks caret::precision()
✖ yardstick::recall()      masks caret::recall()
✖ yardstick::sensitivity() masks caret::sensitivity()
✖ yardstick::spec()        masks readr::spec()
✖ yardstick::specificity() masks caret::specificity()
✖ recipes::step()          masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library(broom)

lmmodel = linear_reg() %>% 
  set_mode("regression") %>% 
  set_engine("lm")

Random Forest

rf_model = rand_forest() %>% 
  set_engine("randomForest", imporance = TRUE) %>% 
  set_mode("regression")

Decision Tree

dt_model = decision_tree() %>% 
  set_engine("rpart") %>% 
  set_mode("regression")

Results

Setting options

set.seed(556)
folds = vfold_cv(train_data, v = 5)
formula = viewers_in_millions ~.
lm_wf = workflow() %>% 
  add_model(lmmodel) %>% 
  add_formula(formula)


set.seed(45)

lm_fit_rs =
  lm_wf %>% 
fit_resamples(folds)
→ A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
There were issues with some computations   A: x1
There were issues with some computations   A: x5
lm_fit_rs
# Resampling results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits           id    .metrics         .notes          
  <list>           <chr> <list>           <list>          
1 <split [273/69]> Fold1 <tibble [2 × 4]> <tibble [1 × 3]>
2 <split [273/69]> Fold2 <tibble [2 × 4]> <tibble [1 × 3]>
3 <split [274/68]> Fold3 <tibble [2 × 4]> <tibble [1 × 3]>
4 <split [274/68]> Fold4 <tibble [2 × 4]> <tibble [1 × 3]>
5 <split [274/68]> Fold5 <tibble [2 × 4]> <tibble [1 × 3]>

There were issues with some computations:

  - Warning(s) x5: prediction from rank-deficient fit; consider predict(., rankdefic...

Run `show_notes(.Last.tune.result)` for more information.
rf_wf = workflow() %>% 
  add_model(rf_model) %>% 
  add_formula(formula)


set.seed(45)

rf_fit_rs =
  rf_wf %>% 
fit_resamples(folds)
Warning: package 'randomForest' was built under R version 4.3.3
rf_fit_rs
# Resampling results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits           id    .metrics         .notes          
  <list>           <chr> <list>           <list>          
1 <split [273/69]> Fold1 <tibble [2 × 4]> <tibble [0 × 3]>
2 <split [273/69]> Fold2 <tibble [2 × 4]> <tibble [0 × 3]>
3 <split [274/68]> Fold3 <tibble [2 × 4]> <tibble [0 × 3]>
4 <split [274/68]> Fold4 <tibble [2 × 4]> <tibble [0 × 3]>
5 <split [274/68]> Fold5 <tibble [2 × 4]> <tibble [0 × 3]>
dt_wf = workflow() %>% 
  add_model(dt_model) %>% 
  add_formula(formula)

set.seed(45)

dt_fit_rs =
  dt_wf %>% 
fit_resamples(folds)

dt_fit_rs
# Resampling results
# 5-fold cross-validation 
# A tibble: 5 × 4
  splits           id    .metrics         .notes          
  <list>           <chr> <list>           <list>          
1 <split [273/69]> Fold1 <tibble [2 × 4]> <tibble [0 × 3]>
2 <split [273/69]> Fold2 <tibble [2 × 4]> <tibble [0 × 3]>
3 <split [274/68]> Fold3 <tibble [2 × 4]> <tibble [0 × 3]>
4 <split [274/68]> Fold4 <tibble [2 × 4]> <tibble [0 × 3]>
5 <split [274/68]> Fold5 <tibble [2 × 4]> <tibble [0 × 3]>

Metrics

Metrics = rbind(collect_metrics(lm_fit_rs),collect_metrics(rf_fit_rs),collect_metrics(dt_fit_rs))

cbind(model = c("LM","LM", "RF","RF","DT","DT"), Metrics)
  model .metric .estimator      mean n    std_err              .config
1    LM    rmse   standard 4.8040789 5 0.12140591 Preprocessor1_Model1
2    LM     rsq   standard 0.4799109 5 0.03524461 Preprocessor1_Model1
3    RF    rmse   standard 2.2305172 5 0.16620793 Preprocessor1_Model1
4    RF     rsq   standard 0.8967623 5 0.01307662 Preprocessor1_Model1
5    DT    rmse   standard 3.1343732 5 0.23193623 Preprocessor1_Model1
6    DT     rsq   standard 0.7760581 5 0.02892788 Preprocessor1_Model1

From these results we can see that the model with the best R^2 value is the RandomForest Model, thich also has the lowest mean RMSE value. Due to this accuracy, we will proceed to use this model on the test data.

Determining the most important variables

library(vip)
rf_recipe <- 
  recipe(viewers_in_millions ~ ., data = train_data) 

rf_workflow <- 
  workflow() %>%
  add_model(rf_model) %>% 
  add_recipe(rf_recipe)


rf_workflow %>% 
  fit(train_data) %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 10)

Testing Prediction Data

library(MLmetrics)
Warning: package 'MLmetrics' was built under R version 4.3.3

Attaching package: 'MLmetrics'
The following objects are masked from 'package:caret':

    MAE, RMSE
The following object is masked from 'package:base':

    Recall
set.seed(24)
rf_fit =
  rf_model %>% 
  fit(viewers_in_millions ~., data = train_data)



rf_testing_pred <- 
  bind_cols(predict(rf_fit, test_data, type = "numeric")) %>% 
  bind_cols(test_data %>% select(viewers_in_millions))

head(rf_testing_pred)
# A tibble: 6 × 2
  .pred viewers_in_millions
  <dbl>               <dbl>
1  12.9               10.3 
2  11.8                9.47
3  12.1               10.3 
4  17.3               16.9 
5  24.4               26   
6  19.7               20.1 
rf_testing_pred = as.data.frame(rf_testing_pred)


preds = rf_testing_pred$.pred

obs = test_data$viewers_in_millions

rbind(RMSE = mean((obs - preds)^2) %>% 
  sqrt(),
RSQ = cor(obs, preds) ^ 2)
          [,1]
RMSE 1.7716773
RSQ  0.9453627

The R^2 value is actually higher on the Test dataset than the Train dataset, and the RMSE is lower as well.

Conclusions

  1. I was able to clean many different variables on several datasets that were very messy

  2. I was able to explore several different variables and while I struggled to find a good combination of predictors/responses, I did identify my Hypothesis.

  3. My Question/Objective was to predict the number of viewers based on the numeric variables in the Ratings table.

  4. I split and preprocessed the data and then ran 3 different models on the training data. The RandomForest performed the best and the most important predictors were ‘Season’ and ‘Year’

  5. I tested the chosen model (RandomForest) on the test data and uncovered favorable results.

I learned a tremendous amount about handling very messy data as well as how to use tidymodels and recipies. I had absolutely no experience with the latter and I now see why they are such a staple for many data scientists.