library(readtext) #For import and Handling for Plain and Formatted Text Files.library(rvest) #For easily Harvest (Scrape) Web Pages.library(xml2) #For working with XML files using a simple, consistent interface.library(polite) #For be responsible when scraping data from websites.library(httr) #Package for working with HTTP organised by HTTP verbs library(tidyverse) #Opinionated collection of R packages designed for data science.library(tidytext) #Functions and supporting data sets to allow conversion of text.library(quanteda) #OUR PACKAGE for text analysis. library(quanteda.textstats) #OUR SUBPACKAGE for text statistics. library(quanteda.textplots) #OUR SUBPACKAGE for text plots. library(stringr) #Consistent Wrappers for Common String Operations.library(spacyr) #NLP package that comes from Python that help us classify words.library(ggsci) #Collection of high-quality color palettes.library(ggrepel) # ggrepel provides geoms for ggplot2 to repel overlapping text labelslibrary(RColorBrewer) #Beautifull color palettes.library(cowplot) #Package to put images in our plots.library(magick) #Package for save images in our environmentlibrary(gghighlight) #gghighlight() adds direct labels for some geoms.#Set imageobj_img <-image_read(path ="https://bit.ly/3twmH2Y")
Packages for the analysis
Tidyverse: set of pacakges that will help us to wrangle our objetcs, dataframes, plots, etc. (Amazing tool)
Quanteda: set of pacakges that will help us to wrangle our objetcs, dataframes, plots, etc. (Amazing tool)
Rvest: set of pacakges that will help us to wrangle our objetcs, dataframes, plots, etc. (Amazing tool)
stringr: set of pacakges that will help us to wrangle our objetcs, dataframes, plots, etc. (Amazing tool)
spacyr: set of pacakges that will help us to wrangle our objetcs, dataframes, plots, etc. (Amazing tool)
Letโs start
Agenda for review
1
2
3
Web scraping ๐๏ธ
Web scraping ๐๏ธ
v_tv_show <-"how-i-met-your-mother"v_url_web <-"http://www.springfieldspringfield.co.uk/"#Remember to be polite and know if we can web scrap the webpagesession_information <-bow(v_url_web) #Do a bow with the polite packagesession_informationv_url <-paste(v_url_web,"episode_scripts.php?tv-show=", v_tv_show, sep="")#Identify yourselfrvest_himym <-session(v_url, add_headers(`From`="jurjoo@gmail.com", `UserAgent`=R.Version()$version.string))#Start web scraphtml_url_scrape <- rvest_himym %>%read_html(v_url)node_selector <-".season-episode-title"directory_path <-paste("texts/how-i-met-your-mother/", v_tv_show, sep ="")
First step: we must download the TV show scripts. For that, we have multiple options, but one efficient way to do it is by applying some web scrap techniques to obtain our texts and other relevant information.
This chunk of code shows how we can retrieve data from the internet. For our purpose, we will use Sprigfield webpage. Here, you can download the original TV scripts from multiple shows; in our case, we will download the How I Met Your Mother scripts.
html_url_all_seasons <-html_nodes(html_url_scrape, node_selector) %>%html_attr("href")### One loop for all our URL's----------------------------------------for (x in html_url_all_seasons) { read_ur <-read_html(paste(v_url_web, x, sep="/"))Sys.sleep(runif(1, 0, 1)) #Be polite# Element node that was checked and that contain the place of the scripts. selector <-".scrolling-script-container"# Scrape the text text_html <-html_nodes(read_ur, selector) %>%html_text()# Last five characters of html_url_all_seasons for saving this to separate text files (This is our pattern). sub_data <-function(x, n) {substr(x, nchar(x) - n +1, nchar(x)) } seasons_final <-sub_data(x, 5)# Write each text filewrite.csv(text_html, file =paste(directory_path, "_", seasons_final, ".txt", sep=""), row.names =FALSE)}
๐ชก Webscrapp TV Show tables
url_himym <-"https://en.wikipedia.org/wiki/List_of_How_I_Met_Your_Mother_episodes"rvest_himym_table <-session(url_himym, add_headers(`From`="jurjoo@gmail.com", `UserAgent`=R.Version()$version.string))l_tables_himym <- rvest_himym_table %>%read_html() %>%html_nodes("table") %>%html_table(fill =TRUE)#This generates a list with all the tables that contain the page. In our case, #we want the table from the second element till the 10th. l_tables_himym <- l_tables_himym[c(2:10)]
Data cleaning to wrangle html tables (Characters of the TV show)
Code
#Reduce the list in one data frame since all of the tables share the same structure df_himym <-data.frame(Reduce(bind_rows, l_tables_himym)) #We do the same for the characters of HIMYMurl_himym_characters <-"https://en.wikipedia.org/wiki/List_of_How_I_Met_Your_Mother_characters"rvest_himym_table_2 <-session(url_himym_characters, add_headers(`From`="jurjoo@gmail.com", `UserAgent`=R.Version()$version.string))l_tables_himym_characters <- rvest_himym_table_2 %>%read_html() %>%html_nodes("table") %>%html_table(fill =TRUE)df_characters <-as.data.frame(l_tables_himym_characters[[1]]) %>%select(Character)df_characters_w <- df_characters %>%filter(!stringr::str_starts(Character, "Futu"),!(Character %in%c("Character", "Main Characters", "Supporting Characters"))) %>%mutate(name =str_extract(Character,"([^ ]+)"),name =replace(name, name =="Dr.", "Sonya"))rmarkdown::paged_table(df_characters_w)
Data cleaning to wrangle html tables (Information of the TV Show)
#We bind the tables with bind_rowsdf_himym <-data.frame(Reduce(bind_rows, l_tables_himym)) df_himym_filt <- df_himym %>%filter(str_length(No.overall) <4)df_himym_filt_dupl <- df_himym %>%filter(str_length(No.overall) >4)#We are doing this particular wrangling to format in the best possible way our tables. #Note that we are using stringr to manipulate our characters.df_himym_filt_dupl_1 <- df_himym_filt_dupl %>%mutate(No.overall =as.numeric(replace(No.overall, str_length(No.overall) >4, substr(No.overall, 1, 3))),No..inseason =as.numeric(replace(No..inseason, str_length(No..inseason) >3, substr(No..inseason, 1, 2))),Prod.code =replace (Prod.code, str_length(Prod.code) >3, substr(Prod.code, 1, 6)))df_himym_filt_dupl_2 <- df_himym_filt_dupl %>%mutate(No.overall =as.numeric(replace(No.overall, str_length(No.overall) >4, substr(No.overall, 4, 6))),No..inseason =as.numeric(replace(No..inseason, str_length(No..inseason) >3, substr(No..inseason, 3, 4))),Title =replace(Title, Title =="\"The Magician's Code\"", "\"The Magician's Code Part 2\""),Title =replace(Title, Title =="\"The Final Page\"", "\"The Final Page Part 2\""),Title =replace(Title, Title =="\"Last Forever\"" , "\"Last Forever Part 2\"" ),Prod.code =replace(Prod.code, str_length(Prod.code) >3, substr(Prod.code, 7, 12)))df_himym_final <-bind_rows(df_himym_filt, df_himym_filt_dupl_1, df_himym_filt_dupl_2) %>%arrange(No.overall, No..inseason) %>%mutate(year =str_extract(Original.air.date, '[0-9]{4}+'),Season =as.numeric(stringr::str_extract(Prod.code, "^.{1}"))) %>%rename(Chapter = No..inseason)df_himym_final$US.viewers.millions. <-as.numeric(str_replace_all(df_himym_final$US.viewers.millions., "\\[[0-9]+\\]", ""))
Look our corpus, itโs divided into types, tokens and even sentences.
corp_himym <-corpus(df_himym_final_doc) #Build a new corpus from the textsdocnames(corp_himym) <- df_himym_final_doc$Titlesummary(corp_himym, n =15)
Corpus consisting of 208 documents, showing 15 documents:
Text Types Tokens Sentences Season Chapter No.overall
"Pilot" 879 3901 400 1 1 1
"Purple Giraffe" 789 3836 358 1 2 2
"Sweet Taste of Liberty" 838 3610 380 1 3 3
"Return of the Shirt" 899 3971 403 1 4 4
"Okay Awesome" 784 3014 309 1 5 5
"Slutty Pumpkin" 909 3652 430 1 6 6
"Matchmaker" 930 3841 396 1 7 7
"The Duel" 909 3965 444 1 8 8
"Belly Full of Turkey" 923 3811 352 1 9 9
"The Pineapple Incident" 783 3824 451 1 10 10
"The Limo" 775 3821 365 1 11 11
"The Wedding" 875 4042 451 1 12 12
"Drumroll, Please" 812 3603 401 1 13 13
"Zip, Zip, Zip" 854 3496 386 1 14 14
"Game Night" 883 3383 350 1 15 15
Title Directed.by Written.by
"Pilot" Pamela Fryman Carter Bays & Craig Thomas
"Purple Giraffe" Pamela Fryman Carter Bays & Craig Thomas
"Sweet Taste of Liberty" Pamela Fryman Phil Lord & Chris Miller
"Return of the Shirt" Pamela Fryman Kourtney Kang
"Okay Awesome" Pamela Fryman Chris Harris
"Slutty Pumpkin" Pamela Fryman Brenda Hsueh
"Matchmaker" Pamela Fryman Chris Marcil & Sam Johnson
"The Duel" Pamela Fryman Gloria Calderon Kellett
"Belly Full of Turkey" Pamela Fryman Phil Lord & Chris Miller
"The Pineapple Incident" Pamela Fryman Carter Bays & Craig Thomas
"The Limo" Pamela Fryman Sam Johnson & Chris Marcil
"The Wedding" Pamela Fryman Kourtney Kang
"Drumroll, Please" Pamela Fryman Gloria Calderon Kellett
"Zip, Zip, Zip" Pamela Fryman Brenda Hsueh
"Game Night" Pamela Fryman Chris Harris
Original.air.date Prod.code US.viewers.millions. year Season_w
September 19, 2005 (2005-09-19) 1ALH79 10.94 2005 Season 1
September 26, 2005 (2005-09-26) 1ALH01 10.40 2005 Season 1
October 3, 2005 (2005-10-03) 1ALH02 10.44 2005 Season 1
October 10, 2005 (2005-10-10) 1ALH03 9.84 2005 Season 1
October 17, 2005 (2005-10-17) 1ALH04 10.14 2005 Season 1
October 24, 2005 (2005-10-24) 1ALH05 10.89 2005 Season 1
November 7, 2005 (2005-11-07) 1ALH07 10.55 2005 Season 1
November 14, 2005 (2005-11-14) 1ALH06 10.35 2005 Season 1
November 21, 2005 (2005-11-21) 1ALH09 10.29 2005 Season 1
November 28, 2005 (2005-11-28) 1ALH08 12.27 2005 Season 1
December 19, 2005 (2005-12-19) 1ALH10 10.36 2005 Season 1
January 9, 2006 (2006-01-09) 1ALH11 11.49 2006 Season 1
January 23, 2006 (2006-01-23) 1ALH12 10.82 2006 Season 1
February 6, 2006 (2006-02-06) 1ALH13 10.94 2006 Season 1
February 27, 2006 (2006-02-27) 1ALH14 9.82 2006 Season 1
Title_season
"Pilot" S1 EP1
"Purple Giraffe" S1 EP2
"Sweet Taste of Liberty" S1 EP3
"Return of the Shirt" S1 EP4
"Okay Awesome" S1 EP5
"Slutty Pumpkin" S1 EP6
"Matchmaker" S1 EP7
"The Duel" S1 EP8
"Belly Full of Turkey" S1 EP9
"The Pineapple Incident" S1 EP10
"The Limo" S1 EP11
"The Wedding" S1 EP12
"Drumroll, Please" S1 EP13
"Zip, Zip, Zip" S1 EP14
"Game Night" S1 EP15
๐ฅฝ Second step: Convert corpus into tokens and wrangle it. Look our tokenization, we separate our text into words. Amazing!
corp_himym_stat <- corp_himymdocnames(corp_himym_stat) <- df_himym_final_doc$Title_seasoncorp_himym_s1_simil <-corpus_subset(corp_himym_stat, Season ==1) #We want to analyze just the first seasontoks_himym_s1 <-tokens(corp_himym_s1_simil, #corpus from all the episodes from the first seasonremove_punct =TRUE, #Remove punctuation of our textsremove_separators =TRUE, #Remove separators of our textsremove_numbers =TRUE, #Remove numbers of our textsremove_symbols =TRUE) %>%#Remove symbols of our textstokens_remove(stopwords("english")) #Remove stop words of our textstoks_himym_s1