# Lab settings - please ingnore
options(repr.plot.width=7, repr.plot.height=4, repr.plot.res=250 ) # Make plots a resonable size


library('tidyverse')


as.Date('15/01/2001', format='%d/%m/%Y', tz='AST')


date1 = '15/1/2021'

date2 = as.Date('15/1/2021', format='%d/%m/%Y', tz="America/Halifax")


date1
date2


class(date1)

class(date2)


sample_dates = as.Date(c('2010-07-22', '2011-04-20', '2012-10-06', '2013-09-16', '2014-11-01', '2015-12-09', '2016-10-23', '2017-01-01', '2018-02-19'))

sample_dates


diff(sample_dates)

Time differences in days
[1] 272 535 345 411 403 319  70 414


OlsonNames() %>% 
  head()


my_dates = seq(date2, length=20, by='week')
my_dates


diff(my_dates)

Time differences in days
 [1] 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7


unclass(my_dates)


weekdays(sample_dates)


months(sample_dates)


quarters(sample_dates)


julian(sample_dates)


unclass(sample_dates)


time1 = as.POSIXct("2021-02-26 23:55:26")
time2 = as.POSIXlt("2021-02-26 23:55:23")


time1

unclass(time1)

[1] "2021-02-26 23:55:26 AST"


time2

unclass(time2)

[1] "2021-02-26 23:55:23 AST"


time1 - time2

Time difference of 3 secs


time1 + 24*60*60

[1] "2021-02-27 23:55:26 AST"


as.POSIXct("2013-03-10 08:32:07") - as.POSIXct("2013-03-09 23:55:26")

Time difference of 8.611389 hours


events = data.frame(
    time=c("2014-01-23 14:28:21","2014-01-23 14:28:55","2014-01-23 14:29:02","2014-01-23 14:31:18"),
    speed=c(2.0,2.2,3.4,5.5))

events


events$time = strptime(events$time,"%Y-%m-%d %H:%M:%S")

events


early = strptime("2000-01-01 00:00:00","%Y-%m-%d %H:%M:%S")
late1 = strptime("2000-01-01 00:00:20","%Y-%m-%d %H:%M:%S")

early - late1

Time difference of -20 secs


late2 = strptime("2000-01-01 1:00:00","%Y-%m-%d %H:%M:%S")

early - late2

Time difference of -1 hours


as.numeric(early-late1)
as.numeric(early-late2)


protein = "vlspadktnv"

nchar(protein)


substr(protein,1,3)


strsplit(protein,'a')


gregexpr('a', protein)


gregexpr('a', protein)[[1]]


mlb_pitching <- na.omit(read.csv("mlb2017_pitching.txt"))

mlb_pitching  %>% 
  head()


mlb_pitching <- mlb_pitching %>% 
  mutate(Name_clean = str_extract(Name,"[A-Z][a-z]+ [A-Z][a-z]+"))

mlb_pitching  %>% 
  select(Name,Name_clean) %>% 
  head()


grep("^Jim", mlb_pitching$Name)


mlb_pitching$Name[grep("^Jim", mlb_pitching$Name)]


length(grep("3.", mlb_pitching$Age))/length(mlb_pitching$Age)


mlb_pitching$Name[grepl("\\*", mlb_pitching$Name)]


mlb_pitching$Name <- gsub("Tyler", "Superman", mlb_pitching$Name)

mlb_pitching$Name[grep("^Superman", mlb_pitching$Name)]


mlb_pitching$Name <- gsub("\\\\.{1,20}", "", mlb_pitching$Name)

mlb_pitching$Name[1:10]


<html>
<head>
</head>

<body>
 <p>Here's a paragraph of text!</p>
 <p>Here's a second paragraph of text!</p>
</body>
</html>


<p style=”color:red” >Text 1</p>
<p style=”color:red” >Text 2</p>
<p style=”color:red” >Text 3</p>


<p class=”red-text” >Text 1</p>
<p class=”red-text” >Text 2</p>
<p class=”red-text” >Text 3</p>


.red-text {
    color : red;
}


<p id=”special” >This is a special tag.</p>


install.packages('rvest')


library('rvest')


# url for this lab
url <- 'https://raw.githubusercontent.com/Diego-Ibarra/biol3782/main/week8/imdb_100titles_2019.html'

# url of actual live imdb website
# url <- 'http://www.imdb.com/search/title?count=100&release_date=2019,2019&title_type=feature'


webpage <- read_html(url)


#Using CSS selectors to scrape the rankings section
rank_data_html <- html_nodes(webpage,'.text-primary')


#Converting the ranking data to text
rank_data <- html_text(rank_data_html)


head(rank_data)


rank_data <- as.numeric(rank_data)


title_data_html <- html_nodes(webpage,'.lister-item-header a')

#Converting the title data to text
title_data <- html_text(title_data_html)


head(title_data)


#Using CSS selectors to scrape the description section
description_data_html <- html_nodes(webpage,'.ratings-bar+ .text-muted')

#Converting the description data to text
description_data <- html_text(description_data_html)

description_data %>% 
 head()


#Removing '\n'
description_data <- gsub("\n","",description_data)

head(description_data)


#Using CSS selectors to scrape the Movie runtime section
runtime_data_html <- html_nodes(webpage,'.text-muted .runtime')

#Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)

head(runtime_data)


#Removing mins and converting it to numerical

runtime_data <- gsub(" min","",runtime_data)
runtime_data <- as.numeric(runtime_data)

head(runtime_data)


#Using CSS selectors to scrape the Movie genre section
genre_data_html <- html_nodes(webpage,'.genre')

#Converting the genre data to text
genre_data <- html_text(genre_data_html)

head(genre_data)


#Removing \n
genre_data <- gsub("\n","",genre_data)

#Removing excess spaces
genre_data <- gsub(" ","",genre_data)

head(genre_data)


#taking only the first genre of each movie
genre_data <- gsub(",.*","",genre_data)

#Convering each genre from text to factor
genre_data <- as.factor(genre_data)

head(genre_data)


#Using CSS selectors to scrape the IMDB rating section
rating_data_html <- html_nodes(webpage,'.ratings-imdb-rating strong')

#Converting the ratings data to text
rating_data <- html_text(rating_data_html)

str(rating_data)

 chr [1:100] "8.4" "6.9" "7.8" "8.6" "7.6" "7.9" "7.8" "8.4" "7.1" "8.3" ...


#Converting ratings to numeric
rating_data <- as.numeric(rating_data)

head(rating_data)


#Using CSS selectors to scrape the votes section
votes_data_html <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')

#Converting the votes data to text
votes_data <- html_text(votes_data_html)

#Removing commas
votes_data <- gsub(",","",votes_data)

#Converting votes to numerical
votes_data <- as.numeric(votes_data)

head(votes_data)


#Using CSS selectors to scrape the directors section
directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)')

#Converting the directors data to text
directors_data <- html_text(directors_data_html)

#Converting directors data into factors
directors_data <- as.factor(directors_data)

head(directors_data)


#Using CSS selectors to scrape the actors section
actors_data_html <- html_nodes(webpage,'.lister-item-content .ghost+ a')

#Converting the gross actors data to text
actors_data <- html_text(actors_data_html)

#Converting actors data into factors
actors_data <- as.factor(actors_data)

head(actors_data)


#Using CSS selectors to scrape the metascore section
metascore_data_html <- html_nodes(webpage,'.metascore')

#Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)

#Removing extra space in metascore
metascore_data <- gsub(" ","",metascore_data)

head(metascore_data)


summary(metascore_data)

   Length     Class      Mode 
      100 character character


#converting metascore to numerical
metascore_data <- as.numeric(metascore_data)

head(metascore_data)


summary(metascore_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   51.75   61.00   61.44   73.00   96.00


#Using CSS selectors to scrape the gross revenue section
gross_data_html <- html_node(html_nodes(webpage, '.lister-item-content'), '.sort-num_votes-visible span:nth-child(5)')

#Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)

length(gross_data)

head(gross_data)


#Removing '$' and 'M' signs
gross_data <- gsub("M","",gross_data)

gross_data <- substring(gross_data,2,6)

#converting gross_data to numerical
gross_data <- as.numeric(gross_data)

gross_data  %>% 
 head()


summary(metascore_data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  19.00   51.75   61.00   61.44   73.00   96.00


#Combining all the lists to form a data frame

movies_df <- data.frame(Rank = rank_data, Title = title_data,
                      Description = description_data, Runtime = runtime_data,
                      Genre = genre_data, Rating = rating_data,
                      Metascore = metascore_data, 
                      Votes = votes_data,
                      Gross_Earning_in_Mil = gross_data,
                      Director = directors_data, Actor = actors_data)

str(movies_df)

'data.frame':	100 obs. of  11 variables:
 $ Rank                : num  1 2 3 4 5 6 7 8 9 10 ...
 $ Title               : chr  "Avengers: Endgame" "Captain Marvel" "Sound of Metal" "Parasite" ...
 $ Description         : chr  "    After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of r"| __truncated__ "    Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a ga"| __truncated__ "    A heavy-metal drummer's life is thrown into freefall when he begins to lose his hearing." "    Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park fa"| __truncated__ ...
 $ Runtime             : num  181 123 120 132 161 130 113 122 148 119 ...
 $ Genre               : Factor w/ 10 levels "Action","Adventure",..: 1 1 7 5 5 5 1 6 7 7 ...
 $ Rating              : num  8.4 6.9 7.8 8.6 7.6 7.9 7.8 8.4 7.1 8.3 ...
 $ Metascore           : num  78 64 82 96 83 82 51 59 72 78 ...
 $ Votes               : num  825771 451581 36445 569876 561190 ...
 $ Gross_Earning_in_Mil: num  858.3 426.8 NA 53.4 142.5 ...
 $ Director            : Factor w/ 99 levels "Adam Randall",..: 8 7 20 11 76 77 35 95 9 87 ...
 $ Actor               : Factor w/ 96 levels "Aaron Paul","Adam Driver",..: 71 9 69 47 53 15 60 41 28 20 ...


qplot(data = movies_df, Runtime, fill = Genre,bins = 30)+
  theme_classic()


# Plot of runtime vs rating
ggplot(movies_df,aes(x = Runtime,y = Rating))+
  geom_point(aes(size = Votes,col = Genre))+
  theme_classic()


# Plot of runtime vs earnings
ggplot(movies_df, aes(x = Runtime, y = Gross_Earning_in_Mil))+
  geom_point(aes(size = Rating, col = Genre))+
  theme_classic()

Warning message:
"Removed 9 rows containing missing values (geom_point)."


cssFile <- '../css/custom.css'
IRdisplay::display_html(readChar(cssFile, file.info(cssFile)$size))

IRdisplay::display_html("<style>.Q::before {counter-increment: question_num;
    content: 'QUESTION ' counter(question_num) ': '; white-space: pre; }.T::before {counter-increment: task_num;
    content: 'Task ' counter(task_num) ': ';</style>")

Code	Value
%d	Day of the month (decimal number)
%m	Month (decimal number)
%b	Month (abbreviated)
%B	Month (full name)
%y	Year (2 digit)
%Y	Year (4 digit)

Code	Meaning	Code	Meaning
%a	Abbreviated weekday	%A	Full weekday
%b	Abbreviated month	%B	Full month
%c	Locale-specific date and time	%d	Decimal date
%H	Decimal hours (24 hour)	%I	Decimal hours (12 hour)
%j	Decimal day of the year	%m	Decimal month
%M	Decimal minute	%p	Locale-specific AM/PM
%S	Decimal second	%U	Decimal week of the year (starting on Sunday)
%w	Decimal Weekday (0=Sunday)	%W	Decimal week of the year (starting on Monday)
%x	Locale-specific Date	%X	Locale-specific Time
%y	2-digit year	%Y	4-digit year
%z	Offset from GMT	%Z	Time zone (character)

	Rk	Name	Age	Tm	Lg	W	L	W.L.	ERA	G	...	WP	BF	ERA.	FIP	WHIP	H9	HR9	BB9	SO9	SO.W
	<int>	<chr>	<int>	<chr>	<chr>	<int>	<int>	<dbl>	<dbl>	<int>	...	<int>	<int>	<int>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	1	Fernando Abad*\abadfe01	31	BOS	AL	2	1	0.667	3.30	48	...	0	182	140	3.68	1.237	8.2	0.8	2.9	7.6	2.64
3	3	Tim Adleman\adlemti01	29	CIN	NL	5	11	0.313	5.52	30	...	1	531	80	5.87	1.431	9.1	2.1	3.8	7.9	2.12
4	4	Andrew Albers*\alberan01	31	SEA	AL	5	1	0.833	3.51	9	...	1	178	121	4.13	1.293	9.4	1.3	2.2	8.1	3.70
5	5	Matt Albers\alberma01	34	WSN	NL	7	2	0.778	1.62	63	...	0	233	269	3.40	0.852	5.2	0.9	2.5	9.3	3.71
6	6	Al Alburquerque\albural01	31	TOT	AL	0	2	0.000	2.50	21	...	0	71	182	2.94	1.000	5.0	0.0	4.0	7.0	1.75
7	7	Al Alburquerque\albural01	31	KCR	AL	0	1	0.000	3.60	11	...	0	42	128	3.16	1.300	6.3	0.0	5.4	8.1	1.50

Special characters (or wild card)	What does it do?
.	matches any single character
*	match preceding character/number 0 or more times
+	match preceding character/number at least once
\?	match preceding character/number exactly once
\	suppresses special meanings. Use this if you want to search for one of the special characters in your string
^	matches the beginning of the string
\$	matches the end of a string
[]	match any characters inside the square brackets
[^]	match any characters except those inside the brackets
{n}	match preceding character n times
{n,m}	match preceding character between n and m times
\n	new line
\t	tab

	Name	Name_clean
	<chr>	<chr>
1	Fernando Abad*\abadfe01	Fernando Abad
3	Tim Adleman\adlemti01	Tim Adleman
4	Andrew Albers*\alberan01	Andrew Albers
5	Matt Albers\alberma01	Matt Albers
6	Al Alburquerque\albural01	Al Alburquerque
7	Al Alburquerque\albural01	Al Alburquerque

Setup of workspace¶

Temporal data¶

Date coding notation in R¶

Date objects¶

Datetime objects¶

strptime()¶

String Manipulation¶

Regular expressions¶

Find values in strings/vectors that match your desired pattern or sequence¶

Return logical vectors that match your pattern¶

Find and replace values that match your pattern¶

Web scraping¶

What is Web scraping?¶

Ways to scrape data¶

Understanding a web page¶

HTML¶

CSS¶

Web scraping imdb website¶

Rank¶

Title¶

Description¶

Runtime¶

Genre¶

Rating¶

Votes¶

Director¶

Actor¶

Metascore¶

Gross¶

Analyzing scraped data from the web¶

Your lab task: Analyzing film data from 2016!¶

time	speed
<chr>	<dbl>
2014-01-23 14:28:21	2.0
2014-01-23 14:28:55	2.2
2014-01-23 14:29:02	3.4
2014-01-23 14:31:18	5.5

time	speed
<dttm>	<dbl>
2014-01-23 14:28:21	2.0
2014-01-23 14:28:55	2.2
2014-01-23 14:29:02	3.4
2014-01-23 14:31:18	5.5