# Lab settings - please ingnore
options(repr.plot.width=7, repr.plot.height=4, repr.plot.res=250 ) # Make plots a resonable size


install.packages(c("dbplyr", "RSQLite"))


library(dplyr)
library(dbplyr)
library(ggplot2)


mammals <- DBI::dbConnect(RSQLite::SQLite(), "data_raw/portal_mammals.sqlite")


src_dbi(mammals)

src:  sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
tbls: plots, species, surveys


tbl(mammals,"surveys")

# Source:   table<surveys> [?? x 9]
# Database: sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's
#   Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
   record_id month   day  year plot_id species_id sex   hindfoot_length weight
       <int> <int> <int> <int>   <int> <chr>      <chr>           <int>  <int>
 1         1     7    16  1977       2 NL         M                  32     NA
 2         2     7    16  1977       3 NL         M                  33     NA
 3         3     7    16  1977       2 DM         F                  37     NA
 4         4     7    16  1977       7 DM         M                  36     NA
 5         5     7    16  1977       3 DM         M                  35     NA
 6         6     7    16  1977       1 PF         M                  14     NA
 7         7     7    16  1977       2 PE         F                  NA     NA
 8         8     7    16  1977       1 DM         M                  37     NA
 9         9     7    16  1977       1 DM         F                  34     NA
10        10     7    16  1977       6 PF         F                  20     NA
# ... with more rows


tbl(mammals,"species")

# Source:   table<species> [?? x 4]
# Database: sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's
#   Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
   species_id genus            species         taxa   
   <chr>      <chr>            <chr>           <chr>  
 1 AB         Amphispiza       bilineata       Bird   
 2 AH         Ammospermophilus harrisi         Rodent 
 3 AS         Ammodramus       savannarum      Bird   
 4 BA         Baiomys          taylori         Rodent 
 5 CB         Campylorhynchus  brunneicapillus Bird   
 6 CM         Calamospiza      melanocorys     Bird   
 7 CQ         Callipepla       squamata        Bird   
 8 CS         Crotalus         scutalatus      Reptile
 9 CT         Cnemidophorus    tigris          Reptile
10 CU         Cnemidophorus    uniparens       Reptile
# ... with more rows


tbl(mammals, sql("SELECT year, species_id, plot_id FROM surveys"))


surveys <- tbl(mammals, "surveys")

surveys %>%
    select(year, species_id, plot_id)


surveys %>%
    select(year, species_id, plot_id)  %>% 
    show_query()

<SQL>
SELECT `year`, `species_id`, `plot_id`
FROM `surveys`


nrow(surveys)


show_query(head(surveys, n = 10))

<SQL>
SELECT *
FROM `surveys`
LIMIT 10


#In SQLite, variable names are escaped by double quotes

translate_sql(x)
#> <SQL> `x`

# And strings are escaped by single quotes

translate_sql("x")
#> <SQL> 'x'


translate_sql(x == 1 && (y < 2 || z > 3))
#> <SQL> `x` = 1.0 AND (`y` < 2.0 OR `z` > 3.0)

translate_sql(x ^ 2 < 10)
#> <SQL> POWER(`x`, 2.0) < 10.0

translate_sql(x %% 2 == 10)
#> <SQL> `x` % 2.0 = 10.0


translate_sql(substr(x, 5, 10))
#> <SQL> SUBSTR(`x`, 5, 6)

translate_sql(log(x, 10))
#> <SQL> LOG(10.0, `x`)


translate_sql(1)
#> <SQL> 1.0

translate_sql(1L)
#> <SQL> 1


translate_sql(if (x > 5) "big" else "small")
#> <SQL> CASE WHEN (`x` > 5.0) THEN ('big') WHEN NOT(`x` > 5.0) THEN ('small') END


simple <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

simple

# Source:   lazy query [?? x 3]
# Database: sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's
#   Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
   species_id sex   weight
   <chr>      <chr>  <int>
 1 PF         M          4
 2 PF         F          4
 3 PF         NA         4
 4 PF         F          4
 5 PF         F          4
 6 RM         M          4
 7 RM         F          4
 8 RM         M          4
 9 RM         M          4
10 RM         M          4
# ... with more rows


# ... with more rows


str(simple)

List of 2
 $ src:List of 2
  ..$ con  :Formal class 'SQLiteConnection' [package "RSQLite"] with 8 slots
  .. .. ..@ ptr                :<externalptr> 
  .. .. ..@ dbname             : chr "C:\\Users\\Diego\\Documents\\6.Biologist's Toolkit\\25Jan\\biol3782\\week6\\data_raw\\portal_mammals.sqlite"
  .. .. ..@ loadable.extensions: logi TRUE
  .. .. ..@ flags              : int 70
  .. .. ..@ vfs                : chr ""
  .. .. ..@ ref                :<environment: 0x000000003582f388> 
  .. .. ..@ bigint             : chr "integer64"
  .. .. ..@ extended_types     : logi FALSE
  ..$ disco: NULL
  ..- attr(*, "class")= chr [1:4] "src_SQLiteConnection" "src_dbi" "src_sql" "src"
 $ ops:List of 4
  ..$ name: chr "select"
  ..$ x   :List of 4
  .. ..$ name: chr "filter"
  .. ..$ x   :List of 2
  .. .. ..$ x   : 'ident' chr "surveys"
  .. .. ..$ vars: chr [1:9] "record_id" "month" "day" "year" ...
  .. .. ..- attr(*, "class")= chr [1:3] "op_base_remote" "op_base" "op"
  .. ..$ dots:List of 1
  .. .. ..$ : language ~weight < 5
  .. .. .. ..- attr(*, ".Environment")=<environment: 0x000000003a47f470> 
  .. ..$ args: list()
  .. ..- attr(*, "class")= chr [1:3] "op_filter" "op_single" "op"
  ..$ dots: list()
  ..$ args:List of 1
  .. ..$ vars:List of 3
  .. .. ..$ species_id: symbol species_id
  .. .. ..$ sex       : symbol sex
  .. .. ..$ weight    : symbol weight
  ..- attr(*, "class")= chr [1:3] "op_select" "op_single" "op"
 - attr(*, "class")= chr [1:5] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql" "tbl_lazy" ...


data_subset <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

data_subset

# Source:   lazy query [?? x 3]
# Database: sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's
#   Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
   species_id sex   weight
   <chr>      <chr>  <int>
 1 PF         M          4
 2 PF         F          4
 3 PF         NA         4
 4 PF         F          4
 5 PF         F          4
 6 RM         M          4
 7 RM         F          4
 8 RM         M          4
 9 RM         M          4
10 RM         M          4
# ... with more rows


str(data_subset)


data_subset <- surveys %>%
  filter(weight > 5) %>%
  select(species_id, sex, weight) %>%
  collect()

data_subset %>% 
  head()


str(data_subset)

tibble [32,208 x 3] (S3: tbl_df/tbl/data.frame)
 $ species_id: chr [1:32208] "DM" "DM" "DM" "DM" ...
 $ sex       : chr [1:32208] "M" "M" "F" "F" ...
 $ weight    : int [1:32208] 40 48 29 46 36 52 8 22 35 7 ...


data_subset %>% 
  na.omit() %>% 
  ggplot(aes(species_id)) +
  geom_histogram(aes(fill = sex), position = "dodge", stat = "count", color = "black") +
  theme_classic()

Warning message:
"Ignoring unknown parameters: binwidth, bins, pad"


plots <- tbl(mammals, "plots")

plots


surveys

# Source:   table<surveys> [?? x 9]
# Database: sqlite 3.34.1 [C:\Users\Diego\Documents\6.Biologist's
#   Toolkit\25Jan\biol3782\week6\data_raw\portal_mammals.sqlite]
   record_id month   day  year plot_id species_id sex   hindfoot_length weight
       <int> <int> <int> <int>   <int> <chr>      <chr>           <int>  <int>
 1         1     7    16  1977       2 NL         M                  32     NA
 2         2     7    16  1977       3 NL         M                  33     NA
 3         3     7    16  1977       2 DM         F                  37     NA
 4         4     7    16  1977       7 DM         M                  36     NA
 5         5     7    16  1977       3 DM         M                  35     NA
 6         6     7    16  1977       1 PF         M                  14     NA
 7         7     7    16  1977       2 PE         F                  NA     NA
 8         8     7    16  1977       1 DM         M                  37     NA
 9         9     7    16  1977       1 DM         F                  34     NA
10        10     7    16  1977       6 PF         F                  20     NA
# ... with more rows


joined_plots <- plots %>%
  filter(plot_id == 1) %>%
  inner_join(surveys) 

joined_plots %>%
  as.data.frame() %>% 
  head()

Joining, by = "plot_id"


full_plots <- plots %>%
  filter(plot_id == 1) %>%
  inner_join(surveys) %>%
  collect()

Joining, by = "plot_id"


str(full_plots)

tibble [1,995 x 10] (S3: tbl_df/tbl/data.frame)
 $ plot_id        : int [1:1995] 1 1 1 1 1 1 1 1 1 1 ...
 $ plot_type      : chr [1:1995] "Spectab exclosure" "Spectab exclosure" "Spectab exclosure" "Spectab exclosure" ...
 $ record_id      : int [1:1995] 6 8 9 78 80 218 222 239 263 270 ...
 $ month          : int [1:1995] 7 7 7 8 8 9 9 9 10 10 ...
 $ day            : int [1:1995] 16 16 16 19 19 13 13 13 16 16 ...
 $ year           : int [1:1995] 1977 1977 1977 1977 1977 1977 1977 1977 1977 1977 ...
 $ species_id     : chr [1:1995] "PF" "DM" "DM" "PF" ...
 $ sex            : chr [1:1995] "M" "M" "F" "M" ...
 $ hindfoot_length: int [1:1995] 14 37 34 16 48 13 52 48 37 36 ...
 $ weight         : int [1:1995] NA NA NA 9 NA 4 NA NA 40 38 ...


full_plots %>% 
  na.omit() %>% 
  ggplot() +
  geom_point(aes(x = weight, y = hindfoot_length, fill = species_id), shape = 21) +
  geom_abline(aes(intercept = 0, slope = 1)) +
  facet_wrap( ~year) +
  theme_classic() +
  scale_x_continuous(breaks=c(0, 100, 200))


full_plots %>% 
  filter(year >= 1977 & year <= 1980) %>% 
  na.omit() %>% 
  ggplot() +
  geom_point(aes(x = weight, y = hindfoot_length, fill = species_id), shape = 21) +
  geom_abline(aes(intercept = 0, slope = 1)) +
  facet_grid(year ~ sex) +
  theme_classic() +
  scale_x_continuous(breaks=c(0, 100, 200)) +
  scale_y_continuous(breaks=c(0, 30, 60))


joined_plots %>%
  filter(!is.na(weight)) %>% 
  mutate(ratio = hindfoot_length/weight) %>% 
  as.data.frame() %>% 
  head()


install.packages("devtools")


library(devtools)

install_github("ropensci/treebase")


library(treebase)

library(ggplot2)

library(tidyverse)


Phylogenies_from_Huelsenbeck <- search_treebase("Huelsenbeck", "author")


str(Phylogenies_from_Huelsenbeck)

List of 2
 $ :List of 9
  ..$ edge     : int [1:61, 1:2] 33 33 34 35 36 37 37 36 38 38 ...
  ..$ Nnode    : int 30
  ..$ tip.label: chr [1:32] "Ibalia" "Isocolus" "Aulacidea" "Panteliella" ...
  ..$ S.id     : chr "1070"
  ..$ Tr.id    : chr "5846"
  ..$ type     : chr "Single"
  ..$ kind     : chr "Species Tree"
  ..$ quality  : chr "Unrated"
  ..$ ntax     : chr "32"
  ..- attr(*, "class")= chr "phylo"
  ..- attr(*, "order")= chr "cladewise"
 $ :List of 9
  ..$ edge     : int [1:93, 1:2] 51 51 52 53 53 52 54 55 56 57 ...
  ..$ Nnode    : int 44
  ..$ tip.label: chr [1:50] "Acraea_andromacha" "Actinote_genitrix" "Actinote_stratonice" "Anthocharis_midea" ...
  ..$ S.id     : chr "1915"
  ..$ Tr.id    : chr "4975"
  ..$ type     : chr "Consensus"
  ..$ kind     : chr "Species Tree"
  ..$ quality  : chr "Unrated"
  ..$ ntax     : chr "50"
  ..- attr(*, "class")= chr "phylo"
  ..- attr(*, "order")= chr "cladewise"


# get all trees from a certain depostition date forwards

meta <- download_metadata("2009-01-01", by="until")

# extract any metadata (i.e. publication date)

dates <- sapply(meta, function(x) as.numeric(x$date))


hist(dates, main="TreeBase growth", xlab="Year")


authors <- sapply(meta, function(x){
   index <- grep( "creator", names(x))
     x[index] 
})

a <- as.factor(unlist(authors))

head(summary(a))


journals <- sapply(meta, function(x) x$publisher)


J <- tail(sort(table(as.factor(unlist(journals)))),5)


b <- barplot(as.numeric(J))
                   
text(b, names(J), srt=70, pos=4, xpd=T)


load("data_raw/treebase.RData")


have <- have_branchlength(treebase) 

branchlengths <- treebase[have][c(1:200)]


#time calibration function
timetree <- function(tree){
    try(chronoMPL(multi2di(tree)), 
        silent=TRUE)}

#cleaned data
tt <- drop_nontrees(sapply(branchlengths,timetree))


str(tt)


gammas <- sapply(tt,  gammaStat)


str(gammas)

 num [1:199] 1.24 5.05 3.58 13.38 2.42 ...


qplot(gammas, binwidth = 0.5)+
  xlab('gamma statistic')+
  theme_classic()

Warning message:
"Removed 7 rows containing non-finite values (stat_bin)."


install.packages("rfishbase", 
                 repos = c("http://packages.ropensci.org", "http://cran.rstudio.com"), 
                 type="source")


library('rfishbase')


options(FISHBASE_VERSION="19.04")


validate_names(c("Oreochromis niloticus", "Salmo trutta"))


species_list(Genus = "Labroides")


fish <- common_to_sci("trout")

fish  %>% 
 head()


species(fish$Species[1:2])


list_fields("Resilience")


resil <- stocks(fish$Species, fields="Resilience")

resil %>% 
 head()


dat <- species(fish$Species, fields=c("SpecCode", "PriceCateg", "Vulnerability"))

dat %>% 
  head()


combined_data <- merge(dat, resil)

combined_data %>% 
  head()


str(combined_data)

'data.frame':	124135 obs. of  4 variables:
 $ SpecCode     : num  6210 8705 24454 246 238 ...
 $ PriceCateg   : chr  "very high" "unknown" "unknown" "very high" ...
 $ Vulnerability: num  47 34.8 46.8 43.4 60 ...
 $ Resilience   : chr  "Medium" "Medium" "Medium" "Medium" ...


combined_data %>%  
  ggplot(aes(x = Vulnerability)) +
  geom_histogram(aes(fill = Resilience), stat = "count", position ="dodge") +
  facet_wrap( ~PriceCateg) +
  scale_y_log10() +
  theme_classic() +
  theme(legend.position = "bottom")

Warning message:
"Ignoring unknown parameters: binwidth, bins, pad"


cssFile <- '../css/custom.css'
IRdisplay::display_html(readChar(cssFile, file.info(cssFile)$size))

IRdisplay::display_html("<style>body {counter-reset: question_num;}.Q::before {counter-increment: question_num;
    content: 'QUESTION ' counter(question_num) '\\A'; white-space: pre; }</style>")

Species	ComName	Language	SpecCode
<chr>	<chr>	<chr>	<dbl>
Salmo obtusirostris	Adriatic trout	English	6210
Schizothorax richardsonii	Alawan snowtrout	English	8705
Schizopyge niger	Alghad snowtrout	English	24454
Salvelinus fontinalis	American brook trout	English	246
Salmo trutta	Amu-Darya trout	English	238
Salmo kottelati	Antalya trout	English	67602

SpecCode	Species	Genus	SpeciesRefNo	Author	FBname	PicPreferredName	PicPreferredNameM	PicPreferredNameF	PicPreferredNameJ	...	Profile	PD50	Emblematic	Entered	DateEntered	Modified	DateModified	Expert	DateChecked	TS
<dbl>	<chr>	<chr>	<dbl>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	...	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<int>
6210	Salmo obtusirostris	Salmo	59043	(Heckel, 1851)	Adriatic trout	Saobt_u0.jpg	NA	NA	NA	...	NA	0.5	0	22	712800000	10	1278892800	1	763084800	NA
8705	Schizothorax richardsonii	Schizothorax	4832	(Gray, 1832)	Snowtrout	Scric_u1.jpg	NA	NA	NA	...	NA	0.5	0	1	757987200	2	1336521600	1	843696000	NA

Setup of workspace¶

What are databases¶

What to do about very large datasets?¶

What is Structured Query Language (SQL)?¶

Connect the databases¶

Querying with SQL syntax¶

Querying with dplry syntax¶

SQL translation¶

Simple database queries¶

Complex database queries¶

Diving deeper into databases¶

Example 1: Treebase¶

Tests across many phylogenies¶

Example 2: Fishbase¶

Accesing specific versions of the database¶

Buidling a species list¶

Getting data¶

Discovering data¶

Using databases to improve performance and decision-making¶

A tibble: 1 × 1
table
<chr>
stocks

A tibble: 6 × 1
Resilience
<chr>
Medium
Medium
Medium
Medium
NA
NA