Using raquamaps

This vignette is a long form documentation included in the raquamaps R package with the intention to describe some common usage scenarios, for example:

For a given species, calculate and generate a distribution map and output in csv format
For a list of species, generate several maps and output in raster format

Some useful packages to use with raquamaps

First some recommended R packages that are useful when working with raquamaps, in order to facilitate plotting, data manipulation and data retrieval:

# plotting
require("classInt")
require("ggplot2")
require("raster")
require("RColorBrewer")

# data manipulation
require("plyr")
require("dplyr")
require("reshape2")

# data modelling
require("rgbif")
require("raquamaps")

# we set our preference for how 
# we choose to interpret string data 
# throughout this tutorial
options(stringsAsFactors = FALSE)

How to access and use the bundled reference data

This package comes with some bundled reference data that can be used for demonstrational purposes and in off-line situations. For real use cases, it is recommended to use on-line services or tools for getting up-to-date reference data, such as the rgbif R package. How to get data from rgbif for use in raquamaps will be covered later in this tutorial.

Before showing some convenience functions that simplify accessing reference data, here is how you can access bundled reference datasets directly:

# Inspect documentation for reference datasets
# that are bundled into the package with this ...

# data(package = "raquamaps")

# Get and print names of bundled datasets 
# in the raquamaps package
datasets <- data(package = "raquamaps")$results[ ,"Item"]
print(datasets)

## [1] "aquamaps_galemys_pyrenaicus"   "aquamaps_hc"                  
## [3] "aquamaps_hcaf_eu"              "aquamaps_hcaf_world"          
## [5] "aquamaps_presence_basins"      "aquamaps_presence_occurrences"

# Load and unload one of the bundled datasets 
# into your current  environment
data("aquamaps_galemys_pyrenaicus")  # load
rm("aquamaps_galemys_pyrenaicus")  # unload

# If you want to load a dataset into your own variable you can ...
# Define a helper function to load reference data from 
# an R package as a data frame that displays nicely
get_data <- function(x) 
  tbl_df(get(data(list = x)))

# ... and use it to load a dataset into your own named variable
hcaf_eu <- get_data("aquamaps_hcaf_eu")
galemys <- get_data("aquamaps_galemys_pyrenaicus")

# Here we load the European cell authority fields, ie half degree cell
# data including geometry and an explicitly defined set of bioclimate 
# variables 
clim_vars <- c(
  "Elevation", 
  "TempMonthM", 
  "NPP", 
  "SoilpH", 
  "SoilMoistu", 
  "PrecipAnMe", 
  "CTI_Max", 
  "SoilCarbon")

hcaf <- get_data("aquamaps_hcaf_eu") %>%
  # discard irrelevant fields
  # and rename the cell identifier column name to
  # harmonize with the other datasets
  select(loiczid = LOICZID, one_of(clim_vars)) 

# Define a cleanup function, intended for use across columns
replace_9999 <- function(x) 
  ifelse(round(x) == -9999, NA, x)

# We apply the above cleanup function across columns, dplyr style
hcaf_eu <- 
  hcaf %>%
  # recode values across climate variable columns 
  # substitute -9999, use NA instead, exclude the "loiczid" column
  mutate_each(funs(replace_9999), -loiczid)

hcaf_eu

## Source: local data frame [8,593 x 9]
## 
##    loiczid Elevation TempMonthM   NPP SoilpH SoilMoistu PrecipAnMe CTI_Max
## 1   103418    475.45         31 0.228  7.652     0.1005       4.73    1777
## 2   102697    479.77         31 0.228  7.686     0.1505       4.19    2008
## 3   102698    432.49         31 0.228  7.792     0.1000       4.39    1814
## 4   103419    441.11         31 0.207  7.643     0.1000       4.99    1864
## 5   103420    411.25         30 0.207  7.725     0.1000       5.44    1897
## 6   102699    407.43         31 0.207  7.797     0.1000       4.60    1774
## 7   102700    385.83         31 0.207  7.665     0.1000       4.92    1564
## 8   101971    882.62         30 0.001  7.670     0.7485       5.76    1666
## 9   101972    781.76         30 0.001  7.765     0.8145       4.93    1556
## 10  101251    766.98         30 0.001  7.770     0.7845       5.03    1737
## ..     ...       ...        ...   ...    ...        ...        ...     ...
## Variables not shown: SoilCarbon (dbl)

There are some convenient helper functions that simplifies access and provides some relevant defaults for use with the demonstrational reference data that is bundled in the package:

# Suggested set of bioclimate variables
clim_vars <- default_clim_vars()

# One specific species name - the default species - Galemys pyrenaicus
species <- default_species()

# Several species names available in the reference data
species_list <- default_species_list()

# European half degree cell reference data for some relevant bioclimate variables
hcaf_eu <- default_hcaf()

# Presence data for all species in the bundled
# reference data, including grid cell identifiers (loiczids)
presence_occs <- default_presence()

# Showing a "dplyr inspired way" to load presence data
# which demonstrates using dplyr API with a
# "pipe" operator that chains operations together sequentially, 
# each step sending its output as input to the next step, 
# going from left to right
presence_galemys <- 
  default_presence() %>% 
  filter(lname == default_species())

# Yet another way to display grid cell identifiers (loiczids) 
# for the default species 
presence_loiczids <- 
  presence_occs %>%
  filter(lname == default_species()) %>%
  group_by(loiczid) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

# Another variant on the same thing
presence_galemys_pyrenaicus <- 
  presence_occs %>%
  # retain only records for a specific latin name
  filter(lname == "galemys pyrenaicus") %>%
  # in case of duplicate cell ids, retrieve only distinct ids
  distinct(loiczid) %>%
  #  group_by(loiczid) %>%
  select(loiczid)

# Get a list of all latin names  for all species in the 
# bundled occurrence dataset ordered with the species with 
# the most dispersed distribution (ie highest number of distinct 
# different cells) at the top
presence_occs %>%
  group_by(lname) %>%
  summarise(count = n_distinct(loiczid)) %>%
  arrange(desc(count)) %>%
  select(species = lname, count)

## Source: local data frame [110 x 2]
## 
##                species count
## 1        ardea cinerea  1793
## 2  gallinago gallinago  1450
## 3      riparia riparia  1444
## 4      aythya fuligula  1388
## 5  phalacrocorax carbo  1386
## 6     numenius arquata  1245
## 7      cinclus cinclus  1209
## 8   bucephala clangula  1204
## 9  gallinula chloropus  1162
## 10    mergus merganser  1158
## ..                 ...   ...

Typical workflow

Now that we can retrieve presence data for one species using the bundled reference data, we can show the workflow for calculating the environmental envelope or spread values.

# We find bioclimate variable data from half degree cells
# where a specific species has been present
hcaf_galemys <- hcaf_by_species("galemys pyrenaicus")

# We determine the spreads across these particular grid cells
# ie calculate the environmental envelope for the species
spreads_galemys <- calc_spreads(hcaf_galemys)

# We can also use this wrapper function for convenience
calc_spreads_by_species("galemys pyrenaicus")

## Source: local data frame [8 x 13]
## 
##      Measure  n n_distinct n_NA  min  max   q1   q3   d1   d9 range iqr
## 1  Elevation 76         76    0  229 1767  650 1068  478 1177  1538 418
## 2 TempMonthM 76         10    0   10   19   14   16   13   17     9   2
## 3        NPP 76         28    0    0    1    0    1    0    1     1   0
## 4     SoilpH 76         66    0    5    8    5    7    5    8     3   2
## 5 SoilMoistu 76         76    0   53  142   82  120   70  127    89  38
## 6 PrecipAnMe 76         76    0   35  114   47   87   41  100    79  41
## 7    CTI_Max 76         71    0 1103 1878 1341 1528 1269 1618   775 187
## 8 SoilCarbon 76         68    0    4   10    5    8    5    9     6   3
## Variables not shown: idr (dbl)

Now, across various other half degree grid cells, where we have similar bioclimate variable information available, we can calculate relative probabilites for presence, assuming that similar spreads in the species’ environmental bioclimate variables would indicate a suitable habitat:

# Calculate probabilites across European grid cells, 
# given this particular species preference for environmental envelopes
hcaf_eu <- default_hcaf()

# You can inspect the model used by inspecting the calc_prob and calc_probs 
# functions, ie uncomment lines below and run in R to inspect the functions:
# calc_prob
# calc_probs
# You can see how prod_p is calculated in the calc_probs function, ie:
# prod_p = Elevation * TempMonthM * NPP * SoilpH * 
#        SoilMoistu * PrecipAnMe * CTI_Max * SoilCarbon
# where those factors are individual p values determined by calc_prob using
# the spread or environmental envelope preference
probs <- calc_probs(hcaf_eu, spreads_galemys)

# Define custom plot style to use in ggplot2 plots
theme_raquamaps <- function() {
  theme <- theme_bw() + theme(
      axis.text.x = element_text(size = 10),
      axis.text.y = element_text(size = 10),
      panel.background = element_blank(), 
      plot.background = element_blank(),
      panel.grid.minor = element_blank(), 
      panel.grid.major = element_blank())
}

# Plot a histogram of the probabilities  
ggplot(probs, aes(x = probs$p)) + theme_raquamaps() +
  geom_bar(fill = "darkgreen", colour = "darkgray")

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Using presence data from gbif

Instead of using the prepackaged reference data, you will probably want to get data from GBIF. Here is an illustration of how to do this conveniently by using the rgbif R package:

# Get presence data for Great White Shark from rgbif
# jagged_tooth <- "Carcharodon carcharias"
#gws <- presence_rgbif(jagged_tooth)
#gws <- get_data("rgbif_great_white_shark")
gws <- get_data("rgbif_galemys_pyrenaicus")

## Warning in data(list = x): data set 'rgbif_galemys_pyrenaicus' not found

# TODO: add presence dataset from gbif to package
# to avoid problems to build if rgbif service is unavailable

# Plot the point data, with a world map to provide context
world <- map_data("world")
ggplot(world, aes(x = long, y = lat, group = group)) +
  # paint country border
  geom_path(color = "gray50", alpha = 0.4) +
  # add general styling
  theme_raquamaps() + coord_cartesian() +
  # zoom in to Europe
  coord_cartesian(xlim = c(-20, 10), ylim = c(35, 50)) +
  # add point data
  geom_point(data = gws, 
     aes(x = gws$decimalLongitude, 
        y = gws$decimalLatitude, group = 0),
     alpha = 0.1, color = "darkgreen")

## Warning: Removed 293 rows containing missing values (geom_point).

# We need at least 5 observations per half degree cell, 
# and need to find those grid cells where the species is present
r <- rasterize_presence(gws)
loiczids <- which_cells_in_raster(r, 5)

hcaf_gws <- 
  # use European half degree cell authority file
  default_hcaf() %>%
  # return only cells where there was presence
  filter(loiczid %in% loiczids)

# Now we know where this species is likely to be present 
# and we determine the corresponding bioclimate variable parameter 
# spread for the cells with known presence

spread_gws <- calc_spreads(hcaf_gws)

# Finally we calculate relative probabilities for presence in 
# other cells, assuming that cells with similar "environmental envelope" 
# is more likely to be a suitable habitat for this species 

hcaf_eu <- default_hcaf()
p <- calc_probs(hcaf_eu, spread_gws)

# Using half degree cell grid data for the whole world
# TODO: unfortunately we miss bioclimate variable data for the full
# world grid, this should be added
hdc_world <- 
  tbl_df(get(data("aquamaps_hc"))) %>%
  mutate(loiczid = LOICZID) %>%
  select(-LOICZID)
rownames(hdc_world) <- hdc_world$loiczid

# TODO: Ask Sven K for reasonable habitat

Exporting results in raster format

You can now proceed to visualize this data in raster format like so:

# Put the probabilities into a raster layer
r <- raster(ncol = 720, nrow = 360)
r[p$loiczid] <- p$prod_p

# Convert raster layer data to a data frame
# to simpligy plotting with ggplot2
cx <- coordinates(r)[ ,1]
cy <- coordinates(r)[ ,2]
df <- data.frame(x = cx, y = cy, p = values(r))

# Shows how to use Fisher-Jenks style class intervals
# for coloring in the plot
fisher_intervals <- classIntervals(df$p, n = 5, style = "fisher")

## Warning in classIntervals(df$p, n = 5, style = "fisher"): var has missing
## values, omitted in finding classes

p_steps <- cut(df$p, breaks = fisher_intervals$brks)

# For now we just go with a standard five-stepped
# sequential color scheme
breakpoints <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
colors <- brewer.pal(5, "YlOrRd")
p_steps <- cut(df$p, breaks = breakpoints)

# Plot raster data along with the world map
map_gws <- 
  ggplot(world, aes(x = long, y = lat, group = group)) +
  # zoom in to Europe
  coord_cartesian(xlim = c(-50, 50), ylim = c(20, 75)) +
  # use specific styling
  theme_bw() + theme(
      axis.text.x = element_text(size = 10),
      axis.text.y = element_text(size = 10),
      panel.background = element_blank(), 
      plot.background = element_blank(),
      panel.grid.minor = element_line(colour = "gray95"),
      panel.grid.major = element_line(colour = "gray95"),
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      axis.line = element_blank(),
      axis.title = element_blank(),
      panel.border = element_blank()) +  
  # add the raster data that was converted to a data frame earlier
  geom_raster(data = df, aes(x, y, fill = p_steps, group = NULL)) +
  # this determines colors used, inspired by aquamaps.org
  scale_fill_brewer(type = "seq", palette = "YlOrRd", name = "Probabilities") +
  # outline country border with a light gray color
  geom_path(color = "gray70", alpha = 0.4) +
  # add grid lines to the map
  scale_y_continuous(breaks=(-2:2) * 30) +
  scale_x_continuous(breaks=(-4:4) * 45)

# We now display this plot
map_gws

# To export this map, ggsave can be used
#ggsave(filename = "map.png", plot = map_gws, 
#       width = 5, height = 4, units = "cm")

# To export the raster data one can save the raster
# or export the corresponding data frame to csv, like so:
# write.csv2(tbl_df(df), file = "rasterdata.csv")

Batch calculation of spreads and probabilites

# Calculate spreads for several species at once
# dplyr style, limit calculation to top 10 species
species_list %>% 
  filter(count > 100) %>%
  head(10) %>%
  group_by(species) %>%
  do(calc_spreads_by_species(.$species))

## Source: local data frame [80 x 14]
## Groups: species
## 
##            species    Measure    n n_distinct n_NA   min  max   q1   q3
## 1    ardea cinerea  Elevation 1541       1518    0     1 2323  104  484
## 2    ardea cinerea TempMonthM 1541         32    0     0   35   12   16
## 3    ardea cinerea        NPP 1541        361    0 -9999    1    0    1
## 4    ardea cinerea     SoilpH 1541       1010    0 -9999    8    5    6
## 5    ardea cinerea SoilMoistu 1541       1523    0     0  150   75  114
## 6    ardea cinerea PrecipAnMe 1541       1326    0     0  229   48   69
## 7    ardea cinerea    CTI_Max 1541        761    0   597 2272 1383 1686
## 8    ardea cinerea SoilCarbon 1541       1191    0 -9999   19    5   10
## 9  aythya fuligula  Elevation 1208       1199    0 -9999 1933  101  429
## 10 aythya fuligula TempMonthM 1208         29    0     0   34   10   14
## ..             ...        ...  ...        ...  ...   ...  ...  ...  ...
## Variables not shown: d1 (dbl), d9 (dbl), range (dbl), iqr (dbl), idr (dbl)

# Create a function to wrap several steps into one
# first calculate environmental envelope for a species
# then calculate probabilities across European grid

calc_probs_species <- function(.latinname) {
#  hcaf_species <- hcaf_by_species(.latinname)
  spreads_species <- calc_spreads_by_species(.latinname)
  hcaf_eu <- default_hcaf()
  probs <- tbl_df(calc_probs(hcaf_eu, spreads_species))
  return (probs)
}

# Now we use the function above to calculate probabilities
# for just one species
calc_probs_species(species_list$species[2])

## Source: local data frame [8,593 x 11]
## 
##    loiczid Elevation TempMonthM NPP SoilpH SoilMoistu PrecipAnMe
## 1    11997         1      0.000   0      0  1.0000000    0.39575
## 2    11998         1      0.000   0      0  0.2948841    0.39850
## 3    11999         1      0.000   0      0  0.2778778    0.39450
## 4    12712         1      0.000   0      0  1.0000000    0.46900
## 5    12713         1      0.000   0      0  1.0000000    0.47350
## 6    12714         1      0.000   0      0  1.0000000    0.48150
## 7    13415         1      0.125   0      0  1.0000000    0.51050
## 8    13417         1      0.000   0      0  1.0000000    0.52025
## 9    13419         1      0.000   0      0  0.7864846    0.50975
## 10   13420         1      0.000   0      0  1.0000000    0.51325
## ..     ...       ...        ... ...    ...        ...        ...
## Variables not shown: CTI_Max (dbl), SoilCarbon (dbl), prod_p (dbl),
##   geomprod_p (dbl)

# However, we want to use the function above now 
# to calculate probabilities for several species,
# Here is how we do that, dplyr style, including
# how to limit to 10 species in one batch calculation
batch_of_ten <- 
  species_list %>%
  head(10) %>%
  group_by(species) %>%
  do(calc_probs_species(.$species))

Using world map and ggmap

# https://stackoverflow.com/questions/11201997/world-map-with-ggmap

TODO: Show how to use a bayesian / monte carlo model

TODO: Restrict output to areas with known presence

TODO: More illustrations

# How to merge several species into one
# say we have two synonyms, use this technique
# which_cells(c("galemys pyrenaicus", "mergus merganser"))