https://github.com/vsbuffalo/paradox_variation
Raw File
Tip revision: 8fa6b5834f6536319b1e5cd9722ca02d317183df authored by Vince Buffalo on 12 August 2021, 01:04:21 UTC
updated ms
Tip revision: 8fa6b58
species_ranges.r
library(rgbif)
library(rnaturalearth)
library(sf)
library(tidyverse)
library(readxl)
library(igraph)
library(units)

geomean <- function(x) prod(x)^(1/length(x))
source('../R/range_funcs.r')


#### Load the data in 

# we don't load in Corbett-Detig data, which has ranges though some are off.
# This is merged in combined_dataset.r. I make one correction here though, to
# Equus ferus przewalskii.

## Leffler data
message('loading Leffler et al data...')
dl <- read_tsv('./leffler_et_al_2012_updated.tsv')
lsp <- sanitize_species(dl$species)
names(lsp) <- dl$species

# get Leffler species names
uniq_lsp_names <- unique(names(lsp))
uniq_lsp <- lsp[uniq_lsp_names]

## Stapley data 
message('loading Stapley et al data...')
ds <- read_tsv('./stapley_et_al_2017_updated.tsv')
# Stapley is all unique species, double check
stopifnot(length(unique(ds$species)) == nrow(ds))

## Romiguier data 
# there are some multiple observations, but not combined like Leffler data
# (hence no need for key)
message('loading Romiguier et al data...')
drs <- read_tsv('./romiguier_et_al_2014_updated.tsv')
drs_sps <- unique(drs$species)

## Combine all data
# flatten the Leffler data
species_df <- bind_rows(map2(lsp, names(lsp), function(x, n) {
                 tibble(key=rep(n, length(x)), species=unlist(x),
                        dataset='Leffler')
}))

species_df <- bind_rows(species_df, tibble(key=ds$species, 
                        species=ds$species, dataset='Stapley'),
                        tibble(key=drs_sps, species=drs_sps,
                               dataset='Romiguier'))

uniq_species_df <- species_df %>% distinct(.keep_all=TRUE)
nrow(uniq_species_df)

#### Species keys and occurence data from GBIF
safe_occ_search <- safely(occ_search)

OCC_FILE <- 'gbif_species_occ.Rdata'
if (!file.exists(OCC_FILE)) {
  message('cached GBIF occurence data et al not found -- downloading from GBIF')
  message('getting suggested GBIF species names...')
  uniq_species_df <- uniq_species_df %>% 
                       mutate(taxon_key = map(species, 
                                        ~ name_suggest(., rank='species')))

  message('downloading GBIF species occurrences...')
  n <- nrow(uniq_species_df)
  uniq_species_df <- uniq_species_df %>% 
                       mutate(occs = map2(taxon_key, 1:nrow(uniq_species_df),
                         function(x, i) {
                           keep_keys <- x$key[!is.na(x$canonicalName)]
                           map(keep_keys, function(k) {
                             msg <-"    species '%s' %d/%d\n"
                             sps <- species_df$species[i]
                             message(sprintf(msg, sps, i, n))
                             out <- safe_occ_search(k, 
                                                    fields='minimal', 
                                                    limit = 30000)
                             return(out$result)
                         })
                        }))

  save(uniq_species_df, file=OCC_FILE)
} else {
  message('loading cached GBIF occurence data...   ', appendLF=FALSE)
  load(OCC_FILE)
}
message('done.')


## for citation (grr, GBIF makes this hard)
# keys <- unlist(map(uniq_species_df$taxon_key, 'key'))
# dwld <- occ_download(
#   pred_in("taxonKey", keys),
#   format = "SIMPLE_CSV",
#   user="vsbuffalo", pwd="hidden", email="vsbuffalo@gmail.com")


# combine in alphas
DEFAULT_ALPHA <- c(8, 70)
RANGE_DIR <- 'total_range_plots'
RAW_RANGES <- 'gbif_inferred_ranges_raw.Rdata'
if (!file.exists(RAW_RANGES)) {
  ### Infer terrestriality
  # filter entries without lat/long and infer terrestriality 
  ranges <- uniq_species_df %>% 
              #filter(species == 'Anoplopoma fimbria') %>%
              #filter(species == 'Aptenodytes patagonicus') %>%
              mutate(valid_geo = map_lgl(occs, valid_occurs)) %>%
              filter(valid_geo) %>%
              distinct(species, .keep_all=TRUE) %>%
              mutate(is_terrestrial = map_lgl(occs, get_terrestriality))

  ## Manually fix things
  # merge in the manually fixed data and fix the terrestriality flags
  source('./species_range_fixes.r')
  ranges <- ranges %>% 
    mutate(genus = extract_genus(species)) %>%
    mutate(is_terrestrial_fixed =
           pmap_lgl(list(genus, species, is_terrestrial),
                    function(g, s, it) {
                     case_when((g %in% marine_gnr) || 
                               (s %in% marine_sps) ~ FALSE,
                               (g %in% terrestrial_gnr) ||
                               (s %in% terrestrial_sps) ~ TRUE,
                               TRUE ~ it)
                    })) %>%
    mutate(is_terrestrial_orig = is_terrestrial, 
           is_terrestrial = is_terrestrial_fixed) %>%
    select(-is_terrestrial_fixed)

  # how many terrestriality fixes?
  table(ranges$is_terrestrial != ranges$is_terrestrial_orig)

  ### Infer Range
  ranges <- ranges %>%  
   mutate(range_area=pmap(list(occs, species, genus, is_terrestrial),
                    function(x, sp, gn, t) {
                      clean_name <- gsub(' +', '_', gsub('/', '_', sp))
                      pdf(sprintf('%s/%s.pdf', RANGE_DIR, clean_name))
                      if (length(x) == 0) {
                        # another degenerate case: just plot an empty plot 
                        maps::map('world', lwd=0.3, col='gray10')
                        dev.off()
                        return(NA)
                      }

                      alpha <- DEFAULT_ALPHA
                      if (sp %in% names(custom_alphas_sps))
                        alpha <- custom_alphas_sps[[sp]]
                      if (gn %in% names(custom_alphas_gnr))
                        alpha <- custom_alphas_gnr[[gn]]
                      # if any alphas are NA, set to default
                      alpha[is.na(alpha)] <- DEFAULT_ALPHA[is.na(alpha)]
                      if (gn %in% no_contrain_gnr || sp %in% no_contrain_sps)
                        constrain <- FALSE
                      else
                        constrain <- TRUE
                      areas <- map_dbl(x, infer_range, alpha=alpha, 
                                      is_terrestrial=t, species=sp, 
                                      constrain=constrain)
                      dev.off()
                      return(areas)
                    }
                    )) %>%
  mutate(n_occ = map_int(occs, count_occs))

  save(ranges, file=RAW_RANGES)
} else {
  message('loading raw range data...   ', appendLF=FALSE)
  load(RAW_RANGES)
}
message('done.')

ranges_df <- ranges %>% select(-occs, -taxon_key, -valid_geo) %>%
               mutate(range = map_dbl(range_area, max, na.rm=TRUE)) %>%
               select(-range_area)

## Manual fixes
# to ensure we are assining *over*, or correcting a value, 
# not creating a new entry, we use this
is_entry <- function(species) stopifnot(any(ranges_df$species == species))

# Human range size is just wrong. Occurrences weren't logged everywhere.
earth_land_area <- 510.1e6 # km^2
antartica_land_area <- 13.21e6 # km^2
human_range <- earth_land_area - antartica_land_area
is_entry('Homo sapiens')
ranges_df[ranges_df$species == 'Homo sapiens', ]$range <- human_range

# Nasonia vitripennis 
# see: https://www.nature.com/articles/hdy2009160/figures/1
us_land_area <- 9.834e6 # km^2
europe_land_area <- 10.18e6  # km^2
nv_range <- us_land_area + europe_land_area
is_entry('Nasonia vitripennis')
ranges_df[ranges_df$species == 'Nasonia vitripennis', ]$range <- nv_range

# Caenorhabditis
# https://www.wolframalpha.com/input/?i=area+of+california+%2B+area+of+washington+state+%2B+area+of+oregon+
# based on Frezel and Felix (2015)
western_us <- 863400 # km^2 
celegans_range <- western_us + europe_land_area
is_entry('Caenorhabditis elegans')
ranges_df[ranges_df$species == 'Caenorhabditis elegans', ]$range <- celegans_range

# Pinus balfouriana
# this was done manually in photoshop from the image 
# https://en.wikipedia.org/wiki/Pinus_balfouriana#/media/File:Pinus_balfouriana_range_map_1.png
ca_area <- 423971 # km^2
pinus_bal_sel <- (174016 - 172459) / 174016 
is_entry('Pinus balfouriana')
ranges_df[ranges_df$species == 'Pinus balfouriana', ]$range <- ca_area* pinus_bal_sel

# Pinus_massoniana
# This is an old map -- I base the area off korea
korea_rel_area <- 9758
pinus_mas_rel_area <- 62666
korea_area <- 100210 + 120540
pinus_mas_area <- korea_area * pinus_mas_rel_area / korea_rel_area
is_entry('Pinus massoniana')
ranges_df[ranges_df$species == 'Pinus massoniana', ]$range <- pinus_mas_area

# Pinus pinaster
pinus_pins_rel_area <- 20880
germany_rel_area <- 101984
germany_area <- 357000 # km^2
pinus_pins_area <- pinus_pins_rel_area / germany_rel_area * germany_area
is_entry('Pinus pinaster')
ranges_df[ranges_df$species == 'Pinus pinaster', ]$range <- pinus_pins_area

# Crassostrea gigas's distribution is coastal, which alpha hull 
# measures have trouble with. 
# 
crass_rel_area <- 15909
australia_rel_area <- 3429
australia_area <- 7.741e6
crass_range <- crass_rel_area / australia_rel_area * australia_area
is_entry('Crassostrea gigas')
ranges_df[ranges_df$species == 'Crassostrea gigas', ]$range <- crass_range

# Cystodytes dellechiajei's automated range is way off. 
# This is because it is a coastal species and the occurence data
# is being overfit. 
# I have used the map here: https://www.sealifebase.ca/summary/Cystodytes-dellechiajei.html#
# and used photoshop to estimate the red areas
cyst_rel_area <- 9538 
australia_rel_area <- 4094
cyst_range <- cyst_rel_area / australia_rel_area * australia_area
is_entry('Cystodytes dellechiajei')
ranges_df[ranges_df$species == 'Cystodytes dellechiajei', ]$range <- cyst_range


# Ciona intestinalis's automated range is way off. 
# https://www.aquamaps.org/receive.php?type_of_map=regular
# note: this isn't that far off from the inferred range from occurrence data, which is reassuring
ciona_i_rel_area <- 2442
australia_rel_area <- 4139
ciona_i_range <- ciona_i_rel_area / australia_rel_area * australia_area
is_entry('Ciona intestinalis')
ranges_df[ranges_df$species == 'Ciona intestinalis', ]$range <- ciona_i_range

# Hippocampus Kuda is off beacuse it's coastal
# https://www.aquamaps.org/receive.php?type_of_map=regular
# note: this isn't that far off from the inferred range from occurrence data, which is reassuring
# note, this leads to N ~ 10^9
# this seems high but searching around, I've found
# reports of 1e7 dead seahorses being poached
# so it's not unreasonable.
# https://abcnews.go.com/Technology/wireStory/peruvian-authorities-123-million-dried-seahorses-seized-66066372
hk_rel_area <- 3644
australia_rel_area <- 4147  # roughly same map source as above
hk_range <- hk_rel_area / australia_rel_area * australia_area
is_entry('Hippocampus kuda')
ranges_df[ranges_df$species == 'Hippocampus kuda', ]$range <- hk_range


# from https://www.iucnredlist.org/species/2815/123789863
is_entry('Bison bison')
ranges_df[ranges_df$species == 'Bison bison', ]$range <- 143253

# Some ranges were very poorly inferred, due mostly do low 
# occurrence data
ranges_df[ranges_df$genus == "Chlamydomonas", ]$range <- NA
ranges_df[ranges_df$genus == "Plasmodium", ]$range <- NA
ranges_df[ranges_df$genus == "Neurospora", ]$range <- NA
ranges_df[ranges_df$genus == "Phytophthora", ]$range <- NA
ranges_df[ranges_df$genus == "Paracoccidioides", ]$range <- NA
ranges_df[ranges_df$genus == "Histoplasma", ]$range <- NA
ranges_df[ranges_df$genus == "Saccharomyces", ]$range <- NA

# Let's talk about Chlorocebus aethiops.
# Looking at the inferred range, it's clear to me that 
# these are generally Chlorocebus sps. labeled as Chlorocebus.
# see: https://www.researchgate.net/figure/Distribution-of-African-green-monkeys-Chlorocebus-species-in-Africa-Chlorocebus_fig5_304030985
# and https://en.wikipedia.org/wiki/Grivet#/media/File:Grivet_area.png
# this is manually fixed using the Wikipedia range.
grivet_rel_area <- 1787
saudia_arabia_rel_area <- 3136
saudia_arabia_area <- 1.961e6 # km^2 -- wolfram alpha
grivet_range <- grivet_rel_area / saudia_arabia_rel_area * saudia_arabia_area
is_entry('Chlorocebus aethiops')
ranges_df[ranges_df$species == 'Chlorocebus aethiops', ]$range <- grivet_range 


# Like Chlorocebus, Callithrix jacchus occurrences seem to 
# generally include all Callithrix species.
# Compare: https://commons.wikimedia.org/wiki/File:Distribution_Callithrix.jpg
# with https://commons.wikimedia.org/wiki/File:Callithrix_jacchus_distribution.svg
common_marmoset_rel_area <- 1630
brazil_rel_area <- 12730 + common_marmoset_rel_area
brazil_area <- 8.515e6 # km^2 wolfram alpha
common_marmoset_area <- common_marmoset_rel_area / brazil_rel_area * brazil_area 
is_entry('Callithrix jacchus')
ranges_df[ranges_df$species == 'Callithrix jacchus', ]$range <- common_marmoset_area


# Pongo species -- Pongo abelii and P. pygmaeus
# (in the Leffler data they are combined) 
# Pongo species are way off -- here alpha hull approach is too liberal for
# these endagered species. Their ranges are (sadly) much more fragmented. IUCN
# redlist data does a better job:
# https://www.iucnredlist.org/species/121097935/123797627#population
# 
# I use the data on that page, from Wich et al. (2016)
is_entry('Pongo abelii')
ranges_df[ranges_df$species == 'Pongo abelii', ]$range <- 16775 # km^2
# https://www.iucnredlist.org/species/17975/123809220#text-fields
is_entry('Pongo pygmaeus')
ranges_df[ranges_df$species == 'Pongo pygmaeus', ]$range <- 97716 #km^2


## Termites
# the occurrence data is bad for these two species. Works well for flavipes
#https://www.researchgate.net/figure/Geographic-distribution-of-Reticulitermes-species-Isoptera-Rhinotermitidae-in-western_fig1_226108638
is_entry('Reticulitermes grassei')
is_entry('Reticulitermes lucifugus')
ret_grassei_rel <- 24005
ret_lucif_rel <- 10064
austria_area <- 32386
austria_rel_area <- 4385
ranges_df[ranges_df$species == 'Reticulitermes grassei', ]$range <- ret_grassei_rel / austria_rel_area * austria_area
ranges_df[ranges_df$species == 'Reticulitermes lucifugus', ]$range <- ret_lucif_rel / austria_rel_area * austria_area


## Daphnia
# Daphnia is hard -- clearly, though, terrestrial inference is way off since
# these live in freshwater. As a *very* rough approximation we say that 4% of
# land is lakes (this is true globally:
# https://blog.nationalgeographic.org/2014/09/15/117-million-lakes-found-in-latest-world-count/) 
PERCENT_LAKE <- 0.037 
daphnia_ranges <- ranges_df[ranges_df$genus == 'Daphnia', ]$range 
ranges_df[ranges_df$genus == 'Daphnia', ]$range  <- PERCENT_LAKE * daphnia_ranges


## Shrimp
# There were poorly inferred from GBIF data for a known and expected 
# reason: they're primarily coastal
# The manual fixes are based off a map (downloaded
# from http://en.aquaculture.ifremer.fr/Sectors/Crustacean-sector/Discoveries/Shrimps
# and in range_maps)
# 
# first image in the range map pair
australia_rel_area <- 5307
mars_rel_area <- 3102
mars_area <- mars_rel_area  / australia_rel_area * australia_area
is_entry('Marsupenaeus japonicus')
ranges_df[ranges_df$species == 'Marsupenaeus japonicus', ]$range <- mars_area

# second image in the range map pair
lito_rel_area <- 6638
australia_rel_area2 <- 5203 # 2, for the second map in this range map pair
lito_area <- lito_rel_area / australia_rel_area2 * australia_area
is_entry('Litopenaeus vannamei')
ranges_df[ranges_df$species == 'Litopenaeus vannamei', ]$range <- lito_area

panaeus_rel_area <- 7398
australia_rel_area2 <- 5203 # 2, for the second map in this range map pair
panaeus_area <- panaeus_rel_area / australia_rel_area2 * australia_area
is_entry('Penaeus monodon')
ranges_df[ranges_df$species == 'Penaeus monodon', ]$range <- panaeus_area

## Canis lupus
# the occurrence dataset has a bunch of odd entries. I instead 
# use the IUCN redlist range maps
australia_rel_area <- 1632
canisl_rel_area <- 31281
canisl_area <- canisl_rel_area  / australia_rel_area * australia_area
is_entry('Canis lupus')
ranges_df[ranges_df$species == 'Canis lupus', ]$range <- canisl_area


## Argopecten irradians -- bay scallop
# like shrimp, these are coastal and their range is poorly inferred by 
# GBIF occurrence data using an alpha hull
# Despite a few occurrences across the globe, this is an Atlantic coast species
# as far as I can tell.
# I use the range map from here: 
# https://www.sciencedirect.com/topics/agricultural-and-biological-sciences/argopecten-irradians
# The range map was too roughly colored, so I've very rougly traced it
# in bright green (again, on log scale this noise is fine)
argo_rel_area <- 6864
bahamas_rel_area <- 369
bahamas_area <- 13880 # km2, wolfram alpha
argo_area <- argo_rel_area / bahamas_rel_area * bahamas_area
is_entry('Argopecten irradians')
ranges_df[ranges_df$species == 'Argopecten irradians', ]$range <- argo_area

## bird range fixes
# Aquila clanga/pomarina -- from http://datazone.birdlife.org/species/factsheet/lesser-spotted-eagle-clanga-pomarina -- geometric mean of both species, since their diversity data is averaged
is_entry('Aquila clanga')
ranges_df[ranges_df$species == 'Aquila clanga', ]$range <- 5340000
is_entry('Aquila pomarina')
ranges_df[ranges_df$species == 'Aquila pomarina', ]$range <- 18100000

# not a huge difference, but I'll use the birdlife value
is_entry('Ficedula albicollis')
ranges_df[ranges_df$species == 'Ficedula albicollis', ]$range <- 4430000
# this range is muuuch smaller according to birdlife.org
is_entry('Taeniopygia guttata')
ranges_df[ranges_df$species == 'Taeniopygia guttata', ]$range <- 308000
# again, not a huge difference but I'll use the birdlife.org values
is_entry('Zonotrichia albicollis')
ranges_df[ranges_df$species == 'Zonotrichia albicollis', ]$range <- 8040000


## Equus ferus przewalskii
# from: https://ielc.libguides.com/sdzg/factsheets/przewalskishorse/behavior
# Hustai National Park: 120-2,400 ha (King and Gurnell 2005)
# Great Gobi B Strictly Protected Area: 150-825 kmĀ² (Kaczensky et al. 2008).
# note that the Corbett-Detig dataset lists this range as 10^7.13, which is around
# what regular horsies have... and this just seems wrong.
# 
# since this is a correction to Corbett Detig's data, which hasn't been 
# merged in yet, I add this row first.
eqfp <- tibble(key = "Equus ferus przewalskii", 
               species = "Equus ferus przewalskii",
               dataset = "Corbett-Detig", is_terrestrial = TRUE, 
               genus="Equus", 
               is_terrestrial_orig = TRUE, n_occ = NA, range = NA)
ranges_df <- rbind(ranges_df, eqfp)
is_entry('Equus ferus przewalskii')
ranges_df[ranges_df$species == 'Equus ferus przewalskii', ]$range <- geomean(0.01 * c(120, 2400)) + geomean(c(150, 825))


## Metriaclima zebra, an endemic cichlid
# Issue here is it's endemc in Lake Malawi, with few occurrences logged.
# I just put the whole Lake Malawi area
lake_malawi_area <- 29600 # km2
is_entry('Metriaclima zebra')
ranges_df[ranges_df$species == 'Metriaclima zebra', ]$range <- lake_malawi_area

## Fenneropenaeus chinensis, Chinese white shrimp
# this is an obvoius outlier in the data, and has too few occurrences to
# accurately infer the range of
is_entry('Fenneropenaeus chinensis')
ranges_df[ranges_df$species == 'Fenneropenaeus chinensis', ]$range <- NA

# Let's fix the Drosophilids. 
# The data is from 
# https://web.archive.org/web/20170710053101/http://www.drosophila-speciation-patterns.com/
# where I've found a spreadsheet of the absolute range sizes
drosoph <- read_xlsx('./RAW_dataset_Jan_06_2012.xlsx')[, c(1, 27)] %>%
             rename(species=`Sp 1`, 
                    range=`absolute range (sp.1) * in square km`) %>%
             mutate(species = paste('Drosophila', species)) %>%
             mutate(range = as.numeric(range)) %>%
             filter(!is.na(range)) %>% distinct()


ranges_df <- ranges_df %>% left_join(drosoph, by='species') %>%
                 mutate(range = pmax(range.x, range.y, na.rm=TRUE))  %>%
                 select(-range.y, -range.x)

write_tsv(ranges_df, "gbif_ranges.tsv")

## some diagnostic images

# leffd %>%
#   group_by(species) %>% summarize_at(c('size', 'range', 'diversity'), mean, na.rm=TRUE) %>%
#   ggplot(aes(size, range, color=genus)) + scale_x_log10() +  geom_point() +
#     scale_y_log10() + #geom_smooth(method='lm') + 
#     geom_text_repel(aes(size, range, label=species), size=1.8) + 
#     theme(legend.position="none")

# leffd %>%
#   ggplot(aes(range / size, diversity)) + scale_x_log10() + 
#     scale_y_log10() + geom_smooth(method='lm') + 
#     geom_text_repel(aes(range/size, diversity, label=species), size=1.8) 

back to top