# ************************************************* # # Analytical Paleobiology Workshop 2022 # # Module 2: Paleodiversity analyses # Day 4: Exploring fossil occurrence data # and fossil record biases # # Thursday, August 25th 2022 # Emma Dunne (emma.dunne@fau.de) # _________________________________________________ # # 01: Data preparation # # ************************************************* ## Package(s) used in this script: library(tidyverse) # for data manipulation functions and ggplot ## If you've been using a lot of different packages, some function names might be duplicated. ## This step ensures that the function 'select' is coming from the dplyr package (part of tidyverse) select <- dplyr::select # Getting PBDB data ------------------------------------------------------- ## Choose a taxonomic group and time interval ## (Note that the scripts for today are set up for the interval from the Late Triassic - Early Jurassic, so you might prefer to chose a taxon group in this interval) taxon_group <- "Pseudosuchia" # Taxon group start_interval <- "Carnian" # Interval to start at stop_interval <- "Toarcian" # Interval to stop at ## Create an API request form the Paleobiology Database and store this URL as an object ## A list of API options can be found here: https://paleobiodb.org/data1.2/ URL <- paste0("https://paleobiodb.org/data1.2/occs/list.csv?base_name=", # occurrence data, as a .csv taxon_group, "&interval=", start_interval, ",", stop_interval, # use our inputs from above "&show=full&pres=regular") # any additional columns we want ## Then use this to load the data into R: occ_data_raw <- as_tibble(read.csv(URL, header = TRUE, stringsAsFactors = FALSE)) ## Take a peep: glimpse(occ_data_raw) # view columns View(occ_data_raw) # open as new tab # Cleaning the data ------------------------------------------------------- ## Before we do any plotting or analyses, let's clean our data up a bit ## Remove 'super-generic' identifications, so that we only retain occurrences to species- and genus-level occ_data_raw2 <- filter(occ_data_raw, (identified_rank %in% c("species","genus"))) ## Remove occurrences with “aff.”, “ex. gr.”, “sensu lato”, “informal”, or quotation marks occ_data_raw3 <- occ_data_raw2 %>% filter(!grepl("cf\\.|aff\\.|\\?|ex\\. gr\\.|sensu lato|informal|\\\"", identified_name)) ## We've already filtered our data to 'form taxa' (i.e. body fossils) in the API query, but just in case any trace fossils crept through... ## Remove entries marked as 'trace' or 'soft', and those with no genus name occ_data_raw4 <- occ_data_raw3[occ_data_raw3$pres_mode != "trace", ] # trace taxa occ_data_raw5 <- occ_data_raw4[!grepl("soft",occ_data_raw4$pres_mode), ] # 'soft' preservation occ_data_raw6 <- occ_data_raw5[occ_data_raw5$genus != "", ] # missing genus name ## Finally, filter the data so any duplicate taxon names or collection numbers are eliminated: occ_data <- distinct(occ_data_raw6, accepted_name, collection_no, .keep_all = TRUE) ## Take a look: ## How much has our data been reduced by? glimpse(occ_data) ## Sometimes, with PBDB data, you'll find some annoying trace terms still remain after you've ## cleaned your data through these steps. Some researchers (myself included) like to keep lists ## of pesky taxa so we can remove them more explicitly. You'll find code for this here: ## https://github.com/emmadunne/pbdb_cleaning ## Save a copy as a .csv file for safe keeping #dir.create("./datasets") # create new folder if one doesn't exist write_csv(occ_data, "./data/occs_cleaned_pseudo.csv") # Set up time interval info ----------------------------------------------- ## Next, let's grab information for time intervals from the PBDB so ## that their ages in Ma match the same ages as the occurrences ## Download names and ages of time intervals from the PBDB: intervals_all <- read.csv("http://paleobiodb.org/data1.1/intervals/list.txt?scale=all&limit=all") ## Make a vector of stage names that we will specifically be looking at ## (we'll be using this vector later for plotting too) interval_names <- c("Carnian", "Norian", "Rhaetian", # Late Triassic "Hettangian", "Sinemurian", "Pliensbachian", "Toarcian") # Early Jurassic ## Select these intervals from the full PBDB intervals tibble: intervals <- filter(intervals_all, interval_name %in% interval_names) ## Pear this down to just the columns we'll need: intervals <- dplyr::select(intervals, interval_name, early_age, late_age) ## For ease of use later, let's rename the age columns to match the occurrence data: intervals <- rename(intervals, "max_ma" = "early_age", "min_ma" = "late_age") ## And finally, calculate the midpoint for each interval intervals$mid_ma <- (intervals$min_ma + intervals$max_ma)/2 ## Take a peep: View(intervals) # open as new tab ## Save a copy as a .csv file write_csv(intervals, "./data/intervals_Car_Tor.csv")