# Code from the second day of the 2022 APW # Erlangen, 2022-08-23 # 0. Read in the Palmer penguin dataset! # setwd("C:/Users/Adam/Desktop/penguins") # either from the hard drive # penguins <- read.csv("data/penguins.csv") # or from the internet penguins <- read.csv("https://fau-paleo.github.io/apw_2022/data/1_toolset/penguins.csv") ### NOTE: This is also published as a package: ## install.packages("palmerpenguins") ## library(palmerpenguins) ## data(penguins) # 1. Select the subset that corresponds to the Gentoo and the Adelie penguins! # Method 1: separate subsets and rejoin gentoo <- subset(penguins,species == "Gentoo") adelie <- subset(penguins, species == "Adelie") both <- rbind(gentoo, adelie) ### Note: this orders your data. Records from Gentoo will come first and then will come those of Adelie. # using the subset function and the match operator Gen_Ad <- subset(penguins, species %in% c("Adelie", "Gentoo")) # simplified example for the match operator x <-c("a", "b", "r", "f") lookup <- c("a", "b") # result is a logical vector, the length is the same as the length of x %in% lookup # result c(TRUE, TRUE, FALSE, FALSE) # how to measure performance: # the first argument is a block of code system.time({ for(i in 1:1000){ mean(1:1000) } }) # using the subset operators and the logical OR operator Gen_Ad2 <- penguins[penguins$species == "Adelie" | penguins$species == "Gentoo", ] # The same with which # this is better for defense against missing values Gen_Ad2 <- penguins[which(penguins$species == "Adelie" | penguins$species == "Gentoo"), ] # using the dplyr package library(dpylr) Data.Peng1 <- penguins %>% filter(species == "Gentoo" | species == "Adelie") # Operator objects can be accessed with the backticks `%>%` ?`%in%` # NA at the end does not change this result which(c(penguins$species == "Adelie" | penguins$species == "Gentoo", NA)) ########################################---------------------------------------- # 2. With a loop for every species: # a. subset the data by species # b. create a directory for it # c. write it into the new directory in rds and csv format! # every file contains data that belong to one species. # Two equally good ways to do this. sp <- unique(penguins$species) # sp <- levels(factor(penguins$species)) for(i in 1:length(sp)){ #for(i in seq_along(sp)) # in every # create a new directory the species currentSpecies <- sp[i] dir.create(currentSpecies) # get the subset that contains all data of the species speciesData <- penguins[which(penguins$species == currentSpecies),] # write it into the directories # write.csv write.csv(speciesData, file= paste0(currentSpecies, "/", currentSpecies, ".csv"), row.names=FALSE) # "Chinstrap/data.csv" # use this "Chinstrap/Chinstrap.csv" # saveRDS saveRDS(speciesData, file=paste0(currentSpecies, "/", currentSpecies, ".rds")) } # Why we use which? # Create a different version of the penguins dataset, # where we have a missing value in the species column pen <- penguins pen$species[1] <- NA # compare the subsetting with and withWhich <- penguins[which(penguins$species == currentSpecies),] # compare the subsetting without withoutWhich <- penguins[penguins$species == currentSpecies,] # Contrasing .rds and the .RData # readin in the rds file adelie <- readRDS("Adelie/Adelie.rds") # .RData: keeps the information of the names of the object # saving as an RData file save(adelie, file="Adelie/Adelie.RData") # remove the file rm(adelie) # it is no longer there ls() # loading load(file="Adelie/Adelie.RData") # the object is back! ls() # not to do: Only gives you the names of the data # o <- load(file="Adelie/Adelie.RData") #######################################---------------------------------------- # 3. Change the previous code to save the subsets in a list. Use the species names as the names of the list! # List can be heterogeneous, can contain different kinds of data li <- list(a=1:10, b=c("a", "b", "c")) # how many elements are there length(li) # accessing elements: # this is still a list li[1] # this is the element only: a vector li[[1]] li$a # saving the results in an object. # This is a container container <- list() # iterate through every species for(i in 1:length(sp)){ # The current species currentSpecies <- sp[i] # get the subset that contains all data of the species speciesData <- penguins[which(penguins$species == currentSpecies),] # write it into the directories container[[i]] <- speciesData } # the structure of the final list str(container) # good idea to indicate which element pertain to what species names(container) <- sp # acessing items in this list: # the dollar operator container$Chinstrap # is essentially the same as this: container[["Chinstrap"]] # you can get deeper subsets of this: the first row of the data.frame container[["Chinstrap"]][1, ] # 4. Write a function that returns parts of the dataset based on column name and value, # either greater or equal/lower or lower or equal to the value. function(x, column, value, greater) #' @param x The data frame to subset #' @param column Which column you are basing the subsetting on. (Numeric column) #' @param value A numeric value, that separates the subset that you want to select from what you don't want. #' @param greater Should a subset be selected that correponds to rows where the values are #' greater than \code{value} (TRUE), or lower tan equal (FALSE).