Introduction

This tutorial shows how probability of activity curves can be extracted from the VHF signal data after classification of 1 min intervals into active or passive states. I’m using Hierarchical Generalized Additive Models following the Pedersen et al. 2021 article. The data needed for the execution of this tutorial can be downloaded here

Packages required

Before you start, make sure that you have the following packages installed: tidyverse, data.table,lubridate, hms, mgcv, gratia, itsadug, mgcViz, ggthemes, viridis

Additionally we will use functionalities from the tRackIT R package that is not hosted on cran yet. Please check the README in the tRackIT GitHub repository for installation instructions

Load the packages

1. Data importation and wrangling

The tRackIT-package GitHub page shows the complete workflow to get from raw signals to activity classifications with a 1 Minute resolution. We will skip that part here and start with the classified data of bats that have been tracket from 2018 to 2021 (all_bats_aggregated). There is one file per individual, that contains the timestamp, the activity class per timestamp and the ID. We will need some more info such as timing in relation to sunset and sunrise as well as species, sex etc. To do so we will use two functionalities of the tRackIT-package. The time_of_day() function claculates timedifference tu sunset and sunrise etc, the add.ID.info() function adds meta-info for the individual to the data. This section of the tutorial can only be executed if the ecological_case_study_bat_activity directory has been downloaded. It has >270Gb and can´t be directly downloaded via the user interface. Check the README of the repository for instructions. However, you can skip that part and continue with the next chunk of code using already processed data.

#set coordinates for sunset/sunrise calculation

Lat<-50.844868
Lon<-8.663649

#Loop through years 
for(y in c(2018, 2019, 2020, 2021)){
#print(y)
#get projectfile per year
pr<-getProject(projroot=paste0("bats_for_activity/",y, "/"))

#remove ids with no data at all
pr$tags<-pr$tags[pr$tags$ID!="Mbec180518_2",]
pr$tags<-pr$tags[pr$tags$ID!="mbec150155_m",]

#loop through individuals
for(id in pr$tags$ID){
  
  #print(id)
  
  anml<-getAnimal(projList = pr, animalID = id)
  #get activity classification aggregated to 1 Minute
  activity_files<-list.files(anml$path$classification,pattern="agg", full.names = TRUE)
  
  if(length(activity_files)>0){
  
  activity_1Min<-data.table::fread(fls[1])
  
  #calculate daytime infos
 activity_1Min<-time_of_day(data = activity_1Min, Lat=Lat, Lon = Lon, tcol = "timestamp", tz = "CET", activity_period = "nocturnal")
  
  #add id info
activity_1Min<-add.ID.Info( data=activity_1Min, animal=anml)

#number of tracking days
activity_1Min$n_days<-as.numeric(abs(difftime(as.Date(activity_1Min$start_datetime), as.Date(activity_1Min$stop_datetime))))+1

#binary activity classification
activity_1Min$activity<-ifelse(activity_1Min$prediction=="active", 1,0)  

#correction of typo
activity_1Min$species[activity_1Min$species=="mbec"]<-"Mbec"
  
 data.table::fwrite(activity_1Min, paste0("all_bats_aggregated/",anml$meta$animalID, "_",as.character(y), "_aggregated_1_Minute.csv"))
    
      }
}}


#merge all data 
all_aggregated_files<-list.files("all_bats_aggregated/", full.names = TRUE)

all_bats<-plyr::ldply(fls, function(x){data.table::fread(x)})

all_bats<-all_bats[all_bats$species=="Mbec" | all_bats$species=="Nlei",]



#get info which ids should be excluded
check_ids<-data.table::fread("bats_inspect_id.csv")
excld<-check_ids[check_ids$exclude_individual=="Y",]

#exclude ids from data
all_bats<-all_bats[!(all_bats$ID %in% excld$ID),]

#account for rep state transition

df_rep<-read.csv("df_rep_state_transition.csv")
for (id in unique(df_rep$ID)){
  
  if (df_rep$rep2[df_rep$ID==id]=="pregnant"){
    
    all_bats$rep.state[all_bats$ID==id & all_bats$year==df_rep$year[df_rep$ID==id] & all_bats$timestamp>=df_rep$start_rep1[df_rep$ID==id]]<-"lactating"}
  
  if (df_rep$rep2[df_rep$ID==id]=="post-lactating"){
    
  all_bats$rep.state[all_bats$ID==id & all_bats$year==df_rep$year[df_rep$ID==id] & all_bats$timestamp>=df_rep$start_rep2[df_rep$ID==id]]<-"post-lactating"}
  
}



#save data
data.table::fwrite(all_bats,"all_bats_aggregated.csv")

1.1 Data importation

## Rows: 818,398
## Columns: 24
## $ timestamp      <dttm> 2020-05-15 20:14:00, 2020-05-15 20:15:00, 2020-05-15 2~
## $ prediction     <fct> active, active, active, active, active, active, active,~
## $ ID             <fct> h146474, h146474, h146474, h146474, h146474, h146474, h~
## $ date           <date> 2020-05-15, 2020-05-15, 2020-05-15, 2020-05-15, 2020-0~
## $ sunset_date    <date> 2020-05-15, 2020-05-15, 2020-05-15, 2020-05-15, 2020-0~
## $ hour           <int> 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,~
## $ sunrise_date   <date> 2020-05-16, 2020-05-16, 2020-05-16, 2020-05-16, 2020-0~
## $ sunset         <dttm> 2020-05-15 19:08:35, 2020-05-15 19:08:35, 2020-05-15 1~
## $ sunrise        <dttm> 2020-05-16 03:34:14, 2020-05-16 03:34:14, 2020-05-16 0~
## $ time_to_rise   <dbl> -7.337226, -7.320559, -7.303893, -7.287226, -7.270559, ~
## $ time_to_set    <dbl> 1.090028, 1.106695, 1.123362, 1.140028, 1.156695, 1.173~
## $ start_datetime <dttm> 2020-05-15 20:14:00, 2020-05-15 20:14:00, 2020-05-15 2~
## $ stop_datetime  <dttm> 2020-05-29 21:58:00, 2020-05-29 21:58:00, 2020-05-29 2~
## $ year           <int> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2~
## $ ydate          <int> 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, ~
## $ month          <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5~
## $ t              <dbl> 0, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600, 660~
## $ species        <fct> Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, Mbec, M~
## $ sex            <fct> w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w, w~
## $ age            <fct> ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad, ad,~
## $ weight         <dbl> 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, 8.2, ~
## $ rep.state      <fct> pregnant, pregnant, pregnant, pregnant, pregnant, pregn~
## $ n_days         <int> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,~
## $ activity       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1~

1.2 Data wrangling

Exclude retagged individuals and extract sample sizes

## # A tibble: 2 x 4
##   species         nID   nObs meanDays
##   <fct>         <int>  <int>    <dbl>
## 1 M.bechsteinii    52 577977     18.4
## 2 N.leisleri       20 204443     21.3
##   nID   nObs meanDays
## 1  72 782420 19.15002

NOTE Some individuals were tagged twice within the same year. We want to avoid these situations and reduce the sampling period to the first tagging event. The full details of this analysis can be found here.

1.3 Visual inspection

We can plot visually inspect the data by presenting the probability of activity over time of the day. Here, it is easier to calculate this probability by time intervals of 15 minutes for easier viuslization

We can also inspect each tagged individual in the same way

2. HGAM for species comparison of daily activity patterns

We fit a Hierarchical Generalized Additive Model to compare whether Bechstein’s and Leisler’s bats differ significantly in their daily activity patterns. We assume that the probability of activity is a non-linear function of time of the day, here centered around time of sunset (t = 0). We use a binomial error term since our activity column is a string of 0 and 1 (i.e the bat is either marked as passive for that minute or active).

2.1 Fit models

We use circular spline functions to constrain the activity probability to be equal at 00:00 and 23:59 (argument bs = "cc"). We also need to account for the fact that individuals were measured repeatedely but in differnet years of monitoring. The simplest way to do that is to include individuals and date as random intercepts with the argument bs = "re". Note that there are many flavors for specifying random effects with HGAMs which apply more or less penalty to the random effects and allow them to deviate from population level trends. Here, we are mainly interested in species differences and assume that there is a general time of activity function that is species specific. Note that more complex random effect structures gave functionally similar results and did not affect conclusions. Pedersen et al. 2021 provides a great and exhasutive tutorials on the different ways to approach random effects within the GAM framework. There are two other arguments that require ou attention. First, we need to account for the fact that there observations are likely to be highly autocorrelated because they are taken at 1 min intervals. This value has to be set manually and we show our procedure for investigating autocorrelation in our analysis R script (see filename, section 1.1). Next, we need to select the degree of complexity of our smoothing terms: k. After inspection with the k.check function, setting k to 120 was the highest complexity we could set without overfitting the data (see for details).

To test for species difference in activity patterns, we fit two models. Model M0 assumes that both species have the same global activity patterns while model M1 allows both species to follow their own trend (argment by = species within the spline function)

Set autocorrelation (r1) and basis dimension (k) of the smooth term

Fit model M0

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") + 
##     s(date_f, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7383     0.1339  -12.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df  Chi.sq p-value    
## s(time_to_set)  51.84    118 4061286  <2e-16 ***
## s(ID)           67.07     71 1976322  <2e-16 ***
## s(date_f)      280.26    306 3084457  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.45   Deviance explained = 38.8%
## fREML = 8.5139e+05  Scale est. = 1         n = 782420

Fit model M1

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ s(time_to_set, bs = c("cc"), k = k, by = species) + 
##     s(ID, bs = "re") + s(date_f, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7399     0.1309  -13.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                        edf Ref.df  Chi.sq p-value    
## s(time_to_set):speciesM.bechsteinii  47.23    118 3571851  <2e-16 ***
## s(time_to_set):speciesN.leisleri     45.82    118  188033  <2e-16 ***
## s(ID)                                66.32     71 1824189  <2e-16 ***
## s(date_f)                           282.01    306 2421828  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.468   Deviance explained = 40.4%
## fREML = 8.4389e+05  Scale est. = 1         n = 782420

Compare M0 and M1

## Analysis of Deviance Table
## 
## Model 1: activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") + 
##     s(date_f, bs = "re")
## Model 2: activity ~ s(time_to_set, bs = c("cc"), k = k, by = species) + 
##     s(ID, bs = "re") + s(date_f, bs = "re")
##   Resid. Df Resid. Dev    Df Deviance
## 1    781994     578790               
## 2    781942     563613 52.41    15177
##                  df      AIC delta.AIC
## fit.gam.M1 443.2209 248309.1      0.00
## fit.gam.M0 400.6479 263401.1  15092.01

2.2 Plot models

2.3 Extract timing of activity values

The following section shows how different aspects of the dialy activity patterns can be extracted from the HGAM output. We focus here on determining the timinig for onset and end of the daily activity period, the timing of peak activity and the intensity of activity during the night.

## # A tibble: 2 x 3
##   species       a.onset       a.end        
##   <fct>         <time>        <time>       
## 1 M.bechsteinii 00'30.879867" 07'10.972640"
## 2 N.leisleri    00'12.125519" 07'23.475539"
## # A tibble: 2 x 4
##   species       peak.a peak.a.low peak.a.up
##   <fct>          <dbl>      <dbl>     <dbl>
## 1 M.bechsteinii  0.770      0.751     0.788
## 2 N.leisleri     0.822      0.798     0.845
## # A tibble: 2 x 2
##   species       t.peak       
##   <fct>         <time>       
## 1 M.bechsteinii 01'33.394363"
## 2 N.leisleri    00'37.131317"
## # A tibble: 2 x 4
##   species         auc auc.low auc.up
##   <fct>         <dbl>   <dbl>  <dbl>
## 1 M.bechsteinii  4.70    4.56   4.84
## 2 N.leisleri     3.42    3.23   3.62

3. HGAM for comparing activity patterns reproductive periods

We can also use HGAM to compare the timing of daily activity depending on the reproductive status of individuals within each species. The models used here are very similar to those used in section 2. with the main difference that we are now comparing a model that assumes a common activity pattern for all statuses (model 0) and one that allows reproductive statuses to have differing smoothing functions.

Exclude individuals of unknown reproductive status

## # A tibble: 2 x 3
##   species         nID   nObs
##   <fct>         <int>  <int>
## 1 M.bechsteinii    35 384536
## 2 N.leisleri       19 203261
## # A tibble: 8 x 4
## # Groups:   species [2]
##   species       rep.state        nID   nObs
##   <fct>         <fct>          <int>  <int>
## 1 M.bechsteinii lactating         13 102076
## 2 M.bechsteinii non.repro          4  68685
## 3 M.bechsteinii post-lactating    12 135582
## 4 M.bechsteinii pregnant          13  78193
## 5 N.leisleri    lactating         11  71560
## 6 N.leisleri    non.repro          2  23296
## 7 N.leisleri    post-lactating     7  58889
## 8 N.leisleri    pregnant           6  49516

3.1 Bechstein’s bats

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") + 
##     s(date_f, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6686     0.1146  -14.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df Chi.sq  p-value    
## s(time_to_set)  45.50    118 573547  < 2e-16 ***
## s(ID)           28.39     35  41969  0.00503 ** 
## s(date_f)      226.32    252  63199 3.32e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.519   Deviance explained = 44.5%
## fREML = 4.0871e+05  Scale est. = 1         n = 384536
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ rep.state + s(time_to_set, bs = c("cc"), k = k, by = rep.state) + 
##     s(ID, bs = "re") + s(date_f, bs = "re")
## 
## Parametric coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.2606     0.1477  -8.532  < 2e-16 ***
## rep.statenon.repro       -0.7385     0.2799  -2.639  0.00832 ** 
## rep.statepost-lactating  -0.8867     0.1325  -6.692  2.2e-11 ***
## rep.statepregnant        -0.1065     0.1996  -0.534  0.59357    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                           edf Ref.df Chi.sq  p-value    
## s(time_to_set):rep.statelactating       33.82    118  77911  < 2e-16 ***
## s(time_to_set):rep.statenon.repro       30.96    118 159447  < 2e-16 ***
## s(time_to_set):rep.statepost-lactating  38.59    118 159446  < 2e-16 ***
## s(time_to_set):rep.statepregnant        31.39    118  58687  < 2e-16 ***
## s(ID)                                   26.75     34  17596   0.0664 .  
## s(date_f)                              224.82    252  44573 5.29e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.526   Deviance explained = 45.3%
## fREML = 4.07e+05  Scale est. = 1         n = 384536
##                    df      AIC delta.AIC
## fit.gam.Mb.1 392.2189 106210.7     0.000
## fit.gam.Mb.0 301.9440 109867.9  3657.224

3.2 Leisler’ bats

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ s(time_to_set, bs = c("cc"), k = k) + s(ID, bs = "re") + 
##     s(date_f, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7416     0.3772  -4.617 3.89e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df  Chi.sq p-value    
## s(time_to_set)  45.71    118  401373  <2e-16 ***
## s(ID)           16.94     18 2697157  <2e-16 ***
## s(date_f)      114.41    126 4172289  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.35   Deviance explained = 32.8%
## fREML = 2.1573e+05  Scale est. = 1         n = 203261
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## activity ~ rep.state + s(time_to_set, bs = c("cc"), k = k, by = rep.state) + 
##     s(ID, bs = "re") + s(date_f, bs = "re")
## 
## Parametric coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.3364     0.5211  -2.564 0.010336 *  
## rep.statenon.repro        1.0943     1.3571   0.806 0.420023    
## rep.statepost-lactating   0.7711     0.2588   2.980 0.002887 ** 
## rep.statepregnant        -2.6379     0.6794  -3.883 0.000103 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                           edf Ref.df Chi.sq  p-value    
## s(time_to_set):rep.statelactating       36.84    118 146915  < 2e-16 ***
## s(time_to_set):rep.statenon.repro       28.29    118  17017  < 2e-16 ***
## s(time_to_set):rep.statepost-lactating  36.58    118  46074  < 2e-16 ***
## s(time_to_set):rep.statepregnant        36.46    118  58619  < 2e-16 ***
## s(ID)                                   15.94     17 904539   0.0185 *  
## s(date_f)                              112.07    126 764998 3.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.367   Deviance explained = 34.3%
## fREML = 2.1435e+05  Scale est. = 1         n = 203261
##                    df      AIC delta.AIC
## fit.gam.Nl.1 272.7888 54313.97     0.000
## fit.gam.Nl.0 178.7082 57287.66  2973.691