Process Data to GBIF format

Raphaël Nussbaumer

Introduction

This script reads the Birdlasser files collected on the ground and produces the events.csv and occurances.csv files in Darwin format for the publication of the Bird Survey Around Mida Creek on GBIF.

Load surveys metadata

A session comprises around 90 surveys gathered during a month, each of them covering a different transect. Sessions are repeated every month. A survey is a ~1 hour (non-stop) recording of all species encountered.

For each session (usually 4 surveys performed from 6am to 10am), a Birdlasser csv file with all the species recorded is exported by the observer and sent to Raphaël Nussbaumer. All the files received are stored in the /birdlasser_files/ folder and additional information on each survey is recorded in the metadata.xlsx file:

  • filename: Birdlasser file name containing the survey data and stored in /birdlasser_files/
  • observer: Letter for each observer L (Kirao Lennox), J (Juma Badi), D (Daniel Kazungu), K (Kibwana Ali), S (Saddam Kailo) and M (Mohammed Ali)
  • sessionID: Session identifier (monthly grouping of surveys covering all the transects), YYYYMM with the month where most surveys were done.
  • Date: Date of the survey
  • start_time: Time of survey start
  • end_time: Time of survey end
  • first_sighting: DEPRECATED: before unidentified was used to specify the start and end time, we used this.
  • last_sighting: DEPRECATED
  • protocol_ok: Whether the data is properly recorded (TRUE or FALSE).
  • gap_min: Duration of the survey where birds were not recorded (often because of rain) in MM:HH.
  • notes: Additional notes with respect to the survey.

Here, we read this file as a table and add computed variables:

  • start_hour: (start time rounded to the closest hour) so 6,7,8,9,10 or 11
  • duration: (end-start-gaps) and create an survey
  • surveyID: a unique identifier per survey
all_surveys <- read_excel(paste(project.root,"data/metadata.xlsx",sep="/"), sheet=2) %>% 
  filter(ifelse(is.na(protocol_ok), TRUE, FALSE)) %>% 
  mutate(
    filename = filename,
    date = force_tz(date,tzone = "Africa/Nairobi"),
    start_time = parse_time(format(start_time,'%H:%M')),
    #first_sighting = parse_time(format(first_sighting,'%H:%M')),
    #last_sighting = parse_time(format(last_sighting,'%H:%M')),
    end_time = parse_time(format(end_time,'%H:%M')),
    gap_min = ifelse( is.na(gap_min), 0, as.integer(format(gap_min,'%M'))),
    start_hour = parse_time(format(round(as.POSIXct(start_time, format="%H:%M:%S"), units="hours"),'%H:%M')),
    surveyID = paste( sessionID, format(date, "%Y%m%d"), str_replace(start_hour,':00:00','') , observer, sep = '_' ),
    notes = str_replace_all(notes, "[\r\n]" , ""),
    observer = observer,
    .keep = "used"
    )

We replace the observer code (first letter) by the full name of the observer.

matchName <- function(x) {
  if (x=="L"){
    "Kirao Lennox"
  } else if(x=="J"){
    "Juma Badi"
  } else if(x=="D"){
    "Daniel Kazungu"
  } else if(x=="K"){
    "Kibwana Ali"
  } else if(x=="S"){
    "Saddam Kailo"
  } else if(x=="M"){
    "Mohammed Ali"
  }
}

all_surveys <- all_surveys %>% mutate(
  recordedBy =  unlist(map(str_extract_all(observer,''), ~str_c(unlist(map_chr(.x,matchName)), collapse = "|"))),
)

Below is an extract (first 5 rows) of the dataset of surveys:

datatable(all_surveys %>% head(), filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")

Load Sightings data

Based on the metadata of each survey loaded in the previous section, we can now create the database including all sightings (i.e. individual observations) and associated survey information.

draw <- all_surveys %>%  
  pmap_dfr(function(...) {
  s <- tibble(...)
  s
  read_csv(paste(project.root,'/data/birdlasser_files/', s$filename,'.csv',sep=""),
           col_types = cols_only(
             `Species primary name` = col_character(),
             `BirdMAP ID` = col_double(),
             Date = col_character(),
             Time = col_time(format = ""),
             Latitude = col_double(),
             Longitude = col_double(),
             Notes = col_character()
             )
           ) %>% 
    mutate(Date = parse_date_time(Date,c("%Y-%m-%d", "%d/%m/%Y"), exact=TRUE, tz = "Africa/Nairobi" )) %>% # date are formated differently, this line allow to read both format
    filter(Date == s$date, s$start_time <= Time, s$end_time >= Time ) %>% # filter data for the survey
    mutate(
      specieName = `Species primary name`,
      specieID = `BirdMAP ID`,
      date = Date+Time,
      lat = Latitude,
      lng = Longitude,
      note = str_replace_all(Notes, "[\r\n]" , ""),
      # From Survey information
      surveyID = s$surveyID,
      sessionID = s$sessionID,
      # filename = s$filename,
      recordedBy = s$recordedBy,
      #  start_hour = s$start_hour,
      #  duration = s$duration,
      #  start_time = s$start_time,
      #  end_time = s$end_time,
      #  gap_min = s$gap_min,
      #  notes_survey = s$notes,
      .keep = "none"
   ) %>% 
    arrange(date) %>% 
    group_by(surveyID,specieID) %>% 
    mutate(recordID = paste(surveyID,specieID,row_number(),sep="_")) %>% 
    ungroup()
})


datatable(draw %>% head(), filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")

Surveys cleaning

In this section, we clean the survey data from erroneous locations and species.

dCleaned <- draw

Cleaning location

We remove and interpolate the wrong locations of individual sightings which were determined by visual checking (see map below).

errorLocList = c(
  '202006_20200528_06_JD_0','202006_20200528_06_JD_545','202006_20200610_06_JD_0','202006_20200616_06_JD_0','202006_20200616_06_JD_517','202006_20200701_06_JD_0','202006_20200701_06_JD_989','202006_20200701_06_JD_784','202006_20200624_09_JD_797','202006_20200610_08_JD_0','202006_20200610_07_JD_0','202006_20200610_07_JD',
  '202007_20200713_06_JM_517','202007_20200721_07_JM_521','202007_20200713_06_JM_314',
  '202008_20200817_06_JM_0','202008_20200818_06_JM_0','202008_20200819_06_JM_989','202008_20200819_06_JM_0','202008_20200821_07_JM_551','202008_20200817_08_JM_989',
  '202010_20201105_06_JM_0','202010_20201108_07_JM_3834',
  '202012_20201215_06_JK_0','202012_20201212_07_JK_3024','202012_20201215_08_JK_0','202012_20201212_07_JK_859','202012_20201210_09_JK_580','202012_20201215_07_JK_0',
  '202101_20210111_08_JM_685','202101_20210113_06_JM_649','202101_20210119_07_JM_3834','202101_20210121_08_JM_144','202101_20210125_06_JM_321',
  '202103_20210309_09_SK_712','202103_20210310_09_SK_772','202103_20210325_07_JM_772','202103_20210315_06_JM_390','202103_20210315_07_JM_715','202103_20210315_07_JM_627','202103_20210319_06_JM_3024','202103_20210316_07_JM_422','202103_20210317_08_JM_385','202103_20210313_06_SK_387','202103_20210313_06_SK_712','202103_20210313_06_SK_654','202103_20210310_09_JM_772','202103_20210309_09_JM_712','202103_20210315_06_JM_723','202103_20210325_07_JM_3032','202103_20210315_06_JM_627','202103_20210315_06_JM_0','202103_20210313_06_JM_654','202103_20210313_06_JM_712','202103_20210313_06_JM_387',
  '202104_20210406_09_JM_3032','202104_20210407_09_JM_385','202104_20210406_08_JM_715','202104_20210417_08_JM_401','202104_20210417_07_JM_517','202104_20210417_09_JM_413','202104_20210416_09_JM_3032','202104_20210416_08_JM_756','202104_20210406_08_JM_493','202104_20210407_08_JM_656','202104_20210408_07_JM_76','202104_20210406_09_JM_517','202104_20210407_08_JM_682','202104_20210406_09_JM_0','202104_20210416_09_JM_387','202104_20210417_09_JM_517','202104_20210417_09_JM_413','202104_20210417_08_SK_723',
  '202105_20210522_06_JM_0','202105_20210521_08_JM_436','202105_20210510_07_SK_729','202105_20210513_06_JM_521','202105_20210513_06_JM_989','202104_20210413_09_JM_3834','202104_20210406_08_JM_0','202104_20210413_07_JM_412', '202105_20210515_09_JM_3974','202105_20210511_09_JM_390','202105_20210512_09_JM_736','202105_20210522_06_JM_400','202105_20210522_06_JM_0','202105_20210518_08_JM_144','202105_20210518_08_JM_740','202105_20210517_08_JM_989','202105_20210517_06_JM_0','202105_20210517_07_JM_0','202105_20210511_08_JM_712'
  )

dCleaned <- dCleaned %>% mutate(
     georeferenceVerificationStatus = ifelse(recordID %in% errorLocList, "corrected with linear interpolation by RN","verified visually by RN"),
     georeferenceRemarks = ifelse(recordID %in% errorLocList, paste0('Initial coordinate: ', lng, ', ',lat),'')
  )

dCleaned[dCleaned$recordID %in% errorLocList,]$lng=NA
dCleaned[dCleaned$recordID %in% errorLocList,]$lat=NA

for (s in unique(dCleaned$surveyID)){
  id = dCleaned$surveyID==s
  dCleaned[id,]$lat = na_interpolation(dCleaned[id,]$lat)
  dCleaned[id,]$lng = na_interpolation(dCleaned[id,]$lng)
}

We visualized all sightings on a map to identify the erroneous locations. We selected specific sessions for ease of cleaning.

# all_surveys$sessionID %>% unique()

ds = dCleaned %>% filter(sessionID=='202008')

m <- leaflet(width = "100%") %>%
  addProviderTiles("MapBox", options = providerTileOptions(
    id = "mapbox.satellite",
    accessToken = 'pk.eyJ1IjoicmFmbnVzcyIsImEiOiIzMVE1dnc0In0.3FNMKIlQ_afYktqki-6m0g')) %>% 
  addFullscreenControl()

factpal <- colorFactor(brewer.pal(12, 'Paired'), as.factor(ds$surveyID))

for (s in unique(ds$surveyID)){
  m <- addPolylines(m, data=ds %>% 
                      filter(surveyID==s) %>% 
                      mutate(      
                        day = format(date,"%Y%m%d"),
                        label = paste0(
                         "<b>Date:</b> ", date,'<br>',
                         "<b>surveyID</b>: ", surveyID,'<br>'
                      )),
                    lng = ~lng, lat = ~lat , 
                    group = ~day, 
                    color=~factpal(surveyID), popup = ~label)
}

m <- addCircleMarkers(m, data = ds %>%
                  mutate(
                    day = format(date,"%Y%m%d"),
                    label = paste0(
                       "<b>recordID</b>: ", recordID,'<br>',
                       "<b>Species:</b> ", specieName,'<br>',
                       "<b>Date:</b> ", date, '<br>',
                       "<b>surveyID</b>: ", surveyID,'<br>'
                  )),
                lng = ~lng, lat = ~lat, popup = ~label,
                radius = ~ifelse(specieID==400, 10, 6),
                stroke = FALSE,
                fillOpacity = ~ifelse(specieID==400, 1, 0.8),
                color = ~factpal(surveyID),
                group = ~day)

m <- m %>% addLayersControl(
    overlayGroups =  format(ds$date,"%Y%m%d"),
    options = layersControlOptions(collapsed = FALSE)
  )

m

Cleaning Species

The cleaning of the species is based on an external dataset where all species reported were manually checked (validated or flagged as error). Here, we load this spreadsheet and apply the corrections (remove or replace with equivalent specieID).

species.list.validation <- read_xlsx(paste0(project.root, '/data/species_lists_validation.xlsx'), range = "A1:F288") %>% 
  mutate(`Equivalent specieID` = ifelse(is.na(`Equivalent specieID`) & grepl("Error",Status) ,0,`Equivalent specieID`))

dCleaned <- dCleaned %>% 
  right_join(species.list.validation, by="specieID") %>% 
  mutate(
    identificationVerificationStatus = Status,
    identificationRemarks = paste0(ifelse(is.na(Comment),'',Comment), ifelse(!is.na(Comment)&!is.na(`Equivalent specieID`),' | ',''), ifelse(is.na(`Equivalent specieID`),'',paste0('Original entry: ', specieName.x,' (',specieID,')'))),
  ) %>%
  mutate(specieID = ifelse(is.na(`Equivalent specieID`),specieID,`Equivalent specieID`)) %>% 
  select(-c('specieName.x','specieName.y','Comment','Equivalent specieID','Status','NumberObs'))

We added external information (specie name, family, etc…) based on the Species list from the 2019 Checklist of the Birds of Kenya 5th Edition

species.list <- read_xlsx('C:/Users/rnussba1/Documents/GitHub/Birds-of-Kenya/Checklist of the Birds of Kenya 5th Edition (2019)/2019 Checklist of the Birds of Kenya 5th Edition (2019).xlsx', sheet = "main")
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in A1182 / R1182C1: got 'Total'
dCleaned <- dCleaned %>% left_join(species.list, by=c("specieID" = "ADU"))

# Check for no match
dCleaned %>% filter(is.na(family_english )) %>% .$specieID %>% unique()
## [1] 0
dsp = dCleaned %>% 
  group_by(specieID) %>% 
  summarise(
    common_name = first(common_name),
    scientific_name = first(scientific_name),
    family_english = first(family_english),
    NumberObs = n(),
    ) %>% 
  arrange(NumberObs)
  
datatable(dsp, filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")

Add location information to survey

As location information have to be added to the event data (i.e., survey), we add the transect name and footprintWKT to each survey (in all_surveys).

f <- paste0(project.root,"/data/transects.geojson")
geojson <- fromJSON(f)
track = readOGR(f,require_geomType = "wkbLineString",verbose=F) %>% spTransform( CRS("+init=epsg:4326"))
## NOTE: keeping only 23 wkbLineString of 23 features
mtrack <- matrix(unlist(lapply(coordinates(track), function(x) apply(x[[1]],2,mean))),nrow=2) 

dLineStr <- dCleaned %>%
  mutate(
    lat2=lat,
    lng2=lng,
  ) %>% 
  st_as_sf(coords = c("lng","lat")) %>%
  group_by(surveyID) %>% 
  summarize(
    mlat = mean(lat2),
    mlng = mean(lng2),
    transectName = geojson$features$properties$Name[unlist(map2(mlng,mlat,function(mlng,mlat) which.min((mlng-mtrack[1,])^2 + (mlat-mtrack[2,])^2 )))]
  ) %>% 
  st_cast("LINESTRING") %>% 
  mutate(footprintWKT=st_as_text(geometry))

all_surveys <- all_surveys %>% right_join(dLineStr,by = "surveyID")

Visualization of the transects

factpal <- colorFactor(brewer.pal(12, 'Paired'), as.factor(dLineStr$transectName))

leaflet(width = "100%") %>%
  addProviderTiles("MapBox", options = providerTileOptions(
    id = "mapbox.satellite",
    accessToken = 'pk.eyJ1IjoicmFmbnVzcyIsImEiOiIzMVE1dnc0In0.3FNMKIlQ_afYktqki-6m0g')) %>% 
  addFullscreenControl() %>% 
  addPolylines(data=dLineStr, group = ~transectName, color=~factpal(transectName)) %>% 
  addPolylines(data=track, popup = geojson$features$properties$Name, color = ~factpal(geojson$features$properties$Name),opacity = 1,group = geojson$features$properties$Name) %>% 
  addLayersControl(
    overlayGroups =  dLineStr$transectName,
    options = layersControlOptions(collapsed = FALSE)
  )

Export in Darwin format

Export Event table

events <- all_surveys %>% 
  transmute(
    type = "Event",
    language = "en",
    license = "http://creativecommons.org/publicdomain/zero/1.0/legalcode",
    rightsHolder = "A Rocha Kenya",
    ownerInstitutionCode = "ARK",
    eventID = surveyID,
    parentEventID = sessionID,
    samplingProtocol = 'Water Bird Count',
    sampleSizeValue = difftime(end_time,start_time, units="mins")-gap_min,
    sampleSizeUnit = "minutes",
    # samplingEffort = coverage,
    eventDate = format(date,"%Y-%m-%d"),
    eventTime = paste0(format(start_time,"%H:%M"),"/",format(end_time,"%H:%M")),
    eventRemarks = notes,
    # Locations
    # locationID = transectID,
    continent = "Africa",
    country = "Kenya",
    countryCode = "KE",
    # stateProvince =" ",
    county = "Kilifi",
    # municipality = if_else(site=='Sabaki', "Sabaki", "Mida"),
    locality = transectName,
    # locationRemarks = description,
    # decimalLatitude = latitude,
    # decimalLongitude = longitude,
    geodeticDatum ="WGS84",
    footprintWKT = footprintWKT,
    #georeferencedBy = "Raphaël Nussbaumer",
    #georeferencedDate = "03/06/2020",
    #georeferenceSources = "https://www.geonames.org/ | https://www.google.co.ke/maps/",
    #georeferenceVerificationStatus = "verified by curator",
    #georeferenceRemarks = "",
    dynamicProperties = paste0("{",
      'gap: "',gap_min,'", ',
      "}"
    ),
    )

Export occurrence table

occurrences <- dCleaned %>% 
  transmute(
    # Occurrence
    occurrenceID = recordID,
    basisOfRecord = "HumanObservation",
    occurrenceRemarks = note,
    occurrenceStatus = "present",
    # Location
    decimalLatitude = lat,
    decimalLongitude = lng,
    georeferenceVerificationStatus = georeferenceVerificationStatus,
    georeferenceRemarks = georeferenceRemarks,
    # Taxon
    taxonID = `Clements--code`,
    taxonRank = `Clements--rank`,
    kingdom = "Animalia",
    phylum = "Chordata",
    class = "Aves",
    family = family_scientific,
    # scientificNameAuthorship = "",
    scientificName = scientific_name,
    vernacularName = common_name,
    identificationVerificationStatus = identificationVerificationStatus,
    identificationRemarks = identificationRemarks,
    # Others
    eventID = surveyID,
    # parentEventID = sessionID,
    recordedBy = recordedBy,
      )

Write to Excel

write.csv(events, file = paste(project.root,"data/events.csv",sep="/"),  na = "", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(occurrences, file = paste(project.root,"data/occurrences.csv",sep="/"), na = "", row.names = FALSE, fileEncoding = "UTF-8")