Introduction
This script reads the Birdlasser files collected on the ground and produces the events.csv
and occurances.csv
files in Darwin format for the publication of the Bird Survey Around Mida Creek on GBIF.
Load surveys metadata
A session comprises around 90 surveys gathered during a month, each of them covering a different transect. Sessions are repeated every month. A survey is a ~1 hour (non-stop) recording of all species encountered.
For each session (usually 4 surveys performed from 6am to 10am), a Birdlasser csv file with all the species recorded is exported by the observer and sent to Raphaël Nussbaumer. All the files received are stored in the /birdlasser_files/
folder and additional information on each survey is recorded in the metadata.xlsx
file:
filename
: Birdlasser file name containing the survey data and stored in/birdlasser_files/
observer
: Letter for each observer L (Kirao Lennox), J (Juma Badi), D (Daniel Kazungu), K (Kibwana Ali), S (Saddam Kailo) and M (Mohammed Ali)sessionID
: Session identifier (monthly grouping of surveys covering all the transects),YYYYMM
with the month where most surveys were done.Date
: Date of the surveystart_time
: Time of survey startend_time
: Time of survey endfirst_sighting
: DEPRECATED: beforeunidentified
was used to specify the start and end time, we used this.last_sighting
: DEPRECATEDprotocol_ok
: Whether the data is properly recorded (TRUE
orFALSE
).gap_min
: Duration of the survey where birds were not recorded (often because of rain) inMM:HH
.notes
: Additional notes with respect to the survey.
Here, we read this file as a table and add computed variables:
start_hour
: (start time rounded to the closest hour) so 6,7,8,9,10 or 11duration
: (end-start-gaps) and create an surveysurveyID
: a unique identifier per survey
<- read_excel(paste(project.root,"data/metadata.xlsx",sep="/"), sheet=2) %>%
all_surveys filter(ifelse(is.na(protocol_ok), TRUE, FALSE)) %>%
mutate(
filename = filename,
date = force_tz(date,tzone = "Africa/Nairobi"),
start_time = parse_time(format(start_time,'%H:%M')),
#first_sighting = parse_time(format(first_sighting,'%H:%M')),
#last_sighting = parse_time(format(last_sighting,'%H:%M')),
end_time = parse_time(format(end_time,'%H:%M')),
gap_min = ifelse( is.na(gap_min), 0, as.integer(format(gap_min,'%M'))),
start_hour = parse_time(format(round(as.POSIXct(start_time, format="%H:%M:%S"), units="hours"),'%H:%M')),
surveyID = paste( sessionID, format(date, "%Y%m%d"), str_replace(start_hour,':00:00','') , observer, sep = '_' ),
notes = str_replace_all(notes, "[\r\n]" , ""),
observer = observer,
.keep = "used"
)
We replace the observer code (first letter) by the full name of the observer.
<- function(x) {
matchName if (x=="L"){
"Kirao Lennox"
else if(x=="J"){
} "Juma Badi"
else if(x=="D"){
} "Daniel Kazungu"
else if(x=="K"){
} "Kibwana Ali"
else if(x=="S"){
} "Saddam Kailo"
else if(x=="M"){
} "Mohammed Ali"
}
}
<- all_surveys %>% mutate(
all_surveys recordedBy = unlist(map(str_extract_all(observer,''), ~str_c(unlist(map_chr(.x,matchName)), collapse = "|"))),
)
Below is an extract (first 5 rows) of the dataset of surveys:
datatable(all_surveys %>% head(), filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")
Load Sightings data
Based on the metadata of each survey loaded in the previous section, we can now create the database including all sightings (i.e. individual observations) and associated survey information.
<- all_surveys %>%
draw pmap_dfr(function(...) {
<- tibble(...)
s
sread_csv(paste(project.root,'/data/birdlasser_files/', s$filename,'.csv',sep=""),
col_types = cols_only(
`Species primary name` = col_character(),
`BirdMAP ID` = col_double(),
Date = col_character(),
Time = col_time(format = ""),
Latitude = col_double(),
Longitude = col_double(),
Notes = col_character()
)%>%
) mutate(Date = parse_date_time(Date,c("%Y-%m-%d", "%d/%m/%Y"), exact=TRUE, tz = "Africa/Nairobi" )) %>% # date are formated differently, this line allow to read both format
filter(Date == s$date, s$start_time <= Time, s$end_time >= Time ) %>% # filter data for the survey
mutate(
specieName = `Species primary name`,
specieID = `BirdMAP ID`,
date = Date+Time,
lat = Latitude,
lng = Longitude,
note = str_replace_all(Notes, "[\r\n]" , ""),
# From Survey information
surveyID = s$surveyID,
sessionID = s$sessionID,
# filename = s$filename,
recordedBy = s$recordedBy,
# start_hour = s$start_hour,
# duration = s$duration,
# start_time = s$start_time,
# end_time = s$end_time,
# gap_min = s$gap_min,
# notes_survey = s$notes,
.keep = "none"
%>%
) arrange(date) %>%
group_by(surveyID,specieID) %>%
mutate(recordID = paste(surveyID,specieID,row_number(),sep="_")) %>%
ungroup()
})
datatable(draw %>% head(), filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")
Surveys cleaning
In this section, we clean the survey data from erroneous locations and species.
<- draw dCleaned
Cleaning location
We remove and interpolate the wrong locations of individual sightings which were determined by visual checking (see map below).
= c(
errorLocList '202006_20200528_06_JD_0','202006_20200528_06_JD_545','202006_20200610_06_JD_0','202006_20200616_06_JD_0','202006_20200616_06_JD_517','202006_20200701_06_JD_0','202006_20200701_06_JD_989','202006_20200701_06_JD_784','202006_20200624_09_JD_797','202006_20200610_08_JD_0','202006_20200610_07_JD_0','202006_20200610_07_JD',
'202007_20200713_06_JM_517','202007_20200721_07_JM_521','202007_20200713_06_JM_314',
'202008_20200817_06_JM_0','202008_20200818_06_JM_0','202008_20200819_06_JM_989','202008_20200819_06_JM_0','202008_20200821_07_JM_551','202008_20200817_08_JM_989',
'202010_20201105_06_JM_0','202010_20201108_07_JM_3834',
'202012_20201215_06_JK_0','202012_20201212_07_JK_3024','202012_20201215_08_JK_0','202012_20201212_07_JK_859','202012_20201210_09_JK_580','202012_20201215_07_JK_0',
'202101_20210111_08_JM_685','202101_20210113_06_JM_649','202101_20210119_07_JM_3834','202101_20210121_08_JM_144','202101_20210125_06_JM_321',
'202103_20210309_09_SK_712','202103_20210310_09_SK_772','202103_20210325_07_JM_772','202103_20210315_06_JM_390','202103_20210315_07_JM_715','202103_20210315_07_JM_627','202103_20210319_06_JM_3024','202103_20210316_07_JM_422','202103_20210317_08_JM_385','202103_20210313_06_SK_387','202103_20210313_06_SK_712','202103_20210313_06_SK_654','202103_20210310_09_JM_772','202103_20210309_09_JM_712','202103_20210315_06_JM_723','202103_20210325_07_JM_3032','202103_20210315_06_JM_627','202103_20210315_06_JM_0','202103_20210313_06_JM_654','202103_20210313_06_JM_712','202103_20210313_06_JM_387',
'202104_20210406_09_JM_3032','202104_20210407_09_JM_385','202104_20210406_08_JM_715','202104_20210417_08_JM_401','202104_20210417_07_JM_517','202104_20210417_09_JM_413','202104_20210416_09_JM_3032','202104_20210416_08_JM_756','202104_20210406_08_JM_493','202104_20210407_08_JM_656','202104_20210408_07_JM_76','202104_20210406_09_JM_517','202104_20210407_08_JM_682','202104_20210406_09_JM_0','202104_20210416_09_JM_387','202104_20210417_09_JM_517','202104_20210417_09_JM_413','202104_20210417_08_SK_723',
'202105_20210522_06_JM_0','202105_20210521_08_JM_436','202105_20210510_07_SK_729','202105_20210513_06_JM_521','202105_20210513_06_JM_989','202104_20210413_09_JM_3834','202104_20210406_08_JM_0','202104_20210413_07_JM_412', '202105_20210515_09_JM_3974','202105_20210511_09_JM_390','202105_20210512_09_JM_736','202105_20210522_06_JM_400','202105_20210522_06_JM_0','202105_20210518_08_JM_144','202105_20210518_08_JM_740','202105_20210517_08_JM_989','202105_20210517_06_JM_0','202105_20210517_07_JM_0','202105_20210511_08_JM_712'
)
<- dCleaned %>% mutate(
dCleaned georeferenceVerificationStatus = ifelse(recordID %in% errorLocList, "corrected with linear interpolation by RN","verified visually by RN"),
georeferenceRemarks = ifelse(recordID %in% errorLocList, paste0('Initial coordinate: ', lng, ', ',lat),'')
)
$recordID %in% errorLocList,]$lng=NA
dCleaned[dCleaned$recordID %in% errorLocList,]$lat=NA
dCleaned[dCleaned
for (s in unique(dCleaned$surveyID)){
= dCleaned$surveyID==s
id $lat = na_interpolation(dCleaned[id,]$lat)
dCleaned[id,]$lng = na_interpolation(dCleaned[id,]$lng)
dCleaned[id,] }
We visualized all sightings on a map to identify the erroneous locations. We selected specific sessions for ease of cleaning.
# all_surveys$sessionID %>% unique()
= dCleaned %>% filter(sessionID=='202008')
ds
<- leaflet(width = "100%") %>%
m addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.satellite",
accessToken = 'pk.eyJ1IjoicmFmbnVzcyIsImEiOiIzMVE1dnc0In0.3FNMKIlQ_afYktqki-6m0g')) %>%
addFullscreenControl()
<- colorFactor(brewer.pal(12, 'Paired'), as.factor(ds$surveyID))
factpal
for (s in unique(ds$surveyID)){
<- addPolylines(m, data=ds %>%
m filter(surveyID==s) %>%
mutate(
day = format(date,"%Y%m%d"),
label = paste0(
"<b>Date:</b> ", date,'<br>',
"<b>surveyID</b>: ", surveyID,'<br>'
)),lng = ~lng, lat = ~lat ,
group = ~day,
color=~factpal(surveyID), popup = ~label)
}
<- addCircleMarkers(m, data = ds %>%
m mutate(
day = format(date,"%Y%m%d"),
label = paste0(
"<b>recordID</b>: ", recordID,'<br>',
"<b>Species:</b> ", specieName,'<br>',
"<b>Date:</b> ", date, '<br>',
"<b>surveyID</b>: ", surveyID,'<br>'
)),lng = ~lng, lat = ~lat, popup = ~label,
radius = ~ifelse(specieID==400, 10, 6),
stroke = FALSE,
fillOpacity = ~ifelse(specieID==400, 1, 0.8),
color = ~factpal(surveyID),
group = ~day)
<- m %>% addLayersControl(
m overlayGroups = format(ds$date,"%Y%m%d"),
options = layersControlOptions(collapsed = FALSE)
)
m
Cleaning Species
The cleaning of the species is based on an external dataset where all species reported were manually checked (validated or flagged as error). Here, we load this spreadsheet and apply the corrections (remove or replace with equivalent specieID).
<- read_xlsx(paste0(project.root, '/data/species_lists_validation.xlsx'), range = "A1:F288") %>%
species.list.validation mutate(`Equivalent specieID` = ifelse(is.na(`Equivalent specieID`) & grepl("Error",Status) ,0,`Equivalent specieID`))
<- dCleaned %>%
dCleaned right_join(species.list.validation, by="specieID") %>%
mutate(
identificationVerificationStatus = Status,
identificationRemarks = paste0(ifelse(is.na(Comment),'',Comment), ifelse(!is.na(Comment)&!is.na(`Equivalent specieID`),' | ',''), ifelse(is.na(`Equivalent specieID`),'',paste0('Original entry: ', specieName.x,' (',specieID,')'))),
%>%
) mutate(specieID = ifelse(is.na(`Equivalent specieID`),specieID,`Equivalent specieID`)) %>%
select(-c('specieName.x','specieName.y','Comment','Equivalent specieID','Status','NumberObs'))
We added external information (specie name, family, etc…) based on the Species list from the 2019 Checklist of the Birds of Kenya 5th Edition
<- read_xlsx('C:/Users/rnussba1/Documents/GitHub/Birds-of-Kenya/Checklist of the Birds of Kenya 5th Edition (2019)/2019 Checklist of the Birds of Kenya 5th Edition (2019).xlsx', sheet = "main") species.list
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in A1182 / R1182C1: got 'Total'
<- dCleaned %>% left_join(species.list, by=c("specieID" = "ADU"))
dCleaned
# Check for no match
%>% filter(is.na(family_english )) %>% .$specieID %>% unique() dCleaned
## [1] 0
= dCleaned %>%
dsp group_by(specieID) %>%
summarise(
common_name = first(common_name),
scientific_name = first(scientific_name),
family_english = first(family_english),
NumberObs = n(),
%>%
) arrange(NumberObs)
datatable(dsp, filter = "top", extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('csv')), rownames = FALSE, class = "compact")
Add location information to survey
As location information have to be added to the event data (i.e., survey), we add the transect name and footprintWKT to each survey (in all_surveys
).
<- paste0(project.root,"/data/transects.geojson")
f <- fromJSON(f)
geojson = readOGR(f,require_geomType = "wkbLineString",verbose=F) %>% spTransform( CRS("+init=epsg:4326")) track
## NOTE: keeping only 23 wkbLineString of 23 features
<- matrix(unlist(lapply(coordinates(track), function(x) apply(x[[1]],2,mean))),nrow=2)
mtrack
<- dCleaned %>%
dLineStr mutate(
lat2=lat,
lng2=lng,
%>%
) st_as_sf(coords = c("lng","lat")) %>%
group_by(surveyID) %>%
summarize(
mlat = mean(lat2),
mlng = mean(lng2),
transectName = geojson$features$properties$Name[unlist(map2(mlng,mlat,function(mlng,mlat) which.min((mlng-mtrack[1,])^2 + (mlat-mtrack[2,])^2 )))]
%>%
) st_cast("LINESTRING") %>%
mutate(footprintWKT=st_as_text(geometry))
<- all_surveys %>% right_join(dLineStr,by = "surveyID") all_surveys
Visualization of the transects
<- colorFactor(brewer.pal(12, 'Paired'), as.factor(dLineStr$transectName))
factpal
leaflet(width = "100%") %>%
addProviderTiles("MapBox", options = providerTileOptions(
id = "mapbox.satellite",
accessToken = 'pk.eyJ1IjoicmFmbnVzcyIsImEiOiIzMVE1dnc0In0.3FNMKIlQ_afYktqki-6m0g')) %>%
addFullscreenControl() %>%
addPolylines(data=dLineStr, group = ~transectName, color=~factpal(transectName)) %>%
addPolylines(data=track, popup = geojson$features$properties$Name, color = ~factpal(geojson$features$properties$Name),opacity = 1,group = geojson$features$properties$Name) %>%
addLayersControl(
overlayGroups = dLineStr$transectName,
options = layersControlOptions(collapsed = FALSE)
)
Export in Darwin format
Export Event table
<- all_surveys %>%
events transmute(
type = "Event",
language = "en",
license = "http://creativecommons.org/publicdomain/zero/1.0/legalcode",
rightsHolder = "A Rocha Kenya",
ownerInstitutionCode = "ARK",
eventID = surveyID,
parentEventID = sessionID,
samplingProtocol = 'Water Bird Count',
sampleSizeValue = difftime(end_time,start_time, units="mins")-gap_min,
sampleSizeUnit = "minutes",
# samplingEffort = coverage,
eventDate = format(date,"%Y-%m-%d"),
eventTime = paste0(format(start_time,"%H:%M"),"/",format(end_time,"%H:%M")),
eventRemarks = notes,
# Locations
# locationID = transectID,
continent = "Africa",
country = "Kenya",
countryCode = "KE",
# stateProvince =" ",
county = "Kilifi",
# municipality = if_else(site=='Sabaki', "Sabaki", "Mida"),
locality = transectName,
# locationRemarks = description,
# decimalLatitude = latitude,
# decimalLongitude = longitude,
geodeticDatum ="WGS84",
footprintWKT = footprintWKT,
#georeferencedBy = "Raphaël Nussbaumer",
#georeferencedDate = "03/06/2020",
#georeferenceSources = "https://www.geonames.org/ | https://www.google.co.ke/maps/",
#georeferenceVerificationStatus = "verified by curator",
#georeferenceRemarks = "",
dynamicProperties = paste0("{",
'gap: "',gap_min,'", ',
"}"
), )
Export occurrence table
<- dCleaned %>%
occurrences transmute(
# Occurrence
occurrenceID = recordID,
basisOfRecord = "HumanObservation",
occurrenceRemarks = note,
occurrenceStatus = "present",
# Location
decimalLatitude = lat,
decimalLongitude = lng,
georeferenceVerificationStatus = georeferenceVerificationStatus,
georeferenceRemarks = georeferenceRemarks,
# Taxon
taxonID = `Clements--code`,
taxonRank = `Clements--rank`,
kingdom = "Animalia",
phylum = "Chordata",
class = "Aves",
family = family_scientific,
# scientificNameAuthorship = "",
scientificName = scientific_name,
vernacularName = common_name,
identificationVerificationStatus = identificationVerificationStatus,
identificationRemarks = identificationRemarks,
# Others
eventID = surveyID,
# parentEventID = sessionID,
recordedBy = recordedBy,
)
Write to Excel
write.csv(events, file = paste(project.root,"data/events.csv",sep="/"), na = "", row.names = FALSE, fileEncoding = "UTF-8")
write.csv(occurrences, file = paste(project.root,"data/occurrences.csv",sep="/"), na = "", row.names = FALSE, fileEncoding = "UTF-8")