voiceThis is a vignette presenting the voice
approach to Kaggle
BirdCLEF 2023 competition.
birdclef2023
contains summarized data from Kaggle
BirdCLEF 2023 competition audios. The summarization was made using
voice,
an R library with functions to easily deal with audio.
# install packs
install.packages('voice', dep = TRUE)
devtools::install_github('filipezabala/birdclef2023')The libraries are called, geti and gaussian
functions are defined.
# packs
library(voice)
library(tidyverse)
library(birdclef2023)
# geti function
geti <- function(x,i){x[i]}
# gaussian function
gaussian <- function(x, mu = 0, sigma = 1){
exp(-((x-mu)/sigma)^2)
}In this section is presented how to download, extract and summarize
the bird’s audio features. As the results are available at birdclef2023,
this section may be ommited in a first reading.
The data may be downloaded by the official Kaggle API. The zip file occupies around 5.3GB on disk.
url <- 'https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/44224/5188730/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1685615147&Signature=QeSfA1%2Bumjv4ND27r%2F7ln3XzHk%2F52GDgNq3EDtphEAKxQUkypPQHaIVCxlLFAnhfqtTM3V4mny0KtttbCXgseNaweq%2Bx1f1TjtY6DEjP%2FksG%2B%2BX0TiaLgl06xPhh1SZ%2FiZcCaEtESL3SKAZ9Nq5%2FbSoNKTzghgPP2ET4ncUxc5UeaqR6%2BVTtSWLDCEe%2FEzPHOT64gaF9w4gAnqZjkl7GFthY034mXaBSULWl1Ul6K55vX%2FJfXbaxDzrJbVB1HlU0eax3MZcnyy9msHSwWEsH8N08pSYtWJZA4GmPoGUn5YrNRr5%2Bc7YYVKWKKPxmH%2FG7%2Fw7Y3gj7vd7etLLBzcjRBA%3D%3D&response-content-disposition=attachment%3B+filename%3Dbirdclef-2023.zip'
options(timeout = 9999)
download.file(url = url, '~/Downloads/birdclef-2023.zip', mode = 'wd')Taking a look.
head(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#> Name Length Date
#> 1 eBird_Taxonomy_v2021.csv 4294967295 2023-03-06 18:35:00
#> 2 sample_submission.csv 4294967295 2023-03-06 18:35:00
#> 3 test_soundscapes/soundscape_29201.ogg 4294967295 2023-03-06 18:35:00
#> 4 train_audio/abethr1/XC128013.ogg 4294967295 2023-03-06 18:35:00
#> 5 train_audio/abethr1/XC363501.ogg 4294967295 2023-03-06 18:35:00
#> 6 train_audio/abethr1/XC363502.ogg 4294967295 2023-03-06 18:35:00
nrow(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#> [1] 16945Unzipping.
unzip('~/Downloads/birdclef-2023.zip',
exdir = tempdir(),
unzip = '/usr/bin/unzip')The .ogg files are converted using ffmpeg, “[a] complete,
cross-platform solution to record, convert and stream audio and video”.
The tagging procedure was performed by voice::tag using
parallel processing. In order to save memory, the ogg to wav conversion,
tagging and the wav files deletion are executed in the same routine. The
processing took around 6 hours on a local machine with 12 CPUs, and the
output is available in birdclef2023::E_train.
# ogg files
oggFiles <- list.files(paste0(tempdir(), '/train_audio'),
pattern = '.[Oo][Gg][Gg]$',
full.names = TRUE, recursive = TRUE)
length(oggFiles)
# ogg directories
oggDirs <- unique(dirname(oggFiles))
length(oggDirs)
# wav files
new_pth <- paste0(tempdir(), '/wav/')
old_pth <- paste0(tempdir(), '/train_audio/')
wavFiles <- sub(old_pth, new_pth, oggFiles)
wavFiles <- sub('.ogg$', '.wav', wavFiles)
# wav directories
wavDirs <- unique(dirname(wavFiles))
# extended dataset
n <- length(oggDirs)
E_list <- vector('list', n)
names(E_list) <- basename(oggDirs)
# converting ogg to wav, tagging grouping by wav_path, deleting wav
for(i in 1:n){
# garbage collector
gc()
# creating directories
dir.create((wavDirs[i]), recursive = TRUE)
# filtering by species_code
oggTemp <- dir(oggDirs[i], full.names = TRUE)
fltr <- grep(basename(oggDirs[i]), wavFiles)
wavTemp <- wavFiles[fltr]
# converting ogg to wav
for(j in 1:length(oggTemp)){
cmd <- paste0("ffmpeg -i ", oggTemp[j], " -ac 1 ", wavTemp[j])
system(cmd)
}
# Extended dataset
E <- dplyr::tibble(species_code = basename(oggDirs[i]),
wav_path = wavTemp)
# tagging
E_list[[i]] <- voice::tag(E, groupBy = 'species_code',
features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
'mfcc', 'zcr', 'rms', 'gain', 'rfc'))
# binding
E <- dplyr::bind_rows(E_list)
write.csv(E, row.names = FALSE,
file = gzfile(paste0(tempdir(), '/E_train.csv.gz')))
# deleting wav files
unlink(wavDirs[i], recursive = TRUE)
# progress
print(i/n)
}Converting the test file from ogg to wav format.
# converting test data
oggFile <- paste0(tempdir(), '/test_soundscapes/soundscape_29201.ogg')
wavFile <- paste0(tempdir(), '/test_soundscapes/soundscape_29201.wav')
cmd <- paste0("ffmpeg -y -i ", oggFile, " -ac 1 ", wavFile)
system(cmd)Splitting the 10 minutes file using voice::splitw. The
procedure took around 25 seconds.
# rttm
n <- 10*60/5
rttm <- dplyr::tibble(type = rep('SPEAKER', n),
file = rep(NA, n),
chnl = rep(1, n),
tbeg = seq(0,595,5),
tdur = rep(5,n),
ortho = rep(NA, n),
stype = rep(NA, n),
name = rep('A',n),
conf = rep(NA,n),
slat = rep(NA,n))
write.table(rttm, paste0(tempdir(), '/split5sec.rttm'),
quote = FALSE, row.names = FALSE, col.names = FALSE)
# splitting
splitDir <- paste0(tempdir(), '/split')
dir.create(splitDir)
voice::splitw(wavFile,
fromRttm = paste0(tempdir(), '/split5sec.rttm'),
toSplit = splitDir)
# renaming
fr <- dir(splitDir, full.names = TRUE)
row_id_temp <- paste0('soundscape_29201_', seq(5,600,5), '.wav')
to <- paste0(dirname(fr), '/', row_id_temp)
file.rename(fr,to)Finally the tagging is applied to the test data. The procedure took
less than 1 minute to run, and the output is available in
birdclef2023::E_test.
# Extended dataset
row_id_raw <- strsplit(row_id_temp, '.wav$')
row_id_raw <- sapply(row_id_raw, geti, 1)
E <- dplyr::tibble(row_id = row_id_raw,
wav_path = to)
# tagging grouping by row_id
tst <- voice::tag(E, groupBy = 'row_id',
features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
'mfcc', 'zcr', 'rms', 'gain', 'rfc'),
sortByGroupBy = FALSE,
mc.cores = parallel::detectCores())
# writing
write.csv(E, row.names = FALSE,
file = gzfile(paste0(tempdir(), '/E_test.csv.gz')))The birdclef2023::E_train object contains 264 rows and
486 columns extracted via voice::tag from Kaggle
BirdCLEF2023 train_audio directory.
birdclef2023::E_train
#> # A tibble: 264 × 487
#> species_code f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 abethr1 80.0 1125. 1860. 2701. 3564.
#> 2 abhori1 122. 860. 1686. 2627. 3543.
#> 3 abythr1 90.3 1085. 1818. 2624. 3462.
#> 4 afbfly1 104. 1159. 1866. 2753. 3621.
#> 5 afdfly1 108. 1289. 2091. 2942. 3791.
#> 6 afecuc1 115. 978. 1801. 2654. 3559.
#> 7 affeag1 169. 934. 1588. 2566. 3486.
#> 8 afgfly1 95.8 869. 1736. 2677. 3550.
#> 9 afghor1 133. 1003. 1909. 2602. 3549.
#> 10 afmdov1 397. 636. 1577. 2558. 3492.
#> # ℹ 254 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> # f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> # rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> # rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> # rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> # rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …The birdclef2023::E_test object contains 120 rows and
486 columns extracted via voice::tag from Kaggle
BirdCLEF2023 test_soundscapes directory. Each row is
associated with a 5 seconds section of soundscape_29201.ogg
file.
birdclef2023::E_test
#> # A tibble: 120 × 487
#> row_id f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 soundscape_29201… 75.6 1488. 1855. 2868. 3833.
#> 2 soundscape_29201… NA 1466. 1809. 2817. 3800.
#> 3 soundscape_29201… 65.3 1464. 1889. 2927. 3878.
#> 4 soundscape_29201… 71.8 1332. 1768. 2797. 3759.
#> 5 soundscape_29201… 56.9 1349. 1787. 2806. 3796.
#> 6 soundscape_29201… 79.0 1452. 1877. 2742. 3766.
#> 7 soundscape_29201… NA 1466. 1816. 2819. 3807.
#> 8 soundscape_29201… 78.4 1351. 1824. 2765. 3741.
#> 9 soundscape_29201… 74.8 1421. 1813. 2819. 3773.
#> 10 soundscape_29201… 66.6 1376. 1798. 2830. 3781.
#> # ℹ 110 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> # f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> # rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> # rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> # rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> # rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …The basic idea is to compare each row from E_test (test
sample) with each row from E_train (reference values). The
comparison is made through a metric derived from the Euclidean distance,
taking the average of the valid values (not NA). We call
Average Length and it is given by \[\begin{equation}
AL=\sqrt{\frac{\sum_{i=1}^{n-n_{NA}} (x_i-y_i)^2}{n-n_{NA}}}
\end{equation}\]
# distances
D <- matrix(NA, nrow = nrow(birdclef2023::E_test),
ncol = nrow(birdclef2023::E_train))
rownames(D) <- birdclef2023::E_test$row_id
colnames(D) <- birdclef2023::E_train$species_codeThe following code is completly not optimized, designed to be
intuitive and run below the 2 hours limit defined in the Kaggle
BirdCLEF 2023 competition rules. Takes around 30 minutes to run on a
local machine, and the output is available in
birdclef2023::D.
# calculating distances
k <- 0
for(i in 1:nrow(birdclef2023::E_train)){
for(j in 1:nrow(birdclef2023::E_test)){
k <- k+1
d2 <- (birdclef2023::E_test[i,-1] - birdclef2023::E_train[j,-1])^2
n <- length(d2)
nNA <- sum(is.na(d2))
D[i,j] <- sqrt(sum(d2, na.rm = TRUE)/(n-nNA))
print(k/(nrow(birdclef2023::E_test)*nrow(birdclef2023::E_train)))
}
}The probabilities are calculated considering the Gaussian function given by \[\begin{equation} g(x)=e^{-\left(\frac{x-\mu}{\sigma}\right)^2} \end{equation}\] \(x\) represents the scaled distances, and the parameters used were \(\mu=0\) and \(\sigma=0.001\). The idea is to give more weight to distance values very close to zero, and quickly reduce the weight to distances that move away from zero, reason why the standard deviation is small (adjusted to 0.001).
# probabilities matrix
P <- matrix(NA, nrow = nrow(birdclef2023::E_test),
ncol = nrow(birdclef2023::E_train))
rownames(P) <- birdclef2023::E_test$row_id
colnames(P) <- birdclef2023::E_train$species_code
# calculating probabilities
for(i in 1:nrow(P)){
w <- gaussian(scale(birdclef2023::D[i,]),
mu = 0, sigma = 0.001)
P[i,] <- w/sum(w)
}After that, there is some cleaning and tidying up.
block <- strsplit(rownames(P), '_')
block <- as.numeric(sapply(block, geti, 3))
submission <- tibble(block = block,
row_id = rownames(P),
as_tibble(P))
submission <- submission %>%
arrange(block) %>%
select(-block)
submission
#> # A tibble: 120 × 265
#> row_id abethr1 abhori1 abythr1 afbfly1 afdfly1 afecuc1 affeag1 afgfly1
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 soundscape… 0 0 0 0 0 0 0 0
#> 2 soundscape… 0 0 0 0 0 0 0 0
#> 3 soundscape… 0 0 0 0 0 0 0 0
#> 4 soundscape… 0 0 0 0 0 0 2.96e-187 0
#> 5 soundscape… 0 0 0 0 0 0 0 0
#> 6 soundscape… 0 0 0 0 0 0 0 0
#> 7 soundscape… 0 0 0 0 0 0 0 0
#> 8 soundscape… 0 0 0 0 0 0 0 0
#> 9 soundscape… 0 0 0 0 0 0 0 0
#> 10 soundscape… 0 0 0 0 0 0 0 0
#> # ℹ 110 more rows
#> # ℹ 256 more variables: afghor1 <dbl>, afmdov1 <dbl>, afpfly1 <dbl>,
#> # afpkin1 <dbl>, afpwag1 <dbl>, afrgos1 <dbl>, afrgrp1 <dbl>, afrjac1 <dbl>,
#> # afrthr1 <dbl>, amesun2 <dbl>, augbuz1 <dbl>, bagwea1 <dbl>, barswa <dbl>,
#> # bawhor2 <dbl>, bawman1 <dbl>, bcbeat1 <dbl>, beasun2 <dbl>, bkctch1 <dbl>,
#> # bkfruw1 <dbl>, blacra1 <dbl>, blacuc1 <dbl>, blakit1 <dbl>, blaplo1 <dbl>,
#> # blbpuf2 <dbl>, blcapa2 <dbl>, blfbus1 <dbl>, blhgon1 <dbl>, …Finally, the submission.csv file is written in the
desired directory.
write.csv(submission,
file = paste0(tempdir(), '/submission.csv'))