voice
This is a vignette presenting the voice
approach to Kaggle
BirdCLEF 2023 competition.
birdclef2023
contains summarized data from Kaggle
BirdCLEF 2023 competition audios. The summarization was made using
voice
,
an R library with functions to easily deal with audio.
# install packs
install.packages('voice', dep = TRUE)
::install_github('filipezabala/birdclef2023') devtools
The libraries are called, geti
and gaussian
functions are defined.
# packs
library(voice)
library(tidyverse)
library(birdclef2023)
# geti function
<- function(x,i){x[i]}
geti
# gaussian function
<- function(x, mu = 0, sigma = 1){
gaussian exp(-((x-mu)/sigma)^2)
}
In this section is presented how to download, extract and summarize
the bird’s audio features. As the results are available at birdclef2023
,
this section may be ommited in a first reading.
The data may be downloaded by the official Kaggle API. The zip file occupies around 5.3GB on disk.
<- 'https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/44224/5188730/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1685615147&Signature=QeSfA1%2Bumjv4ND27r%2F7ln3XzHk%2F52GDgNq3EDtphEAKxQUkypPQHaIVCxlLFAnhfqtTM3V4mny0KtttbCXgseNaweq%2Bx1f1TjtY6DEjP%2FksG%2B%2BX0TiaLgl06xPhh1SZ%2FiZcCaEtESL3SKAZ9Nq5%2FbSoNKTzghgPP2ET4ncUxc5UeaqR6%2BVTtSWLDCEe%2FEzPHOT64gaF9w4gAnqZjkl7GFthY034mXaBSULWl1Ul6K55vX%2FJfXbaxDzrJbVB1HlU0eax3MZcnyy9msHSwWEsH8N08pSYtWJZA4GmPoGUn5YrNRr5%2Bc7YYVKWKKPxmH%2FG7%2Fw7Y3gj7vd7etLLBzcjRBA%3D%3D&response-content-disposition=attachment%3B+filename%3Dbirdclef-2023.zip'
url options(timeout = 9999)
download.file(url = url, '~/Downloads/birdclef-2023.zip', mode = 'wd')
Taking a look.
head(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#> Name Length Date
#> 1 eBird_Taxonomy_v2021.csv 4294967295 2023-03-06 18:35:00
#> 2 sample_submission.csv 4294967295 2023-03-06 18:35:00
#> 3 test_soundscapes/soundscape_29201.ogg 4294967295 2023-03-06 18:35:00
#> 4 train_audio/abethr1/XC128013.ogg 4294967295 2023-03-06 18:35:00
#> 5 train_audio/abethr1/XC363501.ogg 4294967295 2023-03-06 18:35:00
#> 6 train_audio/abethr1/XC363502.ogg 4294967295 2023-03-06 18:35:00
nrow(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#> [1] 16945
Unzipping.
unzip('~/Downloads/birdclef-2023.zip',
exdir = tempdir(),
unzip = '/usr/bin/unzip')
The .ogg files are converted using ffmpeg
, “[a] complete,
cross-platform solution to record, convert and stream audio and video”.
The tagging procedure was performed by voice::tag
using
parallel processing. In order to save memory, the ogg to wav conversion,
tagging and the wav files deletion are executed in the same routine. The
processing took around 6 hours on a local machine with 12 CPUs, and the
output is available in birdclef2023::E_train
.
# ogg files
<- list.files(paste0(tempdir(), '/train_audio'),
oggFiles pattern = '.[Oo][Gg][Gg]$',
full.names = TRUE, recursive = TRUE)
length(oggFiles)
# ogg directories
<- unique(dirname(oggFiles))
oggDirs length(oggDirs)
# wav files
<- paste0(tempdir(), '/wav/')
new_pth <- paste0(tempdir(), '/train_audio/')
old_pth <- sub(old_pth, new_pth, oggFiles)
wavFiles <- sub('.ogg$', '.wav', wavFiles)
wavFiles
# wav directories
<- unique(dirname(wavFiles))
wavDirs
# extended dataset
<- length(oggDirs)
n <- vector('list', n)
E_list names(E_list) <- basename(oggDirs)
# converting ogg to wav, tagging grouping by wav_path, deleting wav
for(i in 1:n){
# garbage collector
gc()
# creating directories
dir.create((wavDirs[i]), recursive = TRUE)
# filtering by species_code
<- dir(oggDirs[i], full.names = TRUE)
oggTemp <- grep(basename(oggDirs[i]), wavFiles)
fltr <- wavFiles[fltr]
wavTemp
# converting ogg to wav
for(j in 1:length(oggTemp)){
<- paste0("ffmpeg -i ", oggTemp[j], " -ac 1 ", wavTemp[j])
cmd system(cmd)
}
# Extended dataset
<- dplyr::tibble(species_code = basename(oggDirs[i]),
E wav_path = wavTemp)
# tagging
<- voice::tag(E, groupBy = 'species_code',
E_list[[i]] features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
'mfcc', 'zcr', 'rms', 'gain', 'rfc'))
# binding
<- dplyr::bind_rows(E_list)
E write.csv(E, row.names = FALSE,
file = gzfile(paste0(tempdir(), '/E_train.csv.gz')))
# deleting wav files
unlink(wavDirs[i], recursive = TRUE)
# progress
print(i/n)
}
Converting the test file from ogg to wav format.
# converting test data
<- paste0(tempdir(), '/test_soundscapes/soundscape_29201.ogg')
oggFile <- paste0(tempdir(), '/test_soundscapes/soundscape_29201.wav')
wavFile <- paste0("ffmpeg -y -i ", oggFile, " -ac 1 ", wavFile)
cmd system(cmd)
Splitting the 10 minutes file using voice::splitw
. The
procedure took around 25 seconds.
# rttm
<- 10*60/5
n <- dplyr::tibble(type = rep('SPEAKER', n),
rttm file = rep(NA, n),
chnl = rep(1, n),
tbeg = seq(0,595,5),
tdur = rep(5,n),
ortho = rep(NA, n),
stype = rep(NA, n),
name = rep('A',n),
conf = rep(NA,n),
slat = rep(NA,n))
write.table(rttm, paste0(tempdir(), '/split5sec.rttm'),
quote = FALSE, row.names = FALSE, col.names = FALSE)
# splitting
<- paste0(tempdir(), '/split')
splitDir dir.create(splitDir)
::splitw(wavFile,
voicefromRttm = paste0(tempdir(), '/split5sec.rttm'),
toSplit = splitDir)
# renaming
<- dir(splitDir, full.names = TRUE)
fr <- paste0('soundscape_29201_', seq(5,600,5), '.wav')
row_id_temp <- paste0(dirname(fr), '/', row_id_temp)
to file.rename(fr,to)
Finally the tagging is applied to the test data. The procedure took
less than 1 minute to run, and the output is available in
birdclef2023::E_test
.
# Extended dataset
<- strsplit(row_id_temp, '.wav$')
row_id_raw <- sapply(row_id_raw, geti, 1)
row_id_raw <- dplyr::tibble(row_id = row_id_raw,
E wav_path = to)
# tagging grouping by row_id
<- voice::tag(E, groupBy = 'row_id',
tst features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
'mfcc', 'zcr', 'rms', 'gain', 'rfc'),
sortByGroupBy = FALSE,
mc.cores = parallel::detectCores())
# writing
write.csv(E, row.names = FALSE,
file = gzfile(paste0(tempdir(), '/E_test.csv.gz')))
The birdclef2023::E_train
object contains 264 rows and
486 columns extracted via voice::tag
from Kaggle
BirdCLEF2023 train_audio
directory.
::E_train
birdclef2023#> # A tibble: 264 × 487
#> species_code f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 abethr1 80.0 1125. 1860. 2701. 3564.
#> 2 abhori1 122. 860. 1686. 2627. 3543.
#> 3 abythr1 90.3 1085. 1818. 2624. 3462.
#> 4 afbfly1 104. 1159. 1866. 2753. 3621.
#> 5 afdfly1 108. 1289. 2091. 2942. 3791.
#> 6 afecuc1 115. 978. 1801. 2654. 3559.
#> 7 affeag1 169. 934. 1588. 2566. 3486.
#> 8 afgfly1 95.8 869. 1736. 2677. 3550.
#> 9 afghor1 133. 1003. 1909. 2602. 3549.
#> 10 afmdov1 397. 636. 1577. 2558. 3492.
#> # ℹ 254 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> # f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> # rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> # rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> # rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> # rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …
The birdclef2023::E_test
object contains 120 rows and
486 columns extracted via voice::tag
from Kaggle
BirdCLEF2023 test_soundscapes
directory. Each row is
associated with a 5 seconds section of soundscape_29201.ogg
file.
::E_test
birdclef2023#> # A tibble: 120 × 487
#> row_id f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 soundscape_29201… 75.6 1488. 1855. 2868. 3833.
#> 2 soundscape_29201… NA 1466. 1809. 2817. 3800.
#> 3 soundscape_29201… 65.3 1464. 1889. 2927. 3878.
#> 4 soundscape_29201… 71.8 1332. 1768. 2797. 3759.
#> 5 soundscape_29201… 56.9 1349. 1787. 2806. 3796.
#> 6 soundscape_29201… 79.0 1452. 1877. 2742. 3766.
#> 7 soundscape_29201… NA 1466. 1816. 2819. 3807.
#> 8 soundscape_29201… 78.4 1351. 1824. 2765. 3741.
#> 9 soundscape_29201… 74.8 1421. 1813. 2819. 3773.
#> 10 soundscape_29201… 66.6 1376. 1798. 2830. 3781.
#> # ℹ 110 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> # f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> # rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> # rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> # rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> # rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …
The basic idea is to compare each row from E_test
(test
sample) with each row from E_train
(reference values). The
comparison is made through a metric derived from the Euclidean distance,
taking the average of the valid values (not NA). We call
Average Length
and it is given by \[\begin{equation}
AL=\sqrt{\frac{\sum_{i=1}^{n-n_{NA}} (x_i-y_i)^2}{n-n_{NA}}}
\end{equation}\]
# distances
<- matrix(NA, nrow = nrow(birdclef2023::E_test),
D ncol = nrow(birdclef2023::E_train))
rownames(D) <- birdclef2023::E_test$row_id
colnames(D) <- birdclef2023::E_train$species_code
The following code is completly not optimized, designed to be
intuitive and run below the 2 hours limit defined in the Kaggle
BirdCLEF 2023 competition rules. Takes around 30 minutes to run on a
local machine, and the output is available in
birdclef2023::D
.
# calculating distances
<- 0
k for(i in 1:nrow(birdclef2023::E_train)){
for(j in 1:nrow(birdclef2023::E_test)){
<- k+1
k <- (birdclef2023::E_test[i,-1] - birdclef2023::E_train[j,-1])^2
d2 <- length(d2)
n <- sum(is.na(d2))
nNA <- sqrt(sum(d2, na.rm = TRUE)/(n-nNA))
D[i,j] print(k/(nrow(birdclef2023::E_test)*nrow(birdclef2023::E_train)))
} }
The probabilities are calculated considering the Gaussian function given by \[\begin{equation} g(x)=e^{-\left(\frac{x-\mu}{\sigma}\right)^2} \end{equation}\] \(x\) represents the scaled distances, and the parameters used were \(\mu=0\) and \(\sigma=0.001\). The idea is to give more weight to distance values very close to zero, and quickly reduce the weight to distances that move away from zero, reason why the standard deviation is small (adjusted to 0.001).
# probabilities matrix
<- matrix(NA, nrow = nrow(birdclef2023::E_test),
P ncol = nrow(birdclef2023::E_train))
rownames(P) <- birdclef2023::E_test$row_id
colnames(P) <- birdclef2023::E_train$species_code
# calculating probabilities
for(i in 1:nrow(P)){
<- gaussian(scale(birdclef2023::D[i,]),
w mu = 0, sigma = 0.001)
<- w/sum(w)
P[i,] }
After that, there is some cleaning and tidying up.
<- strsplit(rownames(P), '_')
block <- as.numeric(sapply(block, geti, 3))
block <- tibble(block = block,
submission row_id = rownames(P),
as_tibble(P))
<- submission %>%
submission arrange(block) %>%
select(-block)
submission#> # A tibble: 120 × 265
#> row_id abethr1 abhori1 abythr1 afbfly1 afdfly1 afecuc1 affeag1 afgfly1
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 soundscape… 0 0 0 0 0 0 0 0
#> 2 soundscape… 0 0 0 0 0 0 0 0
#> 3 soundscape… 0 0 0 0 0 0 0 0
#> 4 soundscape… 0 0 0 0 0 0 2.96e-187 0
#> 5 soundscape… 0 0 0 0 0 0 0 0
#> 6 soundscape… 0 0 0 0 0 0 0 0
#> 7 soundscape… 0 0 0 0 0 0 0 0
#> 8 soundscape… 0 0 0 0 0 0 0 0
#> 9 soundscape… 0 0 0 0 0 0 0 0
#> 10 soundscape… 0 0 0 0 0 0 0 0
#> # ℹ 110 more rows
#> # ℹ 256 more variables: afghor1 <dbl>, afmdov1 <dbl>, afpfly1 <dbl>,
#> # afpkin1 <dbl>, afpwag1 <dbl>, afrgos1 <dbl>, afrgrp1 <dbl>, afrjac1 <dbl>,
#> # afrthr1 <dbl>, amesun2 <dbl>, augbuz1 <dbl>, bagwea1 <dbl>, barswa <dbl>,
#> # bawhor2 <dbl>, bawman1 <dbl>, bcbeat1 <dbl>, beasun2 <dbl>, bkctch1 <dbl>,
#> # bkfruw1 <dbl>, blacra1 <dbl>, blacuc1 <dbl>, blakit1 <dbl>, blaplo1 <dbl>,
#> # blbpuf2 <dbl>, blcapa2 <dbl>, blfbus1 <dbl>, blhgon1 <dbl>, …
Finally, the submission.csv
file is written in the
desired directory.
write.csv(submission,
file = paste0(tempdir(), '/submission.csv'))