BirdCLEF 2023 via voice

Filipe J. Zabala

2023-05-29 15:35:19

0. Context

This is a vignette presenting the voice approach to Kaggle BirdCLEF 2023 competition.

1. Libraries and functions

1.1 Installing

birdclef2023 contains summarized data from Kaggle BirdCLEF 2023 competition audios. The summarization was made using voice, an R library with functions to easily deal with audio.

# install packs
install.packages('voice', dep = TRUE)
devtools::install_github('filipezabala/birdclef2023')

1.2 Calling

The libraries are called, geti and gaussian functions are defined.

# packs
library(voice)
library(tidyverse)
library(birdclef2023)

# geti function
geti <- function(x,i){x[i]}

# gaussian function
gaussian <- function(x, mu = 0, sigma = 1){
  exp(-((x-mu)/sigma)^2)
}

2. Data

In this section is presented how to download, extract and summarize the bird’s audio features. As the results are available at birdclef2023, this section may be ommited in a first reading.

2.1 Downloading

The data may be downloaded by the official Kaggle API. The zip file occupies around 5.3GB on disk.

url <- 'https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/44224/5188730/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1685615147&Signature=QeSfA1%2Bumjv4ND27r%2F7ln3XzHk%2F52GDgNq3EDtphEAKxQUkypPQHaIVCxlLFAnhfqtTM3V4mny0KtttbCXgseNaweq%2Bx1f1TjtY6DEjP%2FksG%2B%2BX0TiaLgl06xPhh1SZ%2FiZcCaEtESL3SKAZ9Nq5%2FbSoNKTzghgPP2ET4ncUxc5UeaqR6%2BVTtSWLDCEe%2FEzPHOT64gaF9w4gAnqZjkl7GFthY034mXaBSULWl1Ul6K55vX%2FJfXbaxDzrJbVB1HlU0eax3MZcnyy9msHSwWEsH8N08pSYtWJZA4GmPoGUn5YrNRr5%2Bc7YYVKWKKPxmH%2FG7%2Fw7Y3gj7vd7etLLBzcjRBA%3D%3D&response-content-disposition=attachment%3B+filename%3Dbirdclef-2023.zip'
options(timeout = 9999)
download.file(url = url, '~/Downloads/birdclef-2023.zip', mode = 'wd')

2.2 Extracting

Taking a look.

head(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#>                                    Name     Length                Date
#> 1              eBird_Taxonomy_v2021.csv 4294967295 2023-03-06 18:35:00
#> 2                 sample_submission.csv 4294967295 2023-03-06 18:35:00
#> 3 test_soundscapes/soundscape_29201.ogg 4294967295 2023-03-06 18:35:00
#> 4      train_audio/abethr1/XC128013.ogg 4294967295 2023-03-06 18:35:00
#> 5      train_audio/abethr1/XC363501.ogg 4294967295 2023-03-06 18:35:00
#> 6      train_audio/abethr1/XC363502.ogg 4294967295 2023-03-06 18:35:00
nrow(unzip('~/Downloads/birdclef-2023.zip', list = TRUE))
#> [1] 16945

Unzipping.

unzip('~/Downloads/birdclef-2023.zip',
      exdir = tempdir(),
      unzip = '/usr/bin/unzip')

2.3 Train

The .ogg files are converted using ffmpeg, “[a] complete, cross-platform solution to record, convert and stream audio and video”. The tagging procedure was performed by voice::tag using parallel processing. In order to save memory, the ogg to wav conversion, tagging and the wav files deletion are executed in the same routine. The processing took around 6 hours on a local machine with 12 CPUs, and the output is available in birdclef2023::E_train.

# ogg files
oggFiles <- list.files(paste0(tempdir(), '/train_audio'), 
                       pattern = '.[Oo][Gg][Gg]$', 
                       full.names = TRUE, recursive = TRUE)
length(oggFiles)

# ogg directories
oggDirs <- unique(dirname(oggFiles))
length(oggDirs)

# wav files
new_pth <- paste0(tempdir(), '/wav/')
old_pth <- paste0(tempdir(), '/train_audio/')
wavFiles <- sub(old_pth, new_pth, oggFiles)
wavFiles <- sub('.ogg$', '.wav', wavFiles)

# wav directories
wavDirs <- unique(dirname(wavFiles))

# extended dataset
n <- length(oggDirs)
E_list <- vector('list', n)
names(E_list) <- basename(oggDirs)

# converting ogg to wav, tagging grouping by wav_path, deleting wav
for(i in 1:n){
  # garbage collector
  gc()
  
  # creating directories
  dir.create((wavDirs[i]), recursive = TRUE)
  
  # filtering by species_code
  oggTemp <- dir(oggDirs[i], full.names = TRUE)
  fltr <- grep(basename(oggDirs[i]), wavFiles)
  wavTemp <- wavFiles[fltr]
  
  # converting ogg to wav
  for(j in 1:length(oggTemp)){
    cmd <- paste0("ffmpeg -i ", oggTemp[j], " -ac 1 ", wavTemp[j])
    system(cmd)
  }
  
  # Extended dataset
  E <- dplyr::tibble(species_code = basename(oggDirs[i]),
                     wav_path = wavTemp)  
  
  # tagging
  E_list[[i]] <- voice::tag(E, groupBy = 'species_code',
                            features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
                                         'mfcc', 'zcr', 'rms', 'gain', 'rfc'))
  
  # binding
  E <- dplyr::bind_rows(E_list)
  write.csv(E, row.names = FALSE,
            file = gzfile(paste0(tempdir(), '/E_train.csv.gz')))
  
  # deleting wav files
  unlink(wavDirs[i], recursive = TRUE)
  
  # progress
  print(i/n)
}

2.4 Test

Converting the test file from ogg to wav format.

# converting test data
oggFile <- paste0(tempdir(), '/test_soundscapes/soundscape_29201.ogg')
wavFile <- paste0(tempdir(), '/test_soundscapes/soundscape_29201.wav')
cmd <- paste0("ffmpeg -y -i ", oggFile, " -ac 1 ", wavFile)
system(cmd)

Splitting the 10 minutes file using voice::splitw. The procedure took around 25 seconds.

# rttm
n <- 10*60/5
rttm <- dplyr::tibble(type = rep('SPEAKER', n),
                      file = rep(NA, n),
                      chnl = rep(1, n),
                      tbeg = seq(0,595,5),
                      tdur = rep(5,n),
                      ortho = rep(NA, n),
                      stype = rep(NA, n),
                      name = rep('A',n),
                      conf = rep(NA,n),
                      slat = rep(NA,n))
write.table(rttm, paste0(tempdir(), '/split5sec.rttm'),
            quote = FALSE, row.names = FALSE, col.names = FALSE)

# splitting
splitDir <- paste0(tempdir(), '/split')
dir.create(splitDir)
voice::splitw(wavFile, 
              fromRttm = paste0(tempdir(), '/split5sec.rttm'),
              toSplit = splitDir)

# renaming
fr <- dir(splitDir, full.names = TRUE)
row_id_temp <- paste0('soundscape_29201_', seq(5,600,5), '.wav')
to <- paste0(dirname(fr), '/', row_id_temp)
file.rename(fr,to)

Finally the tagging is applied to the test data. The procedure took less than 1 minute to run, and the output is available in birdclef2023::E_test.

# Extended dataset
row_id_raw <- strsplit(row_id_temp, '.wav$')
row_id_raw <- sapply(row_id_raw, geti, 1)
E <- dplyr::tibble(row_id = row_id_raw,
                   wav_path = to)

# tagging grouping by row_id
tst <- voice::tag(E, groupBy = 'row_id',
                  features = c('f0', 'fmt', 'rf', 'rpf', 'rcf',
                               'mfcc', 'zcr', 'rms', 'gain', 'rfc'),
                  sortByGroupBy = FALSE,
                  mc.cores = parallel::detectCores())

# writing
write.csv(E, row.names = FALSE,
          file = gzfile(paste0(tempdir(), '/E_test.csv.gz')))

3. Modelling

3.1 Objects

The birdclef2023::E_train object contains 264 rows and 486 columns extracted via voice::tag from Kaggle BirdCLEF2023 train_audio directory.

birdclef2023::E_train
#> # A tibble: 264 × 487
#>    species_code f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#>    <chr>              <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
#>  1 abethr1             80.0       1125.       1860.       2701.       3564.
#>  2 abhori1            122.         860.       1686.       2627.       3543.
#>  3 abythr1             90.3       1085.       1818.       2624.       3462.
#>  4 afbfly1            104.        1159.       1866.       2753.       3621.
#>  5 afdfly1            108.        1289.       2091.       2942.       3791.
#>  6 afecuc1            115.         978.       1801.       2654.       3559.
#>  7 affeag1            169.         934.       1588.       2566.       3486.
#>  8 afgfly1             95.8        869.       1736.       2677.       3550.
#>  9 afghor1            133.        1003.       1909.       2602.       3549.
#> 10 afmdov1            397.         636.       1577.       2558.       3492.
#> # ℹ 254 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> #   f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> #   rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> #   rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> #   rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> #   rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …

The birdclef2023::E_test object contains 120 rows and 486 columns extracted via voice::tag from Kaggle BirdCLEF2023 test_soundscapes directory. Each row is associated with a 5 seconds section of soundscape_29201.ogg file.

birdclef2023::E_test
#> # A tibble: 120 × 487
#>    row_id            f0_tag_mean f1_tag_mean f2_tag_mean f3_tag_mean f4_tag_mean
#>    <chr>                   <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
#>  1 soundscape_29201…        75.6       1488.       1855.       2868.       3833.
#>  2 soundscape_29201…        NA         1466.       1809.       2817.       3800.
#>  3 soundscape_29201…        65.3       1464.       1889.       2927.       3878.
#>  4 soundscape_29201…        71.8       1332.       1768.       2797.       3759.
#>  5 soundscape_29201…        56.9       1349.       1787.       2806.       3796.
#>  6 soundscape_29201…        79.0       1452.       1877.       2742.       3766.
#>  7 soundscape_29201…        NA         1466.       1816.       2819.       3807.
#>  8 soundscape_29201…        78.4       1351.       1824.       2765.       3741.
#>  9 soundscape_29201…        74.8       1421.       1813.       2819.       3773.
#> 10 soundscape_29201…        66.6       1376.       1798.       2830.       3781.
#> # ℹ 110 more rows
#> # ℹ 481 more variables: f5_tag_mean <dbl>, f6_tag_mean <dbl>,
#> #   f7_tag_mean <dbl>, f8_tag_mean <dbl>, zcr1_tag_mean <dbl>,
#> #   rms_tag_mean <dbl>, gain_tag_mean <dbl>, rfc1_tag_mean <dbl>,
#> #   rfc2_tag_mean <dbl>, rfc3_tag_mean <dbl>, rfc4_tag_mean <dbl>,
#> #   rfc5_tag_mean <dbl>, rfc6_tag_mean <dbl>, rfc7_tag_mean <dbl>,
#> #   rfc8_tag_mean <dbl>, rfc9_tag_mean <dbl>, rfc10_tag_mean <dbl>, …

3.2 Distances

The basic idea is to compare each row from E_test (test sample) with each row from E_train (reference values). The comparison is made through a metric derived from the Euclidean distance, taking the average of the valid values (not NA). We call Average Length and it is given by \[\begin{equation} AL=\sqrt{\frac{\sum_{i=1}^{n-n_{NA}} (x_i-y_i)^2}{n-n_{NA}}} \end{equation}\]

# distances
D <- matrix(NA, nrow = nrow(birdclef2023::E_test), 
            ncol = nrow(birdclef2023::E_train))
rownames(D) <- birdclef2023::E_test$row_id
colnames(D) <- birdclef2023::E_train$species_code

The following code is completly not optimized, designed to be intuitive and run below the 2 hours limit defined in the Kaggle BirdCLEF 2023 competition rules. Takes around 30 minutes to run on a local machine, and the output is available in birdclef2023::D.

# calculating distances
k <- 0
for(i in 1:nrow(birdclef2023::E_train)){
  for(j in 1:nrow(birdclef2023::E_test)){
    k <- k+1
    d2 <- (birdclef2023::E_test[i,-1] - birdclef2023::E_train[j,-1])^2
    n <- length(d2)
    nNA <- sum(is.na(d2))
    D[i,j] <- sqrt(sum(d2, na.rm = TRUE)/(n-nNA))
    print(k/(nrow(birdclef2023::E_test)*nrow(birdclef2023::E_train)))
  }
}

The probabilities are calculated considering the Gaussian function given by \[\begin{equation} g(x)=e^{-\left(\frac{x-\mu}{\sigma}\right)^2} \end{equation}\] \(x\) represents the scaled distances, and the parameters used were \(\mu=0\) and \(\sigma=0.001\). The idea is to give more weight to distance values very close to zero, and quickly reduce the weight to distances that move away from zero, reason why the standard deviation is small (adjusted to 0.001).

# probabilities matrix
P <- matrix(NA, nrow = nrow(birdclef2023::E_test), 
            ncol = nrow(birdclef2023::E_train))
rownames(P) <- birdclef2023::E_test$row_id
colnames(P) <- birdclef2023::E_train$species_code

# calculating probabilities
for(i in 1:nrow(P)){
  w <- gaussian(scale(birdclef2023::D[i,]), 
                mu = 0, sigma = 0.001)
  P[i,] <- w/sum(w)
}

After that, there is some cleaning and tidying up.

block <- strsplit(rownames(P), '_')
block <- as.numeric(sapply(block, geti, 3))
submission <- tibble(block = block, 
                     row_id = rownames(P), 
                     as_tibble(P))

submission <- submission %>% 
  arrange(block) %>% 
  select(-block)

submission
#> # A tibble: 120 × 265
#>    row_id      abethr1 abhori1 abythr1 afbfly1 afdfly1 afecuc1   affeag1 afgfly1
#>    <chr>         <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>     <dbl>   <dbl>
#>  1 soundscape…       0       0       0       0       0       0 0               0
#>  2 soundscape…       0       0       0       0       0       0 0               0
#>  3 soundscape…       0       0       0       0       0       0 0               0
#>  4 soundscape…       0       0       0       0       0       0 2.96e-187       0
#>  5 soundscape…       0       0       0       0       0       0 0               0
#>  6 soundscape…       0       0       0       0       0       0 0               0
#>  7 soundscape…       0       0       0       0       0       0 0               0
#>  8 soundscape…       0       0       0       0       0       0 0               0
#>  9 soundscape…       0       0       0       0       0       0 0               0
#> 10 soundscape…       0       0       0       0       0       0 0               0
#> # ℹ 110 more rows
#> # ℹ 256 more variables: afghor1 <dbl>, afmdov1 <dbl>, afpfly1 <dbl>,
#> #   afpkin1 <dbl>, afpwag1 <dbl>, afrgos1 <dbl>, afrgrp1 <dbl>, afrjac1 <dbl>,
#> #   afrthr1 <dbl>, amesun2 <dbl>, augbuz1 <dbl>, bagwea1 <dbl>, barswa <dbl>,
#> #   bawhor2 <dbl>, bawman1 <dbl>, bcbeat1 <dbl>, beasun2 <dbl>, bkctch1 <dbl>,
#> #   bkfruw1 <dbl>, blacra1 <dbl>, blacuc1 <dbl>, blakit1 <dbl>, blaplo1 <dbl>,
#> #   blbpuf2 <dbl>, blcapa2 <dbl>, blfbus1 <dbl>, blhgon1 <dbl>, …

Finally, the submission.csv file is written in the desired directory.

write.csv(submission,
          file = paste0(tempdir(), '/submission.csv'))