libs/qm-dsp/dsp/tempotracking/DownBeat.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308

/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */

/*
    QM DSP Library

    Centre for Digital Music, Queen Mary, University of London.
    This file copyright 2008-2009 Matthew Davies and QMUL.

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License as
    published by the Free Software Foundation; either version 2 of the
    License, or (at your option) any later version.  See the file
    COPYING included with this distribution for more information.
*/

#include "DownBeat.h"

#include "maths/MathAliases.h"
#include "maths/MathUtilities.h"
#include "maths/KLDivergence.h"
#include "dsp/transforms/FFT.h"

#include <iostream>
#include <cstdlib>

DownBeat::DownBeat(float originalSampleRate,
                   size_t decimationFactor,
                   size_t dfIncrement) :
    m_bpb(0),
    m_rate(originalSampleRate),
    m_factor(decimationFactor),
    m_increment(dfIncrement),
    m_decimator1(0),
    m_decimator2(0),
    m_buffer(0),
    m_decbuf(0),
    m_bufsiz(0),
    m_buffill(0),
    m_beatframesize(0),
    m_beatframe(0)
{
    // beat frame size is next power of two up from 1.3 seconds at the
    // downsampled rate (happens to produce 4096 for 44100 or 48000 at
    // 16x decimation, which is our expected normal situation)
    m_beatframesize = MathUtilities::nextPowerOfTwo
        (int((m_rate / decimationFactor) * 1.3));
//    std::cerr << "rate = " << m_rate << ", bfs = " << m_beatframesize << std::endl;
    m_beatframe = new double[m_beatframesize];
    m_fftRealOut = new double[m_beatframesize];
    m_fftImagOut = new double[m_beatframesize];
    m_fft = new FFTReal(m_beatframesize);
}

DownBeat::~DownBeat()
{
    delete m_decimator1;
    delete m_decimator2;
    if (m_buffer) free(m_buffer);
    delete[] m_decbuf;
    delete[] m_beatframe;
    delete[] m_fftRealOut;
    delete[] m_fftImagOut;
    delete m_fft;
}

void
DownBeat::setBeatsPerBar(int bpb)
{
    m_bpb = bpb;
}

void
DownBeat::makeDecimators()
{
//    std::cerr << "m_factor = " << m_factor << std::endl;
    if (m_factor < 2) return;
    size_t highest = Decimator::getHighestSupportedFactor();
    if (m_factor <= highest) {
        m_decimator1 = new Decimator(m_increment, m_factor);
//        std::cerr << "DownBeat: decimator 1 factor " << m_factor << ", size " << m_increment << std::endl;
        return;
    }
    m_decimator1 = new Decimator(m_increment, highest);
//    std::cerr << "DownBeat: decimator 1 factor " << highest << ", size " << m_increment << std::endl;
    m_decimator2 = new Decimator(m_increment / highest, m_factor / highest);
//    std::cerr << "DownBeat: decimator 2 factor " << m_factor / highest << ", size " << m_increment / highest << std::endl;
    m_decbuf = new float[m_increment / highest];
}

void
DownBeat::pushAudioBlock(const float *audio)
{
    if (m_buffill + (m_increment / m_factor) > m_bufsiz) {
        if (m_bufsiz == 0) m_bufsiz = m_increment * 16;
        else m_bufsiz = m_bufsiz * 2;
        if (!m_buffer) {
            m_buffer = (float *)malloc(m_bufsiz * sizeof(float));
        } else {
//            std::cerr << "DownBeat::pushAudioBlock: realloc m_buffer to " << m_bufsiz << std::endl;
            m_buffer = (float *)realloc(m_buffer, m_bufsiz * sizeof(float));
        }
    }
    if (!m_decimator1 && m_factor > 1) makeDecimators();
//    float rmsin = 0, rmsout = 0;
//    for (int i = 0; i < m_increment; ++i) {
//        rmsin += audio[i] * audio[i];
//    }
    if (m_decimator2) {
        m_decimator1->process(audio, m_decbuf);
        m_decimator2->process(m_decbuf, m_buffer + m_buffill);
    } else if (m_decimator1) {
        m_decimator1->process(audio, m_buffer + m_buffill);
    } else {
        // just copy across (m_factor is presumably 1)
        for (size_t i = 0; i < m_increment; ++i) {
            (m_buffer + m_buffill)[i] = audio[i];
        }
    }
//    for (int i = 0; i < m_increment / m_factor; ++i) {
//        rmsout += m_buffer[m_buffill + i] * m_buffer[m_buffill + i];
//    }
//    std::cerr << "pushAudioBlock: rms in " << sqrt(rmsin) << ", out " << sqrt(rmsout) << std::endl;
    m_buffill += m_increment / m_factor;
}

const float *
DownBeat::getBufferedAudio(size_t &length) const
{
    length = m_buffill;
    return m_buffer;
}

void
DownBeat::resetAudioBuffer()
{
    if (m_buffer) free(m_buffer);
    m_buffer = 0;
    m_buffill = 0;
    m_bufsiz = 0;
}

void
DownBeat::findDownBeats(const float *audio,
                        size_t audioLength,
                        const d_vec_t &beats,
                        i_vec_t &downbeats)
{
    // FIND DOWNBEATS BY PARTITIONING THE INPUT AUDIO FILE INTO BEAT SEGMENTS
    // WHERE THE AUDIO FRAMES ARE DOWNSAMPLED  BY A FACTOR OF 16 (fs ~= 2700Hz)
    // THEN TAKING THE JENSEN-SHANNON DIVERGENCE BETWEEN BEAT SYNCHRONOUS SPECTRAL FRAMES

    // IMPLEMENTATION (MOSTLY) FOLLOWS:
    //  DAVIES AND PLUMBLEY "A SPECTRAL DIFFERENCE APPROACH TO EXTRACTING DOWNBEATS IN MUSICAL AUDIO"
    //  EUSIPCO 2006, FLORENCE, ITALY

    d_vec_t newspec(m_beatframesize / 2); // magnitude spectrum of current beat
    d_vec_t oldspec(m_beatframesize / 2); // magnitude spectrum of previous beat

    m_beatsd.clear();

    if (audioLength == 0) return;

    for (size_t i = 0; i + 1 < beats.size(); ++i) {

        // Copy the extents of the current beat from downsampled array
        // into beat frame buffer

        size_t beatstart = (beats[i] * m_increment) / m_factor;
        size_t beatend = (beats[i+1] * m_increment) / m_factor;
        if (beatend >= audioLength) beatend = audioLength - 1;
        if (beatend < beatstart) beatend = beatstart;
        size_t beatlen = beatend - beatstart;

        // Also apply a Hanning window to the beat frame buffer, sized
        // to the beat extents rather than the frame size.  (Because
        // the size varies, it's easier to do this by hand than use
        // our Window abstraction.)

//        std::cerr << "beatlen = " << beatlen << std::endl;

//        float rms = 0;
        for (size_t j = 0; j < beatlen && j < m_beatframesize; ++j) {
            double mul = 0.5 * (1.0 - cos(TWO_PI * (double(j) / double(beatlen))));
            m_beatframe[j] = audio[beatstart + j] * mul;
//            rms += m_beatframe[j] * m_beatframe[j];
        }
//        rms = sqrt(rms);
//        std::cerr << "beat " << i << ": audio rms " << rms << std::endl;

        for (size_t j = beatlen; j < m_beatframesize; ++j) {
            m_beatframe[j] = 0.0;
        }

        // Now FFT beat frame

        m_fft->process(false, m_beatframe, m_fftRealOut, m_fftImagOut);

        // Calculate magnitudes

        for (size_t j = 0; j < m_beatframesize/2; ++j) {
            newspec[j] = sqrt(m_fftRealOut[j] * m_fftRealOut[j] +
                              m_fftImagOut[j] * m_fftImagOut[j]);
        }

        // Preserve peaks by applying adaptive threshold

        MathUtilities::adaptiveThreshold(newspec);

        // Calculate JS divergence between new and old spectral frames

        if (i > 0) { // otherwise we have no previous frame
            m_beatsd.push_back(measureSpecDiff(oldspec, newspec));
//            std::cerr << "specdiff: " << m_beatsd[m_beatsd.size()-1] << std::endl;
        }

        // Copy newspec across to old

        for (size_t j = 0; j < m_beatframesize/2; ++j) {
            oldspec[j] = newspec[j];
        }
    }

    // We now have all spectral difference measures in specdiff

    int timesig = m_bpb;
    if (timesig == 0) timesig = 4;

    d_vec_t dbcand(timesig); // downbeat candidates

    for (int beat = 0; beat < timesig; ++beat) {
        dbcand[beat] = 0;
    }

   // look for beat transition which leads to greatest spectral change
   for (int beat = 0; beat < timesig; ++beat) {
       int count = 0;
       for (int example = beat-1; example < (int)m_beatsd.size(); example += timesig) {
           if (example < 0) continue;
           dbcand[beat] += (m_beatsd[example]) / timesig;
           ++count;
       }
       if (count > 0) dbcand[beat] /= count;
//        std::cerr << "dbcand[" << beat << "] = " << dbcand[beat] << std::endl;
   }

    // first downbeat is beat at index of maximum value of dbcand
    int dbind = MathUtilities::getMax(dbcand);

    // remaining downbeats are at timesig intervals from the first
    for (int i = dbind; i < (int)beats.size(); i += timesig) {
        downbeats.push_back(i);
    }
}

double
DownBeat::measureSpecDiff(d_vec_t oldspec, d_vec_t newspec)
{
    // JENSEN-SHANNON DIVERGENCE BETWEEN SPECTRAL FRAMES

    unsigned int SPECSIZE = 512;   // ONLY LOOK AT FIRST 512 SAMPLES OF SPECTRUM.
    if (SPECSIZE > oldspec.size()/4) {
        SPECSIZE = oldspec.size()/4;
    }
    double SD = 0.;
    double sd1 = 0.;

    double sumnew = 0.;
    double sumold = 0.;

    for (unsigned int i = 0;i < SPECSIZE;i++)
    {
        newspec[i] +=EPS;
        oldspec[i] +=EPS;

        sumnew+=newspec[i];
        sumold+=oldspec[i];
    }

    for (unsigned int i = 0;i < SPECSIZE;i++)
    {
        newspec[i] /= (sumnew);
        oldspec[i] /= (sumold);

        // IF ANY SPECTRAL VALUES ARE 0 (SHOULDN'T BE ANY!) SET THEM TO 1
        if (newspec[i] == 0)
        {
            newspec[i] = 1.;
        }

        if (oldspec[i] == 0)
        {
            oldspec[i] = 1.;
        }

        // JENSEN-SHANNON CALCULATION
        sd1 = 0.5*oldspec[i] + 0.5*newspec[i];	
        SD = SD + (-sd1*log(sd1)) + (0.5*(oldspec[i]*log(oldspec[i]))) + (0.5*(newspec[i]*log(newspec[i])));
    }

    return SD;
}

void
DownBeat::getBeatSD(vector<double> &beatsd) const
{
    for (int i = 0; i < (int)m_beatsd.size(); ++i) beatsd.push_back(m_beatsd[i]);
}