summaryrefslogtreecommitdiff
path: root/libs/vamp-pyin/LocalCandidatePYIN.cpp
diff options
context:
space:
mode:
authorRobin Gareus <robin@gareus.org>2019-09-02 03:12:22 +0200
committerRobin Gareus <robin@gareus.org>2019-09-02 03:12:22 +0200
commit63994f3b820c8f0754ff59d0d09585405d87ae0e (patch)
tree4138d2f4b5d7e7c4ab0f371c08615b5d8fcc7538 /libs/vamp-pyin/LocalCandidatePYIN.cpp
parent1c8b6e1b4296b4fbabc258f9f94635390a319522 (diff)
Include vamp-pyin
In preparation for captainMorgan's pitch analysis script.
Diffstat (limited to 'libs/vamp-pyin/LocalCandidatePYIN.cpp')
-rw-r--r--libs/vamp-pyin/LocalCandidatePYIN.cpp499
1 files changed, 499 insertions, 0 deletions
diff --git a/libs/vamp-pyin/LocalCandidatePYIN.cpp b/libs/vamp-pyin/LocalCandidatePYIN.cpp
new file mode 100644
index 0000000000..3d33a969fa
--- /dev/null
+++ b/libs/vamp-pyin/LocalCandidatePYIN.cpp
@@ -0,0 +1,499 @@
+/* -*- c-basic-offset: 4 indent-tabs-mode: nil -*- vi:set ts=8 sts=4 sw=4: */
+
+/*
+ pYIN - A fundamental frequency estimator for monophonic audio
+ Centre for Digital Music, Queen Mary, University of London.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version. See the file
+ COPYING included with this distribution for more information.
+*/
+
+#include "LocalCandidatePYIN.h"
+#include "MonoPitch.h"
+#include "YinUtil.h"
+
+#include "vamp-sdk/FFT.h"
+
+#include <vector>
+#include <algorithm>
+
+#include <cstdio>
+#include <sstream>
+// #include <iostream>
+#include <cmath>
+#include <complex>
+#include <map>
+
+#include <boost/math/distributions.hpp>
+
+using std::string;
+using std::vector;
+using std::map;
+using Vamp::RealTime;
+
+
+LocalCandidatePYIN::LocalCandidatePYIN(float inputSampleRate) :
+ Plugin(inputSampleRate),
+ m_channels(0),
+ m_stepSize(256),
+ m_blockSize(2048),
+ m_fmin(40),
+ m_fmax(700),
+ m_oPitchTrackCandidates(0),
+ m_threshDistr(2.0f),
+ m_outputUnvoiced(0.0f),
+ m_preciseTime(0.0f),
+ m_pitchProb(0),
+ m_timestamp(0),
+ m_nCandidate(13)
+{
+}
+
+LocalCandidatePYIN::~LocalCandidatePYIN()
+{
+}
+
+string
+LocalCandidatePYIN::getIdentifier() const
+{
+ return "localcandidatepyin";
+}
+
+string
+LocalCandidatePYIN::getName() const
+{
+ return "Local Candidate PYIN";
+}
+
+string
+LocalCandidatePYIN::getDescription() const
+{
+ return "Monophonic pitch and note tracking based on a probabilistic Yin extension.";
+}
+
+string
+LocalCandidatePYIN::getMaker() const
+{
+ return "Matthias Mauch";
+}
+
+int
+LocalCandidatePYIN::getPluginVersion() const
+{
+ // Increment this each time you release a version that behaves
+ // differently from the previous one
+ return 2;
+}
+
+string
+LocalCandidatePYIN::getCopyright() const
+{
+ return "GPL";
+}
+
+LocalCandidatePYIN::InputDomain
+LocalCandidatePYIN::getInputDomain() const
+{
+ return TimeDomain;
+}
+
+size_t
+LocalCandidatePYIN::getPreferredBlockSize() const
+{
+ return 2048;
+}
+
+size_t
+LocalCandidatePYIN::getPreferredStepSize() const
+{
+ return 256;
+}
+
+size_t
+LocalCandidatePYIN::getMinChannelCount() const
+{
+ return 1;
+}
+
+size_t
+LocalCandidatePYIN::getMaxChannelCount() const
+{
+ return 1;
+}
+
+LocalCandidatePYIN::ParameterList
+LocalCandidatePYIN::getParameterDescriptors() const
+{
+ ParameterList list;
+
+ ParameterDescriptor d;
+
+ d.identifier = "threshdistr";
+ d.name = "Yin threshold distribution";
+ d.description = ".";
+ d.unit = "";
+ d.minValue = 0.0f;
+ d.maxValue = 7.0f;
+ d.defaultValue = 2.0f;
+ d.isQuantized = true;
+ d.quantizeStep = 1.0f;
+ d.valueNames.push_back("Uniform");
+ d.valueNames.push_back("Beta (mean 0.10)");
+ d.valueNames.push_back("Beta (mean 0.15)");
+ d.valueNames.push_back("Beta (mean 0.20)");
+ d.valueNames.push_back("Beta (mean 0.30)");
+ d.valueNames.push_back("Single Value 0.10");
+ d.valueNames.push_back("Single Value 0.15");
+ d.valueNames.push_back("Single Value 0.20");
+ list.push_back(d);
+
+ d.identifier = "outputunvoiced";
+ d.valueNames.clear();
+ d.name = "Output estimates classified as unvoiced?";
+ d.description = ".";
+ d.unit = "";
+ d.minValue = 0.0f;
+ d.maxValue = 2.0f;
+ d.defaultValue = 0.0f;
+ d.isQuantized = true;
+ d.quantizeStep = 1.0f;
+ d.valueNames.push_back("No");
+ d.valueNames.push_back("Yes");
+ d.valueNames.push_back("Yes, as negative frequencies");
+ list.push_back(d);
+
+ d.identifier = "precisetime";
+ d.valueNames.clear();
+ d.name = "Use non-standard precise YIN timing (slow).";
+ d.description = ".";
+ d.unit = "";
+ d.minValue = 0.0f;
+ d.maxValue = 1.0f;
+ d.defaultValue = 0.0f;
+ d.isQuantized = true;
+ d.quantizeStep = 1.0f;
+ list.push_back(d);
+
+ return list;
+}
+
+float
+LocalCandidatePYIN::getParameter(string identifier) const
+{
+ if (identifier == "threshdistr") {
+ return m_threshDistr;
+ }
+ if (identifier == "outputunvoiced") {
+ return m_outputUnvoiced;
+ }
+ if (identifier == "precisetime") {
+ return m_preciseTime;
+ }
+ return 0.f;
+}
+
+void
+LocalCandidatePYIN::setParameter(string identifier, float value)
+{
+ if (identifier == "threshdistr")
+ {
+ m_threshDistr = value;
+ }
+ if (identifier == "outputunvoiced")
+ {
+ m_outputUnvoiced = value;
+ }
+ if (identifier == "precisetime")
+ {
+ m_preciseTime = value;
+ }
+}
+
+LocalCandidatePYIN::ProgramList
+LocalCandidatePYIN::getPrograms() const
+{
+ ProgramList list;
+ return list;
+}
+
+string
+LocalCandidatePYIN::getCurrentProgram() const
+{
+ return ""; // no programs
+}
+
+void
+LocalCandidatePYIN::selectProgram(string name)
+{
+}
+
+LocalCandidatePYIN::OutputList
+LocalCandidatePYIN::getOutputDescriptors() const
+{
+ OutputList outputs;
+
+ OutputDescriptor d;
+
+ d.identifier = "pitchtrackcandidates";
+ d.name = "Pitch track candidates";
+ d.description = "Multiple candidate pitch tracks.";
+ d.unit = "Hz";
+ d.hasFixedBinCount = false;
+ d.hasKnownExtents = true;
+ d.minValue = m_fmin;
+ d.maxValue = 500; //!!!???
+ d.isQuantized = false;
+ d.sampleType = OutputDescriptor::FixedSampleRate;
+ d.sampleRate = (m_inputSampleRate / m_stepSize);
+ d.hasDuration = false;
+ outputs.push_back(d);
+
+ return outputs;
+}
+
+bool
+LocalCandidatePYIN::initialise(size_t channels, size_t stepSize, size_t blockSize)
+{
+ if (channels < getMinChannelCount() ||
+ channels > getMaxChannelCount()) return false;
+
+/*
+ std::cerr << "LocalCandidatePYIN::initialise: channels = " << channels
+ << ", stepSize = " << stepSize << ", blockSize = " << blockSize
+ << std::endl;
+*/
+ m_channels = channels;
+ m_stepSize = stepSize;
+ m_blockSize = blockSize;
+
+ reset();
+
+ return true;
+}
+
+void
+LocalCandidatePYIN::reset()
+{
+ m_pitchProb.clear();
+ m_timestamp.clear();
+/*
+ std::cerr << "LocalCandidatePYIN::reset"
+ << ", blockSize = " << m_blockSize
+ << std::endl;
+*/
+}
+
+LocalCandidatePYIN::FeatureSet
+LocalCandidatePYIN::process(const float *const *inputBuffers, RealTime timestamp)
+{
+ int offset = m_preciseTime == 1.0 ? m_blockSize/2 : m_blockSize/4;
+ timestamp = timestamp + Vamp::RealTime::frame2RealTime(offset, lrintf(m_inputSampleRate));
+
+ double *dInputBuffers = new double[m_blockSize];
+ for (size_t i = 0; i < m_blockSize; ++i) dInputBuffers[i] = inputBuffers[0][i];
+
+ size_t yinBufferSize = m_blockSize/2;
+ double* yinBuffer = new double[yinBufferSize];
+ if (!m_preciseTime) YinUtil::fastDifference(dInputBuffers, yinBuffer, yinBufferSize);
+ else YinUtil::slowDifference(dInputBuffers, yinBuffer, yinBufferSize);
+
+ delete [] dInputBuffers;
+
+ YinUtil::cumulativeDifference(yinBuffer, yinBufferSize);
+
+ float minFrequency = 60;
+ float maxFrequency = 900;
+ vector<double> peakProbability = YinUtil::yinProb(yinBuffer,
+ m_threshDistr,
+ yinBufferSize,
+ m_inputSampleRate/maxFrequency,
+ m_inputSampleRate/minFrequency);
+
+ vector<pair<double, double> > tempPitchProb;
+ for (size_t iBuf = 0; iBuf < yinBufferSize; ++iBuf)
+ {
+ if (peakProbability[iBuf] > 0)
+ {
+ double currentF0 =
+ m_inputSampleRate * (1.0 /
+ YinUtil::parabolicInterpolation(yinBuffer, iBuf, yinBufferSize));
+ double tempPitch = 12 * std::log(currentF0/440)/std::log(2.) + 69;
+ tempPitchProb.push_back(pair<double, double>(tempPitch, peakProbability[iBuf]));
+ }
+ }
+ m_pitchProb.push_back(tempPitchProb);
+ m_timestamp.push_back(timestamp);
+
+ delete[] yinBuffer;
+
+ return FeatureSet();
+}
+
+LocalCandidatePYIN::FeatureSet
+LocalCandidatePYIN::getRemainingFeatures()
+{
+ // timestamp -> candidate number -> value
+ map<RealTime, map<int, float> > featureValues;
+
+ // std::cerr << "in remaining features" << std::endl;
+
+ if (m_pitchProb.empty()) {
+ return FeatureSet();
+ }
+
+ // MONO-PITCH STUFF
+ MonoPitch mp;
+ size_t nFrame = m_timestamp.size();
+ vector<vector<float> > pitchTracks;
+ vector<float> freqSum = vector<float>(m_nCandidate);
+ vector<float> freqNumber = vector<float>(m_nCandidate);
+ vector<float> freqMean = vector<float>(m_nCandidate);
+
+ boost::math::normal normalDist(0, 8); // semitones sd
+ float maxNormalDist = boost::math::pdf(normalDist, 0);
+
+ // Viterbi-decode multiple times with different frequencies emphasised
+ for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
+ {
+ pitchTracks.push_back(vector<float>(nFrame));
+ vector<vector<pair<double,double> > > tempPitchProb;
+ float centrePitch = 45 + 3 * iCandidate;
+
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) {
+ tempPitchProb.push_back(vector<pair<double,double> >());
+ float sumProb = 0;
+ float pitch = 0;
+ float prob = 0;
+ for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
+ {
+ pitch = m_pitchProb[iFrame][iProb].first;
+ prob = m_pitchProb[iFrame][iProb].second *
+ boost::math::pdf(normalDist, pitch-centrePitch) /
+ maxNormalDist * 2;
+ sumProb += prob;
+ tempPitchProb[iFrame].push_back(
+ pair<double,double>(pitch,prob));
+ }
+ for (size_t iProb = 0; iProb < m_pitchProb[iFrame].size(); ++iProb)
+ {
+ tempPitchProb[iFrame][iProb].second /= sumProb;
+ }
+ }
+
+ vector<float> mpOut = mp.process(tempPitchProb);
+ float prevFreq = 0;
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
+ {
+ if (mpOut[iFrame] > 0) {
+
+ pitchTracks[iCandidate][iFrame] = mpOut[iFrame];
+ freqSum[iCandidate] += mpOut[iFrame];
+ freqNumber[iCandidate]++;
+ prevFreq = mpOut[iFrame];
+
+ }
+ }
+ freqMean[iCandidate] = freqSum[iCandidate]*1.0/freqNumber[iCandidate];
+ }
+
+ // find near duplicate pitch tracks
+ vector<size_t> duplicates;
+ for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate) {
+ for (size_t jCandidate = iCandidate+1; jCandidate < m_nCandidate; ++jCandidate) {
+ size_t countEqual = 0;
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
+ {
+ if ((pitchTracks[jCandidate][iFrame] == 0 && pitchTracks[iCandidate][iFrame] == 0) ||
+ fabs(pitchTracks[iCandidate][iFrame]/pitchTracks[jCandidate][iFrame]-1)<0.01)
+ countEqual++;
+ }
+ // std::cerr << "proportion equal: " << (countEqual * 1.0 / nFrame) << std::endl;
+ if (countEqual * 1.0 / nFrame > 0.8) {
+ if (freqNumber[iCandidate] > freqNumber[jCandidate]) {
+ duplicates.push_back(jCandidate);
+ } else if (iCandidate < jCandidate) {
+ duplicates.push_back(iCandidate);
+ }
+ }
+ }
+ }
+
+ // now find non-duplicate pitch tracks
+ map<int, int> candidateActuals;
+ map<int, std::string> candidateLabels;
+
+ vector<vector<float> > outputFrequencies;
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame) outputFrequencies.push_back(vector<float>());
+
+ int actualCandidateNumber = 0;
+ for (size_t iCandidate = 0; iCandidate < m_nCandidate; ++iCandidate)
+ {
+ bool isDuplicate = false;
+ for (size_t i = 0; i < duplicates.size(); ++i) {
+
+ if (duplicates[i] == iCandidate) {
+ isDuplicate = true;
+ break;
+ }
+ }
+ if (!isDuplicate && freqNumber[iCandidate] > 0.5*nFrame)
+ {
+ std::ostringstream convert;
+ convert << actualCandidateNumber++;
+ candidateLabels[iCandidate] = convert.str();
+ candidateActuals[iCandidate] = actualCandidateNumber;
+ // std::cerr << iCandidate << " " << actualCandidateNumber << " " << freqNumber[iCandidate] << " " << freqMean[iCandidate] << std::endl;
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame)
+ {
+ if (pitchTracks[iCandidate][iFrame] > 0)
+ {
+ // featureValues[m_timestamp[iFrame]][iCandidate] =
+ // pitchTracks[iCandidate][iFrame];
+ outputFrequencies[iFrame].push_back(pitchTracks[iCandidate][iFrame]);
+ } else {
+ outputFrequencies[iFrame].push_back(0);
+ }
+ }
+ }
+ // fs[m_oPitchTrackCandidates].push_back(f);
+ }
+
+ // adapt our features so as to return a stack of candidate values
+ // per frame
+
+ FeatureSet fs;
+
+ for (size_t iFrame = 0; iFrame < nFrame; ++iFrame){
+ Feature f;
+ f.hasTimestamp = true;
+ f.timestamp = m_timestamp[iFrame];
+ f.values = outputFrequencies[iFrame];
+ fs[0].push_back(f);
+ }
+
+ // I stopped using Chris's map stuff below because I couldn't get my head around it
+ //
+ // for (map<RealTime, map<int, float> >::const_iterator i =
+ // featureValues.begin(); i != featureValues.end(); ++i) {
+ // Feature f;
+ // f.hasTimestamp = true;
+ // f.timestamp = i->first;
+ // int nextCandidate = candidateActuals.begin()->second;
+ // for (map<int, float>::const_iterator j =
+ // i->second.begin(); j != i->second.end(); ++j) {
+ // while (candidateActuals[j->first] > nextCandidate) {
+ // f.values.push_back(0);
+ // ++nextCandidate;
+ // }
+ // f.values.push_back(j->second);
+ // nextCandidate = j->first + 1;
+ // }
+ // //!!! can't use labels?
+ // fs[0].push_back(f);
+ // }
+
+ return fs;
+}