From 75d2f51193f6fd25881a9c766db9078f3b68d80e Mon Sep 17 00:00:00 2001 From: Sampo Savolainen Date: Tue, 13 Mar 2007 22:42:34 +0000 Subject: Added a xmmintrin.h based SSE function find_peaks(). Needs polishing as this commit breaks the build system for i386 builds with dynamic SSE enabled. git-svn-id: svn://localhost/ardour2/trunk@1586 d708f5d6-7413-0410-9779-e7cbd77b26cf --- libs/ardour/SConscript | 3 ++ libs/ardour/ardour/mix.h | 18 +++++--- libs/ardour/ardour/session.h | 6 ++- libs/ardour/globals.cc | 5 ++- libs/ardour/mix.cc | 39 ++++++++++++++++- libs/ardour/session.cc | 3 +- libs/ardour/sse_functions_xmm.cc | 93 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 156 insertions(+), 11 deletions(-) create mode 100644 libs/ardour/sse_functions_xmm.cc diff --git a/libs/ardour/SConscript b/libs/ardour/SConscript index 3772246f11..47015f4d07 100644 --- a/libs/ardour/SConscript +++ b/libs/ardour/SConscript @@ -287,10 +287,13 @@ env['BUILDERS']['SharedAsmObject'] = Builder (action = '$CXX -c -fPIC $SOURCE -o if env['FPU_OPTIMIZATION']: if env['DIST_TARGET'] == "i386": arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s') + ardour_files += ['sse_functions_xmm.cc'] if env['DIST_TARGET'] == "i686": arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s') + ardour_files += ['sse_functions_xmm.cc'] if env['DIST_TARGET'] == "x86_64": arch_specific_objects = env.SharedAsmObject('sse_functions_64bit.os', 'sse_functions_64bit.s') + ardour_files += ['sse_functions_xmm.cc'] libardour = ardour.SharedLibrary('ardour', ardour_files + extra_sources + arch_specific_objects) diff --git a/libs/ardour/ardour/mix.h b/libs/ardour/ardour/mix.h index 495ea74132..7515401a2a 100644 --- a/libs/ardour/ardour/mix.h +++ b/libs/ardour/ardour/mix.h @@ -27,7 +27,7 @@ extern "C" { /* SSE functions */ - float x86_sse_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); + float x86_sse_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); void x86_sse_apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain); @@ -36,9 +36,11 @@ extern "C" { void x86_sse_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes); } +float x86_sse_find_peaks (ARDOUR::Sample *buf, nframes_t nsamples, float *min, float *max); + /* debug wrappers for SSE functions */ -float debug_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); +float debug_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); void debug_apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain); @@ -52,6 +54,8 @@ void debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nfra float veclib_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); +float veclib_find_peaks (ARDOUR::Sample *buf, nframes_t nsamples, float *min, float *max); + void veclib_apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain); void veclib_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes, float gain); @@ -62,12 +66,14 @@ void veclib_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src /* non-optimized functions */ -float compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); +float compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current); + +float find_peaks (ARDOUR::Sample *buf, nframes_t nsamples, float *min, float *max); -void apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain); +void apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain); -void mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes, float gain); +void mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes, float gain); -void mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes); +void mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, nframes_t nframes); #endif /* __ardour_mix_h__ */ diff --git a/libs/ardour/ardour/session.h b/libs/ardour/ardour/session.h index 3380fc03e0..7794dc95c8 100644 --- a/libs/ardour/ardour/session.h +++ b/libs/ardour/ardour/session.h @@ -905,12 +905,14 @@ class Session : public PBD::StatefulDestructible void* ptr, float opt); - typedef float (*compute_peak_t) (Sample *, nframes_t, float); + typedef float (*compute_peak_t) (Sample *, nframes_t, float); + typedef float (*find_peaks_t) (Sample *, nframes_t, float *, float*); typedef void (*apply_gain_to_buffer_t) (Sample *, nframes_t, float); typedef void (*mix_buffers_with_gain_t) (Sample *, Sample *, nframes_t, float); typedef void (*mix_buffers_no_gain_t) (Sample *, Sample *, nframes_t); - static compute_peak_t compute_peak; + static compute_peak_t compute_peak; + static find_peaks_t find_peaks; static apply_gain_to_buffer_t apply_gain_to_buffer; static mix_buffers_with_gain_t mix_buffers_with_gain; static mix_buffers_no_gain_t mix_buffers_no_gain; diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index 05b7f3e8f0..406f21832c 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -233,6 +233,7 @@ setup_hardware_optimization (bool try_optimization) // SSE SET Session::compute_peak = x86_sse_compute_peak; + Session::find_peaks = x86_sse_find_peaks; Session::apply_gain_to_buffer = x86_sse_apply_gain_to_buffer; Session::mix_buffers_with_gain = x86_sse_mix_buffers_with_gain; Session::mix_buffers_no_gain = x86_sse_mix_buffers_no_gain; @@ -249,6 +250,7 @@ setup_hardware_optimization (bool try_optimization) if (sysVersion >= 0x00001040) { // Tiger at least Session::compute_peak = veclib_compute_peak; + Session::find_peaks = veclib_find_peaks; Session::apply_gain_to_buffer = veclib_apply_gain_to_buffer; Session::mix_buffers_with_gain = veclib_mix_buffers_with_gain; Session::mix_buffers_no_gain = veclib_mix_buffers_no_gain; @@ -262,7 +264,8 @@ setup_hardware_optimization (bool try_optimization) if (generic_mix_functions) { - Session::compute_peak = compute_peak; + Session::compute_peak = compute_peak; + Session::find_peaks = find_peaks; Session::apply_gain_to_buffer = apply_gain_to_buffer; Session::mix_buffers_with_gain = mix_buffers_with_gain; Session::mix_buffers_no_gain = mix_buffers_no_gain; diff --git a/libs/ardour/mix.cc b/libs/ardour/mix.cc index 63ccc8b7ea..e2096178dd 100644 --- a/libs/ardour/mix.cc +++ b/libs/ardour/mix.cc @@ -24,7 +24,6 @@ #include #if defined (ARCH_X86) && defined (BUILD_SSE_OPTIMIZATIONS) - // Debug wrappers float @@ -90,6 +89,25 @@ compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current) return current; } +float +find_peaks (ARDOUR::Sample *buf, nframes_t nframes, float *min, float *max) +{ + long i; + float a, b; + + a = *max; + b = *min; + + for (i = 0; i < nframes; i++) + { + a = fmax (buf[i], a); + b = fmin (buf[i], b); + } + + *max = a; + *min = b; +} + void apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain) { @@ -124,6 +142,25 @@ veclib_compute_peak (ARDOUR::Sample *buf, nframes_t nsamples, float current) return f_max(current, tmpmax); } +float +veclib_find_peaks (ARDOUR::Sample *buf, nframes_t nframes, float *min, float *max) +{ + // TODO: someone with veclib skills needs to write this one + long i; + float a, b; + + a = *max; + b = *min; + + for (i = 0; i < nframes; i++) + { + a = fmax (buf[i], a); + b = fmin (buf[i], b); + } + + *max = a; + *min = b; +} void veclib_apply_gain_to_buffer (ARDOUR::Sample *buf, nframes_t nframes, float gain) { diff --git a/libs/ardour/session.cc b/libs/ardour/session.cc index 8feee3e16b..d1531e4c96 100644 --- a/libs/ardour/session.cc +++ b/libs/ardour/session.cc @@ -88,7 +88,8 @@ const char* Session::dead_sound_dir_name = X_("dead_sounds"); const char* Session::interchange_dir_name = X_("interchange"); const char* Session::export_dir_name = X_("export"); -Session::compute_peak_t Session::compute_peak = 0; +Session::compute_peak_t Session::compute_peak = 0; +Session::find_peaks_t Session::find_peaks = 0; Session::apply_gain_to_buffer_t Session::apply_gain_to_buffer = 0; Session::mix_buffers_with_gain_t Session::mix_buffers_with_gain = 0; Session::mix_buffers_no_gain_t Session::mix_buffers_no_gain = 0; diff --git a/libs/ardour/sse_functions_xmm.cc b/libs/ardour/sse_functions_xmm.cc new file mode 100644 index 0000000000..7b5ea143ec --- /dev/null +++ b/libs/ardour/sse_functions_xmm.cc @@ -0,0 +1,93 @@ +/* + Copyright (C) 2007 Paul Davis + Written by Sampo Savolainen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include +#include + +void +x86_sse_find_peaks(float *buf, nframes_t nframes, float *min, float *max) +{ + __m128 current_max, current_min, work; + + // Load max and min values into all four slots of the XMM registers + current_min = _mm_set1_ps(*min); + current_max = _mm_set1_ps(*max); + + // Work input until "buf" reaches 16 byte alignment + while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { + + // Load the next float into the work buffer + work = _mm_set1_ps(*buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf++; + nframes--; + } + + // work through aligned buffers + while (nframes >= 4) { + + work = _mm_load_ps(buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf+=4; + nframes-=4; + } + + // work through the rest < 4 samples + while ( nframes > 0) { + + // Load the next float into the work buffer + work = _mm_set1_ps(*buf); + + current_min = _mm_min_ps(current_min, work); + current_max = _mm_max_ps(current_max, work); + + buf++; + nframes--; + } + + // Find min & max value in current_max through shuffle tricks + + work = current_min; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); + work = _mm_min_ps (work, current_min); + current_min = work; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); + work = _mm_min_ps (work, current_min); + + _mm_store_ss(min, work); + + work = current_max; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); + work = _mm_max_ps (work, current_max); + current_max = work; + work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); + work = _mm_max_ps (work, current_max); + + _mm_store_ss(max, work); +} + + + -- cgit v1.2.3