diff options
author | Paul Davis <paul@linuxaudiosystems.com> | 2008-06-02 21:41:35 +0000 |
---|---|---|
committer | Paul Davis <paul@linuxaudiosystems.com> | 2008-06-02 21:41:35 +0000 |
commit | 449aab3c465bbbf66d221fac3d7ea559f1720357 (patch) | |
tree | 6843cc40c88250a132acac701271f1504cd2df04 /libs/soundtouch/mmx_win.cpp | |
parent | 9c0d7d72d70082a54f823cd44c0ccda5da64bb6f (diff) |
rollback to 3428, before the mysterious removal of libs/* at 3431/3432
git-svn-id: svn://localhost/ardour2/branches/3.0@3435 d708f5d6-7413-0410-9779-e7cbd77b26cf
Diffstat (limited to 'libs/soundtouch/mmx_win.cpp')
-rw-r--r-- | libs/soundtouch/mmx_win.cpp | 487 |
1 files changed, 487 insertions, 0 deletions
diff --git a/libs/soundtouch/mmx_win.cpp b/libs/soundtouch/mmx_win.cpp new file mode 100644 index 0000000000..ec4ac9d88b --- /dev/null +++ b/libs/soundtouch/mmx_win.cpp @@ -0,0 +1,487 @@ +//////////////////////////////////////////////////////////////////////////////// +/// +/// Win32 version of the MMX optimized routines. All MMX optimized functions +/// have been gathered into this single source code file, regardless to their +/// class or original source code file, in order to ease porting the library +/// to other compiler and processor platforms. +/// +/// This file is to be compiled in Windows platform with Microsoft Visual C++ +/// Compiler. Please see 'mmx_gcc.cpp' for the gcc compiler version for all +/// GNU platforms. +/// +/// Author : Copyright (c) Olli Parviainen +/// Author e-mail : oparviai @ iki.fi +/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch +/// +//////////////////////////////////////////////////////////////////////////////// +// +// Last changed : $Date$ +// File revision : $Revision$ +// +// $Id$ +// +//////////////////////////////////////////////////////////////////////////////// +// +// License : +// +// SoundTouch audio processing library +// Copyright (c) Olli Parviainen +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +//////////////////////////////////////////////////////////////////////////////// + +#include "STTypes.h" + +#ifndef WIN32 +#error "wrong platform - this source code file is exclusively for Win32 platform" +#endif + +using namespace soundtouch; + +#ifdef ALLOW_MMX +// MMX routines available only with integer sample type + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of MMX optimized functions of class 'TDStretchMMX' +// +////////////////////////////////////////////////////////////////////////////// + +#include "TDStretch.h" +#include <limits.h> + +// these are declared in 'TDStretch.cpp' +extern int scanOffsets[4][24]; + +// Calculates cross correlation of two buffers +long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const +{ + long corr; + uint local_overlapLength = overlapLength; + uint local_overlapDividerBits = overlapDividerBits; + + _asm + { + ; Calculate cross-correlation between the tempOffset and tmpbid_buffer. + ; + ; Process 4 parallel batches of 2 * stereo samples each during one + ; round to improve CPU-level parallellization. + ; + ; load address of sloped pV2 buffer to eax + ; load address of mixing point of the sample data buffer to ebx + ; load counter to ecx = overlapLength / 8 - 1 + ; empty the mm0 + ; + ; prepare to the first round by loading + ; load mm1 = eax[0] + ; load mm2 = eax[1]; + + mov eax, dword ptr pV1 + mov ebx, dword ptr pV2 + + movq mm1, qword ptr [eax] + mov ecx, local_overlapLength + + movq mm2, qword ptr [eax+8] + shr ecx, 3 + + pxor mm0, mm0 + sub ecx, 1 + + movd mm5, local_overlapDividerBits + + loop1: + ; multiply-add mm1 = mm1 * ebx[0] + ; multiply-add mm2 = mm2 * ebx[1] + ; + ; add mm2 += mm1 + ; mm2 >>= mm5 (=overlapDividerBits) + ; add mm0 += mm2 + ; + ; load mm3 = eax[2] + ; multiply-add mm3 = mm3 * ebx[2] + ; + ; load mm4 = eax[3] + ; multiply-add mm4 = mm4 * ebx[3] + ; + ; add mm3 += mm4 + ; mm3 >>= mm5 (=overlapDividerBits) + ; add mm0 += mm3 + ; + ; add eax += 4; + ; add ebx += 4 + ; load mm1 = eax[0] (~eax[4]) + ; load mm2 = eax[1] (~eax[5]) + ; + ; loop + + pmaddwd mm1, qword ptr [ebx] + movq mm3, qword ptr [eax+16] + + pmaddwd mm2, qword ptr [ebx+8] + movq mm4, qword ptr [eax+24] + + pmaddwd mm3, qword ptr [ebx+16] + paddd mm2, mm1 + + pmaddwd mm4, qword ptr [ebx+24] + movq mm1, qword ptr [eax+32] + + psrad mm2, mm5 + add eax, 32 + + paddd mm3, mm4 + paddd mm0, mm2 + + movq mm2, qword ptr [eax+8] + psrad mm3, mm5 + + add ebx, 32 + paddd mm0, mm3 + + dec ecx + jnz loop1 + + ; Finalize the last partial loop: + + movq mm3, qword ptr [eax+16] + pmaddwd mm1, qword ptr [ebx] + + movq mm4, qword ptr [eax+24] + pmaddwd mm2, qword ptr [ebx+8] + + pmaddwd mm3, qword ptr [ebx+16] + paddd mm2, mm1 + + pmaddwd mm4, qword ptr [ebx+24] + psrad mm2, mm5 + + paddd mm3, mm4 + paddd mm0, mm2 + + psrad mm3, mm5 + paddd mm0, mm3 + + ; copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 + ; and finally store the result into the variable "corr" + + movq mm1, mm0 + psrlq mm1, 32 + paddd mm0, mm1 + movd corr, mm0 + } + return corr; + + // Note: Warning about the missing EMMS instruction is harmless + // as it'll be called elsewhere. +} + + + +void TDStretchMMX::clearCrossCorrState() +{ + _asm EMMS; +} + + + + + +// MMX-optimized version of the function overlapStereo +void TDStretchMMX::overlapStereo(short *output, const short *input) const +{ + short *local_midBuffer = pMidBuffer; + uint local_overlapLength = overlapLength; + uint local_overlapDividerBits = overlapDividerBits; + + _asm + { + ; load sliding mixing value counter to mm6 and mm7 + ; load counter value to ecx = overlapLength / 4 + ; load divider-shifter value to esi + ; load mixing value adder to mm5 + ; load address of midBuffer to eax + ; load address of inputBuffer added with ovlOffset to ebx + ; load address of end of the outputBuffer to edx + + mov eax, local_overlapLength ; ecx = 0x0000 OVL_ + mov edi, 0x0002fffe ; ecx = 0x0002 fffe + + mov esi, local_overlapDividerBits + movd mm6, eax ; mm6 = 0x0000 0000 0000 OVL_ + + mov ecx, eax; + sub eax, 1 + + punpckldq mm6, mm6 ; mm6 = 0x0000 OVL_ 0000 OVL_ + mov edx, output + + or eax, 0x00010000 ; eax = 0x0001 overlapLength-1 + mov ebx, dword ptr input + + movd mm5, edi ; mm5 = 0x0000 0000 0002 fffe + movd mm7, eax ; mm7 = 0x0000 0000 0001 01ff + + mov eax, dword ptr local_midBuffer + punpckldq mm5, mm5 ; mm5 = 0x0002 fffe 0002 fffe + + shr ecx, 2 ; ecx = overlapLength / 2 + punpckldq mm7, mm7 ; mm7 = 0x0001 01ff 0001 01ff + + loop1: + ; Process two parallel batches of 2+2 stereo samples during each round + ; to improve CPU-level parallellization. + ; + ; Load [eax] into mm0 and mm1 + ; Load [ebx] into mm3 + ; unpack words of mm0, mm1 and mm3 into mm0 and mm1 + ; multiply-add mm0*mm6 and mm1*mm7, store results into mm0 and mm1 + ; divide mm0 and mm1 by 512 (=right-shift by overlapDividerBits) + ; pack the result into mm0 and store into [edx] + ; + ; Load [eax+8] into mm2 and mm3 + ; Load [ebx+8] into mm4 + ; unpack words of mm2, mm3 and mm4 into mm2 and mm3 + ; multiply-add mm2*mm6 and mm3*mm7, store results into mm2 and mm3 + ; divide mm2 and mm3 by 512 (=right-shift by overlapDividerBits) + ; pack the result into mm2 and store into [edx+8] + + + movq mm0, qword ptr [eax] ; mm0 = m1l m1r m0l m0r + add edx, 16 + + movq mm3, qword ptr [ebx] ; mm3 = i1l i1r i0l i0r + movq mm1, mm0 ; mm1 = m1l m1r m0l m0r + + movq mm2, qword ptr [eax+8] ; mm2 = m3l m3r m2l m2r + punpcklwd mm0, mm3 ; mm0 = i0l m0l i0r m0r + + movq mm4, qword ptr [ebx+8] ; mm4 = i3l i3r i2l i2r + punpckhwd mm1, mm3 ; mm1 = i1l m1l i1r m1r + + movq mm3, mm2 ; mm3 = m3l m3r m2l m2r + punpcklwd mm2, mm4 ; mm2 = i2l m2l i2r m2r + + pmaddwd mm0, mm6 ; mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60 + punpckhwd mm3, mm4 ; mm3 = i3l m3l i3r m3r + + movd mm4, esi ; mm4 = overlapDividerBits + + pmaddwd mm1, mm7 ; mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70 + paddw mm6, mm5 + + paddw mm7, mm5 + psrad mm0, mm4 ; mmo >>= overlapDividerBits + + pmaddwd mm2, mm6 ; mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60 + psrad mm1, mm4 ; mm1 >>= overlapDividerBits + + pmaddwd mm3, mm7 ; mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70 + psrad mm2, mm4 ; mm2 >>= overlapDividerBits + + packssdw mm0, mm1 ; mm0 = mm1h mm1l mm0h mm0l + psrad mm3, mm4 ; mm3 >>= overlapDividerBits + + add eax, 16 + paddw mm6, mm5 + + packssdw mm2, mm3 ; mm2 = mm2h mm2l mm3h mm3l + paddw mm7, mm5 + + movq qword ptr [edx-16], mm0 + add ebx, 16 + + movq qword ptr [edx-8], mm2 + dec ecx + + jnz loop1 + + emms + } +} + + +////////////////////////////////////////////////////////////////////////////// +// +// implementation of MMX optimized functions of class 'FIRFilter' +// +////////////////////////////////////////////////////////////////////////////// + +#include "FIRFilter.h" + + +FIRFilterMMX::FIRFilterMMX() : FIRFilter() +{ + filterCoeffsUnalign = NULL; +} + + +FIRFilterMMX::~FIRFilterMMX() +{ + delete[] filterCoeffsUnalign; +} + + +// (overloaded) Calculates filter coefficients for MMX routine +void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor) +{ + uint i; + FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor); + + // Ensure that filter coeffs array is aligned to 16-byte boundary + delete[] filterCoeffsUnalign; + filterCoeffsUnalign = new short[2 * newLength + 8]; + filterCoeffsAlign = (short *)(((uint)filterCoeffsUnalign + 15) & -16); + + // rearrange the filter coefficients for mmx routines + for (i = 0;i < length; i += 4) + { + filterCoeffsAlign[2 * i + 0] = coeffs[i + 0]; + filterCoeffsAlign[2 * i + 1] = coeffs[i + 2]; + filterCoeffsAlign[2 * i + 2] = coeffs[i + 0]; + filterCoeffsAlign[2 * i + 3] = coeffs[i + 2]; + + filterCoeffsAlign[2 * i + 4] = coeffs[i + 1]; + filterCoeffsAlign[2 * i + 5] = coeffs[i + 3]; + filterCoeffsAlign[2 * i + 6] = coeffs[i + 1]; + filterCoeffsAlign[2 * i + 7] = coeffs[i + 3]; + } +} + + + +// mmx-optimized version of the filter routine for stereo sound +uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const +{ + // Create stack copies of the needed member variables for asm routines : + uint local_length = length; + uint local_lengthDiv8 = lengthDiv8; + uint local_resultDivider = resultDivFactor; + short *local_filterCoeffs = (short*)filterCoeffsAlign; + + if (local_length < 2) return 0; + + _asm + { + ; Load (num_samples-aa_filter_length)/2 to edi as a i + ; Load a pointer to samples to esi + ; Load a pointer to destination to edx + + mov edi, numSamples + mov esi, dword ptr src + sub edi, local_length + mov edx, dword ptr dest + sar edi, 1 + + ; Load filter length/8 to ecx + ; Load pointer to samples from esi to ebx + ; Load counter from edi to ecx + ; Load [ebx] to mm3 + ; Load pointer to filter coefficients to eax +loop1: + mov ebx, esi + pxor mm0, mm0 + + mov ecx, local_lengthDiv8 + pxor mm7, mm7 + + movq mm1, [ebx] ; mm1 = l1 r1 l0 r0 + mov eax, local_filterCoeffs +loop2: + + movq mm2, [ebx+8] ; mm2 = l3 r3 l2 r2 + movq mm4, mm1 ; mm4 = l1 r1 l0 r0 + + movq mm3, [ebx+16] ; mm3 = l5 r5 l4 r4 + punpckhwd mm1, mm2 ; mm1 = l3 l1 r3 r1 + + movq mm6, mm2 ; mm6 = l3 r3 l2 r2 + punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0 + + movq mm2, qword ptr [eax] ; mm2 = f2 f0 f2 f0 + movq mm5, mm1 ; mm5 = l3 l1 r3 r1 + + punpcklwd mm6, mm3 ; mm6 = l4 l2 r4 r2 + pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0 + + pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0 + movq mm2, qword ptr [eax+8] ; mm2 = f3 f1 f3 f1 + + paddd mm0, mm4 ; mm0 += s02*f02 + movq mm4, mm3 ; mm4 = l1 r1 l0 r0 + + pmaddwd mm1, mm2 ; mm1 = l3*f3+l1*f1 r3*f3+l1*f1 + paddd mm7, mm5 ; mm7 += s13*f02 + + pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1 + movq mm2, [ebx+24] ; mm2 = l3 r3 l2 r2 + + paddd mm0, mm1 ; mm0 += s31*f31 + movq mm1, [ebx+32] ; mm1 = l5 r5 l4 r4 + + paddd mm7, mm6 ; mm7 += s42*f31 + punpckhwd mm3, mm2 ; mm3 = l3 l1 r3 r1 + + movq mm6, mm2 ; mm6 = l3 r3 l2 r2 + punpcklwd mm4, mm2 ; mm4 = l2 l0 r2 r0 + + movq mm2, qword ptr [eax+16] ; mm2 = f2 f0 f2 f0 + movq mm5, mm3 ; mm5 = l3 l1 r3 r1 + + punpcklwd mm6, mm1 ; mm6 = l4 l2 r4 r2 + add eax, 32 + + pmaddwd mm4, mm2 ; mm4 = l2*f2+l0*f0 r2*f2+r0*f0 + add ebx, 32 + + pmaddwd mm5, mm2 ; mm5 = l3*f2+l1*f0 r3*f2+l1*f0 + movq mm2, qword ptr [eax-8] ; mm2 = f3 f1 f3 f1 + + paddd mm0, mm4 ; mm0 += s02*f02 + pmaddwd mm3, mm2 ; mm3 = l3*f3+l1*f1 r3*f3+l1*f1 + + paddd mm7, mm5 ; mm7 += s13*f02 + pmaddwd mm6, mm2 ; mm6 = l4*f3+l2*f1 r4*f3+f4*f1 + + paddd mm0, mm3 ; mm0 += s31*f31 + paddd mm7, mm6 ; mm7 += s42*f31 + + dec ecx + jnz loop2 + + ; Divide mm0 and mm7 by 8192 (= right-shift by 13), + ; pack and store to [edx] + movd mm4, local_resultDivider; + + psrad mm0, mm4 ; divider the result + + add edx, 8 + psrad mm7, mm4 ; divider the result + + add esi, 8 + packssdw mm0, mm7 + + movq qword ptr [edx-8], mm0 + dec edi + + jnz loop1 + + emms + } + return (numSamples & 0xfffffffe) - local_length; +} + +#endif // ALLOW_MMX |