libs/soundtouch/sse_win.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367

////////////////////////////////////////////////////////////////////////////////
///
/// Win32 version of the SSE optimized routines for Pentium-III, Athlon-XP and
/// later. All SSE optimized functions have been gathered into this single source 
/// code file, regardless to their class or original source code file, in order 
/// to ease porting the library to other compiler and processor platforms.
///
/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
/// 6.0 processor pack" update to support SSE instruction set. The update is 
/// available for download at Microsoft Developers Network, see here:
/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
///
/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
/// perform a search with keywords "processor pack".
///
/// This file is to be compiled in Windows platform with Microsoft Visual C++ 
/// Compiler. Please see 'sse_gcc.cpp' for the gcc compiler version for all
/// GNU platforms (if file supplied).
///
/// Author        : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai @ iki.fi
/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
///
////////////////////////////////////////////////////////////////////////////////
//
// Last changed  : $Date$
// File revision : $Revision$
//
// $Id$
//
////////////////////////////////////////////////////////////////////////////////
//
// License :
//
//  SoundTouch audio processing library
//  Copyright (c) Olli Parviainen
//
//  This library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU Lesser General Public
//  License as published by the Free Software Foundation; either
//  version 2.1 of the License, or (at your option) any later version.
//
//  This library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//  Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public
//  License along with this library; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
////////////////////////////////////////////////////////////////////////////////

#include "cpu_detect.h"
#include "STTypes.h"

#ifndef WIN32
#error "wrong platform - this source code file is exclusively for Win32 platform"
#endif

using namespace soundtouch;

#ifdef ALLOW_SSE
// SSE routines available only with float sample type    

//////////////////////////////////////////////////////////////////////////////
//
// implementation of SSE optimized functions of class 'TDStretchSSE'
//
//////////////////////////////////////////////////////////////////////////////

#include "TDStretch.h"
#include <limits.h>

// these are declared in 'TDStretch.cpp'
extern int scanOffsets[4][24];

// Calculates cross correlation of two buffers
double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
{
    uint overlapLengthLocal = overlapLength;
    float corr;

    /*
    double corr;
    uint i;

    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
    corr = 0.0;
    for (i = 0; i < overlapLength / 8; i ++) 
    {
        corr += pV1[0] * pV2[0] +
                pV1[1] * pV2[1] +
                pV1[2] * pV2[2] +
                pV1[3] * pV2[3] +
                pV1[4] * pV2[4] +
                pV1[5] * pV2[5] +
                pV1[6] * pV2[6] +
                pV1[7] * pV2[7] +
                pV1[8] * pV2[8] +
                pV1[9] * pV2[9] +
                pV1[10] * pV2[10] +
                pV1[11] * pV2[11] +
                pV1[12] * pV2[12] +
                pV1[13] * pV2[13] +
                pV1[14] * pV2[14] +
                pV1[15] * pV2[15];

        pV1 += 16;
        pV2 += 16;
    }
    */

    _asm 
    {
        // Very important note: data in 'pV2' _must_ be aligned to 
        // 16-byte boundary!

        // give prefetch hints to CPU of what data are to be needed soonish
        // give more aggressive hints on pV1 as that changes while pV2 stays
        // same between runs
        prefetcht0 [pV1]
        prefetcht0 [pV2]
        prefetcht0 [pV1 + 32]

        mov     eax, dword ptr pV1
        mov     ebx, dword ptr pV2

        xorps   xmm0, xmm0

        mov     ecx, overlapLengthLocal
        shr     ecx, 3  // div by eight

    loop1:
        prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        movups  xmm1, [eax]
        mulps   xmm1, [ebx]
        addps   xmm0, xmm1

        movups  xmm2, [eax + 16]
        mulps   xmm2, [ebx + 16]
        addps   xmm0, xmm2

        prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish

        movups  xmm3, [eax + 32]
        mulps   xmm3, [ebx + 32]
        addps   xmm0, xmm3

        movups  xmm4, [eax + 48]
        mulps   xmm4, [ebx + 48]
        addps   xmm0, xmm4

        add     eax, 64
        add     ebx, 64

        dec     ecx
        jnz     loop1

        // add the four floats of xmm0 together and return the result. 

        movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
        addps   xmm1, xmm0
        movaps  xmm2, xmm1
        shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
        addss   xmm2, xmm1
        movss   corr, xmm2
    }

    return (double)corr;
}


//////////////////////////////////////////////////////////////////////////////
//
// implementation of SSE optimized functions of class 'FIRFilter'
//
//////////////////////////////////////////////////////////////////////////////

#include "FIRFilter.h"

FIRFilterSSE::FIRFilterSSE() : FIRFilter()
{
    filterCoeffsUnalign = NULL;
}


FIRFilterSSE::~FIRFilterSSE()
{
    delete[] filterCoeffsUnalign;
}


// (overloaded) Calculates filter coefficients for SSE routine
void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
{
    uint i;
    float fDivider;

    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);

    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
    // also rearrange coefficients suitably for 3DNow!
    // Ensure that filter coeffs array is aligned to 16-byte boundary
    delete[] filterCoeffsUnalign;
    filterCoeffsUnalign = new float[2 * newLength + 4];
    filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);

    fDivider = (float)resultDivider;

    // rearrange the filter coefficients for mmx routines 
    for (i = 0; i < newLength; i ++)
    {
        filterCoeffsAlign[2 * i + 0] =
        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
    }
}


// SSE-optimized version of the filter routine for stereo sound
uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
{
    int count = (numSamples - length) & -2;
    uint lengthLocal = length / 8;
    float *filterCoeffsLocal = filterCoeffsAlign;

    assert(count % 2 == 0);

    if (count < 2) return 0;

    /*
    double suml1, suml2;
    double sumr1, sumr2;
    uint i, j;

    for (j = 0; j < count; j += 2)
    {
        const float *ptr;
        const float *pFil;

        suml1 = sumr1 = 0.0;
        suml2 = sumr2 = 0.0;
        ptr = src;
        pFil = filterCoeffs;
        for (i = 0; i < lengthLocal; i ++) 
        {
            // unroll loop for efficiency.

            suml1 += ptr[0] * pFil[0] + 
                     ptr[2] * pFil[2] +
                     ptr[4] * pFil[4] +
                     ptr[6] * pFil[6];

            sumr1 += ptr[1] * pFil[1] + 
                     ptr[3] * pFil[3] +
                     ptr[5] * pFil[5] +
                     ptr[7] * pFil[7];

            suml2 += ptr[8] * pFil[0] + 
                     ptr[10] * pFil[2] +
                     ptr[12] * pFil[4] +
                     ptr[14] * pFil[6];

            sumr2 += ptr[9] * pFil[1] + 
                     ptr[11] * pFil[3] +
                     ptr[13] * pFil[5] +
                     ptr[15] * pFil[7];

            ptr += 16;
            pFil += 8;
        }
        dest[0] = (float)suml1;
        dest[1] = (float)sumr1;
        dest[2] = (float)suml2;
        dest[3] = (float)sumr2;

        src += 4;
        dest += 4;
    }
    */

    _asm
    {
        // Very important note: data in 'src' _must_ be aligned to 
        // 16-byte boundary!
        mov     edx, count
        mov     ebx, dword ptr src
        mov     eax, dword ptr dest
        shr     edx, 1

    loop1:
        // "outer loop" : during each round 2*2 output samples are calculated

        // give prefetch hints to CPU of what data are to be needed soonish
        prefetcht0 [ebx]
        prefetcht0 [filterCoeffsLocal]

        mov     esi, ebx
        mov     edi, filterCoeffsLocal
        xorps   xmm0, xmm0
        xorps   xmm1, xmm1
        mov     ecx, lengthLocal

    loop2:
        // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
        prefetcht0 [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish

        movups  xmm2, [esi]         // possibly unaligned load
        movups  xmm3, [esi + 8]     // possibly unaligned load
        mulps   xmm2, [edi]
        mulps   xmm3, [edi]
        addps   xmm0, xmm2
        addps   xmm1, xmm3

        movups  xmm4, [esi + 16]    // possibly unaligned load
        movups  xmm5, [esi + 24]    // possibly unaligned load
        mulps   xmm4, [edi + 16]
        mulps   xmm5, [edi + 16]
        addps   xmm0, xmm4
        addps   xmm1, xmm5

        prefetcht0 [esi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
        prefetcht0 [edi + 64]     // give a prefetch hint to CPU what data are to be needed soonish

        movups  xmm6, [esi + 32]    // possibly unaligned load
        movups  xmm7, [esi + 40]    // possibly unaligned load
        mulps   xmm6, [edi + 32]
        mulps   xmm7, [edi + 32]
        addps   xmm0, xmm6
        addps   xmm1, xmm7

        movups  xmm4, [esi + 48]    // possibly unaligned load
        movups  xmm5, [esi + 56]    // possibly unaligned load
        mulps   xmm4, [edi + 48]
        mulps   xmm5, [edi + 48]
        addps   xmm0, xmm4
        addps   xmm1, xmm5

        add     esi, 64
        add     edi, 64
        dec     ecx
        jnz     loop2

        // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
        // to sum the two hi- and lo-floats of these registers together.

        movhlps xmm2, xmm0          // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
        movlhps xmm2, xmm1          // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
        shufps  xmm0, xmm1, 0xe4    // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
        addps   xmm0, xmm2

        movaps  [eax], xmm0
        add     ebx, 16
        add     eax, 16

        dec     edx
        jnz     loop1
    }

    return (uint)count;
}

#endif  // ALLOW_SSE