libs/soundtouch/3dnow_win.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

////////////////////////////////////////////////////////////////////////////////
///
/// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon 
/// processors. All 3DNow! optimized functions have been gathered into this
/// single source code file, regardless to their class or original source code 
/// file, in order to ease porting the library to other compiler and processor 
/// platforms.
///
/// By the way; the performance gain depends heavily on the CPU generation: On 
/// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the 
/// difference to the original routines stayed at unremarkable 8%! Such a small 
/// improvement on Athlon is due to 3DNow can perform only two operations in 
/// parallel, and obviously also the Athlon FPU is doing a very good job with
/// the standard C floating point routines! Here these routines are anyway, 
/// although it might not be worth the effort to convert these to GCC platform, 
/// for Athlon CPU at least. The situation is different regarding the SSE 
/// optimizations though, thanks to the four parallel operations of SSE that 
/// already make a difference.
/// 
/// This file is to be compiled in Windows platform with Microsoft Visual C++ 
/// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
/// GNU platforms (if file supplied).
///
/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
/// 6.0 processor pack" update to support 3DNow! instruction set. The update is 
/// available for download at Microsoft Developers Network, see here:
/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
///
/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
/// perform a search with keywords "processor pack".
///
/// Author        : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai @ iki.fi
/// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
///
////////////////////////////////////////////////////////////////////////////////
//
// Last changed  : $Date$
// File revision : $Revision$
//
// $Id$
//
////////////////////////////////////////////////////////////////////////////////
//
// License :
//
//  SoundTouch audio processing library
//  Copyright (c) Olli Parviainen
//
//  This library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU Lesser General Public
//  License as published by the Free Software Foundation; either
//  version 2.1 of the License, or (at your option) any later version.
//
//  This library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//  Lesser General Public License for more details.
//
//  You should have received a copy of the GNU Lesser General Public
//  License along with this library; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
////////////////////////////////////////////////////////////////////////////////

#include "cpu_detect.h"
#include "STTypes.h"

#ifndef WIN32
#error "wrong platform - this source code file is exclusively for Win32 platform"
#endif

using namespace soundtouch;

#ifdef ALLOW_3DNOW
// 3DNow! routines available only with float sample type    

//////////////////////////////////////////////////////////////////////////////
//
// implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
//
//////////////////////////////////////////////////////////////////////////////

#include "TDStretch.h"
#include <limits.h>

// these are declared in 'TDStretch.cpp'
extern int scanOffsets[4][24];


// Calculates cross correlation of two buffers
double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
{
    uint overlapLengthLocal = overlapLength;
    float corr;

    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
    /*
    c-pseudocode:

        corr = 0;
        for (i = 0; i < overlapLength / 4; i ++)
        {
            corr += pV1[0] * pV2[0];
                    pV1[1] * pV2[1];
                    pV1[2] * pV2[2];
                    pV1[3] * pV2[3];
                    pV1[4] * pV2[4];
                    pV1[5] * pV2[5];
                    pV1[6] * pV2[6];
                    pV1[7] * pV2[7];

            pV1 += 8;
            pV2 += 8;
        }
    */

    _asm 
    {
        // give prefetch hints to CPU of what data are to be needed soonish.
        // give more aggressive hints on pV1 as that changes more between different calls 
        // while pV2 stays the same.
        prefetch [pV1]
        prefetch [pV2]
        prefetch [pV1 + 32]

        mov     eax, dword ptr pV2
        mov     ebx, dword ptr pV1

        pxor    mm0, mm0

        mov     ecx, overlapLengthLocal
        shr     ecx, 2  // div by four

    loop1:
        movq    mm1, [eax]
        prefetch [eax + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        pfmul   mm1, [ebx]
        prefetch [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish

        movq    mm2, [eax + 8]
        pfadd   mm0, mm1
        pfmul   mm2, [ebx + 8]

        movq    mm3, [eax + 16]
        pfadd   mm0, mm2
        pfmul   mm3, [ebx + 16]

        movq    mm4, [eax + 24]
        pfadd   mm0, mm3
        pfmul   mm4, [ebx + 24]

        add     eax, 32
        pfadd   mm0, mm4
        add     ebx, 32

        dec     ecx
        jnz     loop1

        // add halfs of mm0 together and return the result. 
        // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
        pfacc   mm0, mm1
        movd    corr, mm0
        femms
    }

    return corr;
}


//////////////////////////////////////////////////////////////////////////////
//
// implementation of 3DNow! optimized functions of class 'FIRFilter'
//
//////////////////////////////////////////////////////////////////////////////

#include "FIRFilter.h"

FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
{
    filterCoeffsUnalign = NULL;
}


FIRFilter3DNow::~FIRFilter3DNow()
{
    delete[] filterCoeffsUnalign;
}


// (overloaded) Calculates filter coefficients for 3DNow! routine
void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
{
    uint i;
    float fDivider;

    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);

    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
    // also rearrange coefficients suitably for 3DNow!
    // Ensure that filter coeffs array is aligned to 16-byte boundary
    delete[] filterCoeffsUnalign;
    filterCoeffsUnalign = new float[2 * newLength + 4];
    filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);

    fDivider = (float)resultDivider;

    // rearrange the filter coefficients for mmx routines 
    for (i = 0; i < newLength; i ++)
    {
        filterCoeffsAlign[2 * i + 0] =
        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
    }
}


// 3DNow!-optimized version of the filter routine for stereo sound
uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
{
    float *filterCoeffsLocal = filterCoeffsAlign;
    uint count = (numSamples - length) & -2;
    uint lengthLocal = length / 4;

    assert(length != 0);
    assert(count % 2 == 0);

    /* original code:

    double suml1, suml2;
    double sumr1, sumr2;
    uint i, j;

    for (j = 0; j < count; j += 2)
    {
        const float *ptr;

        suml1 = sumr1 = 0.0;
        suml2 = sumr2 = 0.0;
        ptr = src;
        filterCoeffsLocal = filterCoeffs;
        for (i = 0; i < lengthLocal; i ++) 
        {
            // unroll loop for efficiency.

            suml1 += ptr[0] * filterCoeffsLocal[0] + 
                     ptr[2] * filterCoeffsLocal[2] +
                     ptr[4] * filterCoeffsLocal[4] +
                     ptr[6] * filterCoeffsLocal[6];

            sumr1 += ptr[1] * filterCoeffsLocal[1] + 
                     ptr[3] * filterCoeffsLocal[3] +
                     ptr[5] * filterCoeffsLocal[5] +
                     ptr[7] * filterCoeffsLocal[7];

            suml2 += ptr[8] * filterCoeffsLocal[0] + 
                     ptr[10] * filterCoeffsLocal[2] +
                     ptr[12] * filterCoeffsLocal[4] +
                     ptr[14] * filterCoeffsLocal[6];

            sumr2 += ptr[9] * filterCoeffsLocal[1] + 
                     ptr[11] * filterCoeffsLocal[3] +
                     ptr[13] * filterCoeffsLocal[5] +
                     ptr[15] * filterCoeffsLocal[7];

            ptr += 16;
            filterCoeffsLocal += 8;
        }
        dest[0] = (float)suml1;
        dest[1] = (float)sumr1;
        dest[2] = (float)suml2;
        dest[3] = (float)sumr2;

        src += 4;
        dest += 4;
    }

    */
    _asm
    {
        mov     eax, dword ptr dest
        mov     ebx, dword ptr src
        mov     edx, count
        shr     edx, 1

    loop1:
        // "outer loop" : during each round 2*2 output samples are calculated
        prefetch  [ebx]                 // give a prefetch hint to CPU what data are to be needed soonish
        prefetch  [filterCoeffsLocal]   // give a prefetch hint to CPU what data are to be needed soonish

        mov     esi, ebx
        mov     edi, filterCoeffsLocal
        pxor    mm0, mm0
        pxor    mm1, mm1
        mov     ecx, lengthLocal

    loop2:
        // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
        movq    mm2, [edi]
        movq    mm3, mm2
        prefetch  [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        pfmul   mm2, [esi]
        prefetch  [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
        pfmul   mm3, [esi + 8]

        movq    mm4, [edi + 8]
        movq    mm5, mm4
        pfadd   mm0, mm2
        pfmul   mm4, [esi + 8]
        pfadd   mm1, mm3
        pfmul   mm5, [esi + 16]

        movq    mm2, [edi + 16]
        movq    mm6, mm2
        pfadd   mm0, mm4
        pfmul   mm2, [esi + 16]
        pfadd   mm1, mm5
        pfmul   mm6, [esi + 24]

        movq    mm3, [edi + 24]
        movq    mm7, mm3
        pfadd   mm0, mm2
        pfmul   mm3, [esi + 24]
        pfadd   mm1, mm6
        pfmul   mm7, [esi + 32]
        add     esi, 32
        pfadd   mm0, mm3
        add     edi, 32
        pfadd   mm1, mm7

        dec     ecx
        jnz     loop2

        movq    [eax], mm0
        add     ebx, 16
        movq    [eax + 8], mm1
        add     eax, 16

        dec     edx
        jnz     loop1

        femms
    }

    return count;
}


#endif  // ALLOW_3DNOW