/* Copyright (C) 2005-2006 Paul Davis, John Rigg This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Author: Sampo Savolainen 64-bit conversion: John Rigg $Id$ */ #; Microsoft version of AVX sample processing functions #; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain); .globl x86_sse_avx_mix_buffers_with_gain .def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32; .endef x86_sse_avx_mix_buffers_with_gain: #; due to Microsoft calling convention #; %rcx float *dst #; %rdx float *src #; %r8 unsigned int nframes #; %xmm3 float gain pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved #; move current max to %xmm0 for convenience movss %xmm3, %xmm0 #; if nframes == 0, go to end cmp $0, %r8 je .MBWG_END #; Check for alignment movq %rcx, %rax andq $28, %rax #; mask alignment offset movq %rdx, %rbx andq $28, %rbx #; mask alignment offset cmp %rax, %rbx jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually #; if we are aligned cmp $0, %rbx jz .MBWG_AVX #; Pre-loop, we need to run 1-7 frames "manually" without #; SSE instructions .MBWG_PRELOOP: #; gain is already in %xmm0 movss (%rdx), %xmm1 mulss %xmm0, %xmm1 addss (%rcx), %xmm1 movss %xmm1, (%rcx) addq $4, %rcx #; dst++ addq $4, %rdx #; src++ decq %r8 #; nframes-- jz .MBWG_END addq $4, %rbx cmp $32, %rbx #; test if we've reached 32 byte alignment jne .MBWG_PRELOOP .MBWG_AVX: cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then jl .MBWG_NONALIGN #; we jump straight to the "normal" code #; set up the gain buffer (gain is already in %xmm0) vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits .MBWG_AVXLOOP: vmovaps (%rdx), %ymm1 #; source => xmm0 vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source vaddps (%rcx), %ymm2, %ymm1 #; mix with destination vmovaps %ymm1, (%rcx) #; copy result to destination addq $32, %rcx #; dst+=8 addq $32, %rdx #; src+=8 subq $8, %r8 #; nframes-=8 cmp $8, %r8 jge .MBWG_AVXLOOP #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties vzeroupper cmp $0, %r8 je .MBWG_END #; if there are remaining frames, the nonalign code will do nicely #; for the rest 1-7 frames. .MBWG_NONALIGN: #; not aligned! #; gain is already in %xmm0 .MBWG_NONALIGNLOOP: movss (%rdx), %xmm1 mulss %xmm0, %xmm1 addss (%rcx), %xmm1 movss %xmm1, (%rcx) addq $4, %rcx addq $4, %rdx decq %r8 jnz .MBWG_NONALIGNLOOP .MBWG_END: popq %rbx #; return leave ret #; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes); .globl x86_sse_avx_mix_buffers_no_gain .def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32; .endef x86_sse_avx_mix_buffers_no_gain: #; due to Microsoft calling convention #; %rcx float *dst #; %rdx float *src #; %r8 unsigned int nframes pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved #; the real function #; if nframes == 0, go to end cmp $0, %r8 je .MBNG_END #; Check for alignment movq %rcx, %rax andq $28, %rax #; mask alignment offset movq %rdx, %rbx andq $28, %rbx #; mask alignment offset cmp %rax, %rbx jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually cmp $0, %rbx je .MBNG_AVX #; aligned at 32, rpoceed to AVX #; Pre-loop, we need to run 1-7 frames "manually" without #; AVX instructions .MBNG_PRELOOP: movss (%rdx), %xmm0 addss (%rcx), %xmm0 movss %xmm0, (%rcx) addq $4, %rcx #; dst++ addq $4, %rdx #; src++ decq %r8 #; nframes-- jz .MBNG_END addq $4, %rbx #; one non-aligned byte less cmp $32, %rbx #; test if we've reached 32 byte alignment jne .MBNG_PRELOOP .MBNG_AVX: cmp $8, %r8 #; if there are frames left, but less than 8 jl .MBNG_NONALIGN #; we can't run AVX .MBNG_AVXLOOP: vmovaps (%rdx), %ymm0 #; source => xmm0 vaddps (%rcx), %ymm0, %ymm1 #; mix with destination vmovaps %ymm1, (%rcx) #; copy result to destination addq $32, %rcx #; dst+=8 addq $32, %rdx #; src+=8 subq $8, %r8 #; nframes-=8 cmp $8, %r8 jge .MBNG_AVXLOOP #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties vzeroupper cmp $0, %r8 je .MBNG_END #; if there are remaining frames, the nonalign code will do nicely #; for the rest 1-7 frames. .MBNG_NONALIGN: #; not aligned! #; movss (%rdx), %xmm0 #; src => xmm0 addss (%rcx), %xmm0 #; xmm0 += dst movss %xmm0, (%rcx) #; xmm0 => dst addq $4, %rcx addq $4, %rdx decq %r8 jnz .MBNG_NONALIGN .MBNG_END: popq %rbx #; return leave ret #; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes); .globl x86_sse_avx_copy_vector .def x86_sse_avx_copy_vector; .scl 2; .type 32; .endef x86_sse_avx_copy_vector: #; due to Microsoft calling convention #; %rcx float *dst #; %rdx float *src #; %r8 unsigned int nframes pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved #; the real function #; if nframes == 0, go to end cmp $0, %r8 je .CB_END #; Check for alignment movq %rcx, %rax andq $28, %rax #; mask alignment offset movq %rdx, %rbx andq $28, %rbx #; mask alignment offset cmp %rax, %rbx jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually cmp $0, %rbx je .CB_AVX #; aligned at 32, rpoceed to AVX #; Pre-loop, we need to run 1-7 frames "manually" without #; AVX instructions .CB_PRELOOP: movss (%rdx), %xmm0 movss %xmm0, (%rcx) addq $4, %rcx #; dst++ addq $4, %rdx #; src++ decq %r8 #; nframes-- jz .CB_END addq $4, %rbx #; one non-aligned byte less cmp $32, %rbx #; test if we've reached 32 byte alignment jne .CB_PRELOOP .CB_AVX: cmp $8, %r8 #; if there are frames left, but less than 8 jl .CB_NONALIGN #; we can't run AVX .CB_AVXLOOP: vmovaps (%rdx), %ymm0 #; source => xmm0 vmovaps %ymm0, (%rcx) #; copy result to destination addq $32, %rcx #; dst+=8 addq $32, %rdx #; src+=8 subq $8, %r8 #; nframes-=8 cmp $8, %r8 jge .CB_AVXLOOP #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties vzeroupper cmp $0, %r8 je .CB_END #; if there are remaining frames, the nonalign code will do nicely #; for the rest 1-7 frames. .CB_NONALIGN: #; not aligned! #; movss (%rdx), %xmm0 #; src => xmm0 movss %xmm0, (%rcx) #; xmm0 => dst addq $4, %rcx addq $4, %rdx decq %r8 jnz .CB_NONALIGN .CB_END: popq %rbx #; return leave ret #; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain); .globl x86_sse_avx_apply_gain_to_buffer .def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32; .endef x86_sse_avx_apply_gain_to_buffer: #; due to Microsoft calling convention #; %rcx float *buf 32(%rbp) #; %rdx unsigned int nframes #; %xmm2 float gain avx specific register pushq %rbp movq %rsp, %rbp #; move current max to %xmm0 for convenience movss %xmm2, %xmm0 #; the real function #; if nframes == 0, go to end cmp $0, %rdx je .AG_END #; Check for alignment movq %rcx, %r8 #; buf => %rdx andq $28, %r8 #; check alignment with mask 11100 jz .AG_AVX #; if buffer IS aligned #; PRE-LOOP #; we iterate 1-7 times, doing normal x87 float comparison #; so we reach a 32 byte aligned "buf" (=%rdi) value .AGLP_START: #; Load next value from the buffer into %xmm1 movss (%rcx), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (%rcx) #; increment buffer, decrement counter addq $4, %rcx #; buf++; decq %rdx #; nframes-- jz .AG_END #; if we run out of frames, we go to the end addq $4, %r8 #; one non-aligned byte less cmp $16, %r8 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over .AG_AVX: #; We have reached the 32 byte aligned "buf" ("rcx") value #; use AVX instructions #; Figure out how many loops we should do movq %rdx, %rax #; copy remaining nframes to %rax for division shr $3, %rax #; unsigned divide by 8 #; %rax = AVX iterations cmp $0, %rax je .AGPOST_START #; set up the gain buffer (gain is already in %xmm0) vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits .AGLP_AVX: vmovaps (%rcx), %ymm1 vmulps %ymm0, %ymm1, %ymm2 vmovaps %ymm2, (%rcx) addq $32, %rcx #; buf + 8 subq $8, %rdx #; nframes-=8 decq %rax jnz .AGLP_AVX #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties vzeroupper #; Next we need to post-process all remaining frames #; the remaining frame count is in %rcx cmpq $0, %rdx #; jz .AG_END .AGPOST_START: movss (%rcx), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (%rcx) #; increment buffer, decrement counter addq $4, %rcx #; buf++; decq %rdx #; nframes-- jnz .AGPOST_START #; if we run out of frames, we go to the end .AG_END: #; return leave ret #; end proc #; float x86_sse_avx_compute_peak(float *buf, long nframes, float current); .globl x86_sse_avx_compute_peak .def x86_sse_avx_compute_peak; .scl 2; .type 32; .endef x86_sse_avx_compute_peak: #; due to Microsoft calling convention #; %rcx float* buf 32(%rbp) #; %rdx unsigned int nframes #; %xmm2 float current pushq %rbp movq %rsp, %rbp #; move current max to %xmm0 for convenience movss %xmm2, %xmm0 #; if nframes == 0, go to end cmp $0, %rdx je .CP_END #; create the "abs" mask in %xmm3 #; if will be used to discard sign bit pushq $2147483647 movss (%rsp), %xmm3 addq $8, %rsp #; Check for alignment movq %rcx, %r8 #; buf => %rdx andq $28, %r8 #; mask bits 1 & 2 jz .CP_AVX #; if buffer IS aligned #; PRE-LOOP #; we iterate 1-7 times, doing normal x87 float comparison #; so we reach a 32 byte aligned "buf" (=%rcx) value .LP_START: #; Load next value from the buffer movss (%rcx), %xmm1 andps %xmm3, %xmm1 #; mask out sign bit maxss %xmm1, %xmm0 #; increment buffer, decrement counter addq $4, %rcx #; buf++; decq %rdx #; nframes-- jz .CP_END #; if we run out of frames, we go to the end addq $4, %r8 #; one non-aligned byte less cmp $32, %r8 jne .LP_START #; if more non-aligned frames exist, we do a do-over .CP_AVX: #; We have reached the 32 byte aligned "buf" ("rdi") value #; Figure out how many loops we should do movq %rdx, %rax #; copy remaining nframes to %rax for division shr $3, %rax #; unsigned divide by 8 jz .POST_START #; %rax = AVX iterations #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register.. vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits #; broadcast sign mask to the whole ymm3 register vshufps $0x00, %ymm3, %ymm3, %ymm3 #; spread single float value to the all 128 bits of xmm3 register vperm2f128 $0x00, %ymm3, %ymm3, %ymm3 #; extend the first 128 bits of ymm3 register to higher 128 bits .LP_AVX: vmovaps (%rcx), %ymm1 vandps %ymm3, %ymm1, %ymm1 #; mask out sign bit vmaxps %ymm1, %ymm0, %ymm0 addq $32, %rcx #; buf+=8 subq $8, %rdx #; nframes-=8 decq %rax jnz .LP_AVX #; Calculate the maximum value contained in the 4 FP's in %ymm0 vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums #; now every float in %ymm0 is the same value, current maximum value #; Next we need to post-process all remaining frames #; the remaining frame count is in %rcx #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties vzeroupper #; if no remaining frames, jump to the end cmp $0, %rdx je .CP_END .POST_START: movss (%rcx), %xmm1 andps %xmm3, %xmm1 #; mask out sign bit maxss %xmm1, %xmm0 addq $4, %rcx #; buf++; decq %rdx #; nframes--; jnz .POST_START .CP_END: #; return value is in xmm0 #; return leave ret #; end proc