/* Copyright (C) 2005-2006 Paul Davis, John Rigg This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. Author: Sampo Savolainen 64-bit conversion: John Rigg $Id$ */ #; Microsoft version of SSE sample processing functions #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain); .globl x86_sse_mix_buffers_with_gain .def x86_sse_mix_buffers_with_gain; .scl 2; .type 32; .endef x86_sse_mix_buffers_with_gain: #; due to Microsoft calling convention #; %rcx float *dst #; %rdx float *src #; %r8 unsigned int nframes #; %xmm3 float gain #; due to System V AMD64 (Linux) calling convention #; %rdi float *dst #; %rsi float *src #; %rdx unsigned int nframes #; %xmm0 float gain pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved pushq %rcx pushq %rdx pushq %rdi #; must be preserved pushq %rsi #; must be preserved #; to keep algorithms universal - move input params into Linux specific registers movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx movss %xmm3, %xmm0 #; if nframes == 0, go to end cmp $0, %rdx je .MBWG_END #; Check for alignment movq %rdi, %rax andq $12, %rax #; mask alignment offset movq %rsi, %rbx andq $12, %rbx #; mask alignment offset cmp %rax, %rbx jne .MBWG_NONALIGN #; if not aligned, calculate manually #; if we are aligned cmp $0, %rbx jz .MBWG_SSE #; Pre-loop, we need to run 1-3 frames "manually" without #; SSE instructions .MBWG_PRELOOP: #; gain is already in %xmm0 movss (%rsi), %xmm1 mulss %xmm0, %xmm1 addss (%rdi), %xmm1 movss %xmm1, (%rdi) addq $4, %rdi #; dst++ addq $4, %rsi #; src++ decq %rdx #; nframes-- jz .MBWG_END addq $4, %rbx cmp $16, %rbx #; test if we've reached 16 byte alignment jne .MBWG_PRELOOP .MBWG_SSE: cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then jnge .MBWG_NONALIGN #; we jump straight to the "normal" code #; gain is already in %xmm0 shufps $0x00, %xmm0, %xmm0 .MBWG_SSELOOP: movaps (%rsi), %xmm1 #; source => xmm0 mulps %xmm0, %xmm1 #; apply gain to source addps (%rdi), %xmm1 #; mix with destination movaps %xmm1, (%rdi) #; copy result to destination addq $16, %rdi #; dst+=4 addq $16, %rsi #; src+=4 subq $4, %rdx #; nframes-=4 cmp $4, %rdx jge .MBWG_SSELOOP cmp $0, %rdx je .MBWG_END #; if there are remaining frames, the nonalign code will do nicely #; for the rest 1-3 frames. .MBWG_NONALIGN: #; not aligned! #; gain is already in %xmm0 .MBWG_NONALIGNLOOP: movss (%rsi), %xmm1 mulss %xmm0, %xmm1 addss (%rdi), %xmm1 movss %xmm1, (%rdi) addq $4, %rdi addq $4, %rsi decq %rdx jnz .MBWG_NONALIGNLOOP .MBWG_END: popq %rsi popq %rdi popq %rdx popq %rcx popq %rbx #; return leave ret #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes); .globl x86_sse_mix_buffers_no_gain .def x86_sse_mix_buffers_no_gain; .scl 2; .type 32; .endef x86_sse_mix_buffers_no_gain: #; due to Microsoft calling convention #; %rcx float *dst #; %rdx float *src #; %r8 unsigned int nframes #; due to System V AMD64 (Linux) calling convention #; %rdi float *dst #; %rsi float *src #; %rdx unsigned int nframes pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved pushq %rcx pushq %rdx pushq %rdi #; must be preserved pushq %rsi #; must be preserved #; to keep algorithms universal - move input params into Linux specific registers movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #; the real function #; if nframes == 0, go to end cmp $0, %r8 je .MBNG_END #; Check for alignment movq %rdi, %rax andq $12, %rax #; mask alignment offset movq %rsi, %rbx andq $12, %rbx #; mask alignment offset cmp %rax, %rbx jne .MBNG_NONALIGN #; if not aligned, calculate manually cmp $0, %rbx je .MBNG_SSE #; Pre-loop, we need to run 1-3 frames "manually" without #; SSE instructions .MBNG_PRELOOP: movss (%rsi), %xmm0 addss (%rdi), %xmm0 movss %xmm0, (%rdi) addq $4, %rdi #; dst++ addq $4, %rsi #; src++ decq %rdx #; nframes-- jz .MBNG_END addq $4, %rbx cmp $16, %rbx #; test if we've reached 16 byte alignment jne .MBNG_PRELOOP .MBNG_SSE: cmp $4, %rdx #; if there are frames left, but less than 4 jnge .MBNG_NONALIGN #; we can't run SSE .MBNG_SSELOOP: movaps (%rsi), %xmm0 #; source => xmm0 addps (%rdi), %xmm0 #; mix with destination movaps %xmm0, (%rdi) #; copy result to destination addq $16, %rdi #; dst+=4 addq $16, %rsi #; src+=4 subq $4, %rdx #; nframes-=4 cmp $4, %rdx jge .MBNG_SSELOOP cmp $0, %rdx je .MBNG_END #; if there are remaining frames, the nonalign code will do nicely #; for the rest 1-3 frames. .MBNG_NONALIGN: #; not aligned! movss (%rsi), %xmm0 #; src => xmm0 addss (%rdi), %xmm0 #; xmm0 += dst movss %xmm0, (%rdi) #; xmm0 => dst addq $4, %rdi addq $4, %rsi decq %rdx jnz .MBNG_NONALIGN .MBNG_END: popq %rsi popq %rdi popq %rdx popq %rcx popq %rbx #; return leave ret #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain); .globl x86_sse_apply_gain_to_buffer .def x86_sse_apply_gain_to_buffer; .scl 2; .type 32; .endef x86_sse_apply_gain_to_buffer: #; due to Microsoft calling convention #; %rcx float *buf 32(%rbp) #; %rdx unsigned int nframes #; %xmm2 float gain #; %xmm1 float buf[0] #; due to System V AMD64 (Linux) calling convention #; %rdi float *buf 32(%rbp) #; %rsi unsigned int nframes #; %xmm0 float gain #; %xmm1 float buf[0] pushq %rbp movq %rsp, %rbp #; save the registers pushq %rcx pushq %rdi #; must be preserved pushq %rsi #; must be preserved #; to keep algorithms universal - move input params into Linux specific registers movq %rcx, %rdi movq %rdx, %rsi movss %xmm2, %xmm0 #; the real function #; if nframes == 0, go to end movq %rsi, %rcx #; nframes cmp $0, %rcx je .AG_END #; set up the gain buffer (gain is already in %xmm0) shufps $0x00, %xmm0, %xmm0 #; Check for alignment movq %rdi, %rdx #; buf => %rdx andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 jz .AG_SSE #; if buffer IS aligned #; PRE-LOOP #; we iterate 1-3 times, doing normal x87 float comparison #; so we reach a 16 byte aligned "buf" (=%rdi) value .AGLP_START: #; Load next value from the buffer into %xmm1 movss (%rdi), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (%rdi) #; increment buffer, decrement counter addq $4, %rdi #; buf++; decq %rcx #; nframes-- jz .AG_END #; if we run out of frames, we go to the end addq $4, %rdx #; one non-aligned byte less cmp $16, %rdx jne .AGLP_START #; if more non-aligned frames exist, we do a do-over .AG_SSE: #; We have reached the 16 byte aligned "buf" ("rdi") value #; Figure out how many loops we should do movq %rcx, %rax #; copy remaining nframes to %rax for division shr $2,%rax #; unsigned divide by 4 #; %rax = SSE iterations cmp $0, %rax je .AGPOST_START .AGLP_SSE: movaps (%rdi), %xmm1 mulps %xmm0, %xmm1 movaps %xmm1, (%rdi) addq $16, %rdi #; buf + 4 subq $4, %rcx #; nframes-=4 decq %rax jnz .AGLP_SSE #; Next we need to post-process all remaining frames #; the remaining frame count is in %rcx andq $3, %rcx #; nframes % 4 jz .AG_END .AGPOST_START: movss (%rdi), %xmm1 mulss %xmm0, %xmm1 movss %xmm1, (%rdi) #; increment buffer, decrement counter addq $4, %rdi #; buf++; decq %rcx #; nframes-- jnz .AGPOST_START #; if we run out of frames, we go to the end .AG_END: popq %rsi popq %rdi popq %rcx #; return leave ret #; end proc #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes) .globl x86_sse_apply_gain_vector .def x86_sse_apply_gain_vector; .scl 2; .type 32; .endef x86_sse_apply_gain_vector: #; due to Microsoft calling convention #; %rcx float *buf #; %rdx float *gain_vector #; %r8 unsigned int nframes #; due to System V AMD64 (Linux) calling convention #; %rdi float *buf #; %rsi float *gain_vector #; %rdx unsigned int nframes pushq %rbp movq %rsp, %rbp #; save the registers pushq %rbx #; must be preserved pushq %rcx pushq %rdx pushq %rdi #; must be preserved pushq %rsi #; must be preserved #; to keep algorithms universal - move input params into Linux specific registers movq %rcx, %rdi movq %rdx, %rsi movq %r8, %rdx #; if nframes == 0 go to end cmp $0, %rdx je .AGA_END #; Check alignment movq %rdi, %rax andq $12, %rax movq %rsi, %rbx andq $12, %rbx cmp %rax,%rbx jne .AGA_ENDLOOP cmp $0, %rax jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount .AGA_ALIGNLOOP: movss (%rdi), %xmm0 #; buf => xmm0 movss (%rsi), %xmm1 #; gain value => xmm1 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 movss %xmm0, (%rdi) #; signal with gain => buf decq %rdx jz .AGA_END addq $4, %rdi #; buf++ addq $4, %rsi #; gab++ addq $4, %rax cmp $16, %rax jne .AGA_ALIGNLOOP #; There are frames left for sure, as that is checked in the beginning #; and within the previous loop. BUT, there might be less than 4 frames #; to process .AGA_SSE: movq %rdx, %rax #; nframes => %rax shr $2, %rax #; unsigned divide by 4 cmp $0, %rax je .AGA_ENDLOOP .AGA_SSELOOP: movaps (%rdi), %xmm0 movaps (%rsi), %xmm1 mulps %xmm1, %xmm0 movaps %xmm0, (%rdi) addq $16, %rdi addq $16, %rsi decq %rax jnz .AGA_SSELOOP andq $3, %rdx #; Remaining frames are nframes & 3 jz .AGA_END #; Inside this loop, we know there are frames left to process #; but because either there are < 4 frames left, or the buffers #; are not aligned, we can't use the parallel SSE ops .AGA_ENDLOOP: movss (%rdi), %xmm0 #; buf => xmm0 movss (%rsi), %xmm1 #; gain value => xmm1 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 movss %xmm0, (%rdi) #; signal with gain => buf addq $4,%rdi addq $4,%rsi decq %rdx #; nframes-- jnz .AGA_ENDLOOP .AGA_END: popq %rsi popq %rdi popq %rdx popq %rcx popq %rbx leave ret #; end proc #; float x86_sse_compute_peak(float *buf, long nframes, float current); .globl x86_sse_compute_peak .def x86_sse_compute_peak; .scl 2; .type 32; .endef x86_sse_compute_peak: #; due to Microsoft calling convention #; %rcx float* buf 32(%rbp) #; %rdx unsigned int nframes #; %xmm2 float current #; %xmm1 float buf[0] #; due to System V AMD64 (Linux) calling convention #; %rdi float* buf 32(%rbp) #; %rsi unsigned int nframes #; %xmm0 float current #; %xmm1 float buf[0] pushq %rbp movq %rsp, %rbp #; save registers pushq %rcx pushq %rdi #; must be preserved pushq %rsi #; must be preserved #; to keep algorithms universal - move input params into Linux specific registers movq %rcx, %rdi movq %rdx, %rsi movss %xmm2, %xmm0 #; if nframes == 0, go to end movq %rsi, %rcx #; nframes cmp $0, %rcx je .CP_END #; create the "abs" mask in %xmm2 pushq $2147483647 movss (%rsp), %xmm2 addq $8, %rsp shufps $0x00, %xmm2, %xmm2 #; Check for alignment #;movq 8(%rbp), %rdi #; buf movq %rdi, %rdx #; buf => %rdx andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 jz .CP_SSE #; if buffer IS aligned #; PRE-LOOP #; we iterate 1-3 times, doing normal x87 float comparison #; so we reach a 16 byte aligned "buf" (=%rdi) value .LP_START: #; Load next value from the buffer movss (%rdi), %xmm1 andps %xmm2, %xmm1 maxss %xmm1, %xmm0 #; increment buffer, decrement counter addq $4, %rdi #; buf++; decq %rcx #; nframes-- jz .CP_END #; if we run out of frames, we go to the end addq $4, %rdx #; one non-aligned byte less cmp $16, %rdx jne .LP_START #; if more non-aligned frames exist, we do a do-over .CP_SSE: #; We have reached the 16 byte aligned "buf" ("rdi") value #; Figure out how many loops we should do movq %rcx, %rax #; copy remaining nframes to %rax for division shr $2,%rax #; unsigned divide by 4 jz .POST_START #; %rax = SSE iterations #; current maximum is at %xmm0, but we need to .. shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's #;prefetcht0 16(%rdi) .LP_SSE: movaps (%rdi), %xmm1 andps %xmm2, %xmm1 maxps %xmm1, %xmm0 addq $16, %rdi subq $4, %rcx #; nframes-=4 decq %rax jnz .LP_SSE #; Calculate the maximum value contained in the 4 FP's in %xmm0 movaps %xmm0, %xmm1 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) maxps %xmm1, %xmm0 #; maximums of the two pairs movaps %xmm0, %xmm1 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) maxps %xmm1, %xmm0 #; now every float in %xmm0 is the same value, current maximum value #; Next we need to post-process all remaining frames #; the remaining frame count is in %rcx #; if no remaining frames, jump to the end andq $3, %rcx #; nframes % 4 jz .CP_END .POST_START: movss (%rdi), %xmm1 andps %xmm2, %xmm1 maxss %xmm1, %xmm0 addq $4, %rdi #; buf++; decq %rcx #; nframes--; jnz .POST_START .CP_END: #; restore registers popq %rsi popq %rdi popq %rcx #; return value is in xmm0 #; return leave ret #; end proc