Yeah, the prefetch instructions are a bit of a mystery, aren't they?
Re: unrolling, I tend to just unroll until every available XMM register is filled.
The best all-round tradeoff I've found so far is to use negative addressing offsets for the output, like this:
void fastmul_perform64(t_fastmul *x,
t_object *dsp64,
double **ins,
long numins,
double **outs,
long numouts,
long sampleframes,
long flags,
void *userparam) {
t_double *in1 = ins[0];
t_double *in2 = ins[1];
t_double *out = outs[0];
int n = sampleframes;
asm {
mov ecx, n
mov esi, in1
mov edi, in2
mov ebx, out
shr ecx, 3
loopStart:
movapd xmm0, [esi]
movapd xmm1, [esi + 16]
movapd xmm2, [esi + 32]
movapd xmm3, [esi + 48]
movapd xmm4, [edi]
movapd xmm5, [edi + 16]
movapd xmm6, [edi + 32]
movapd xmm7, [edi + 48]
mulpd xmm0, xmm4
add ebx, 64
mulpd xmm1, xmm5
add esi, 64
mulpd xmm2, xmm6
add edi, 64
mulpd xmm3, xmm7
movntpd [ebx - 64], xmm0
movntpd [ebx - 48], xmm1
movntpd [ebx - 32], xmm2
movntpd [ebx - 16], xmm3
sub ecx, 1
jnz loopStart
}
}
(This code assumes a minimum signal vector size of 8.)