I realise this is getting rather OT, seeing as we don't have a [gen~] SDK, but in case it helps you optimise your patch in the MSP domain, here's an idea how to implement some bitwise stuff using SSE in assembly:
float *in1 = (t_float *)(ins[0]);
float *in2 = (t_float *)(ins[1]);
float *out = (t_float *)(outs[0]);
int n = sampleframes;
asm {
mov esi, in1 // esi = &in1
mov edi, in2 // edi = &in2
mov ebx, out // ebx = &out
mov ecx, n // ecx = n
shr ecx, 3 // ecx >>= 3 (we process 8 doubles per loop)
loopStart:
// fill the first 4 xmm registers with in1[0] to in1[7]
movapd xmm0, [esi]
movapd xmm1, [esi + 16]
movapd xmm2, [esi + 32]
movapd xmm3, [esi + 48]
// fill the last 4 xmm registers with in2[0] to in2[7]
movapd xmm4, [edi]
movapd xmm5, [edi + 16]
movapd xmm6, [edi + 32]
movapd xmm7, [edi + 48]
add esi, 64 // a bit of pointer arithmetic
// There now follow a few little blocks demonstrating
// bitwise operations on packed double-precision values.
// Uncomment ONE block only to perform the desired operation.
///////////////////////
// in1 = in1 AND in2 //
///////////////////////
// andpd xmm0, xmm4
// andpd xmm1, xmm5
// andpd xmm2, xmm6
// andpd xmm3, xmm7
//////////////////////
// in1 = in1 OR in2 //
//////////////////////
// orpd xmm0, xmm4
// orpd xmm1, xmm5
// orpd xmm2, xmm6
// orpd xmm3, xmm7
///////////////////////
// in1 = in1 XOR in2 //
///////////////////////
// xorpd xmm0, xmm4
// xorpd xmm1, xmm5
// xorpd xmm2, xmm6
// xorpd xmm3, xmm7
/////////////////////////////
// in1 = in1 AND (NOT in2) //
/////////////////////////////
// andnpd xmm0, xmm4
// andnpd xmm1, xmm5
// andnpd xmm2, xmm6
// andnpd xmm3, xmm7
add edi, 64 // a bit more pointer arithmetic
// save the results in out[0] to out[7]
movntpd [ebx], xmm0
movntpd [ebx + 16], xmm1
movntpd [ebx + 32], xmm2
movntpd [ebx + 48], xmm3
add ebx, 64 // last bit of pointer arithmetic
sub ecx, 1 // decrement loop counter
jnz loopStart // jump if not zero (loop)
}