;// compilation :
;// "as toto.s -o toto.o
;// "ld toto.o -o toto
;// The _fill function is juste to have predictive result when debugging with gdb in Eclipse
;// And so to be able to verify operations and pixel results in memory
.section .data
.align 16 ; // to ensure 16bytes alignement
image: .space 256*256*4; // 256*256*(4 bytes/pixel) format : RGBARGBARGBA....
imagelen: .long 256*256*4; // take care to have size multiple of 4pixels
// take care to have @image 128bits aligned, if not change all "movdqa" with "movdqu"
// "movdqa = move doublequad aligned", "movdqu = move doublequad unaligned"
charmaskinlong: .long 0x000000FF;
// we compute all coefficients as 65536th
coefR: .long 77; // 0.3 *256
coefG: .long 151; // 0.59 *256
coefB: .long 28; // 0.11 *256
// 77+151+28 = 256 ok, we shouldn't overflow 255
.section .text
.global _start
_start:
call _fill;
movd charmaskinlong,%xmm7; // load charmask in XMM7=[000000000000000000000000000000FF]
pshufd $0,%xmm7,%xmm7; // copy charmask on each packeted 32bits
// XMM7==[000000FF000000FF000000FF000000FF]
movd coefR,%xmm6; // load coefR in XMM6=[000000000000000000000000000000cR]
pshufd $0,%xmm6,%xmm6; // XMM6.D=[cR cR cR cR] , XMM7.W=[0 cR 0 cR 0 cR 0 cR]
movd coefG,%xmm5; // load coefG in XMM5=[000000000000000000000000000000cG]
pshufd $0,%xmm5,%xmm5; // XMM5.D=[cG cG cG cG] , XMM5.W=[0 cG 0 cG 0 cG 0 cG]
pslld $16,%xmm5; // XMM5.W=[cG 0 cG 0 cG 0 cG 0]
por %xmm5,%xmm6; // XMM6.W=[cG cR cG cR cG cR cG cR]
movd coefB,%xmm4; // load coefB in XMM7=[000000000000000000000000000000cB]
pshufd $0,%xmm4,%xmm4; // XMM4.D=[cB cB cB cB] , XMM4.W=[0 cB 0 cB 0 cB 0 cB]
movdqa %xmm7,%xmm5; // XMM5.D=[255 255 255 255]
pslld $16,%xmm4; // XMM4.W=[cB 0 cB 0 cB 0 cB 0]
por %xmm4,%xmm5; // XMM5.W=[cB 255 cB 255 cB 255 cB 255]
movl $image,%eax; // put ptr image in EAX
movl imagelen,%ecx; // put size in ECX
sar $4,%ecx; // divide size by 16 (4 pixels / pass)
_loop4:
// the goal is (R*cR+G*cG+B*cB)/256 (cR,cG,cB are *256)
// we will do ( [(R*cR + G*cG)/256]*255 + B*cB)/256 -> 2 Packed MultiplyAdd operation
movdqa (%eax),%xmm0; // load 4pixels in XMM0 (4 * 32bits = 128bits)
pshufd $0xE4,%xmm7,%xmm1; // copy XMM7 to XMM1 (using shifting SSE unit not load unit)
pand %xmm0,%xmm1; // XMM1.B=[00 00 00 RR 00 00 00 RR 00 00 00 RR 00 00 00 RR]
pshufd $0xE4,%xmm7,%xmm2; // copy XMM7 to XMM2 (using shifting SSE unit not load unit)
pslld $8,%xmm2; // XMM2.B=[00 00 FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00]
pand %xmm0,%xmm2; // XMM1.B=[00 00 GG 00 00 00 GG 00 00 00 GG 00 00 00 GG 00]
pslld $8,%xmm2; // XMM1.B=[00 GG 00 00 00 GG 00 00 00 GG 00 00 00 GG 00 00]
por %xmm2,%xmm1; // XMM1.W=[G R G R G R G R]
pmaddwd %xmm6,%xmm1; // XMM1.D=[g*cG+R*cR g*cG+R*cR g*cG+R*cR g*cG+R*cR]
psrad $8,%xmm1; // XMM1.W=[0 (g*cG+R*cR)/256 4x]
movdqa %xmm7,%xmm2; // Copy XMM7 to XMM2 using load SSE unit
pslld $16,%xmm2; // XMM2.B=[00 FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00 00]
pand %xmm0,%xmm2; // XMM2.B=[00 BB 00 00 00 BB 00 00 00 BB 00 00 00 BB 00 00]
por %xmm2,%xmm1; // XMM1.W=[BB (g*cG+R*cR)/256 BB 4x]
pmaddwd %xmm5,%xmm1; // XMM1.D=[((g*cG+R*cR)/256)*255+B*cB 4x]
// XMM1.D~=[(g*cG+R*cR+B*cB) 4x]
psrad $8,%xmm1; // XMM1.D~=[(g*cG+R*cR+B*cB)/256 4x]
movdqa %xmm7,%xmm2; // Copy XMM7 to XMM2 using load SSE unit
pslld $24,%xmm2; // XMM2.B=[FF 00 00 00 FF 00 00 00 FF 00 00 00 FF 00 00 00]
pand %xmm0,%xmm2; // XMM2.B=[AA 00 00 00 AA 00 00 00 AA 00 00 00 AA 00 00 00]
por %xmm1,%xmm2; // XMM2.B=[AA 00 00 Gr AA 00 00 Gr AA 00 00 Gr AA 00 00 Gr]
movdqa %xmm1,%xmm3; // XMM3.D~=[Grey Grey Grey Grey]
pslld $8,%xmm3; // XMM3.B=[00 00 Gr 00 00 00 Gr 00 00 00 Gr 00 00 00 Gr 00]
por %xmm3,%xmm2; // XMM2.B=[AA 00 Gr Gr AA 00 Gr Gr AA 00 Gr Gr AA 00 Gr Gr]
pslld $8,%xmm3; // XMM3.B=[00 Gr 00 00 00 Gr 00 00 00 Gr 00 00 00 Gr 00 00]
por %xmm3,%xmm2; // XMM2.B=[AA Gr Gr Gr AA Gr Gr Gr AA Gr Gr Gr AA Gr Gr Gr]
movdqa %xmm2,(%eax); // store 4pixels packed Grey converted
add $16,%eax; // next frame is 4*4bytes=16bytes further
dec %ecx; // decrement 4pixels packed count
jnz _loop4; // continue works :)
_end:
mov $1,%eax; // 1 => exit
mov $0,%ebx; // 0 return code
int $0x80; // bye bye :)
;// Addon function to fill the picture
_fill:
movl $image,%eax; // put ptr image in EAX
xor %ebx,%ebx; // clear EBX
movl imagelen,%ecx; // put size in ECX
_loop2:
movb %bl,(%eax); // put counter in pixel component
inc %bl; // increment counter
inc %eax; // increment pixel component pointer
dec %ecx; // decrement byte count
jnz _loop2; // continue if not finished
ret; // come back to the main code