3 
3 

4 
4 
// Orginal C implementation by A'rpi/ESPteam <arpi@thot.banki.hu>

5 
5 
// current version mostly by Michael Niedermayer (michaelni@gmx.at)


6 
// the parts written by michael are under GNU GPL

6 
7 

7 
8 
#include <inttypes.h>

8 
9 
#include "../config.h"


10 
#include "swscale.h"

9 
11 

10 
12 
//#undef HAVE_MMX2

11 
13 
//#undef HAVE_MMX

12 
14 
//#undef ARCH_X86

13 

#define DITHER16BPP

14 

//#define ALT_ERROR


15 
#define DITHER1XBPP


16 
int fullUVIpol=0;


17 
//disables the unscaled height version


18 
int allwaysIpol=0;

15 
19 

16 
20 
#define RET 0xC3 //near return opcode

17 
21 
/*

18 
22 
NOTES

19 
23 

20 

known BUGS with known cause (no bugreports please!)

21 

code reads 1 sample too much (might cause a sig11)


24 
known BUGS with known cause (no bugreports please!, but patches are welcome :) )


25 
horizontal MMX2 scaler reads 17 samples too much (might cause a sig11)


26 


27 
Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested)


28 
BGR15 & BGR16 MMX verions support dithering


29 
Special versions: fast Y 1:1 scaling (no interpolation in y direction)

22 
30 

23 
31 
TODO

24 

check alignment off everything


32 
more intelligent missalignment avoidance for the horizontal scaler

25 
33 
*/

26 
34 

27 

static uint64_t yCoeff= 0x2568256825682568LL;

28 

static uint64_t ubCoeff= 0x3343334333433343LL;

29 

static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL;

30 

static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL;

31 

static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL;

32 

static uint64_t w80= 0x0080008000800080LL;

33 

static uint64_t w10= 0x0010001000100010LL;

34 

static uint64_t bm00000111=0x0000000000FFFFFFLL;

35 

static uint64_t bm11111000=0xFFFFFFFFFF000000LL;

36 


37 

static uint64_t b16Dither= 0x0004000400040004LL;

38 

static uint64_t b16Dither1=0x0004000400040004LL;

39 

static uint64_t b16Dither2=0x0602060206020602LL;

40 

static uint64_t g16Dither= 0x0002000200020002LL;

41 

static uint64_t g16Dither1=0x0002000200020002LL;

42 

static uint64_t g16Dither2=0x0301030103010301LL;

43 


44 

static uint64_t b16Mask= 0x001F001F001F001FLL;

45 

static uint64_t g16Mask= 0x07E007E007E007E0LL;

46 

static uint64_t r16Mask= 0xF800F800F800F800LL;

47 

static uint64_t temp0;


35 
#define ABS(a) ((a) > 0 ? (a) : ((a)))


36 


37 
#ifdef HAVE_MMX2


38 
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"


39 
#elif defined (HAVE_3DNOW)


40 
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"


41 
#endif

48 
42 


43 
#ifdef HAVE_MMX2


44 
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"


45 
#else


46 
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"


47 
#endif


48 


49 


50 
#ifdef HAVE_MMX


51 
static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL;


52 
static uint64_t __attribute__((aligned(8))) ubCoeff= 0x3343334333433343LL;


53 
static uint64_t __attribute__((aligned(8))) vrCoeff= 0x40cf40cf40cf40cfLL;


54 
static uint64_t __attribute__((aligned(8))) ugCoeff= 0xE5E2E5E2E5E2E5E2LL;


55 
static uint64_t __attribute__((aligned(8))) vgCoeff= 0xF36EF36EF36EF36ELL;


56 
static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL;


57 
static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL;


58 
static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL;


59 
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;


60 
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;


61 
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;


62 


63 
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;


64 
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;


65 
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;


66 
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;


67 
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;


68 
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;


69 


70 
static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL;


71 
static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL;


72 
static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL;


73 
static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL;


74 
static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL;


75 
static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL;


76 


77 
static uint64_t __attribute__((aligned(8))) temp0;


78 
static uint64_t __attribute__((aligned(8))) asm_yalpha1;


79 
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;


80 
#endif

49 
81 

50 
82 
// temporary storage for 4 yuv lines:

51 
83 
// 16bit for now (mmx likes it more compact)


84 
#ifdef HAVE_MMX


85 
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];


86 
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];


87 
#else

52 
88 
static uint16_t pix_buf_y[4][2048];

53 
89 
static uint16_t pix_buf_uv[2][2048*2];


90 
#endif

54 
91 

55 
92 
// clipping helper table for C implementations:

56 
93 
static unsigned char clip_table[768];

...  ...  
66 
103 
static uint8_t funnyYCode[10000];

67 
104 
static uint8_t funnyUVCode[10000];

68 
105 


106 
#define FULL_YSCALEYUV2RGB \


107 
"pxor %%mm7, %%mm7 \n\t"\


108 
"movd %6, %%mm6 \n\t" /*yalpha1*/\


109 
"punpcklwd %%mm6, %%mm6 \n\t"\


110 
"punpcklwd %%mm6, %%mm6 \n\t"\


111 
"movd %7, %%mm5 \n\t" /*uvalpha1*/\


112 
"punpcklwd %%mm5, %%mm5 \n\t"\


113 
"punpcklwd %%mm5, %%mm5 \n\t"\


114 
"xorl %%eax, %%eax \n\t"\


115 
"1: \n\t"\


116 
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\


117 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\


118 
"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\


119 
"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\


120 
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\


121 
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\


122 
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\


123 
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\


124 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\


125 
"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\


126 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\


127 
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\


128 
"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\


129 
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\


130 
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\


131 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\


132 
"psubw w400, %%mm3 \n\t" /* 8(U128)*/\


133 
"pmulhw yCoeff, %%mm1 \n\t"\


134 
\


135 
\


136 
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\


137 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\


138 
"pmulhw ubCoeff, %%mm3 \n\t"\


139 
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\


140 
"pmulhw ugCoeff, %%mm2 \n\t"\


141 
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\


142 
"psubw w400, %%mm0 \n\t" /* (V128)8*/\


143 
\


144 
\


145 
"movq %%mm0, %%mm4 \n\t" /* (V128)8*/\


146 
"pmulhw vrCoeff, %%mm0 \n\t"\


147 
"pmulhw vgCoeff, %%mm4 \n\t"\


148 
"paddw %%mm1, %%mm3 \n\t" /* B*/\


149 
"paddw %%mm1, %%mm0 \n\t" /* R*/\


150 
"packuswb %%mm3, %%mm3 \n\t"\


151 
\


152 
"packuswb %%mm0, %%mm0 \n\t"\


153 
"paddw %%mm4, %%mm2 \n\t"\


154 
"paddw %%mm2, %%mm1 \n\t" /* G*/\


155 
\


156 
"packuswb %%mm1, %%mm1 \n\t"


157 


158 
#define YSCALEYUV2RGB \


159 
"movd %6, %%mm6 \n\t" /*yalpha1*/\


160 
"punpcklwd %%mm6, %%mm6 \n\t"\


161 
"punpcklwd %%mm6, %%mm6 \n\t"\


162 
"movq %%mm6, asm_yalpha1 \n\t"\


163 
"movd %7, %%mm5 \n\t" /*uvalpha1*/\


164 
"punpcklwd %%mm5, %%mm5 \n\t"\


165 
"punpcklwd %%mm5, %%mm5 \n\t"\


166 
"movq %%mm5, asm_uvalpha1 \n\t"\


167 
"xorl %%eax, %%eax \n\t"\


168 
"1: \n\t"\


169 
"movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\


170 
"movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\


171 
"movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\


172 
"movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\


173 
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\


174 
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\


175 
"movq asm_uvalpha1, %%mm0 \n\t"\


176 
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\


177 
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\


178 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\


179 
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\


180 
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\


181 
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\


182 
"psubw w400, %%mm3 \n\t" /* (U128)8*/\


183 
"psubw w400, %%mm4 \n\t" /* (V128)8*/\


184 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\


185 
"movq %%mm4, %%mm5 \n\t" /* (V128)8*/\


186 
"pmulhw ugCoeff, %%mm3 \n\t"\


187 
"pmulhw vgCoeff, %%mm4 \n\t"\


188 
/* mm2=(U128)8, mm3=ug, mm4=vg mm5=(V128)8 */\


189 
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\


190 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\


191 
"movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\


192 
"movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\


193 
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\


194 
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax]  buf1[eax]*/\


195 
"pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\


196 
"pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\


197 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\


198 
"psraw $4, %%mm7 \n\t" /* buf0[eax]  buf1[eax] >>4*/\


199 
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\


200 
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\


201 
"pmulhw ubCoeff, %%mm2 \n\t"\


202 
"pmulhw vrCoeff, %%mm5 \n\t"\


203 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\


204 
"psubw w80, %%mm7 \n\t" /* 8(Y16)*/\


205 
"pmulhw yCoeff, %%mm1 \n\t"\


206 
"pmulhw yCoeff, %%mm7 \n\t"\


207 
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\


208 
"paddw %%mm3, %%mm4 \n\t"\


209 
"movq %%mm2, %%mm0 \n\t"\


210 
"movq %%mm5, %%mm6 \n\t"\


211 
"movq %%mm4, %%mm3 \n\t"\


212 
"punpcklwd %%mm2, %%mm2 \n\t"\


213 
"punpcklwd %%mm5, %%mm5 \n\t"\


214 
"punpcklwd %%mm4, %%mm4 \n\t"\


215 
"paddw %%mm1, %%mm2 \n\t"\


216 
"paddw %%mm1, %%mm5 \n\t"\


217 
"paddw %%mm1, %%mm4 \n\t"\


218 
"punpckhwd %%mm0, %%mm0 \n\t"\


219 
"punpckhwd %%mm6, %%mm6 \n\t"\


220 
"punpckhwd %%mm3, %%mm3 \n\t"\


221 
"paddw %%mm7, %%mm0 \n\t"\


222 
"paddw %%mm7, %%mm6 \n\t"\


223 
"paddw %%mm7, %%mm3 \n\t"\


224 
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\


225 
"packuswb %%mm0, %%mm2 \n\t"\


226 
"packuswb %%mm6, %%mm5 \n\t"\


227 
"packuswb %%mm3, %%mm4 \n\t"\


228 
"pxor %%mm7, %%mm7 \n\t"


229 


230 
#define YSCALEYUV2RGB1 \


231 
"xorl %%eax, %%eax \n\t"\


232 
"1: \n\t"\


233 
"movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\


234 
"movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\


235 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\


236 
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\


237 
"psubw w400, %%mm3 \n\t" /* (U128)8*/\


238 
"psubw w400, %%mm4 \n\t" /* (V128)8*/\


239 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\


240 
"movq %%mm4, %%mm5 \n\t" /* (V128)8*/\


241 
"pmulhw ugCoeff, %%mm3 \n\t"\


242 
"pmulhw vgCoeff, %%mm4 \n\t"\


243 
/* mm2=(U128)8, mm3=ug, mm4=vg mm5=(V128)8 */\


244 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\


245 
"movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\


246 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\


247 
"psraw $4, %%mm7 \n\t" /* buf0[eax]  buf1[eax] >>4*/\


248 
"pmulhw ubCoeff, %%mm2 \n\t"\


249 
"pmulhw vrCoeff, %%mm5 \n\t"\


250 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\


251 
"psubw w80, %%mm7 \n\t" /* 8(Y16)*/\


252 
"pmulhw yCoeff, %%mm1 \n\t"\


253 
"pmulhw yCoeff, %%mm7 \n\t"\


254 
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\


255 
"paddw %%mm3, %%mm4 \n\t"\


256 
"movq %%mm2, %%mm0 \n\t"\


257 
"movq %%mm5, %%mm6 \n\t"\


258 
"movq %%mm4, %%mm3 \n\t"\


259 
"punpcklwd %%mm2, %%mm2 \n\t"\


260 
"punpcklwd %%mm5, %%mm5 \n\t"\


261 
"punpcklwd %%mm4, %%mm4 \n\t"\


262 
"paddw %%mm1, %%mm2 \n\t"\


263 
"paddw %%mm1, %%mm5 \n\t"\


264 
"paddw %%mm1, %%mm4 \n\t"\


265 
"punpckhwd %%mm0, %%mm0 \n\t"\


266 
"punpckhwd %%mm6, %%mm6 \n\t"\


267 
"punpckhwd %%mm3, %%mm3 \n\t"\


268 
"paddw %%mm7, %%mm0 \n\t"\


269 
"paddw %%mm7, %%mm6 \n\t"\


270 
"paddw %%mm7, %%mm3 \n\t"\


271 
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\


272 
"packuswb %%mm0, %%mm2 \n\t"\


273 
"packuswb %%mm6, %%mm5 \n\t"\


274 
"packuswb %%mm3, %%mm4 \n\t"\


275 
"pxor %%mm7, %%mm7 \n\t"


276 


277 
#define WRITEBGR32 \


278 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\


279 
"movq %%mm2, %%mm1 \n\t" /* B */\


280 
"movq %%mm5, %%mm6 \n\t" /* R */\


281 
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\


282 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\


283 
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\


284 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\


285 
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\


286 
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\


287 
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\


288 
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\


289 
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\


290 
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\


291 
\


292 
MOVNTQ(%%mm0, (%4, %%eax, 4))\


293 
MOVNTQ(%%mm2, 8(%4, %%eax, 4))\


294 
MOVNTQ(%%mm1, 16(%4, %%eax, 4))\


295 
MOVNTQ(%%mm3, 24(%4, %%eax, 4))\


296 
\


297 
"addl $8, %%eax \n\t"\


298 
"cmpl %5, %%eax \n\t"\


299 
" jb 1b \n\t"


300 


301 
#define WRITEBGR16 \


302 
"movq %%mm2, %%mm1 \n\t" /* B */\


303 
"movq %%mm4, %%mm3 \n\t" /* G */\


304 
"movq %%mm5, %%mm6 \n\t" /* R */\


305 
\


306 
"punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\


307 
"punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\


308 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\


309 
\


310 
"psrlw $3, %%mm2 \n\t"\


311 
"psllw $3, %%mm3 \n\t"\


312 
"psllw $8, %%mm5 \n\t"\


313 
\


314 
"pand g16Mask, %%mm3 \n\t"\


315 
"pand r16Mask, %%mm5 \n\t"\


316 
\


317 
"por %%mm3, %%mm2 \n\t"\


318 
"por %%mm5, %%mm2 \n\t"\


319 
\


320 
"punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\


321 
"punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\


322 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\


323 
\


324 
"psrlw $3, %%mm1 \n\t"\


325 
"psllw $3, %%mm4 \n\t"\


326 
"psllw $8, %%mm6 \n\t"\


327 
\


328 
"pand g16Mask, %%mm4 \n\t"\


329 
"pand r16Mask, %%mm6 \n\t"\


330 
\


331 
"por %%mm4, %%mm1 \n\t"\


332 
"por %%mm6, %%mm1 \n\t"\


333 
\


334 
MOVNTQ(%%mm2, (%4, %%eax, 2))\


335 
MOVNTQ(%%mm1, 8(%4, %%eax, 2))\


336 
\


337 
"addl $8, %%eax \n\t"\


338 
"cmpl %5, %%eax \n\t"\


339 
" jb 1b \n\t"


340 


341 
#define WRITEBGR15 \


342 
"movq %%mm2, %%mm1 \n\t" /* B */\


343 
"movq %%mm4, %%mm3 \n\t" /* G */\


344 
"movq %%mm5, %%mm6 \n\t" /* R */\


345 
\


346 
"punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\


347 
"punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\


348 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\


349 
\


350 
"psrlw $3, %%mm2 \n\t"\


351 
"psllw $2, %%mm3 \n\t"\


352 
"psllw $7, %%mm5 \n\t"\


353 
\


354 
"pand g15Mask, %%mm3 \n\t"\


355 
"pand r15Mask, %%mm5 \n\t"\


356 
\


357 
"por %%mm3, %%mm2 \n\t"\


358 
"por %%mm5, %%mm2 \n\t"\


359 
\


360 
"punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\


361 
"punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\


362 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\


363 
\


364 
"psrlw $3, %%mm1 \n\t"\


365 
"psllw $2, %%mm4 \n\t"\


366 
"psllw $7, %%mm6 \n\t"\


367 
\


368 
"pand g15Mask, %%mm4 \n\t"\


369 
"pand r15Mask, %%mm6 \n\t"\


370 
\


371 
"por %%mm4, %%mm1 \n\t"\


372 
"por %%mm6, %%mm1 \n\t"\


373 
\


374 
MOVNTQ(%%mm2, (%4, %%eax, 2))\


375 
MOVNTQ(%%mm1, 8(%4, %%eax, 2))\


376 
\


377 
"addl $8, %%eax \n\t"\


378 
"cmpl %5, %%eax \n\t"\


379 
" jb 1b \n\t"


380 
// FIXME find a faster way to shuffle it to BGR24


381 
#define WRITEBGR24 \


382 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\


383 
"movq %%mm2, %%mm1 \n\t" /* B */\


384 
"movq %%mm5, %%mm6 \n\t" /* R */\


385 
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\


386 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\


387 
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\


388 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\


389 
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\


390 
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\


391 
"punpcklbw %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\


392 
"punpckhbw %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\


393 
"punpcklbw %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\


394 
"punpckhbw %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\


395 
\


396 
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\


397 
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\


398 
"pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\


399 
"pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\


400 
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\


401 
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\


402 
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\


403 
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\


404 
\


405 
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\


406 
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\


407 
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\


408 
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\


409 
"pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\


410 
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\


411 
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\


412 
"pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\


413 
"pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\


414 
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\


415 
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\


416 
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\


417 
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\


418 
\


419 
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\


420 
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\


421 
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\


422 
"pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\


423 
"pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\


424 
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\


425 
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\


426 
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\


427 
\


428 
"leal (%%eax, %%eax, 2), %%ebx \n\t"\


429 
MOVNTQ(%%mm0, (%4, %%ebx))\


430 
MOVNTQ(%%mm2, 8(%4, %%ebx))\


431 
MOVNTQ(%%mm3, 16(%4, %%ebx))\


432 
\


433 
"addl $8, %%eax \n\t"\


434 
"cmpl %5, %%eax \n\t"\


435 
" jb 1b \n\t"


436 


437 


438 
/**


439 
* vertical scale YV12 to RGB


440 
*/


441 
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,


442 
uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)


443 
{


444 
int yalpha1=yalpha^4095;


445 
int uvalpha1=uvalpha^4095;


446 
int i;


447 


448 
if(fullUVIpol)


449 
{


450 


451 
#ifdef HAVE_MMX


452 
if(dstbpp == 32)


453 
{


454 
asm volatile(


455 


456 


457 
FULL_YSCALEYUV2RGB


458 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG


459 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0


460 


461 
"movq %%mm3, %%mm1 \n\t"


462 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0


463 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0


464 


465 
MOVNTQ(%%mm3, (%4, %%eax, 4))


466 
MOVNTQ(%%mm1, 8(%4, %%eax, 4))


467 


468 
"addl $4, %%eax \n\t"


469 
"cmpl %5, %%eax \n\t"


470 
" jb 1b \n\t"


471 


472 


473 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


474 
"m" (yalpha1), "m" (uvalpha1)


475 
: "%eax"


476 
);


477 
}


478 
else if(dstbpp==24)


479 
{


480 
asm volatile(


481 


482 
FULL_YSCALEYUV2RGB


483 


484 
// lsb ... msb


485 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG


486 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0


487 


488 
"movq %%mm3, %%mm1 \n\t"


489 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0


490 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0


491 


492 
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0


493 
"psrlq $8, %%mm3 \n\t" // GR0BGR00


494 
"pand bm00000111, %%mm2 \n\t" // BGR00000


495 
"pand bm11111000, %%mm3 \n\t" // 000BGR00


496 
"por %%mm2, %%mm3 \n\t" // BGRBGR00


497 
"movq %%mm1, %%mm2 \n\t"


498 
"psllq $48, %%mm1 \n\t" // 000000BG


499 
"por %%mm1, %%mm3 \n\t" // BGRBGRBG


500 


501 
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0


502 
"psrld $16, %%mm2 \n\t" // R000R000


503 
"psrlq $24, %%mm1 \n\t" // 0BGR0000


504 
"por %%mm2, %%mm1 \n\t" // RBGRR000


505 


506 
"movl %4, %%ebx \n\t"


507 
"addl %%eax, %%ebx \n\t"


508 


509 
#ifdef HAVE_MMX2


510 
//FIXME Alignment


511 
"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"


512 
"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"


513 
#else


514 
"movd %%mm3, (%%ebx, %%eax, 2) \n\t"


515 
"psrlq $32, %%mm3 \n\t"


516 
"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"


517 
"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"


518 
#endif


519 
"addl $4, %%eax \n\t"


520 
"cmpl %5, %%eax \n\t"


521 
" jb 1b \n\t"


522 


523 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),


524 
"m" (yalpha1), "m" (uvalpha1)


525 
: "%eax", "%ebx"


526 
);


527 
}


528 
else if(dstbpp==15)


529 
{


530 
asm volatile(


531 


532 
FULL_YSCALEYUV2RGB


533 
#ifdef DITHER1XBPP


534 
"paddusb b16Dither, %%mm1 \n\t"


535 
"paddusb b16Dither, %%mm0 \n\t"


536 
"paddusb b16Dither, %%mm3 \n\t"


537 
#endif


538 
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G


539 
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B


540 
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R


541 


542 
"psrlw $3, %%mm3 \n\t"


543 
"psllw $2, %%mm1 \n\t"


544 
"psllw $7, %%mm0 \n\t"


545 
"pand g15Mask, %%mm1 \n\t"


546 
"pand r15Mask, %%mm0 \n\t"


547 


548 
"por %%mm3, %%mm1 \n\t"


549 
"por %%mm1, %%mm0 \n\t"


550 


551 
MOVNTQ(%%mm0, (%4, %%eax, 2))


552 


553 
"addl $4, %%eax \n\t"


554 
"cmpl %5, %%eax \n\t"


555 
" jb 1b \n\t"


556 


557 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


558 
"m" (yalpha1), "m" (uvalpha1)


559 
: "%eax"


560 
);


561 
}


562 
else if(dstbpp==16)


563 
{


564 
asm volatile(


565 


566 
FULL_YSCALEYUV2RGB


567 
#ifdef DITHER1XBPP


568 
"paddusb g16Dither, %%mm1 \n\t"


569 
"paddusb b16Dither, %%mm0 \n\t"


570 
"paddusb b16Dither, %%mm3 \n\t"


571 
#endif


572 
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G


573 
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B


574 
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R


575 


576 
"psrlw $3, %%mm3 \n\t"


577 
"psllw $3, %%mm1 \n\t"


578 
"psllw $8, %%mm0 \n\t"


579 
"pand g16Mask, %%mm1 \n\t"


580 
"pand r16Mask, %%mm0 \n\t"


581 


582 
"por %%mm3, %%mm1 \n\t"


583 
"por %%mm1, %%mm0 \n\t"


584 


585 
MOVNTQ(%%mm0, (%4, %%eax, 2))


586 


587 
"addl $4, %%eax \n\t"


588 
"cmpl %5, %%eax \n\t"


589 
" jb 1b \n\t"


590 


591 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


592 
"m" (yalpha1), "m" (uvalpha1)


593 
: "%eax"


594 
);


595 
}


596 
#else


597 
if(dstbpp==32  dstbpp==24)


598 
{


599 
for(i=0;i<dstw;i++){


600 
// vertical linear interpolation && yuv2rgb in a single step:


601 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


602 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);


603 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);


604 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];


605 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];


606 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];


607 
dest+=dstbpp>>3;


608 
}


609 
}


610 
else if(dstbpp==16)


611 
{


612 
for(i=0;i<dstw;i++){


613 
// vertical linear interpolation && yuv2rgb in a single step:


614 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


615 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);


616 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);


617 


618 
((uint16_t*)dest)[0] =


619 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


620 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 


621 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;


622 
dest+=2;


623 
}


624 
}


625 
else if(dstbpp==15)


626 
{


627 
for(i=0;i<dstw;i++){


628 
// vertical linear interpolation && yuv2rgb in a single step:


629 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


630 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);


631 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);


632 


633 
((uint16_t*)dest)[0] =


634 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


635 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 


636 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;


637 
dest+=2;


638 
}


639 
}


640 
#endif


641 
}//FULL_UV_IPOL


642 
else


643 
{


644 
#ifdef HAVE_MMX


645 
if(dstbpp == 32)


646 
{


647 
asm volatile(


648 
YSCALEYUV2RGB


649 
WRITEBGR32


650 


651 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


652 
"m" (yalpha1), "m" (uvalpha1)


653 
: "%eax"


654 
);


655 
}


656 
else if(dstbpp==24)


657 
{


658 
asm volatile(


659 
YSCALEYUV2RGB


660 
WRITEBGR24


661 


662 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


663 
"m" (yalpha1), "m" (uvalpha1)


664 
: "%eax", "%ebx"


665 
);


666 
}


667 
else if(dstbpp==15)


668 
{


669 
asm volatile(


670 
YSCALEYUV2RGB


671 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */


672 
#ifdef DITHER1XBPP


673 
"paddusb b16Dither, %%mm2 \n\t"


674 
"paddusb b16Dither, %%mm4 \n\t"


675 
"paddusb b16Dither, %%mm5 \n\t"


676 
#endif


677 


678 
WRITEBGR15


679 


680 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


681 
"m" (yalpha1), "m" (uvalpha1)


682 
: "%eax"


683 
);


684 
}


685 
else if(dstbpp==16)


686 
{


687 
asm volatile(


688 
YSCALEYUV2RGB


689 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */


690 
#ifdef DITHER1XBPP


691 
"paddusb g16Dither, %%mm2 \n\t"


692 
"paddusb b16Dither, %%mm4 \n\t"


693 
"paddusb b16Dither, %%mm5 \n\t"


694 
#endif


695 


696 
WRITEBGR16


697 


698 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


699 
"m" (yalpha1), "m" (uvalpha1)


700 
: "%eax"


701 
);


702 
}


703 
#else


704 
//FIXME unroll C loop and dont recalculate UV


705 
if(dstbpp==32  dstbpp==24)


706 
{


707 
for(i=0;i<dstw;i++){


708 
// vertical linear interpolation && yuv2rgb in a single step:


709 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


710 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


711 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


712 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];


713 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];


714 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];


715 
dest+=dstbpp>>3;


716 
}


717 
}


718 
else if(dstbpp==16)


719 
{


720 
for(i=0;i<dstw;i++){


721 
// vertical linear interpolation && yuv2rgb in a single step:


722 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


723 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


724 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


725 


726 
((uint16_t*)dest)[0] =


727 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


728 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 


729 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;


730 
dest+=2;


731 
}


732 
}


733 
else if(dstbpp==15)


734 
{


735 
for(i=0;i<dstw;i++){


736 
// vertical linear interpolation && yuv2rgb in a single step:


737 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];


738 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


739 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


740 


741 
((uint16_t*)dest)[0] =


742 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


743 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 


744 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;


745 
dest+=2;


746 
}


747 
}


748 
#endif


749 
} //!FULL_UV_IPOL


750 
}


751 


752 
/**


753 
* YV12 to RGB without scaling or interpolating


754 
*/


755 
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,


756 
uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)


757 
{


758 
int yalpha1=yalpha^4095;


759 
int uvalpha1=uvalpha^4095;


760 
int i;


761 
if(fullUVIpol  allwaysIpol)


762 
{


763 
yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);


764 
return;


765 
}


766 
#ifdef HAVE_MMX


767 
if(dstbpp == 32)


768 
{


769 
asm volatile(


770 
YSCALEYUV2RGB1


771 
WRITEBGR32


772 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


773 
"m" (yalpha1), "m" (uvalpha1)


774 
: "%eax"


775 
);


776 
}


777 
else if(dstbpp==24)


778 
{


779 
asm volatile(


780 
YSCALEYUV2RGB1


781 
WRITEBGR24


782 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


783 
"m" (yalpha1), "m" (uvalpha1)


784 
: "%eax", "%ebx"


785 
);


786 
}


787 
else if(dstbpp==15)


788 
{


789 
asm volatile(


790 
YSCALEYUV2RGB1


791 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */


792 
#ifdef DITHER1XBPP


793 
"paddusb b16Dither, %%mm2 \n\t"


794 
"paddusb b16Dither, %%mm4 \n\t"


795 
"paddusb b16Dither, %%mm5 \n\t"


796 
#endif


797 
WRITEBGR15


798 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


799 
"m" (yalpha1), "m" (uvalpha1)


800 
: "%eax"


801 
);


802 
}


803 
else if(dstbpp==16)


804 
{


805 
asm volatile(


806 
YSCALEYUV2RGB1


807 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */


808 
#ifdef DITHER1XBPP


809 
"paddusb g16Dither, %%mm2 \n\t"


810 
"paddusb b16Dither, %%mm4 \n\t"


811 
"paddusb b16Dither, %%mm5 \n\t"


812 
#endif


813 


814 
WRITEBGR16


815 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),


816 
"m" (yalpha1), "m" (uvalpha1)


817 
: "%eax"


818 
);


819 
}


820 
#else


821 
//FIXME unroll C loop and dont recalculate UV


822 
if(dstbpp==32  dstbpp==24)


823 
{


824 
for(i=0;i<dstw;i++){


825 
// vertical linear interpolation && yuv2rgb in a single step:


826 
int Y=yuvtab_2568[buf0[i]>>7];


827 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


828 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


829 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];


830 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];


831 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];


832 
dest+=dstbpp>>3;


833 
}


834 
}


835 
else if(dstbpp==16)


836 
{


837 
for(i=0;i<dstw;i++){


838 
// vertical linear interpolation && yuv2rgb in a single step:


839 
int Y=yuvtab_2568[buf0[i]>>7];


840 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


841 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


842 


843 
((uint16_t*)dest)[0] =


844 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


845 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 


846 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;


847 
dest+=2;


848 
}


849 
}


850 
else if(dstbpp==15)


851 
{


852 
for(i=0;i<dstw;i++){


853 
// vertical linear interpolation && yuv2rgb in a single step:


854 
int Y=yuvtab_2568[buf0[i]>>7];


855 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);


856 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);


857 


858 
((uint16_t*)dest)[0] =


859 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 


860 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 


861 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;


862 
dest+=2;


863 
}


864 
}


865 
#endif


866 
}


867 


868 


869 

69 
870 

70 
871 
// *** bilinear scaling and yuv>rgb conversion of yv12 slices:

71 
872 
// *** Note: it's called multiple times while decoding a frame, first time y==0

...  ...  
95 
896 
// used to detect a horizontal size change

96 
897 
static int old_dstw= 1;

97 
898 
static int old_s_xinc= 1;

98 


99 
899 
#endif


900 

100 
901 
int canMMX2BeUsed=0;

101 
902 
int srcWidth= (dstw*s_xinc + 0x8000)>>16;


903 
int dstUVw= fullUVIpol ? dstw : dstw/2;


904 

102 
905 

103 
906 
#ifdef HAVE_MMX2

104 
907 
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;

...  ...  
111 
914 
// first and last pixel

112 
915 
if(canMMX2BeUsed) s_xinc+= 20;

113 
916 
else s_xinc = ((srcWidth2)<<16)/(dstw2)  20;

114 

s_xinc2=s_xinc>>1;

115 
917 


918 
if(fullUVIpol) s_xinc2= s_xinc>>1;


919 
else s_xinc2= s_xinc;

116 
920 
// force calculation of the horizontal interpolation of the first line

117 
921 
s_last_ypos=99;

118 
922 
s_last_y1pos=99;

...  ...  
215 
1019 
funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=

216 
1020 
a  (b<<2)  (c<<4)  (d<<6);

217 
1021 


1022 
// if we dont need to read 8 bytes than dont :), reduces the chance of


1023 
// crossing a cache line


1024 
if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;


1025 

218 
1026 
funnyYCode[fragmentLength*(i+4)/4]= RET;

219 
1027 
}

220 
1028 
xpos+=s_xinc;

221 
1029 
}

222 
1030 

223 
1031 
xpos= 0; //s_xinc2/2  0x10000; // difference between centers of chrom samples

224 

for(i=0; i<dstw/8; i++)


1032 
for(i=0; i<dstUVw/8; i++)

225 
1033 
{

226 
1034 
int xx=xpos>>16;

227 
1035 

...  ...  
238 
1046 
funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=

239 
1047 
a  (b<<2)  (c<<4)  (d<<6);

240 
1048 


1049 
// if we dont need to read 8 bytes than dont :), reduces the chance of


1050 
// crossing a cache line


1051 
if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;


1052 

241 
1053 
funnyUVCode[fragmentLength*(i+4)/4]= RET;

242 
1054 
}

243 
1055 
xpos+=s_xinc2;

...  ...  
255 
1067 
// points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)

256 
1068 
int srcuvpos= s_srcypos + s_yinc/2  0x8000;

257 
1069 
int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line

258 

int yalpha=((s_srcypos1)&0xFFFF)>>7;

259 

int yalpha1=yalpha^511;

260 

int uvalpha=((srcuvpos1)&0x1FFFF)>>8;

261 

int uvalpha1=uvalpha^511;


1070 
int yalpha=((s_srcypos1)&0xFFFF)>>4;


1071 
int uvalpha=((srcuvpos1)&0x1FFFF)>>5;

262 
1072 
uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice

263 
1073 
uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice

264 
1074 
uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice

...  ...  
320 
1130 
"xorl %%ecx, %%ecx \n\t"

321 
1131 
"xorl %%ebx, %%ebx \n\t"

322 
1132 
"movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF

323 

// "int $3\n\t"

324 

"call funnyYCode \n\t"

325 

"movq temp0, %%mm2 \n\t"

326 

"xorl %%ecx, %%ecx \n\t"

327 

"call funnyYCode \n\t"

328 

"movq temp0, %%mm2 \n\t"

329 

"xorl %%ecx, %%ecx \n\t"

330 

"call funnyYCode \n\t"

331 

"movq temp0, %%mm2 \n\t"

332 

"xorl %%ecx, %%ecx \n\t"

333 

"call funnyYCode \n\t"

334 

"movq temp0, %%mm2 \n\t"

335 

"xorl %%ecx, %%ecx \n\t"

336 

"call funnyYCode \n\t"

337 

"movq temp0, %%mm2 \n\t"

338 

"xorl %%ecx, %%ecx \n\t"

339 

"call funnyYCode \n\t"

340 

"movq temp0, %%mm2 \n\t"


1133 
#ifdef HAVE_MMX2


1134 
#define FUNNY_Y_CODE \


1135 
"prefetchnta 1024(%%esi) \n\t"\


1136 
"prefetchnta 1056(%%esi) \n\t"\


1137 
"prefetchnta 1088(%%esi) \n\t"\


1138 
"call funnyYCode \n\t"\


1139 
"movq temp0, %%mm2 \n\t"\

341 
1140 
"xorl %%ecx, %%ecx \n\t"

342 

"call funnyYCode \n\t"

343 

"movq temp0, %%mm2 \n\t"


1141 
#else


1142 
#define FUNNY_Y_CODE \


1143 
"call funnyYCode \n\t"\


1144 
"movq temp0, %%mm2 \n\t"\

344 
1145 
"xorl %%ecx, %%ecx \n\t"

345 

"call funnyYCode \n\t"


1146 
#endif


1147 
FUNNY_Y_CODE


1148 
FUNNY_Y_CODE


1149 
FUNNY_Y_CODE


1150 
FUNNY_Y_CODE


1151 
FUNNY_Y_CODE


1152 
FUNNY_Y_CODE


1153 
FUNNY_Y_CODE


1154 
FUNNY_Y_CODE


1155 

346 
1156 
:: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),

347 
1157 
"m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)

348 
1158 
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"

...  ...  
352 
1162 
else

353 
1163 
{

354 
1164 
#endif

355 

//NO MMX just normal asm ... FIXME try/write funny MMX2 variant

356 

//FIXME add prefetch


1165 
//NO MMX just normal asm ...

357 
1166 
asm volatile(

358 
1167 
"xorl %%eax, %%eax \n\t" // i

359 
1168 
"xorl %%ebx, %%ebx \n\t" // xx

...  ...  
438 
1247 
"xorl %%ebx, %%ebx \n\t"

439 
1248 
"movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF

440 
1249 

441 

// "int $3\n\t"


1250 
#ifdef HAVE_MMX2

442 
1251 
#define FUNNYUVCODE \

443 

"call funnyUVCode \n\t"\

444 

"movq temp0, %%mm2 \n\t"\

445 

"xorl %%ecx, %%ecx \n\t"


1252 
"prefetchnta 1024(%%esi) \n\t"\


1253 
"prefetchnta 1056(%%esi) \n\t"\


1254 
"prefetchnta 1088(%%esi) \n\t"\


1255 
"call funnyUVCode \n\t"\


1256 
"movq temp0, %%mm2 \n\t"\


1257 
"xorl %%ecx, %%ecx \n\t"


1258 
#else


1259 
#define FUNNYUVCODE \


1260 
"call funnyUVCode \n\t"\


1261 
"movq temp0, %%mm2 \n\t"\


1262 
"xorl %%ecx, %%ecx \n\t"


1263 
#endif

446 
1264 

447 
1265 
FUNNYUVCODE

448 
1266 
FUNNYUVCODE

...  ...  
455 
1273 
FUNNYUVCODE

456 
1274 

457 
1275 

458 


459 
1276 
"xorl %%eax, %%eax \n\t" // i

460 
1277 
"movl %6, %%esi \n\t" // src

461 
1278 
"movl %1, %%edi \n\t" // buf1

...  ...  
471 
1288 
FUNNYUVCODE

472 
1289 
FUNNYUVCODE

473 
1290 

474 

:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16),


1291 
:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),

475 
1292 
"m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)

476 
1293 
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"

477 
1294 
);

478 

for(i=dstw1; (i*s_xinc2)>>16 >=srcWidth/21; i)


1295 
for(i=dstUVw1; (i*s_xinc2)>>16 >=srcWidth/21; i)

479 
1296 
{

480 
1297 
uvbuf1[i] = src1[srcWidth/21]*128;

481 
1298 
uvbuf1[i+2048] = src2[srcWidth/21]*128;

...  ...  
516 
1333 
"cmpl %2, %%eax \n\t"

517 
1334 
" jb 1b \n\t"

518 
1335 

519 


520 

:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),


1336 
:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),

521 
1337 
"r" (src2)

522 
1338 
: "%eax", "%ebx", "%ecx", "%edi", "%esi"

523 
1339 
);

...  ...  
525 
1341 
} //if MMX2 cant be used

526 
1342 
#endif

527 
1343 
#else

528 

for(i=0;i<dstw;i++){


1344 
for(i=0;i<dstUVw;i++){

529 
1345 
register unsigned int xx=xpos>>16;

530 
1346 
register unsigned int xalpha=(xpos&0xFFFF)>>9;

531 
1347 
uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);

...  ...  
541 
1357 
}

542 
1358 
}

543 
1359 


1360 
if(ABS(s_yinc  0x10000) < 10)


1361 
yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);


1362 
else


1363 
yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);

544 
1364 

545 

// Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...

546 

// Re: Note1: ok n*4 for now

547 

// Note2: instead of using lookup tabs, mmx version could do the multiply...

548 

// Re: Note2: yep

549 

// Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:

550 

// Re: done (32 & 16) and 16 has dithering :) but 16 is untested

551 
1365 
#ifdef HAVE_MMX

552 

//FIXME write lq version with less uv ...

553 

//FIXME reorder / optimize

554 

if(dstbpp == 32)

555 

{

556 

asm volatile(

557 


558 

#define YSCALEYUV2RGB \

559 

"pxor %%mm7, %%mm7 \n\t"\

560 

"movd %6, %%mm6 \n\t" /*yalpha1*/\

561 

"punpcklwd %%mm6, %%mm6 \n\t"\

562 

"punpcklwd %%mm6, %%mm6 \n\t"\

563 

"movd %7, %%mm5 \n\t" /*uvalpha1*/\

564 

"punpcklwd %%mm5, %%mm5 \n\t"\

565 

"punpcklwd %%mm5, %%mm5 \n\t"\

566 

"xorl %%eax, %%eax \n\t"\

567 

"1: \n\t"\

568 

"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\

569 

"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\

570 

"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\

571 

"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\

572 

"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\

573 

"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\

574 

"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\

575 

"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\

576 

"psraw $7, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>7*/\

577 

"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\

578 

"psraw $7, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>7*/\

579 

"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\

580 

"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\

581 

"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\

582 

"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\

583 

"psubw w10, %%mm1 \n\t" /* Y16*/\

584 

"psubw w80, %%mm3 \n\t" /* (U128)*/\

585 

"psllw $3, %%mm1 \n\t" /* (y16)*8*/\

586 

"psllw $3, %%mm3 \n\t" /*(U128)8*/\

587 

"pmulhw yCoeff, %%mm1 \n\t"\

588 

\

589 

\

590 

"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\

591 

"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\

592 

"pmulhw ubCoeff, %%mm3 \n\t"\

593 

"psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>7*/\

594 

"pmulhw ugCoeff, %%mm2 \n\t"\

595 

"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\

596 

"psubw w80, %%mm0 \n\t" /* (V128)*/\

597 

"psllw $3, %%mm0 \n\t" /* (V128)8*/\

598 

\

599 

\

600 

"movq %%mm0, %%mm4 \n\t" /* (V128)8*/\

601 

"pmulhw vrCoeff, %%mm0 \n\t"\

602 

"pmulhw vgCoeff, %%mm4 \n\t"\

603 

"paddw %%mm1, %%mm3 \n\t" /* B*/\

604 

"paddw %%mm1, %%mm0 \n\t" /* R*/\

605 

"packuswb %%mm3, %%mm3 \n\t"\

606 

\

607 

"packuswb %%mm0, %%mm0 \n\t"\

608 

"paddw %%mm4, %%mm2 \n\t"\

609 

"paddw %%mm2, %%mm1 \n\t" /* G*/\

610 

\

611 

"packuswb %%mm1, %%mm1 \n\t"

612 


613 

YSCALEYUV2RGB

614 

"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG

615 

"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0

616 


617 

"movq %%mm3, %%mm1 \n\t"

618 

"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0

619 

"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0

620 

#ifdef HAVE_MMX2

621 

"movntq %%mm3, (%4, %%eax, 4) \n\t"

622 

"movntq %%mm1, 8(%4, %%eax, 4) \n\t"

623 

#else

624 

"movq %%mm3, (%4, %%eax, 4) \n\t"

625 

"movq %%mm1, 8(%4, %%eax, 4) \n\t"

626 

#endif

627 

"addl $4, %%eax \n\t"

628 

"cmpl %5, %%eax \n\t"

629 

" jb 1b \n\t"

630 


631 


632 

:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),

633 

"m" (yalpha1), "m" (uvalpha1)

634 

: "%eax"

635 

);

636 

}

637 

else if(dstbpp==24)

638 

{

639 

asm volatile(

640 


641 

YSCALEYUV2RGB

642 


643 

// lsb ... msb

644 

"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG

645 

"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0

646 


647 

"movq %%mm3, %%mm1 \n\t"

648 

"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0

649 

"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0

650 


651 

"movq %%mm3, %%mm2 \n\t" // BGR0BGR0

652 

"psrlq $8, %%mm3 \n\t" // GR0BGR00

653 

"pand bm00000111, %%mm2 \n\t" // BGR00000

654 

"pand bm11111000, %%mm3 \n\t" // 000BGR00

655 

"por %%mm2, %%mm3 \n\t" // BGRBGR00

656 

"movq %%mm1, %%mm2 \n\t"

657 

"psllq $48, %%mm1 \n\t" // 000000BG

658 

"por %%mm1, %%mm3 \n\t" // BGRBGRBG

659 


660 

"movq %%mm2, %%mm1 \n\t" // BGR0BGR0

661 

"psrld $16, %%mm2 \n\t" // R000R000

662 

"psrlq $24, %%mm1 \n\t" // 0BGR0000

663 

"por %%mm2, %%mm1 \n\t" // RBGRR000

664 


665 

"movl %4, %%ebx \n\t"

666 

"addl %%eax, %%ebx \n\t"

667 

#ifdef HAVE_MMX2

668 

//FIXME Alignment

669 

"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"

670 

"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"

671 

#else

672 

"movd %%mm3, (%%ebx, %%eax, 2) \n\t"

673 

"psrlq $32, %%mm3 \n\t"

674 

"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t"

675 

"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t"

676 

#endif

677 

"addl $4, %%eax \n\t"

678 

"cmpl %5, %%eax \n\t"

679 

" jb 1b \n\t"

680 


681 

:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),

682 

"m" (yalpha1), "m" (uvalpha1)

683 

: "%eax", "%ebx"

684 

);

685 

}

686 

else if(dstbpp==16)

687 

{

688 

asm volatile(

689 


690 

YSCALEYUV2RGB

691 

#ifdef DITHER16BPP

692 

"paddusb g16Dither, %%mm1 \n\t"

693 

"paddusb b16Dither, %%mm0 \n\t"

694 

"paddusb b16Dither, %%mm3 \n\t"

695 

#endif

696 

"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G

697 

"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B

698 

"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R

699 


700 

"psrlw $3, %%mm3 \n\t"

701 

"psllw $3, %%mm1 \n\t"

702 

"psllw $8, %%mm0 \n\t"

703 

"pand g16Mask, %%mm1 \n\t"

704 

"pand r16Mask, %%mm0 \n\t"

705 


706 

"por %%mm3, %%mm1 \n\t"

707 

"por %%mm1, %%mm0 \n\t"

708 

#ifdef HAVE_MMX2

709 

"movntq %%mm0, (%4, %%eax, 2) \n\t"

710 

#else

711 

"movq %%mm0, (%4, %%eax, 2) \n\t"

712 

#endif

713 

"addl $4, %%eax \n\t"

714 

"cmpl %5, %%eax \n\t"

715 

" jb 1b \n\t"

716 


717 

:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),

718 

"m" (yalpha1), "m" (uvalpha1)

719 

: "%eax"

720 

);

721 

}

722 

#else

723 

if(dstbpp==32  dstbpp==24)

724 

{

725 

for(i=0;i<dstw;i++){

726 

// vertical linear interpolation && yuv2rgb in a single step:

727 

int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];

728 

int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);

729 

int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);

730 

dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];

731 

dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];

732 

dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];

733 

dest+=dstbpp>>3;

734 

}

735 

}

736 

else if(dstbpp==16)

737 

{

738 

for(i=0;i<dstw;i++){

739 

// vertical linear interpolation && yuv2rgb in a single step:

740 

int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];

741 

int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);

742 

int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);

743 


744 

((uint16_t*)dest)[0] =

745 

(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 

746 

(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 

747 

(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;

748 

dest+=2;

749 

}

750 

}

751 

else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16?

752 

{

753 

for(i=0;i<dstw;i++){

754 

// vertical linear interpolation && yuv2rgb in a single step:

755 

int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];

756 

int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);

757 

int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);

758 


759 

((uint16_t*)dest)[0] =

760 

(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) 

761 

(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 

762 

(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;

763 

dest+=2;

764 

}

765 

}

766 

#endif

767 


768 

b16Dither= b16Dither1;


1366 
b16Dither= b16Dither1;

769 
1367 
b16Dither1= b16Dither2;

770 
1368 
b16Dither2= b16Dither;

771 
1369 

772 
1370 
g16Dither= g16Dither1;

773 
1371 
g16Dither1= g16Dither2;

774 
1372 
g16Dither2= g16Dither;


1373 
#endif

775 
1374 
}

776 
1375 

777 
1376 
#ifdef HAVE_3DNOW
