c++ - Bitonic sort for key/value array -
i'm trying modify the intel's bitonic sorting algorithm sorts array of cl_int
s, sort array of cl_int2
s (based on key – i.e. cl_int2.x
).
the intel's example consists of simple host code , 1 opencl kernel called multiple times during 1 sorting operation (multipass). kernel loads 4 array items @ once cl_int4
, operates on them.
i didn't modify host code algorithm, device code. list of changes in kernel function:
- modify first kernel's parameter type
int4*
int8*
(to load 4 key-value pairs) - use
.even
components ofthearray
's elements compare values (<
) - create "
pseudomask
" (int4
) , based on that, createmask
pseudomask.xxyyzzww
(to capture values)
although output of modified kernel sorted cl_int2
array first component (cl_int2.x
), values (cl_int2.y
) incorrect – value of 1 item repeated next 4 or 8 items , new value used , repeated...
i'm sure there's trivial mistake, i'm unable find it.
diff of original intel code , modified version.
edit: cl_int2
array sorted flawlessly when each key (cl_int2.x
) unique.
example input: http://pastebin.com/92qb1cst
example output: http://pastebin.com/dsu97npn
(properly sorted array: http://pastebin.com/nb56buqk)
the modified kernel code (commented):
// copyright (c) 2009-2011 intel corporation // https://software.intel.com/en-us/articles/bitonic-sorting // modified sort int2 key-value array __kernel void bitonicsort(__global int8* thearray, const uint stage, const uint passofstage, const uint dir) { size_t = get_global_id(0); int8 srcleft, srcright, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passofstage > 0) // upper level pass, exchange between 2 fours, { size_t r = 1 << (passofstage - 1); size_t lmask = r - 1; size_t left = ((i>>(passofstage-1)) << passofstage) + (i & lmask); size_t right = left + r; srcleft = thearray[left]; srcright = thearray[right]; pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; int8 imin = (srcleft & mask) | (srcright & ~mask); int8 imax = (srcleft & ~mask) | (srcright & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { thearray[left] = imin; thearray[right] = imax; } else { thearray[right] = imin; thearray[left] = imax; } } else // last pass, sort inside 1 4 { srcleft = thearray[i]; srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyzzww; if(((i >> stage) & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & ~mask) | (srcright & mask); } } } else // first stage, sort inside 1 4 { /* * convert code int2 sorter, this: * 1. instead of loading int4, load int8 (key,value, key,value, ...) * 2. when there vector swizzling, replace component index 2 consecutive indices: * srcleft.yxwz -> srcleft.s23016745 * use rewrite rule: * x y z w * 01 23 45 67 * 3. replace comparison operands keys swizzled: * mask = srcleft < srcright; -> pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; */ // make bitonic sequence out of 4. int4 imask0 = (int4)(0, -1, -1, 0); // -1 in comparison = true (all bits set - two's complement) srcleft = thearray[i]; srcright = srcleft.s23016745; /* * xor mask flips bits, in `mask` following * results (remember srcright srcleft swapped component pairs): * * [ left.x<left.y, left.x<left.y, left.w<left.z, left.w<left.z ] * or: [ left.x<left.y, left.x<left.y, left.z>left.w, left.z>left.w ] */ pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxyyzzww; if( dir ) srcleft = (srcleft & mask) | (srcright & ~mask); // make sure numbers sorted this: else srcleft = (srcleft & ~mask) | (srcright & mask); /* * pairs of numbers in `srcleft` sorted according specified `dir`ection. * if dir == true, * components `x` , `y` swapped `x` < `y`. `z` , `w` swapped `z` > `w`. resembles up-hill: /\ * else * components `x` , `y` swapped `x` > `y`. `z` , `w` swapped `z` < `w`. resembles down-hill: \/ * * swapping achieved creating `srcleft`, in normal order, , `srcright`, has component pairs switched (xyzw -> yxwz). * `mask` created. mask bits redundant because applies vector component pairs (so in order implement key-value sorting, * have increase length of masks!). * * non-ordered component pairs in `srcleft` masked out `mask` while inverted `mask` applied (pair-wise switched) `srcright`. * * (the previous) first flipping makes 4-bitonic sequence. */ /* * second step sorts bitonic sequence */ srcright = srcleft.s45670123; // inverts bitonic sequence // [ left.a<left.c, left.b<left.d, left.a<left.c, left.b<left.d ] pseudomask = (srcleft.even < srcright.even) ^ imask10; // imask10 = (noflip, noflip, flip, flip) mask = pseudomask.xxyyzzww; // or odd (the output of thread sorted monotonic sequence. monotonicity changes , preparing bitonic sequence next pass.). if((i & 1) ^ dir) { // sorts bitonic sequence, hence splitting srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & ~mask) | (srcright & mask); } } }
the host-side code:
void executesortkernel(cl_kernel kernel, cl_command_queue queue, cl_mem cl_input_buffer, cl_int arraysize, cl_uint sortascending) { cl_int numstages = 0; cl_int stage; cl_int passofstage; (cl_int temp = arraysize; temp > 2; temp >>= 1) numstages++; clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *) &cl_input_buffer); clsetkernelarg(kernel, 3, sizeof(cl_uint), (void *) &sortascending); (stage = 0; stage < numstages; stage++) { clsetkernelarg(kernel, 1, sizeof(cl_uint), (void *) &stage); (passofstage = stage; passofstage >= 0; passofstage--) { clsetkernelarg(kernel, 2, sizeof(cl_uint), (void *) &passofstage); // set work-item dimensions size_t gsz = arraysize / (2*4); size_t global_work_size[1] = { passofstage ? gsz : gsz << 1 }; //number of quad items in input array // execute kernel clenqueuendrangekernel(queue, kernel, 1, null, global_work_size, null, 0, null, null); } } }
i've resolved problem!
the tricky part in way original intel code handled equal values of adjacent pairs inside loaded 4-tuple — it didn't explicitly handle it!
the bugs present in first stage
and in last passofstage
(i.e. passofstage = 0
) of every other stage
s. these parts of code interchanging individual 2-tuples inside 1 4-tuple (represented cl_int8
array thearray
).
let's consider excerpt example (it doesn't function equal adjacent 2-tuples in 4-tuple):
imask0 = (int4)(0, -1, -1, 0); srcleft = thearray[i]; // int8 srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxyyzzww; result = (srcleft & mask) | (srcright & ~mask);
imagine happen when we'd use (unfixed) code , srcleft.even = (int4)(7,7, 5,5)
. operation srcleft.even < srcright.even
result yield (int4)(0,0,0,0)
, we'd mask result imask0
, we'd … pseudomask = (int4)(0,-1,-1,0)
– i.e. imask itself. is, however, wrong.
the pseudomask
's value required form pattern: (int4)(a,a, b,b)
(where a
, b
can either 0
or -1
). means sufficient do following comparison form correct mask
: quasimask = srcleft.s07 < srcright.s07
. correct mask created mask = quasimask.xxxxyyyy
. first 2 x
es mask first key-value pair in first 2-tuple of 4-tuple (4-tuple = 1 element in thearray
). , since want bitmask corresponding 2-tuples (which specified imask0
0
–-1
pairs), add xx
. bitmask analogously second 2-tuple in 4-tuple, leaves yyyy
.
visual example bitshifting imask11
srcleft: x y z w < < < < srcright [relative srcleft]: y x w z ^ imask0: 0 -1 0 1 ------------------------------------------ (srcleft<srcright)^imask0: x x z z
the fixed, functioning version (i've commented fixed parts):
__kernel void bitonicsort(__global int8* thearray, const uint stage, const uint passofstage, const uint dir) { size_t = get_global_id(0); int8 srcleft, srcright, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passofstage > 0) // upper level pass, exchange between 2 fours { size_t r = 1 << (passofstage - 1); size_t lmask = r - 1; size_t left = ((i>>(passofstage-1)) << passofstage) + (i & lmask); size_t right = left + r; srcleft = thearray[left]; srcright = thearray[right]; pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; // here interchange individual components, no mask applied , hence no 2 pairs must contain same bit-pattern int8 imin = (srcleft & mask) | (srcright & ~mask); int8 imax = (srcleft & ~mask) | (srcright & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { thearray[left] = imin; thearray[right] = imax; } else { thearray[right] = imin; thearray[left] = imax; } } else // last pass, sort inside 1 4 { srcleft = thearray[i]; srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyxxyy; if(((i >> stage) & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & ~mask) | (srcright & mask); } } } else // first stage, sort inside 1 4 { int4 imask0 = (int4)(0, -1, -1, 0); srcleft = thearray[i]; srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxxxwwww; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) if( dir ) srcleft = (srcleft & mask) | (srcright & ~mask); else srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyxxyy; // 0th , 2nd elements must contain exact same value (as 1st , 3rd) if((i & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & ~mask) | (srcright & mask); } } }
Comments
Post a Comment