c++ - Bitonic sort for key/value array -
i'm trying modify the intel's bitonic sorting algorithm sorts array of cl_ints, sort array of cl_int2s (based on key – i.e. cl_int2.x).
the intel's example consists of simple host code , 1 opencl kernel called multiple times during 1 sorting operation (multipass). kernel loads 4 array items @ once cl_int4 , operates on them.
i didn't modify host code algorithm, device code. list of changes in kernel function:
- modify first kernel's parameter type
int4*int8*(to load 4 key-value pairs) - use
.evencomponents ofthearray's elements compare values (<) - create "
pseudomask" (int4) , based on that, createmaskpseudomask.xxyyzzww(to capture values)
although output of modified kernel sorted cl_int2 array first component (cl_int2.x), values (cl_int2.y) incorrect – value of 1 item repeated next 4 or 8 items , new value used , repeated...
i'm sure there's trivial mistake, i'm unable find it.
diff of original intel code , modified version.
edit: cl_int2 array sorted flawlessly when each key (cl_int2.x) unique.
example input: http://pastebin.com/92qb1cst
example output: http://pastebin.com/dsu97npn
(properly sorted array: http://pastebin.com/nb56buqk)
the modified kernel code (commented):
// copyright (c) 2009-2011 intel corporation // https://software.intel.com/en-us/articles/bitonic-sorting // modified sort int2 key-value array __kernel void bitonicsort(__global int8* thearray, const uint stage, const uint passofstage, const uint dir) { size_t = get_global_id(0); int8 srcleft, srcright, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passofstage > 0) // upper level pass, exchange between 2 fours, { size_t r = 1 << (passofstage - 1); size_t lmask = r - 1; size_t left = ((i>>(passofstage-1)) << passofstage) + (i & lmask); size_t right = left + r; srcleft = thearray[left]; srcright = thearray[right]; pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; int8 imin = (srcleft & mask) | (srcright & ~mask); int8 imax = (srcleft & ~mask) | (srcright & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { thearray[left] = imin; thearray[right] = imax; } else { thearray[right] = imin; thearray[left] = imax; } } else // last pass, sort inside 1 4 { srcleft = thearray[i]; srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyzzww; if(((i >> stage) & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & ~mask) | (srcright & mask); } } } else // first stage, sort inside 1 4 { /* * convert code int2 sorter, this: * 1. instead of loading int4, load int8 (key,value, key,value, ...) * 2. when there vector swizzling, replace component index 2 consecutive indices: * srcleft.yxwz -> srcleft.s23016745 * use rewrite rule: * x y z w * 01 23 45 67 * 3. replace comparison operands keys swizzled: * mask = srcleft < srcright; -> pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; */ // make bitonic sequence out of 4. int4 imask0 = (int4)(0, -1, -1, 0); // -1 in comparison = true (all bits set - two's complement) srcleft = thearray[i]; srcright = srcleft.s23016745; /* * xor mask flips bits, in `mask` following * results (remember srcright srcleft swapped component pairs): * * [ left.x<left.y, left.x<left.y, left.w<left.z, left.w<left.z ] * or: [ left.x<left.y, left.x<left.y, left.z>left.w, left.z>left.w ] */ pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxyyzzww; if( dir ) srcleft = (srcleft & mask) | (srcright & ~mask); // make sure numbers sorted this: else srcleft = (srcleft & ~mask) | (srcright & mask); /* * pairs of numbers in `srcleft` sorted according specified `dir`ection. * if dir == true, * components `x` , `y` swapped `x` < `y`. `z` , `w` swapped `z` > `w`. resembles up-hill: /\ * else * components `x` , `y` swapped `x` > `y`. `z` , `w` swapped `z` < `w`. resembles down-hill: \/ * * swapping achieved creating `srcleft`, in normal order, , `srcright`, has component pairs switched (xyzw -> yxwz). * `mask` created. mask bits redundant because applies vector component pairs (so in order implement key-value sorting, * have increase length of masks!). * * non-ordered component pairs in `srcleft` masked out `mask` while inverted `mask` applied (pair-wise switched) `srcright`. * * (the previous) first flipping makes 4-bitonic sequence. */ /* * second step sorts bitonic sequence */ srcright = srcleft.s45670123; // inverts bitonic sequence // [ left.a<left.c, left.b<left.d, left.a<left.c, left.b<left.d ] pseudomask = (srcleft.even < srcright.even) ^ imask10; // imask10 = (noflip, noflip, flip, flip) mask = pseudomask.xxyyzzww; // or odd (the output of thread sorted monotonic sequence. monotonicity changes , preparing bitonic sequence next pass.). if((i & 1) ^ dir) { // sorts bitonic sequence, hence splitting srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxyyzzww; thearray[i] = (srcleft & ~mask) | (srcright & mask); } } } the host-side code:
void executesortkernel(cl_kernel kernel, cl_command_queue queue, cl_mem cl_input_buffer, cl_int arraysize, cl_uint sortascending) { cl_int numstages = 0; cl_int stage; cl_int passofstage; (cl_int temp = arraysize; temp > 2; temp >>= 1) numstages++; clsetkernelarg(kernel, 0, sizeof(cl_mem), (void *) &cl_input_buffer); clsetkernelarg(kernel, 3, sizeof(cl_uint), (void *) &sortascending); (stage = 0; stage < numstages; stage++) { clsetkernelarg(kernel, 1, sizeof(cl_uint), (void *) &stage); (passofstage = stage; passofstage >= 0; passofstage--) { clsetkernelarg(kernel, 2, sizeof(cl_uint), (void *) &passofstage); // set work-item dimensions size_t gsz = arraysize / (2*4); size_t global_work_size[1] = { passofstage ? gsz : gsz << 1 }; //number of quad items in input array // execute kernel clenqueuendrangekernel(queue, kernel, 1, null, global_work_size, null, 0, null, null); } } }
i've resolved problem!
the tricky part in way original intel code handled equal values of adjacent pairs inside loaded 4-tuple — it didn't explicitly handle it!
the bugs present in first stage and in last passofstage (i.e. passofstage = 0) of every other stages. these parts of code interchanging individual 2-tuples inside 1 4-tuple (represented cl_int8 array thearray).
let's consider excerpt example (it doesn't function equal adjacent 2-tuples in 4-tuple):
imask0 = (int4)(0, -1, -1, 0); srcleft = thearray[i]; // int8 srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxyyzzww; result = (srcleft & mask) | (srcright & ~mask); imagine happen when we'd use (unfixed) code , srcleft.even = (int4)(7,7, 5,5). operation srcleft.even < srcright.even result yield (int4)(0,0,0,0), we'd mask result imask0 , we'd … pseudomask = (int4)(0,-1,-1,0) – i.e. imask itself. is, however, wrong.
the pseudomask's value required form pattern: (int4)(a,a, b,b) (where a , b can either 0 or -1). means sufficient do following comparison form correct mask: quasimask = srcleft.s07 < srcright.s07. correct mask created mask = quasimask.xxxxyyyy. first 2 xes mask first key-value pair in first 2-tuple of 4-tuple (4-tuple = 1 element in thearray). , since want bitmask corresponding 2-tuples (which specified imask0 0–-1 pairs), add xx. bitmask analogously second 2-tuple in 4-tuple, leaves yyyy.
visual example bitshifting imask11
srcleft: x y z w < < < < srcright [relative srcleft]: y x w z ^ imask0: 0 -1 0 1 ------------------------------------------ (srcleft<srcright)^imask0: x x z z the fixed, functioning version (i've commented fixed parts):
__kernel void bitonicsort(__global int8* thearray, const uint stage, const uint passofstage, const uint dir) { size_t = get_global_id(0); int8 srcleft, srcright, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passofstage > 0) // upper level pass, exchange between 2 fours { size_t r = 1 << (passofstage - 1); size_t lmask = r - 1; size_t left = ((i>>(passofstage-1)) << passofstage) + (i & lmask); size_t right = left + r; srcleft = thearray[left]; srcright = thearray[right]; pseudomask = srcleft.even < srcright.even; mask = pseudomask.xxyyzzww; // here interchange individual components, no mask applied , hence no 2 pairs must contain same bit-pattern int8 imin = (srcleft & mask) | (srcright & ~mask); int8 imax = (srcleft & ~mask) | (srcright & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { thearray[left] = imin; thearray[right] = imax; } else { thearray[right] = imin; thearray[left] = imax; } } else // last pass, sort inside 1 4 { srcleft = thearray[i]; srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyxxyy; if(((i >> stage) & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & ~mask) | (srcright & mask); } } } else // first stage, sort inside 1 4 { int4 imask0 = (int4)(0, -1, -1, 0); srcleft = thearray[i]; srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask0; mask = pseudomask.xxxxwwww; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) if( dir ) srcleft = (srcleft & mask) | (srcright & ~mask); else srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s45670123; pseudomask = (srcleft.even < srcright.even) ^ imask10; mask = pseudomask.xxyyxxyy; // 0th , 2nd elements must contain exact same value (as 1st , 3rd) if((i & 1) ^ dir) { srcleft = (srcleft & mask) | (srcright & ~mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & mask) | (srcright & ~mask); } else { srcleft = (srcleft & ~mask) | (srcright & mask); srcright = srcleft.s23016745; pseudomask = (srcleft.even < srcright.even) ^ imask11; mask = pseudomask.xxxxzzzz; // 0th , 1st elements must contain exact same value (as 2nd , 3rd) thearray[i] = (srcleft & ~mask) | (srcright & mask); } } }
Comments
Post a Comment