Implement the pair coordination
See the main coordination implementation, the code there is uses as a starting point for this variant.
This should be the easiest implementation: If each kernel will run only on a couple it could be written like this:
template <bool usePBC, typename calculateFloat>
__global__ void
getCoord (const unsigned couples,
const PLMD::GPU::rationalSwitchParameters<calculateFloat>
switchingParameters,
const calculateFloat *coordinates,
const unsigned *trueIndexes,
calculateFloat *ncoordOut,
calculateFloat *devOut,
calculateFloat *virialOut) {
// blockDIm are the number of threads in your block
const unsigned i = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned j = i + couples;
// blocks are initializated with 'ceil (couples/threads)'
if (i >= couples) {
return;
}
const unsigned idx = trueIndexes[i];
const unsigned jdx = trueIndexes[j];
if (idx == jdx) {
return;
}
// local calculation aid
calculateFloat d_0, d_1, d_2;
calculateFloat dfunc;
d_0 = coordinates[X (j)] - coordinates[X (i)];
d_1 = coordinates[Y (j)] - coordinates[Y (i)];
d_2 = coordinates[Z (j)] - coordinates[Z (i)];
dfunc = 0.;
ncoordOut[i] = calculateSqr (
d_0 * d_0 + d_1 * d_1 + d_2 * d_2, switchingParameters, dfunc);
mydevX = dfunc * d_0;
mydevY = dfunc * d_1;
mydevZ = dfunc * d_2;
devOut[X (i)] = -mydevX;
devOut[Y (i)] = -mydevY;
devOut[Z (i)] = -mydevZ;
devOut[X (j)] = mydevX;
devOut[Y (j)] = mydevY;
devOut[Z (j)] = mydevZ;
virialOut[couples * 0 + i] = -dfunc * d[0] * d[0];
virialOut[couples * 1 + i] = -dfunc * d[0] * d[1];
virialOut[couples * 2 + i] = -dfunc * d[0] * d[2];
virialOut[couples * 3 + i] = -dfunc * d[1] * d[0];
virialOut[couples * 4 + i] = -dfunc * d[1] * d[1];
virialOut[couples * 5 + i] = -dfunc * d[1] * d[2];
virialOut[couples * 6 + i] = -dfunc * d[2] * d[0];
virialOut[couples * 7 + i] = -dfunc * d[2] * d[1];
virialOut[couples * 8 + i] = -dfunc * d[2] * d[2];
}
The efficiency of this approach is not high, because we lose time in spinning up ncouples
of threads, each one that does a single small task, the coordination of a single pair.
An attempt to make it quicker could be to slightly change the kernel with a for loop on some pairs.
template <bool usePBC, typename calculateFloat>
__global__ void
getCoord (const unsigned couples,
const unsigned couplesPerThread,
const PLMD::GPU::rationalSwitchParameters<calculateFloat>
switchingParameters,
const calculateFloat *coordinates,
const unsigned *trueIndexes,
calculateFloat *ncoordOut,
calculateFloat *devOut,
calculateFloat *virialOut) {
// blockDIm are the number of threads in your block
const unsigned tid = threadIdx.x + blockIdx.x * blockDim.x;
for(unsigned k=0;k<couplesPerThread;++k){
const unsigned i = couplesPerThread*tid + k;
const unsigned j = i + couples;
const unsigned idx = trueIndexes[i];
const unsigned jdx = trueIndexes[j];
if (i >= couples) {
break;
}
if (idx == jdx) {
continue;
}
// local calculation aid
calculateFloat d_0, d_1, d_2;
calculateFloat dfunc;
d_0 = coordinates[X (j)] - coordinates[X (i)];
d_1 = coordinates[Y (j)] - coordinates[Y (i)];
d_2 = coordinates[Z (j)] - coordinates[Z (i)];
dfunc = 0.;
ncoordOut[i] += calculateSqr (
d_0 * d_0 + d_1 * d_1 + d_2 * d_2, switchingParameters, dfunc);
mydevX = dfunc * d_0;
mydevY = dfunc * d_1;
mydevZ = dfunc * d_2;
devOut[X (i)] = -mydevX;
devOut[Y (i)] = -mydevY;
devOut[Z (i)] = -mydevZ;
devOut[X (j)] = mydevX;
devOut[Y (j)] = mydevY;
devOut[Z (j)] = mydevZ;
}
virialOut[couples * 0 + tid] = -dfunc * d[0] * d[0];
virialOut[couples * 1 + tid] = -dfunc * d[0] * d[1];
virialOut[couples * 2 + tid] = -dfunc * d[0] * d[2];
virialOut[couples * 3 + tid] = -dfunc * d[1] * d[0];
virialOut[couples * 4 + tid] = -dfunc * d[1] * d[1];
virialOut[couples * 5 + tid] = -dfunc * d[1] * d[2];
virialOut[couples * 6 + tid] = -dfunc * d[2] * d[0];
virialOut[couples * 7 + tid] = -dfunc * d[2] * d[1];
virialOut[couples * 8 + tid] = -dfunc * d[2] * d[2];
}
This kernel works on couplesPerThread
per thread. The problem with this prototype is that nearly all the operations are done in the global memory.