QuEST_gpu.cu
Go to the documentation of this file.
1 // Distributed under MIT licence. See https://github.com/QuEST-Kit/QuEST/blob/master/LICENCE.txt for details
2 
10 # include "QuEST.h"
11 # include "QuEST_precision.h"
12 # include "QuEST_internal.h" // purely to resolve getQuESTDefaultSeedKey
13 # include "mt19937ar.h"
14 
15 # include <stdlib.h>
16 # include <stdio.h>
17 # include <math.h>
18 
19 # define REDUCE_SHARED_SIZE 512
20 # define DEBUG 0
21 
22 
23 
24 /*
25  * struct types for concisely passing unitaries to kernels
26  */
27 
28  // hide these from doxygen
30 
31  typedef struct ArgMatrix2 {
32  Complex r0c0, r0c1;
33  Complex r1c0, r1c1;
34  } ArgMatrix2;
35 
36  typedef struct ArgMatrix4
37  {
38  Complex r0c0, r0c1, r0c2, r0c3;
39  Complex r1c0, r1c1, r1c2, r1c3;
40  Complex r2c0, r2c1, r2c2, r2c3;
41  Complex r3c0, r3c1, r3c2, r3c3;
42  } ArgMatrix4;
43 
44 ArgMatrix2 argifyMatrix2(ComplexMatrix2 m) {
45  ArgMatrix2 a;
46  a.r0c0.real=m.real[0][0]; a.r0c0.imag=m.imag[0][0];
47  a.r0c1.real=m.real[0][1]; a.r0c1.imag=m.imag[0][1];
48  a.r1c0.real=m.real[1][0]; a.r1c0.imag=m.imag[1][0];
49  a.r1c1.real=m.real[1][1]; a.r1c1.imag=m.imag[1][1];
50  return a;
51  }
52 
53 ArgMatrix4 argifyMatrix4(ComplexMatrix4 m) {
54  ArgMatrix4 a;
55  a.r0c0.real=m.real[0][0]; a.r0c0.imag=m.imag[0][0];
56  a.r0c1.real=m.real[0][1]; a.r0c1.imag=m.imag[0][1];
57  a.r0c2.real=m.real[0][2]; a.r0c2.imag=m.imag[0][2];
58  a.r0c3.real=m.real[0][3]; a.r0c3.imag=m.imag[0][3];
59  a.r1c0.real=m.real[1][0]; a.r1c0.imag=m.imag[1][0];
60  a.r1c1.real=m.real[1][1]; a.r1c1.imag=m.imag[1][1];
61  a.r1c2.real=m.real[1][2]; a.r1c2.imag=m.imag[1][2];
62  a.r1c3.real=m.real[1][3]; a.r1c3.imag=m.imag[1][3];
63  a.r2c0.real=m.real[2][0]; a.r2c0.imag=m.imag[2][0];
64  a.r2c1.real=m.real[2][1]; a.r2c1.imag=m.imag[2][1];
65  a.r2c2.real=m.real[2][2]; a.r2c2.imag=m.imag[2][2];
66  a.r2c3.real=m.real[2][3]; a.r2c3.imag=m.imag[2][3];
67  a.r3c0.real=m.real[3][0]; a.r3c0.imag=m.imag[3][0];
68  a.r3c1.real=m.real[3][1]; a.r3c1.imag=m.imag[3][1];
69  a.r3c2.real=m.real[3][2]; a.r3c2.imag=m.imag[3][2];
70  a.r3c3.real=m.real[3][3]; a.r3c3.imag=m.imag[3][3];
71  return a;
72  }
73 
75 
76 
77 
78 /*
79  * in-kernel bit twiddling functions
80  */
81 
82 __forceinline__ __device__ int extractBit (const int locationOfBitFromRight, const long long int theEncodedNumber) {
83  return (theEncodedNumber & ( 1LL << locationOfBitFromRight )) >> locationOfBitFromRight;
84 }
85 
86 __forceinline__ __device__ int getBitMaskParity(long long int mask) {
87  int parity = 0;
88  while (mask) {
89  parity = !parity;
90  mask = mask & (mask-1);
91  }
92  return parity;
93 }
94 
95 __forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd) {
96  return (number ^ (1LL << bitInd));
97 }
98 
99 __forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index) {
100  long long int left, right;
101  left = (number >> index) << index;
102  right = number - left;
103  return (left << 1) ^ right;
104 }
105 
106 __forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2) {
107  int small = (bit1 < bit2)? bit1 : bit2;
108  int big = (bit1 < bit2)? bit2 : bit1;
109  return insertZeroBit(insertZeroBit(number, small), big);
110 }
111 
112 __forceinline__ __device__ long long int insertZeroBits(long long int number, int* inds, const int numInds) {
113  /* inserted bit inds must strictly increase, so that their final indices are correct.
114  * in-lieu of sorting (avoided since no C++ variable-size arrays, and since we're already
115  * memory bottle-necked so overhead eats this slowdown), we find the next-smallest index each
116  * at each insert. recall every element of inds (a positive or zero number) is unique.
117  * This function won't appear in the CPU code, which can use C99 variable-size arrays and
118  * ought to make a sorted array before threading
119  */
120  int curMin = inds[0];
121  int prevMin = -1;
122  for (int n=0; n < numInds; n++) {
123 
124  // find next min
125  for (int t=0; t < numInds; t++)
126  if (inds[t]>prevMin && inds[t]<curMin)
127  curMin = inds[t];
128 
129  number = insertZeroBit(number, curMin);
130 
131  // set curMin to an arbitrary non-visited elem
132  prevMin = curMin;
133  for (int t=0; t < numInds; t++)
134  if (inds[t] > curMin) {
135  curMin = inds[t];
136  break;
137  }
138  }
139  return number;
140 }
141 
142 
143 
144 /*
145  * state vector and density matrix operations
146  */
147 
148 #ifdef __cplusplus
149 extern "C" {
150 #endif
151 
152 
153 void statevec_setAmps(Qureg qureg, long long int startInd, qreal* reals, qreal* imags, long long int numAmps) {
154 
155  cudaDeviceSynchronize();
156  cudaMemcpy(
157  qureg.deviceStateVec.real + startInd,
158  reals,
159  numAmps * sizeof(*(qureg.deviceStateVec.real)),
160  cudaMemcpyHostToDevice);
161  cudaMemcpy(
162  qureg.deviceStateVec.imag + startInd,
163  imags,
164  numAmps * sizeof(*(qureg.deviceStateVec.imag)),
165  cudaMemcpyHostToDevice);
166 }
167 
168 
170 void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg) {
171 
172  // copy copyQureg's GPU statevec to targetQureg's GPU statevec
173  cudaDeviceSynchronize();
174  cudaMemcpy(
175  targetQureg.deviceStateVec.real,
176  copyQureg.deviceStateVec.real,
177  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.real)),
178  cudaMemcpyDeviceToDevice);
179  cudaMemcpy(
180  targetQureg.deviceStateVec.imag,
181  copyQureg.deviceStateVec.imag,
182  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.imag)),
183  cudaMemcpyDeviceToDevice);
184 }
185 
187  long long int numPureAmps,
188  qreal *targetVecReal, qreal *targetVecImag,
189  qreal *copyVecReal, qreal *copyVecImag)
190 {
191  // this is a particular index of the pure copyQureg
192  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
193  if (index>=numPureAmps) return;
194 
195  qreal realRow = copyVecReal[index];
196  qreal imagRow = copyVecImag[index];
197  for (long long int col=0; col < numPureAmps; col++) {
198  qreal realCol = copyVecReal[col];
199  qreal imagCol = - copyVecImag[col]; // minus for conjugation
200  targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
201  targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
202  }
203 }
204 
205 void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
206 {
207  int threadsPerCUDABlock, CUDABlocks;
208  threadsPerCUDABlock = 128;
209  CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
210  densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
211  copyQureg.numAmpsPerChunk,
212  targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
213  copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag);
214 }
215 
216 __global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag){
217  long long int index;
218 
219  index = blockIdx.x*blockDim.x + threadIdx.x;
220  if (index>=stateVecSize) return;
221 
222  stateVecReal[index] = probFactor;
223  stateVecImag[index] = 0.0;
224 }
225 
227 {
228  qreal probFactor = 1.0/((qreal) (1LL << qureg.numQubitsRepresented));
229  int threadsPerCUDABlock, CUDABlocks;
230  threadsPerCUDABlock = 128;
231  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
232  densmatr_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
233  qureg.numAmpsPerChunk,
234  probFactor,
235  qureg.deviceStateVec.real,
236  qureg.deviceStateVec.imag);
237 }
238 
240  long long int densityNumElems,
241  qreal *densityReal, qreal *densityImag,
242  long long int densityInd)
243 {
244  // initialise the state to all zeros
245  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
246  if (index >= densityNumElems) return;
247 
248  densityReal[index] = 0.0;
249  densityImag[index] = 0.0;
250 
251  if (index==densityInd){
252  // classical state has probability 1
253  densityReal[densityInd] = 1.0;
254  densityImag[densityInd] = 0.0;
255  }
256 }
257 
258 void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
259 {
260  int threadsPerCUDABlock, CUDABlocks;
261  threadsPerCUDABlock = 128;
262  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
263 
264  // index of the desired state in the flat density matrix
265  long long int densityDim = 1LL << qureg.numQubitsRepresented;
266  long long int densityInd = (densityDim + 1)*stateInd;
267 
268  // identical to pure version
269  densmatr_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
270  qureg.numAmpsPerChunk,
271  qureg.deviceStateVec.real,
272  qureg.deviceStateVec.imag, densityInd);
273 }
274 
275 void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env)
276 {
277  // allocate CPU memory
278  long long int numAmps = 1L << numQubits;
279  long long int numAmpsPerRank = numAmps/env.numRanks;
280  qureg->stateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.real));
281  qureg->stateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.imag));
282  if (env.numRanks>1){
283  qureg->pairStateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.real));
284  qureg->pairStateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.imag));
285  }
286 
287  // check cpu memory allocation was successful
288  if ( (!(qureg->stateVec.real) || !(qureg->stateVec.imag))
289  && numAmpsPerRank ) {
290  printf("Could not allocate memory!\n");
291  exit (EXIT_FAILURE);
292  }
293  if ( env.numRanks>1 && (!(qureg->pairStateVec.real) || !(qureg->pairStateVec.imag))
294  && numAmpsPerRank ) {
295  printf("Could not allocate memory!\n");
296  exit (EXIT_FAILURE);
297  }
298 
299  qureg->numQubitsInStateVec = numQubits;
300  qureg->numAmpsPerChunk = numAmpsPerRank;
301  qureg->numAmpsTotal = numAmps;
302  qureg->chunkId = env.rank;
303  qureg->numChunks = env.numRanks;
304  qureg->isDensityMatrix = 0;
305 
306  // allocate GPU memory
307  cudaMalloc(&(qureg->deviceStateVec.real), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.real)));
308  cudaMalloc(&(qureg->deviceStateVec.imag), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.imag)));
309  cudaMalloc(&(qureg->firstLevelReduction), ceil(qureg->numAmpsPerChunk/(qreal)REDUCE_SHARED_SIZE)*sizeof(qreal));
311  sizeof(qreal));
312 
313  // check gpu memory allocation was successful
314  if (!(qureg->deviceStateVec.real) || !(qureg->deviceStateVec.imag)){
315  printf("Could not allocate memory on GPU!\n");
316  exit (EXIT_FAILURE);
317  }
318 
319 }
320 
322 {
323  // Free CPU memory
324  free(qureg.stateVec.real);
325  free(qureg.stateVec.imag);
326  if (env.numRanks>1){
327  free(qureg.pairStateVec.real);
328  free(qureg.pairStateVec.imag);
329  }
330 
331  // Free GPU memory
332  cudaFree(qureg.deviceStateVec.real);
333  cudaFree(qureg.deviceStateVec.imag);
334  cudaFree(qureg.firstLevelReduction);
335  cudaFree(qureg.secondLevelReduction);
336 }
337 
339 
340  DiagonalOp op;
341  op.numQubits = numQubits;
342  op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
343  op.chunkId = env.rank;
344  op.numChunks = env.numRanks;
345 
346  // allocate CPU memory (initialised to zero)
347  op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
348  op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
349  // @TODO no handling of rank>1 allocation (no distributed GPU)
350 
351  // check cpu memory allocation was successful
352  if ( !op.real || !op.imag ) {
353  printf("Could not allocate memory!\n");
354  exit(EXIT_FAILURE);
355  }
356 
357  // allocate GPU memory
358  size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
359  cudaMalloc(&(op.deviceOperator.real), arrSize);
360  cudaMalloc(&(op.deviceOperator.imag), arrSize);
361 
362  // check gpu memory allocation was successful
363  if (!op.deviceOperator.real || !op.deviceOperator.imag) {
364  printf("Could not allocate memory on GPU!\n");
365  exit(EXIT_FAILURE);
366  }
367 
368  // initialise GPU memory to zero
369  cudaMemset(op.deviceOperator.real, 0, arrSize);
370  cudaMemset(op.deviceOperator.imag, 0, arrSize);
371 
372  return op;
373 }
374 
376  free(op.real);
377  free(op.imag);
378  cudaFree(op.deviceOperator.real);
379  cudaFree(op.deviceOperator.imag);
380 }
381 
383  cudaDeviceSynchronize();
384  size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
385  cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice);
386  cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice);
387 }
388 
390  DiagonalOp op, enum pauliOpType* pauliCodes, qreal* termCoeffs, int numSumTerms
391 ) {
392  // each thread processes one diagonal element
393  long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x;
394  if (elemInd >= op.numElemsPerChunk)
395  return;
396 
397  qreal elem = 0;
398 
399  // elem is (+-) every coefficient, with sign determined by parity
400  for (int t=0; t<numSumTerms; t++) {
401 
402  // determine the parity of the Z-targeted qubits in the element's corresponding state
403  int isOddNumOnes = 0;
404  for (int q=0; q<op.numQubits; q++)
405  if (pauliCodes[q + t*op.numQubits] == PAULI_Z)
406  if (extractBit(q, elemInd))
407  isOddNumOnes = !isOddNumOnes;
408 
409  // avoid warp divergence
410  int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1)
411  elem += termCoeffs[t] * sign;
412  }
413 
414  op.deviceOperator.real[elemInd] = elem;
415  op.deviceOperator.imag[elemInd] = 0;
416 }
417 
419 
420  // copy args intop GPU memory
421  enum pauliOpType* d_pauliCodes;
422  size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes;
423  cudaMalloc(&d_pauliCodes, mem_pauliCodes);
424  cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
425 
426  qreal* d_termCoeffs;
427  size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
428  cudaMalloc(&d_termCoeffs, mem_termCoeffs);
429  cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
430 
431  int numThreadsPerBlock = 128;
432  int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock);
433  agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
434  op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
435 
436  // copy populated operator into to RAM
437  cudaDeviceSynchronize();
438  size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
439  cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost);
440  cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost);
441 
442  cudaFree(d_pauliCodes);
443  cudaFree(d_termCoeffs);
444 }
445 
446 int GPUExists(void){
447  int deviceCount, device;
448  int gpuDeviceCount = 0;
449  struct cudaDeviceProp properties;
450  cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
451  if (cudaResultCode != cudaSuccess) deviceCount = 0;
452  /* machines with no GPUs can still report one emulation device */
453  for (device = 0; device < deviceCount; ++device) {
454  cudaGetDeviceProperties(&properties, device);
455  if (properties.major != 9999) { /* 9999 means emulation only */
456  ++gpuDeviceCount;
457  }
458  }
459  if (gpuDeviceCount) return 1;
460  else return 0;
461 }
462 
464 
465  if (!GPUExists()){
466  printf("Trying to run GPU code with no GPU available\n");
467  exit(EXIT_FAILURE);
468  }
469 
470  QuESTEnv env;
471  env.rank=0;
472  env.numRanks=1;
473 
474  env.seeds = NULL;
475  env.numSeeds = 0;
476  seedQuESTDefault(env);
477 
478  return env;
479 }
480 
482  cudaDeviceSynchronize();
483 }
484 
485 int syncQuESTSuccess(int successCode){
486  return successCode;
487 }
488 
490  free(env.seeds);
491 }
492 
494  printf("EXECUTION ENVIRONMENT:\n");
495  printf("Running locally on one node with GPU\n");
496  printf("Number of ranks is %d\n", env.numRanks);
497 # ifdef _OPENMP
498  printf("OpenMP enabled\n");
499  printf("Number of threads available is %d\n", omp_get_max_threads());
500 # else
501  printf("OpenMP disabled\n");
502 # endif
503 }
504 
505 void getEnvironmentString(QuESTEnv env, char str[200]){
506 
507  // OpenMP can be hybridised with GPU in future, so this check is safe and worthwhile
508  int ompStatus=0;
509  int numThreads=1;
510 # ifdef _OPENMP
511  ompStatus=1;
512  numThreads=omp_get_max_threads();
513 # endif
514 
515  // there is no reporting of CUDA cores/threads/blocks currently (since non-trivial)
516  sprintf(str, "CUDA=1 OpenMP=%d MPI=0 threads=%d ranks=1", ompStatus, numThreads);
517 }
518 
520 {
521  if (DEBUG) printf("Copying data to GPU\n");
522  cudaMemcpy(qureg.deviceStateVec.real, qureg.stateVec.real,
523  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyHostToDevice);
524  cudaMemcpy(qureg.deviceStateVec.imag, qureg.stateVec.imag,
525  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyHostToDevice);
526  if (DEBUG) printf("Finished copying data to GPU\n");
527 }
528 
530 {
531  cudaDeviceSynchronize();
532  if (DEBUG) printf("Copying data from GPU\n");
533  cudaMemcpy(qureg.stateVec.real, qureg.deviceStateVec.real,
534  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
535  cudaMemcpy(qureg.stateVec.imag, qureg.deviceStateVec.imag,
536  qureg.numAmpsPerChunk*sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
537  if (DEBUG) printf("Finished copying data from GPU\n");
538 }
539 
543 void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank){
544  long long int index;
545  int rank;
546  copyStateFromGPU(qureg);
547  if (qureg.numQubitsInStateVec<=5){
548  for (rank=0; rank<qureg.numChunks; rank++){
549  if (qureg.chunkId==rank){
550  if (reportRank) {
551  printf("Reporting state from rank %d [\n", qureg.chunkId);
552  //printf("\trank, index, real, imag\n");
553  printf("real, imag\n");
554  } else if (rank==0) {
555  printf("Reporting state [\n");
556  printf("real, imag\n");
557  }
558 
559  for(index=0; index<qureg.numAmpsPerChunk; index++){
560  printf(REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", qureg.stateVec.real[index], qureg.stateVec.imag[index]);
561  }
562  if (reportRank || rank==qureg.numChunks-1) printf("]\n");
563  }
564  syncQuESTEnv(env);
565  }
566  }
567 }
568 
569 qreal statevec_getRealAmp(Qureg qureg, long long int index){
570  qreal el=0;
571  cudaMemcpy(&el, &(qureg.deviceStateVec.real[index]),
572  sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
573  return el;
574 }
575 
576 qreal statevec_getImagAmp(Qureg qureg, long long int index){
577  qreal el=0;
578  cudaMemcpy(&el, &(qureg.deviceStateVec.imag[index]),
579  sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
580  return el;
581 }
582 
583 __global__ void statevec_initBlankStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
584  long long int index;
585 
586  // initialise the statevector to be all-zeros
587  index = blockIdx.x*blockDim.x + threadIdx.x;
588  if (index>=stateVecSize) return;
589  stateVecReal[index] = 0.0;
590  stateVecImag[index] = 0.0;
591 }
592 
594 {
595  int threadsPerCUDABlock, CUDABlocks;
596  threadsPerCUDABlock = 128;
597  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
598  statevec_initBlankStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
599  qureg.numAmpsPerChunk,
600  qureg.deviceStateVec.real,
601  qureg.deviceStateVec.imag);
602 }
603 
604 __global__ void statevec_initZeroStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
605  long long int index;
606 
607  // initialise the state to |0000..0000>
608  index = blockIdx.x*blockDim.x + threadIdx.x;
609  if (index>=stateVecSize) return;
610  stateVecReal[index] = 0.0;
611  stateVecImag[index] = 0.0;
612 
613  if (index==0){
614  // zero state |0000..0000> has probability 1
615  stateVecReal[0] = 1.0;
616  stateVecImag[0] = 0.0;
617  }
618 }
619 
621 {
622  int threadsPerCUDABlock, CUDABlocks;
623  threadsPerCUDABlock = 128;
624  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
625  statevec_initZeroStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
626  qureg.numAmpsPerChunk,
627  qureg.deviceStateVec.real,
628  qureg.deviceStateVec.imag);
629 }
630 
631 __global__ void statevec_initPlusStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
632  long long int index;
633 
634  index = blockIdx.x*blockDim.x + threadIdx.x;
635  if (index>=stateVecSize) return;
636 
637  qreal normFactor = 1.0/sqrt((qreal)stateVecSize);
638  stateVecReal[index] = normFactor;
639  stateVecImag[index] = 0.0;
640 }
641 
643 {
644  int threadsPerCUDABlock, CUDABlocks;
645  threadsPerCUDABlock = 128;
646  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
647  statevec_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
648  qureg.numAmpsPerChunk,
649  qureg.deviceStateVec.real,
650  qureg.deviceStateVec.imag);
651 }
652 
653 __global__ void statevec_initClassicalStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd){
654  long long int index;
655 
656  // initialise the state to |stateInd>
657  index = blockIdx.x*blockDim.x + threadIdx.x;
658  if (index>=stateVecSize) return;
659  stateVecReal[index] = 0.0;
660  stateVecImag[index] = 0.0;
661 
662  if (index==stateInd){
663  // classical state has probability 1
664  stateVecReal[stateInd] = 1.0;
665  stateVecImag[stateInd] = 0.0;
666  }
667 }
668 void statevec_initClassicalState(Qureg qureg, long long int stateInd)
669 {
670  int threadsPerCUDABlock, CUDABlocks;
671  threadsPerCUDABlock = 128;
672  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
673  statevec_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
674  qureg.numAmpsPerChunk,
675  qureg.deviceStateVec.real,
676  qureg.deviceStateVec.imag, stateInd);
677 }
678 
679 __global__ void statevec_initDebugStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag){
680  long long int index;
681 
682  index = blockIdx.x*blockDim.x + threadIdx.x;
683  if (index>=stateVecSize) return;
684 
685  stateVecReal[index] = (index*2.0)/10.0;
686  stateVecImag[index] = (index*2.0+1.0)/10.0;
687 }
688 
690 {
691  int threadsPerCUDABlock, CUDABlocks;
692  threadsPerCUDABlock = 128;
693  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
694  statevec_initDebugStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
695  qureg.numAmpsPerChunk,
696  qureg.deviceStateVec.real,
697  qureg.deviceStateVec.imag);
698 }
699 
700 __global__ void statevec_initStateOfSingleQubitKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome){
701  long long int index;
702  int bit;
703 
704  index = blockIdx.x*blockDim.x + threadIdx.x;
705  if (index>=stateVecSize) return;
706 
707  qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2);
708  bit = extractBit(qubitId, index);
709  if (bit==outcome) {
710  stateVecReal[index] = normFactor;
711  stateVecImag[index] = 0.0;
712  } else {
713  stateVecReal[index] = 0.0;
714  stateVecImag[index] = 0.0;
715  }
716 }
717 
718 void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
719 {
720  int threadsPerCUDABlock, CUDABlocks;
721  threadsPerCUDABlock = 128;
722  CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock);
723  statevec_initStateOfSingleQubitKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome);
724 }
725 
726 // returns 1 if successful, else 0
727 int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env){
728  long long int chunkSize, stateVecSize;
729  long long int indexInChunk, totalIndex;
730 
731  chunkSize = qureg->numAmpsPerChunk;
732  stateVecSize = chunkSize*qureg->numChunks;
733 
734  qreal *stateVecReal = qureg->stateVec.real;
735  qreal *stateVecImag = qureg->stateVec.imag;
736 
737  FILE *fp;
738  char line[200];
739 
740  fp = fopen(filename, "r");
741  if (fp == NULL)
742  return 0;
743 
744  indexInChunk = 0; totalIndex = 0;
745  while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
746  if (line[0]!='#'){
747  int chunkId = totalIndex/chunkSize;
748  if (chunkId==qureg->chunkId){
749  # if QuEST_PREC==1
750  sscanf(line, "%f, %f", &(stateVecReal[indexInChunk]),
751  &(stateVecImag[indexInChunk]));
752  # elif QuEST_PREC==2
753  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
754  &(stateVecImag[indexInChunk]));
755  # elif QuEST_PREC==4
756  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
757  &(stateVecImag[indexInChunk]));
758  # endif
759  indexInChunk += 1;
760  }
761  totalIndex += 1;
762  }
763  }
764  fclose(fp);
765  copyStateToGPU(*qureg);
766 
767  // indicate success
768  return 1;
769 }
770 
771 int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision){
772  qreal diff;
773  int chunkSize = mq1.numAmpsPerChunk;
774 
775  copyStateFromGPU(mq1);
776  copyStateFromGPU(mq2);
777 
778  for (int i=0; i<chunkSize; i++){
779  diff = mq1.stateVec.real[i] - mq2.stateVec.real[i];
780  if (diff<0) diff *= -1;
781  if (diff>precision) return 0;
782  diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i];
783  if (diff<0) diff *= -1;
784  if (diff>precision) return 0;
785  }
786  return 1;
787 }
788 
789 __global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta){
790  // ----- sizes
791  long long int sizeBlock, // size of blocks
792  sizeHalfBlock; // size of blocks halved
793  // ----- indices
794  long long int thisBlock, // current block
795  indexUp,indexLo; // current index and corresponding index in lower half block
796 
797  // ----- temp variables
798  qreal stateRealUp,stateRealLo, // storage for previous state values
799  stateImagUp,stateImagLo; // (used in updates)
800  // ----- temp variables
801  long long int thisTask; // task based approach for expose loop with small granularity
802  long long int numTasks=qureg.numAmpsPerChunk>>1;
803 
804  sizeHalfBlock = 1LL << rotQubit; // size of blocks halved
805  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
806 
807  // ---------------------------------------------------------------- //
808  // rotate //
809  // ---------------------------------------------------------------- //
810 
812  qreal *stateVecReal = qureg.deviceStateVec.real;
813  qreal *stateVecImag = qureg.deviceStateVec.imag;
814  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
815  qreal betaImag=beta.imag, betaReal=beta.real;
816 
817  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
818  if (thisTask>=numTasks) return;
819 
820  thisBlock = thisTask / sizeHalfBlock;
821  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
822  indexLo = indexUp + sizeHalfBlock;
823 
824  // store current state vector values in temp variables
825  stateRealUp = stateVecReal[indexUp];
826  stateImagUp = stateVecImag[indexUp];
827 
828  stateRealLo = stateVecReal[indexLo];
829  stateImagLo = stateVecImag[indexLo];
830 
831  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
832  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
833  - betaReal*stateRealLo - betaImag*stateImagLo;
834  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
835  - betaReal*stateImagLo + betaImag*stateRealLo;
836 
837  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
838  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
839  + alphaReal*stateRealLo + alphaImag*stateImagLo;
840  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
841  + alphaReal*stateImagLo - alphaImag*stateRealLo;
842 }
843 
844 void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta)
845 {
846  int threadsPerCUDABlock, CUDABlocks;
847  threadsPerCUDABlock = 128;
848  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
849  statevec_compactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, alpha, beta);
850 }
851 
852 __global__ void statevec_controlledCompactUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta){
853  // ----- sizes
854  long long int sizeBlock, // size of blocks
855  sizeHalfBlock; // size of blocks halved
856  // ----- indices
857  long long int thisBlock, // current block
858  indexUp,indexLo; // current index and corresponding index in lower half block
859 
860  // ----- temp variables
861  qreal stateRealUp,stateRealLo, // storage for previous state values
862  stateImagUp,stateImagLo; // (used in updates)
863  // ----- temp variables
864  long long int thisTask; // task based approach for expose loop with small granularity
865  long long int numTasks=qureg.numAmpsPerChunk>>1;
866  int controlBit;
867 
868  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
869  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
870 
871  // ---------------------------------------------------------------- //
872  // rotate //
873  // ---------------------------------------------------------------- //
874 
876  qreal *stateVecReal = qureg.deviceStateVec.real;
877  qreal *stateVecImag = qureg.deviceStateVec.imag;
878  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
879  qreal betaImag=beta.imag, betaReal=beta.real;
880 
881  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
882  if (thisTask>=numTasks) return;
883 
884  thisBlock = thisTask / sizeHalfBlock;
885  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
886  indexLo = indexUp + sizeHalfBlock;
887 
888  controlBit = extractBit(controlQubit, indexUp);
889  if (controlBit){
890  // store current state vector values in temp variables
891  stateRealUp = stateVecReal[indexUp];
892  stateImagUp = stateVecImag[indexUp];
893 
894  stateRealLo = stateVecReal[indexLo];
895  stateImagLo = stateVecImag[indexLo];
896 
897  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
898  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
899  - betaReal*stateRealLo - betaImag*stateImagLo;
900  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
901  - betaReal*stateImagLo + betaImag*stateRealLo;
902 
903  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
904  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
905  + alphaReal*stateRealLo + alphaImag*stateImagLo;
906  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
907  + alphaReal*stateImagLo - alphaImag*stateRealLo;
908  }
909 }
910 
911 void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
912 {
913  int threadsPerCUDABlock, CUDABlocks;
914  threadsPerCUDABlock = 128;
915  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
916  statevec_controlledCompactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, alpha, beta);
917 }
918 
919 __global__ void statevec_unitaryKernel(Qureg qureg, int targetQubit, ArgMatrix2 u){
920  // ----- sizes
921  long long int sizeBlock, // size of blocks
922  sizeHalfBlock; // size of blocks halved
923  // ----- indices
924  long long int thisBlock, // current block
925  indexUp,indexLo; // current index and corresponding index in lower half block
926 
927  // ----- temp variables
928  qreal stateRealUp,stateRealLo, // storage for previous state values
929  stateImagUp,stateImagLo; // (used in updates)
930  // ----- temp variables
931  long long int thisTask; // task based approach for expose loop with small granularity
932  long long int numTasks=qureg.numAmpsPerChunk>>1;
933 
934  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
935  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
936 
937  // ---------------------------------------------------------------- //
938  // rotate //
939  // ---------------------------------------------------------------- //
940 
942  qreal *stateVecReal = qureg.deviceStateVec.real;
943  qreal *stateVecImag = qureg.deviceStateVec.imag;
944 
945  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
946  if (thisTask>=numTasks) return;
947 
948  thisBlock = thisTask / sizeHalfBlock;
949  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
950  indexLo = indexUp + sizeHalfBlock;
951 
952  // store current state vector values in temp variables
953  stateRealUp = stateVecReal[indexUp];
954  stateImagUp = stateVecImag[indexUp];
955 
956  stateRealLo = stateVecReal[indexLo];
957  stateImagLo = stateVecImag[indexLo];
958 
959  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
960  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
961  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
962  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
963  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
964 
965  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
966  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
967  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
968  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
969  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
970 }
971 
972 void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u)
973 {
974  int threadsPerCUDABlock, CUDABlocks;
975  threadsPerCUDABlock = 128;
976  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
977  statevec_unitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, argifyMatrix2(u));
978 }
979 
981  Qureg qureg, long long int ctrlMask, int* targs, int numTargs,
982  qreal* uRe, qreal* uIm, long long int* ampInds, qreal* reAmps, qreal* imAmps, long long int numTargAmps)
983 {
984 
985  // decide the amplitudes this thread will modify
986  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
987  long long int numTasks = qureg.numAmpsPerChunk >> numTargs; // kernel called on every 1 in 2^numTargs amplitudes
988  if (thisTask>=numTasks) return;
989 
990  // find this task's start index (where all targs are 0)
991  long long int ind00 = insertZeroBits(thisTask, targs, numTargs);
992 
993  // this task only modifies amplitudes if control qubits are 1 for this state
994  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
995  return;
996 
997  qreal *reVec = qureg.deviceStateVec.real;
998  qreal *imVec = qureg.deviceStateVec.imag;
999 
1000  /*
1001  each thread needs:
1002  long long int ampInds[numAmps];
1003  qreal reAmps[numAmps];
1004  qreal imAmps[numAmps];
1005  but instead has access to shared arrays, with below stride and offset
1006  */
1007  size_t stride = gridDim.x*blockDim.x;
1008  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
1009 
1010  // determine the indices and record values of target amps
1011  long long int ind;
1012  for (int i=0; i < numTargAmps; i++) {
1013 
1014  // get global index of current target qubit assignment
1015  ind = ind00;
1016  for (int t=0; t < numTargs; t++)
1017  if (extractBit(t, i))
1018  ind = flipBit(ind, targs[t]);
1019 
1020  ampInds[i*stride+offset] = ind;
1021  reAmps [i*stride+offset] = reVec[ind];
1022  imAmps [i*stride+offset] = imVec[ind];
1023  }
1024 
1025  // update the amplitudes
1026  for (int r=0; r < numTargAmps; r++) {
1027  ind = ampInds[r*stride+offset];
1028  reVec[ind] = 0;
1029  imVec[ind] = 0;
1030  for (int c=0; c < numTargAmps; c++) {
1031  qreal uReElem = uRe[c + r*numTargAmps];
1032  qreal uImElem = uIm[c + r*numTargAmps];
1033  reVec[ind] += reAmps[c*stride+offset]*uReElem - imAmps[c*stride+offset]*uImElem;
1034  imVec[ind] += reAmps[c*stride+offset]*uImElem + imAmps[c*stride+offset]*uReElem;
1035  }
1036  }
1037 }
1038 
1039 void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int* targs, int numTargs, ComplexMatrixN u)
1040 {
1041  int threadsPerCUDABlock = 128;
1042  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>numTargs)/threadsPerCUDABlock);
1043 
1044  // allocate device space for global {targs} (length: numTargs) and populate
1045  int *d_targs;
1046  size_t targMemSize = numTargs * sizeof *d_targs;
1047  cudaMalloc(&d_targs, targMemSize);
1048  cudaMemcpy(d_targs, targs, targMemSize, cudaMemcpyHostToDevice);
1049 
1050  // flatten out the u.real and u.imag lists
1051  int uNumRows = (1 << u.numQubits);
1052  qreal* uReFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uReFlat);
1053  qreal* uImFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uImFlat);
1054  long long int i = 0;
1055  for (int r=0; r < uNumRows; r++)
1056  for (int c=0; c < uNumRows; c++) {
1057  uReFlat[i] = u.real[r][c];
1058  uImFlat[i] = u.imag[r][c];
1059  i++;
1060  }
1061 
1062  // allocate device space for global u.real and u.imag (flatten by concatenating rows) and populate
1063  qreal* d_uRe;
1064  qreal* d_uIm;
1065  size_t uMemSize = uNumRows*uNumRows * sizeof *d_uRe; // size of each of d_uRe and d_uIm
1066  cudaMalloc(&d_uRe, uMemSize);
1067  cudaMalloc(&d_uIm, uMemSize);
1068  cudaMemcpy(d_uRe, uReFlat, uMemSize, cudaMemcpyHostToDevice);
1069  cudaMemcpy(d_uIm, uImFlat, uMemSize, cudaMemcpyHostToDevice);
1070 
1071  // allocate device Wspace for thread-local {ampInds}, {reAmps}, {imAmps} (length: 1<<numTargs)
1072  long long int *d_ampInds;
1073  qreal *d_reAmps;
1074  qreal *d_imAmps;
1075  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
1076  int numTargAmps = uNumRows;
1077  cudaMalloc(&d_ampInds, numTargAmps*gridSize * sizeof *d_ampInds);
1078  cudaMalloc(&d_reAmps, numTargAmps*gridSize * sizeof *d_reAmps);
1079  cudaMalloc(&d_imAmps, numTargAmps*gridSize * sizeof *d_imAmps);
1080 
1081  // call kernel
1082  statevec_multiControlledMultiQubitUnitaryKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
1083  qureg, ctrlMask, d_targs, numTargs, d_uRe, d_uIm, d_ampInds, d_reAmps, d_imAmps, numTargAmps);
1084 
1085  // free kernel memory
1086  free(uReFlat);
1087  free(uImFlat);
1088  cudaFree(d_targs);
1089  cudaFree(d_uRe);
1090  cudaFree(d_uIm);
1091  cudaFree(d_ampInds);
1092  cudaFree(d_reAmps);
1093  cudaFree(d_imAmps);
1094 }
1095 
1096 __global__ void statevec_multiControlledTwoQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u){
1097 
1098  // decide the 4 amplitudes this thread will modify
1099  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1100  long long int numTasks = qureg.numAmpsPerChunk >> 2; // kernel called on every 1 in 4 amplitudes
1101  if (thisTask>=numTasks) return;
1102 
1103  qreal *reVec = qureg.deviceStateVec.real;
1104  qreal *imVec = qureg.deviceStateVec.imag;
1105 
1106  // find indices of amplitudes to modify (treat q1 as the least significant bit)
1107  long long int ind00, ind01, ind10, ind11;
1108  ind00 = insertTwoZeroBits(thisTask, q1, q2);
1109 
1110  // modify only if control qubits are 1 for this state
1111  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
1112  return;
1113 
1114  ind01 = flipBit(ind00, q1);
1115  ind10 = flipBit(ind00, q2);
1116  ind11 = flipBit(ind01, q2);
1117 
1118  // extract statevec amplitudes
1119  qreal re00, re01, re10, re11;
1120  qreal im00, im01, im10, im11;
1121  re00 = reVec[ind00]; im00 = imVec[ind00];
1122  re01 = reVec[ind01]; im01 = imVec[ind01];
1123  re10 = reVec[ind10]; im10 = imVec[ind10];
1124  re11 = reVec[ind11]; im11 = imVec[ind11];
1125 
1126  // apply u * {amp00, amp01, amp10, amp11}
1127  reVec[ind00] =
1128  u.r0c0.real*re00 - u.r0c0.imag*im00 +
1129  u.r0c1.real*re01 - u.r0c1.imag*im01 +
1130  u.r0c2.real*re10 - u.r0c2.imag*im10 +
1131  u.r0c3.real*re11 - u.r0c3.imag*im11;
1132  imVec[ind00] =
1133  u.r0c0.imag*re00 + u.r0c0.real*im00 +
1134  u.r0c1.imag*re01 + u.r0c1.real*im01 +
1135  u.r0c2.imag*re10 + u.r0c2.real*im10 +
1136  u.r0c3.imag*re11 + u.r0c3.real*im11;
1137 
1138  reVec[ind01] =
1139  u.r1c0.real*re00 - u.r1c0.imag*im00 +
1140  u.r1c1.real*re01 - u.r1c1.imag*im01 +
1141  u.r1c2.real*re10 - u.r1c2.imag*im10 +
1142  u.r1c3.real*re11 - u.r1c3.imag*im11;
1143  imVec[ind01] =
1144  u.r1c0.imag*re00 + u.r1c0.real*im00 +
1145  u.r1c1.imag*re01 + u.r1c1.real*im01 +
1146  u.r1c2.imag*re10 + u.r1c2.real*im10 +
1147  u.r1c3.imag*re11 + u.r1c3.real*im11;
1148 
1149  reVec[ind10] =
1150  u.r2c0.real*re00 - u.r2c0.imag*im00 +
1151  u.r2c1.real*re01 - u.r2c1.imag*im01 +
1152  u.r2c2.real*re10 - u.r2c2.imag*im10 +
1153  u.r2c3.real*re11 - u.r2c3.imag*im11;
1154  imVec[ind10] =
1155  u.r2c0.imag*re00 + u.r2c0.real*im00 +
1156  u.r2c1.imag*re01 + u.r2c1.real*im01 +
1157  u.r2c2.imag*re10 + u.r2c2.real*im10 +
1158  u.r2c3.imag*re11 + u.r2c3.real*im11;
1159 
1160  reVec[ind11] =
1161  u.r3c0.real*re00 - u.r3c0.imag*im00 +
1162  u.r3c1.real*re01 - u.r3c1.imag*im01 +
1163  u.r3c2.real*re10 - u.r3c2.imag*im10 +
1164  u.r3c3.real*re11 - u.r3c3.imag*im11;
1165  imVec[ind11] =
1166  u.r3c0.imag*re00 + u.r3c0.real*im00 +
1167  u.r3c1.imag*re01 + u.r3c1.real*im01 +
1168  u.r3c2.imag*re10 + u.r3c2.real*im10 +
1169  u.r3c3.imag*re11 + u.r3c3.real*im11;
1170 }
1171 
1172 void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
1173 {
1174  int threadsPerCUDABlock = 128;
1175  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock); // one kernel eval for every 4 amplitudes
1176  statevec_multiControlledTwoQubitUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, q1, q2, argifyMatrix4(u));
1177 }
1178 
1179 __global__ void statevec_controlledUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u){
1180  // ----- sizes
1181  long long int sizeBlock, // size of blocks
1182  sizeHalfBlock; // size of blocks halved
1183  // ----- indices
1184  long long int thisBlock, // current block
1185  indexUp,indexLo; // current index and corresponding index in lower half block
1186 
1187  // ----- temp variables
1188  qreal stateRealUp,stateRealLo, // storage for previous state values
1189  stateImagUp,stateImagLo; // (used in updates)
1190  // ----- temp variables
1191  long long int thisTask; // task based approach for expose loop with small granularity
1192  long long int numTasks=qureg.numAmpsPerChunk>>1;
1193 
1194  int controlBit;
1195 
1196  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1197  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1198 
1199  // ---------------------------------------------------------------- //
1200  // rotate //
1201  // ---------------------------------------------------------------- //
1202 
1204  qreal *stateVecReal = qureg.deviceStateVec.real;
1205  qreal *stateVecImag = qureg.deviceStateVec.imag;
1206 
1207  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1208  if (thisTask>=numTasks) return;
1209 
1210  thisBlock = thisTask / sizeHalfBlock;
1211  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1212  indexLo = indexUp + sizeHalfBlock;
1213 
1214  // store current state vector values in temp variables
1215  stateRealUp = stateVecReal[indexUp];
1216  stateImagUp = stateVecImag[indexUp];
1217 
1218  stateRealLo = stateVecReal[indexLo];
1219  stateImagLo = stateVecImag[indexLo];
1220 
1221  controlBit = extractBit(controlQubit, indexUp);
1222  if (controlBit){
1223  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1224  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1225  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1226  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1227  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1228 
1229  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1230  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1231  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1232  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1233  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1234  }
1235 }
1236 
1237 void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
1238 {
1239  int threadsPerCUDABlock, CUDABlocks;
1240  threadsPerCUDABlock = 128;
1241  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1242  statevec_controlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, argifyMatrix2(u));
1243 }
1244 
1246  Qureg qureg,
1247  long long int ctrlQubitsMask, long long int ctrlFlipMask,
1248  int targetQubit, ArgMatrix2 u
1249 ){
1250  // ----- sizes
1251  long long int sizeBlock, // size of blocks
1252  sizeHalfBlock; // size of blocks halved
1253  // ----- indices
1254  long long int thisBlock, // current block
1255  indexUp,indexLo; // current index and corresponding index in lower half block
1256 
1257  // ----- temp variables
1258  qreal stateRealUp,stateRealLo, // storage for previous state values
1259  stateImagUp,stateImagLo; // (used in updates)
1260  // ----- temp variables
1261  long long int thisTask; // task based approach for expose loop with small granularity
1262  long long int numTasks=qureg.numAmpsPerChunk>>1;
1263 
1264 
1265  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1266  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1267 
1268  // ---------------------------------------------------------------- //
1269  // rotate //
1270  // ---------------------------------------------------------------- //
1271 
1273  qreal *stateVecReal = qureg.deviceStateVec.real;
1274  qreal *stateVecImag = qureg.deviceStateVec.imag;
1275 
1276  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1277  if (thisTask>=numTasks) return;
1278 
1279  thisBlock = thisTask / sizeHalfBlock;
1280  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1281  indexLo = indexUp + sizeHalfBlock;
1282 
1283  if (ctrlQubitsMask == (ctrlQubitsMask & (indexUp ^ ctrlFlipMask))) {
1284  // store current state vector values in temp variables
1285  stateRealUp = stateVecReal[indexUp];
1286  stateImagUp = stateVecImag[indexUp];
1287 
1288  stateRealLo = stateVecReal[indexLo];
1289  stateImagLo = stateVecImag[indexLo];
1290 
1291  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1292  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1293  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1294  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1295  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1296 
1297  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1298  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1299  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1300  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1301  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1302  }
1303 }
1304 
1306  Qureg qureg,
1307  long long int ctrlQubitsMask, long long int ctrlFlipMask,
1308  int targetQubit, ComplexMatrix2 u
1309 ){
1310  int threadsPerCUDABlock = 128;
1311  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1312  statevec_multiControlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
1313  qureg, ctrlQubitsMask, ctrlFlipMask, targetQubit, argifyMatrix2(u));
1314 }
1315 
1316 __global__ void statevec_pauliXKernel(Qureg qureg, int targetQubit){
1317  // ----- sizes
1318  long long int sizeBlock, // size of blocks
1319  sizeHalfBlock; // size of blocks halved
1320  // ----- indices
1321  long long int thisBlock, // current block
1322  indexUp,indexLo; // current index and corresponding index in lower half block
1323 
1324  // ----- temp variables
1325  qreal stateRealUp, // storage for previous state values
1326  stateImagUp; // (used in updates)
1327  // ----- temp variables
1328  long long int thisTask; // task based approach for expose loop with small granularity
1329  long long int numTasks=qureg.numAmpsPerChunk>>1;
1330 
1331  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1332  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1333 
1334  // ---------------------------------------------------------------- //
1335  // rotate //
1336  // ---------------------------------------------------------------- //
1337 
1339  qreal *stateVecReal = qureg.deviceStateVec.real;
1340  qreal *stateVecImag = qureg.deviceStateVec.imag;
1341 
1342  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1343  if (thisTask>=numTasks) return;
1344 
1345  thisBlock = thisTask / sizeHalfBlock;
1346  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1347  indexLo = indexUp + sizeHalfBlock;
1348 
1349  // store current state vector values in temp variables
1350  stateRealUp = stateVecReal[indexUp];
1351  stateImagUp = stateVecImag[indexUp];
1352 
1353  stateVecReal[indexUp] = stateVecReal[indexLo];
1354  stateVecImag[indexUp] = stateVecImag[indexLo];
1355 
1356  stateVecReal[indexLo] = stateRealUp;
1357  stateVecImag[indexLo] = stateImagUp;
1358 }
1359 
1360 void statevec_pauliX(Qureg qureg, int targetQubit)
1361 {
1362  int threadsPerCUDABlock, CUDABlocks;
1363  threadsPerCUDABlock = 128;
1364  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1365  statevec_pauliXKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1366 }
1367 
1368 __global__ void statevec_pauliYKernel(Qureg qureg, int targetQubit, int conjFac){
1369 
1370  long long int sizeHalfBlock = 1LL << targetQubit;
1371  long long int sizeBlock = 2LL * sizeHalfBlock;
1372  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1373  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1374  if (thisTask>=numTasks) return;
1375 
1376  long long int thisBlock = thisTask / sizeHalfBlock;
1377  long long int indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1378  long long int indexLo = indexUp + sizeHalfBlock;
1379  qreal stateRealUp, stateImagUp;
1380 
1381  qreal *stateVecReal = qureg.deviceStateVec.real;
1382  qreal *stateVecImag = qureg.deviceStateVec.imag;
1383  stateRealUp = stateVecReal[indexUp];
1384  stateImagUp = stateVecImag[indexUp];
1385 
1386  // update under +-{{0, -i}, {i, 0}}
1387  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1388  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1389  stateVecReal[indexLo] = conjFac * -stateImagUp;
1390  stateVecImag[indexLo] = conjFac * stateRealUp;
1391 }
1392 
1393 void statevec_pauliY(Qureg qureg, int targetQubit)
1394 {
1395  int threadsPerCUDABlock, CUDABlocks;
1396  threadsPerCUDABlock = 128;
1397  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1398  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, 1);
1399 }
1400 
1401 void statevec_pauliYConj(Qureg qureg, int targetQubit)
1402 {
1403  int threadsPerCUDABlock, CUDABlocks;
1404  threadsPerCUDABlock = 128;
1405  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1406  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, -1);
1407 }
1408 
1409 __global__ void statevec_controlledPauliYKernel(Qureg qureg, int controlQubit, int targetQubit, int conjFac)
1410 {
1411  long long int index;
1412  long long int sizeBlock, sizeHalfBlock;
1413  long long int stateVecSize;
1414  int controlBit;
1415 
1416  qreal stateRealUp, stateImagUp;
1417  long long int thisBlock, indexUp, indexLo;
1418  sizeHalfBlock = 1LL << targetQubit;
1419  sizeBlock = 2LL * sizeHalfBlock;
1420 
1421  stateVecSize = qureg.numAmpsPerChunk;
1422  qreal *stateVecReal = qureg.deviceStateVec.real;
1423  qreal *stateVecImag = qureg.deviceStateVec.imag;
1424 
1425  index = blockIdx.x*blockDim.x + threadIdx.x;
1426  if (index>=(stateVecSize>>1)) return;
1427  thisBlock = index / sizeHalfBlock;
1428  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1429  indexLo = indexUp + sizeHalfBlock;
1430 
1431  controlBit = extractBit(controlQubit, indexUp);
1432  if (controlBit){
1433 
1434  stateRealUp = stateVecReal[indexUp];
1435  stateImagUp = stateVecImag[indexUp];
1436 
1437  // update under +-{{0, -i}, {i, 0}}
1438  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1439  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1440  stateVecReal[indexLo] = conjFac * -stateImagUp;
1441  stateVecImag[indexLo] = conjFac * stateRealUp;
1442  }
1443 }
1444 
1445 void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit)
1446 {
1447  int conjFactor = 1;
1448  int threadsPerCUDABlock, CUDABlocks;
1449  threadsPerCUDABlock = 128;
1450  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1451  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1452 }
1453 
1454 void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit)
1455 {
1456  int conjFactor = -1;
1457  int threadsPerCUDABlock, CUDABlocks;
1458  threadsPerCUDABlock = 128;
1459  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1460  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1461 }
1462 
1463 __global__ void statevec_phaseShiftByTermKernel(Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle) {
1464 
1465  long long int sizeBlock, sizeHalfBlock;
1466  long long int thisBlock, indexUp,indexLo;
1467 
1468  qreal stateRealLo, stateImagLo;
1469  long long int thisTask;
1470  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1471 
1472  sizeHalfBlock = 1LL << targetQubit;
1473  sizeBlock = 2LL * sizeHalfBlock;
1474 
1475  qreal *stateVecReal = qureg.deviceStateVec.real;
1476  qreal *stateVecImag = qureg.deviceStateVec.imag;
1477 
1478  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1479  if (thisTask>=numTasks) return;
1480  thisBlock = thisTask / sizeHalfBlock;
1481  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1482  indexLo = indexUp + sizeHalfBlock;
1483 
1484  stateRealLo = stateVecReal[indexLo];
1485  stateImagLo = stateVecImag[indexLo];
1486 
1487  stateVecReal[indexLo] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1488  stateVecImag[indexLo] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1489 }
1490 
1491 void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term)
1492 {
1493  qreal cosAngle = term.real;
1494  qreal sinAngle = term.imag;
1495 
1496  int threadsPerCUDABlock, CUDABlocks;
1497  threadsPerCUDABlock = 128;
1498  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1499  statevec_phaseShiftByTermKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, cosAngle, sinAngle);
1500 }
1501 
1502 __global__ void statevec_controlledPhaseShiftKernel(Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
1503 {
1504  long long int index;
1505  long long int stateVecSize;
1506  int bit1, bit2;
1507  qreal stateRealLo, stateImagLo;
1508 
1509  stateVecSize = qureg.numAmpsPerChunk;
1510  qreal *stateVecReal = qureg.deviceStateVec.real;
1511  qreal *stateVecImag = qureg.deviceStateVec.imag;
1512 
1513  index = blockIdx.x*blockDim.x + threadIdx.x;
1514  if (index>=stateVecSize) return;
1515 
1516  bit1 = extractBit (idQubit1, index);
1517  bit2 = extractBit (idQubit2, index);
1518  if (bit1 && bit2) {
1519  stateRealLo = stateVecReal[index];
1520  stateImagLo = stateVecImag[index];
1521 
1522  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1523  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1524  }
1525 }
1526 
1527 void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle)
1528 {
1529  qreal cosAngle = cos(angle);
1530  qreal sinAngle = sin(angle);
1531 
1532  int threadsPerCUDABlock, CUDABlocks;
1533  threadsPerCUDABlock = 128;
1534  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1535  statevec_controlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2, cosAngle, sinAngle);
1536 }
1537 
1538 __global__ void statevec_multiControlledPhaseShiftKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle) {
1539  qreal stateRealLo, stateImagLo;
1540  long long int index;
1541  long long int stateVecSize;
1542 
1543  stateVecSize = qureg.numAmpsPerChunk;
1544  qreal *stateVecReal = qureg.deviceStateVec.real;
1545  qreal *stateVecImag = qureg.deviceStateVec.imag;
1546 
1547  index = blockIdx.x*blockDim.x + threadIdx.x;
1548  if (index>=stateVecSize) return;
1549 
1550  if (mask == (mask & index) ){
1551  stateRealLo = stateVecReal[index];
1552  stateImagLo = stateVecImag[index];
1553  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1554  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1555  }
1556 }
1557 
1558 void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
1559 {
1560  qreal cosAngle = cos(angle);
1561  qreal sinAngle = sin(angle);
1562 
1563  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1564 
1565  int threadsPerCUDABlock, CUDABlocks;
1566  threadsPerCUDABlock = 128;
1567  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1568  statevec_multiControlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1569 }
1570 
1571 __global__ void statevec_multiRotateZKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle) {
1572 
1573  long long int stateVecSize = qureg.numAmpsPerChunk;
1574  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1575  if (index>=stateVecSize) return;
1576 
1577  qreal *stateVecReal = qureg.deviceStateVec.real;
1578  qreal *stateVecImag = qureg.deviceStateVec.imag;
1579 
1580  int fac = getBitMaskParity(mask & index)? -1 : 1;
1581  qreal stateReal = stateVecReal[index];
1582  qreal stateImag = stateVecImag[index];
1583 
1584  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1585  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1586 }
1587 
1588 void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle)
1589 {
1590  qreal cosAngle = cos(angle/2.0);
1591  qreal sinAngle = sin(angle/2.0);
1592 
1593  int threadsPerCUDABlock, CUDABlocks;
1594  threadsPerCUDABlock = 128;
1595  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1596  statevec_multiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1597 }
1598 
1599 __global__ void statevec_multiControlledMultiRotateZKernel(Qureg qureg, long long int ctrlMask, long long int targMask, qreal cosAngle, qreal sinAngle) {
1600 
1601  long long int stateVecSize = qureg.numAmpsPerChunk;
1602  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1603  if (index>=stateVecSize) return;
1604 
1605  // amplitudes corresponding to control qubits not all-in-one are unmodified
1606  if (ctrlMask && ((ctrlMask & index) != ctrlMask))
1607  return;
1608 
1609  qreal *stateVecReal = qureg.deviceStateVec.real;
1610  qreal *stateVecImag = qureg.deviceStateVec.imag;
1611 
1612  // avoid warp divergence, setting fac = +- 1
1613  int fac = 1-2*getBitMaskParity(targMask & index);
1614  qreal stateReal = stateVecReal[index];
1615  qreal stateImag = stateVecImag[index];
1616 
1617  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1618  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1619 }
1620 
1621 void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle)
1622 {
1623  qreal cosAngle = cos(angle/2.0);
1624  qreal sinAngle = sin(angle/2.0);
1625 
1626  int threadsPerCUDABlock, CUDABlocks;
1627  threadsPerCUDABlock = 128;
1628  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1629  statevec_multiControlledMultiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, targMask, cosAngle, sinAngle);
1630 }
1631 
1633 
1634  // computes the trace using Kahan summation
1635  qreal pTotal=0;
1636  qreal y, t, c;
1637  c = 0;
1638 
1639  long long int numCols = 1LL << qureg.numQubitsRepresented;
1640  long long diagIndex;
1641 
1642  copyStateFromGPU(qureg);
1643 
1644  for (int col=0; col< numCols; col++) {
1645  diagIndex = col*(numCols + 1);
1646  y = qureg.stateVec.real[diagIndex] - c;
1647  t = pTotal + y;
1648  c = ( t - pTotal ) - y; // brackets are important
1649  pTotal = t;
1650  }
1651 
1652  return pTotal;
1653 }
1654 
1656  /* IJB - implemented using Kahan summation for greater accuracy at a slight floating
1657  point operation overhead. For more details see https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
1658  /* Don't change the bracketing in this routine! */
1659  qreal pTotal=0;
1660  qreal y, t, c;
1661  long long int index;
1662  long long int numAmpsPerRank = qureg.numAmpsPerChunk;
1663 
1664  copyStateFromGPU(qureg);
1665 
1666  c = 0.0;
1667  for (index=0; index<numAmpsPerRank; index++){
1668  /* Perform pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index]; by Kahan */
1669  // pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index];
1670  y = qureg.stateVec.real[index]*qureg.stateVec.real[index] - c;
1671  t = pTotal + y;
1672  c = ( t - pTotal ) - y;
1673  pTotal = t;
1674 
1675  /* Perform pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index]; by Kahan */
1676  //pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index];
1677  y = qureg.stateVec.imag[index]*qureg.stateVec.imag[index] - c;
1678  t = pTotal + y;
1679  c = ( t - pTotal ) - y;
1680  pTotal = t;
1681 
1682 
1683  }
1684  return pTotal;
1685 }
1686 
1687 __global__ void statevec_controlledPhaseFlipKernel(Qureg qureg, int idQubit1, int idQubit2)
1688 {
1689  long long int index;
1690  long long int stateVecSize;
1691  int bit1, bit2;
1692 
1693  stateVecSize = qureg.numAmpsPerChunk;
1694  qreal *stateVecReal = qureg.deviceStateVec.real;
1695  qreal *stateVecImag = qureg.deviceStateVec.imag;
1696 
1697  index = blockIdx.x*blockDim.x + threadIdx.x;
1698  if (index>=stateVecSize) return;
1699 
1700  bit1 = extractBit (idQubit1, index);
1701  bit2 = extractBit (idQubit2, index);
1702  if (bit1 && bit2) {
1703  stateVecReal [index] = - stateVecReal [index];
1704  stateVecImag [index] = - stateVecImag [index];
1705  }
1706 }
1707 
1708 void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2)
1709 {
1710  int threadsPerCUDABlock, CUDABlocks;
1711  threadsPerCUDABlock = 128;
1712  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1713  statevec_controlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2);
1714 }
1715 
1716 __global__ void statevec_multiControlledPhaseFlipKernel(Qureg qureg, long long int mask)
1717 {
1718  long long int index;
1719  long long int stateVecSize;
1720 
1721  stateVecSize = qureg.numAmpsPerChunk;
1722  qreal *stateVecReal = qureg.deviceStateVec.real;
1723  qreal *stateVecImag = qureg.deviceStateVec.imag;
1724 
1725  index = blockIdx.x*blockDim.x + threadIdx.x;
1726  if (index>=stateVecSize) return;
1727 
1728  if (mask == (mask & index) ){
1729  stateVecReal [index] = - stateVecReal [index];
1730  stateVecImag [index] = - stateVecImag [index];
1731  }
1732 }
1733 
1734 void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits)
1735 {
1736  int threadsPerCUDABlock, CUDABlocks;
1737  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1738  threadsPerCUDABlock = 128;
1739  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1740  statevec_multiControlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask);
1741 }
1742 
1743 __global__ void statevec_swapQubitAmpsKernel(Qureg qureg, int qb1, int qb2) {
1744 
1745  qreal *reVec = qureg.deviceStateVec.real;
1746  qreal *imVec = qureg.deviceStateVec.imag;
1747 
1748  long long int numTasks = qureg.numAmpsPerChunk >> 2; // each iteration updates 2 amps and skips 2 amps
1749  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1750  if (thisTask>=numTasks) return;
1751 
1752  long long int ind00, ind01, ind10;
1753  qreal re01, re10, im01, im10;
1754 
1755  // determine ind00 of |..0..0..>, |..0..1..> and |..1..0..>
1756  ind00 = insertTwoZeroBits(thisTask, qb1, qb2);
1757  ind01 = flipBit(ind00, qb1);
1758  ind10 = flipBit(ind00, qb2);
1759 
1760  // extract statevec amplitudes
1761  re01 = reVec[ind01]; im01 = imVec[ind01];
1762  re10 = reVec[ind10]; im10 = imVec[ind10];
1763 
1764  // swap 01 and 10 amps
1765  reVec[ind01] = re10; reVec[ind10] = re01;
1766  imVec[ind01] = im10; imVec[ind10] = im01;
1767 }
1768 
1769 void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2)
1770 {
1771  int threadsPerCUDABlock, CUDABlocks;
1772  threadsPerCUDABlock = 128;
1773  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock);
1774  statevec_swapQubitAmpsKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, qb1, qb2);
1775 }
1776 
1777 __global__ void statevec_hadamardKernel (Qureg qureg, int targetQubit){
1778  // ----- sizes
1779  long long int sizeBlock, // size of blocks
1780  sizeHalfBlock; // size of blocks halved
1781  // ----- indices
1782  long long int thisBlock, // current block
1783  indexUp,indexLo; // current index and corresponding index in lower half block
1784 
1785  // ----- temp variables
1786  qreal stateRealUp,stateRealLo, // storage for previous state values
1787  stateImagUp,stateImagLo; // (used in updates)
1788  // ----- temp variables
1789  long long int thisTask; // task based approach for expose loop with small granularity
1790  long long int numTasks=qureg.numAmpsPerChunk>>1;
1791 
1792  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1793  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1794 
1795  // ---------------------------------------------------------------- //
1796  // rotate //
1797  // ---------------------------------------------------------------- //
1798 
1800  qreal *stateVecReal = qureg.deviceStateVec.real;
1801  qreal *stateVecImag = qureg.deviceStateVec.imag;
1802 
1803  qreal recRoot2 = 1.0/sqrt(2.0);
1804 
1805  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1806  if (thisTask>=numTasks) return;
1807 
1808  thisBlock = thisTask / sizeHalfBlock;
1809  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1810  indexLo = indexUp + sizeHalfBlock;
1811 
1812  // store current state vector values in temp variables
1813  stateRealUp = stateVecReal[indexUp];
1814  stateImagUp = stateVecImag[indexUp];
1815 
1816  stateRealLo = stateVecReal[indexLo];
1817  stateImagLo = stateVecImag[indexLo];
1818 
1819  stateVecReal[indexUp] = recRoot2*(stateRealUp + stateRealLo);
1820  stateVecImag[indexUp] = recRoot2*(stateImagUp + stateImagLo);
1821 
1822  stateVecReal[indexLo] = recRoot2*(stateRealUp - stateRealLo);
1823  stateVecImag[indexLo] = recRoot2*(stateImagUp - stateImagLo);
1824 }
1825 
1826 void statevec_hadamard(Qureg qureg, int targetQubit)
1827 {
1828  int threadsPerCUDABlock, CUDABlocks;
1829  threadsPerCUDABlock = 128;
1830  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1831  statevec_hadamardKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1832 }
1833 
1834 __global__ void statevec_controlledNotKernel(Qureg qureg, int controlQubit, int targetQubit)
1835 {
1836  long long int index;
1837  long long int sizeBlock, // size of blocks
1838  sizeHalfBlock; // size of blocks halved
1839  long long int stateVecSize;
1840  int controlBit;
1841 
1842  // ----- temp variables
1843  qreal stateRealUp, // storage for previous state values
1844  stateImagUp; // (used in updates)
1845  long long int thisBlock, // current block
1846  indexUp,indexLo; // current index and corresponding index in lower half block
1847  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1848  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1849 
1850  stateVecSize = qureg.numAmpsPerChunk;
1851  qreal *stateVecReal = qureg.deviceStateVec.real;
1852  qreal *stateVecImag = qureg.deviceStateVec.imag;
1853 
1854  index = blockIdx.x*blockDim.x + threadIdx.x;
1855  if (index>=(stateVecSize>>1)) return;
1856  thisBlock = index / sizeHalfBlock;
1857  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1858  indexLo = indexUp + sizeHalfBlock;
1859 
1860  controlBit = extractBit(controlQubit, indexUp);
1861  if (controlBit){
1862  stateRealUp = stateVecReal[indexUp];
1863  stateImagUp = stateVecImag[indexUp];
1864 
1865  stateVecReal[indexUp] = stateVecReal[indexLo];
1866  stateVecImag[indexUp] = stateVecImag[indexLo];
1867 
1868  stateVecReal[indexLo] = stateRealUp;
1869  stateVecImag[indexLo] = stateImagUp;
1870  }
1871 }
1872 
1873 void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit)
1874 {
1875  int threadsPerCUDABlock, CUDABlocks;
1876  threadsPerCUDABlock = 128;
1877  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1878  statevec_controlledNotKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit);
1879 }
1880 
1881 __global__ void statevec_multiControlledMultiQubitNotKernel(Qureg qureg, int ctrlMask, int targMask) {
1882 
1883  qreal* stateRe = qureg.deviceStateVec.real;
1884  qreal* stateIm = qureg.deviceStateVec.imag;
1885 
1886  // althouugh each thread swaps/updates two amplitudes, we still invoke one thread per amp
1887  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
1888  if (ampInd >= qureg.numAmpsPerChunk)
1889  return;
1890 
1891  // modify amplitudes only if control qubits are 1 for this state
1892  if (ctrlMask && ((ctrlMask & ampInd) != ctrlMask))
1893  return;
1894 
1895  long long int mateInd = ampInd ^ targMask;
1896 
1897  // if the mate is lower index, another thread is handling it
1898  if (mateInd < ampInd)
1899  return;
1900 
1901  /* it may seem wasteful to spawn more threads than are needed, and abort
1902  * half of them due to the amp pairing above (and potentially abort
1903  * an exponential number due to ctrlMask). however, since we are moving
1904  * global memory directly in a potentially non-contiguous fashoin, this
1905  * method is likely to be memory bandwidth bottlenecked anyway
1906  */
1907 
1908  qreal mateRe = stateRe[mateInd];
1909  qreal mateIm = stateIm[mateInd];
1910 
1911  // swap amp with mate
1912  stateRe[mateInd] = stateRe[ampInd];
1913  stateIm[mateInd] = stateIm[ampInd];
1914  stateRe[ampInd] = mateRe;
1915  stateIm[ampInd] = mateIm;
1916 }
1917 
1918 void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask) {
1919 
1920  int numThreadsPerBlock = 128;
1921  int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
1922  statevec_multiControlledMultiQubitNotKernel<<<numBlocks, numThreadsPerBlock>>>(qureg, ctrlMask, targMask);
1923 }
1924 
1925 __device__ __host__ unsigned int log2Int( unsigned int x )
1926 {
1927  unsigned int ans = 0 ;
1928  while( x>>=1 ) ans++;
1929  return ans ;
1930 }
1931 
1932 __device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length){
1933  int i, l, r;
1934  int threadMax, maxDepth;
1935  threadMax = length/2;
1936  maxDepth = log2Int(length/2);
1937 
1938  for (i=0; i<maxDepth+1; i++){
1939  if (threadIdx.x<threadMax){
1940  l = threadIdx.x;
1941  r = l + threadMax;
1942  arrayIn[l] = arrayIn[r] + arrayIn[l];
1943  }
1944  threadMax = threadMax >> 1;
1945  __syncthreads(); // optimise -- use warp shuffle instead
1946  }
1947 
1948  if (threadIdx.x==0) reducedArray[blockIdx.x] = arrayIn[0];
1949 }
1950 
1951 __global__ void copySharedReduceBlock(qreal*arrayIn, qreal *reducedArray, int length){
1952  extern __shared__ qreal tempReductionArray[];
1953  int blockOffset = blockIdx.x*length;
1954  tempReductionArray[threadIdx.x*2] = arrayIn[blockOffset + threadIdx.x*2];
1955  tempReductionArray[threadIdx.x*2+1] = arrayIn[blockOffset + threadIdx.x*2+1];
1956  __syncthreads();
1957  reduceBlock(tempReductionArray, reducedArray, length);
1958 }
1959 
1961  Qureg qureg, int measureQubit, qreal *reducedArray
1962 ) {
1963  // run by each thread
1964  // use of block here refers to contiguous amplitudes where measureQubit = 0,
1965  // (then =1) and NOT the CUDA block, which is the partitioning of CUDA threads
1966 
1967  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1968  long long int numTasks = densityDim >> 1;
1969  long long int sizeHalfBlock = 1LL << (measureQubit);
1970  long long int sizeBlock = 2LL * sizeHalfBlock;
1971 
1972  long long int thisBlock; // which block this thread is processing
1973  long long int thisTask; // which part of the block this thread is processing
1974  long long int basisIndex; // index of this thread's computational basis state
1975  long long int densityIndex; // " " index of |basis><basis| in the flat density matrix
1976 
1977  // array of each thread's collected probability, to be summed
1978  extern __shared__ qreal tempReductionArray[];
1979 
1980  // figure out which density matrix prob that this thread is assigned
1981  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1982  if (thisTask>=numTasks) return;
1983  thisBlock = thisTask / sizeHalfBlock;
1984  basisIndex = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1985  densityIndex = (densityDim + 1) * basisIndex;
1986 
1987  // record the probability in the CUDA-BLOCK-wide array
1988  qreal prob = qureg.deviceStateVec.real[densityIndex]; // im[densityIndex] assumed ~ 0
1989  tempReductionArray[threadIdx.x] = prob;
1990 
1991  // sum the probs collected by this CUDA-BLOCK's threads into a per-CUDA-BLOCK array
1992  __syncthreads();
1993  if (threadIdx.x<blockDim.x/2){
1994  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1995  }
1996 }
1997 
1999  Qureg qureg, int measureQubit, qreal *reducedArray
2000 ) {
2001  // ----- sizes
2002  long long int sizeBlock, // size of blocks
2003  sizeHalfBlock; // size of blocks halved
2004  // ----- indices
2005  long long int thisBlock, // current block
2006  index; // current index for first half block
2007  // ----- temp variables
2008  long long int thisTask; // task based approach for expose loop with small granularity
2009  long long int numTasks=qureg.numAmpsPerChunk>>1;
2010  // (good for shared memory parallelism)
2011 
2012  extern __shared__ qreal tempReductionArray[];
2013 
2014  // ---------------------------------------------------------------- //
2015  // dimensions //
2016  // ---------------------------------------------------------------- //
2017  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2018  // and then the number to skip
2019  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2020 
2021  // ---------------------------------------------------------------- //
2022  // find probability //
2023  // ---------------------------------------------------------------- //
2024 
2025  //
2026  // --- task-based shared-memory parallel implementation
2027  //
2028 
2029  qreal *stateVecReal = qureg.deviceStateVec.real;
2030  qreal *stateVecImag = qureg.deviceStateVec.imag;
2031 
2032  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2033  if (thisTask>=numTasks) return;
2034 
2035  thisBlock = thisTask / sizeHalfBlock;
2036  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2037  qreal realVal, imagVal;
2038  realVal = stateVecReal[index];
2039  imagVal = stateVecImag[index];
2040  tempReductionArray[threadIdx.x] = realVal*realVal + imagVal*imagVal;
2041  __syncthreads();
2042 
2043  if (threadIdx.x<blockDim.x/2){
2044  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2045  }
2046 }
2047 
2048 int getNumReductionLevels(long long int numValuesToReduce, int numReducedPerLevel){
2049  int levels=0;
2050  while (numValuesToReduce){
2051  numValuesToReduce = numValuesToReduce/numReducedPerLevel;
2052  levels++;
2053  }
2054  return levels;
2055 }
2056 
2057 void swapDouble(qreal **a, qreal **b){
2058  qreal *temp;
2059  temp = *a;
2060  *a = *b;
2061  *b = temp;
2062 }
2063 
2065 {
2066  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2067  long long int numValuesToReduce = densityDim >> 1; // half of the diagonal has measureQubit=0
2068 
2069  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2070  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2071  int firstTime = 1;
2072 
2073  while (numValuesToReduce > 1) {
2074 
2075  // need less than one CUDA-BLOCK to reduce
2076  if (numValuesToReduce < maxReducedPerLevel) {
2077  valuesPerCUDABlock = numValuesToReduce;
2078  numCUDABlocks = 1;
2079  }
2080  // otherwise use only full CUDA-BLOCKS
2081  else {
2082  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2083  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2084  }
2085 
2086  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2087 
2088  // spawn threads to sum the probs in each block
2089  if (firstTime) {
2090  densmatr_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2091  qureg, measureQubit, qureg.firstLevelReduction);
2092  firstTime = 0;
2093 
2094  // sum the block probs
2095  } else {
2096  cudaDeviceSynchronize();
2097  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2098  qureg.firstLevelReduction,
2099  qureg.secondLevelReduction, valuesPerCUDABlock);
2100  cudaDeviceSynchronize();
2102  }
2103 
2104  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2105  }
2106 
2107  qreal zeroProb;
2108  cudaMemcpy(&zeroProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2109  return zeroProb;
2110 }
2111 
2113 {
2114  long long int numValuesToReduce = qureg.numAmpsPerChunk>>1;
2115  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2116  qreal stateProb=0;
2117  int firstTime=1;
2118  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2119 
2120  while(numValuesToReduce>1){
2121  if (numValuesToReduce<maxReducedPerLevel){
2122  // Need less than one CUDA block to reduce values
2123  valuesPerCUDABlock = numValuesToReduce;
2124  numCUDABlocks = 1;
2125  } else {
2126  // Use full CUDA blocks, with block size constrained by shared mem usage
2127  valuesPerCUDABlock = maxReducedPerLevel;
2128  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2129  }
2130  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2131 
2132  if (firstTime){
2133  statevec_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2134  qureg, measureQubit, qureg.firstLevelReduction);
2135  firstTime=0;
2136  } else {
2137  cudaDeviceSynchronize();
2138  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2139  qureg.firstLevelReduction,
2140  qureg.secondLevelReduction, valuesPerCUDABlock);
2141  cudaDeviceSynchronize();
2143  }
2144  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2145  }
2146  cudaMemcpy(&stateProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2147  return stateProb;
2148 }
2149 
2150 qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
2151 {
2152  qreal outcomeProb = statevec_findProbabilityOfZero(qureg, measureQubit);
2153  if (outcome==1)
2154  outcomeProb = 1.0 - outcomeProb;
2155  return outcomeProb;
2156 }
2157 
2158 qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
2159 {
2160  qreal outcomeProb = densmatr_findProbabilityOfZero(qureg, measureQubit);
2161  if (outcome==1)
2162  outcomeProb = 1.0 - outcomeProb;
2163  return outcomeProb;
2164 }
2165 
2166 // atomicAdd on floats/doubles isn't available on <6 CC devices, so we add it ourselves
2167 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
2168 #else
2169 static __inline__ __device__ double atomicAdd(double* address, double val)
2170 {
2171  unsigned long long int* address_as_ull = (unsigned long long int*) address;
2172  unsigned long long int old = *address_as_ull, assumed;
2173 
2174  do {
2175  assumed = old;
2176  old = atomicCAS(address_as_ull, assumed,
2177  __double_as_longlong(val + __longlong_as_double(assumed)));
2178 
2179  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
2180  } while (assumed != old);
2181 
2182  return __longlong_as_double(old);
2183 }
2184 #endif
2185 
2187  qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
2188 ) {
2189  // each thread handles one amplitude (all amplitudes are involved)
2190  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2191  if (ampInd >= qureg.numAmpsTotal) return;
2192 
2193  qreal prob = (
2194  qureg.deviceStateVec.real[ampInd]*qureg.deviceStateVec.real[ampInd] +
2195  qureg.deviceStateVec.imag[ampInd]*qureg.deviceStateVec.imag[ampInd]);
2196 
2197  // each amplitude contributes to one outcome
2198  long long int outcomeInd = 0;
2199  for (int q=0; q<numQubits; q++)
2200  outcomeInd += extractBit(qubits[q], ampInd) * (1LL << q);
2201 
2202  // each thread atomically writes directly to the global output.
2203  // this beat block-heirarchal atomic reductions in both global and shared memory!
2204  atomicAdd(&outcomeProbs[outcomeInd], prob);
2205 }
2206 
2207 void statevec_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
2208 
2209  // copy qubits to GPU memory
2210  int* d_qubits;
2211  size_t mem_qubits = numQubits * sizeof *d_qubits;
2212  cudaMalloc(&d_qubits, mem_qubits);
2213  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
2214 
2215  // create one thread for every amplitude
2216  int numThreadsPerBlock = 128;
2217  int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
2218 
2219  // create global GPU array for outcomeProbs
2220  qreal* d_outcomeProbs;
2221  long long int numOutcomes = (1LL << numQubits);
2222  size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
2223  cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
2224  cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
2225 
2226  // populate per-block subarrays
2227  statevec_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
2228  d_outcomeProbs, qureg, d_qubits, numQubits);
2229 
2230  // copy outcomeProbs from GPU memory
2231  cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
2232 
2233  // free GPU memory
2234  cudaFree(d_qubits);
2235  cudaFree(d_outcomeProbs);
2236 }
2237 
2239  qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits
2240 ) {
2241  // each thread handles one diagonal amplitude
2242  long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x;
2243  long long int numDiags = (1LL << qureg.numQubitsRepresented);
2244  if (diagInd >= numDiags) return;
2245 
2246  long long int flatInd = (1 + numDiags)*diagInd;
2247  qreal prob = qureg.deviceStateVec.real[flatInd]; // im[flatInd] assumed ~ 0
2248 
2249  // each diagonal amplitude contributes to one outcome
2250  long long int outcomeInd = 0;
2251  for (int q=0; q<numQubits; q++)
2252  outcomeInd += extractBit(qubits[q], diagInd) * (1LL << q);
2253 
2254  // each thread atomically writes directly to the global output.
2255  // this beat block-heirarchal atomic reductions in both global and shared memory!
2256  atomicAdd(&outcomeProbs[outcomeInd], prob);
2257 }
2258 
2259 void densmatr_calcProbOfAllOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits) {
2260 
2261  // copy qubits to GPU memory
2262  int* d_qubits;
2263  size_t mem_qubits = numQubits * sizeof *d_qubits;
2264  cudaMalloc(&d_qubits, mem_qubits);
2265  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
2266 
2267  // create global array, with per-block subarrays
2268  int numThreadsPerBlock = 128;
2269  int numDiags = (1LL << qureg.numQubitsRepresented);
2270  int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock);
2271 
2272  // create global GPU array for outcomeProbs
2273  qreal* d_outcomeProbs;
2274  long long int numOutcomes = (1LL << numQubits);
2275  size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
2276  cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
2277  cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
2278 
2279  // populate per-block subarrays
2280  densmatr_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
2281  d_outcomeProbs, qureg, d_qubits, numQubits);
2282 
2283  // copy outcomeProbs from GPU memory
2284  cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
2285 
2286  // free GPU memory
2287  cudaFree(d_qubits);
2288  cudaFree(d_outcomeProbs);
2289 }
2290 
2293  Qureg a, Qureg b, long long int numTermsToSum, qreal* reducedArray
2294 ) {
2295  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2296  if (index >= numTermsToSum) return;
2297 
2298  // Re{ conj(a) b } = Re{ (aRe - i aIm)(bRe + i bIm) } = aRe bRe + aIm bIm
2299  qreal prod = (
2300  a.deviceStateVec.real[index]*b.deviceStateVec.real[index]
2301  + a.deviceStateVec.imag[index]*b.deviceStateVec.imag[index]);
2302 
2303  // array of each thread's collected sum term, to be summed
2304  extern __shared__ qreal tempReductionArray[];
2305  tempReductionArray[threadIdx.x] = prod;
2306  __syncthreads();
2307 
2308  // every second thread reduces
2309  if (threadIdx.x<blockDim.x/2)
2310  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2311 }
2312 
2314 
2315  // we're summing the square of every term in the density matrix
2316  long long int numValuesToReduce = a.numAmpsTotal;
2317 
2318  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2319  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2320  int firstTime = 1;
2321 
2322  while (numValuesToReduce > 1) {
2323 
2324  // need less than one CUDA-BLOCK to reduce
2325  if (numValuesToReduce < maxReducedPerLevel) {
2326  valuesPerCUDABlock = numValuesToReduce;
2327  numCUDABlocks = 1;
2328  }
2329  // otherwise use only full CUDA-BLOCKS
2330  else {
2331  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2332  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2333  }
2334  // dictates size of reduction array
2335  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2336 
2337  // spawn threads to sum the terms in each block
2338  // arbitrarily store the reduction in the b qureg's array
2339  if (firstTime) {
2340  densmatr_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2341  a, b, a.numAmpsTotal, b.firstLevelReduction);
2342  firstTime = 0;
2343  }
2344  // sum the block terms
2345  else {
2346  cudaDeviceSynchronize();
2347  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2349  b.secondLevelReduction, valuesPerCUDABlock);
2350  cudaDeviceSynchronize();
2352  }
2353 
2354  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2355  }
2356 
2357  qreal innerprod;
2358  cudaMemcpy(&innerprod, b.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2359  return innerprod;
2360 }
2361 
2364  int getRealComp,
2365  qreal* vecReal1, qreal* vecImag1, qreal* vecReal2, qreal* vecImag2,
2366  long long int numTermsToSum, qreal* reducedArray)
2367 {
2368  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2369  if (index >= numTermsToSum) return;
2370 
2371  // choose whether to calculate the real or imaginary term of the inner product
2372  qreal innerProdTerm;
2373  if (getRealComp)
2374  innerProdTerm = vecReal1[index]*vecReal2[index] + vecImag1[index]*vecImag2[index];
2375  else
2376  innerProdTerm = vecReal1[index]*vecImag2[index] - vecImag1[index]*vecReal2[index];
2377 
2378  // array of each thread's collected sum term, to be summed
2379  extern __shared__ qreal tempReductionArray[];
2380  tempReductionArray[threadIdx.x] = innerProdTerm;
2381  __syncthreads();
2382 
2383  // every second thread reduces
2384  if (threadIdx.x<blockDim.x/2)
2385  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2386 }
2387 
2394 
2395  qreal innerProdReal, innerProdImag;
2396 
2397  int getRealComp;
2398  long long int numValuesToReduce;
2399  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2400  int maxReducedPerLevel;
2401  int firstTime;
2402 
2403  // compute real component of inner product
2404  getRealComp = 1;
2405  numValuesToReduce = bra.numAmpsPerChunk;
2406  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2407  firstTime = 1;
2408  while (numValuesToReduce > 1) {
2409  if (numValuesToReduce < maxReducedPerLevel) {
2410  valuesPerCUDABlock = numValuesToReduce;
2411  numCUDABlocks = 1;
2412  }
2413  else {
2414  valuesPerCUDABlock = maxReducedPerLevel;
2415  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2416  }
2417  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2418  if (firstTime) {
2419  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2420  getRealComp,
2421  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2422  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2423  numValuesToReduce,
2424  bra.firstLevelReduction);
2425  firstTime = 0;
2426  } else {
2427  cudaDeviceSynchronize();
2428  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2429  bra.firstLevelReduction,
2430  bra.secondLevelReduction, valuesPerCUDABlock);
2431  cudaDeviceSynchronize();
2433  }
2434  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2435  }
2436  cudaMemcpy(&innerProdReal, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2437 
2438  // compute imag component of inner product
2439  getRealComp = 0;
2440  numValuesToReduce = bra.numAmpsPerChunk;
2441  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2442  firstTime = 1;
2443  while (numValuesToReduce > 1) {
2444  if (numValuesToReduce < maxReducedPerLevel) {
2445  valuesPerCUDABlock = numValuesToReduce;
2446  numCUDABlocks = 1;
2447  }
2448  else {
2449  valuesPerCUDABlock = maxReducedPerLevel;
2450  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2451  }
2452  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2453  if (firstTime) {
2454  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2455  getRealComp,
2456  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2457  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2458  numValuesToReduce,
2459  bra.firstLevelReduction);
2460  firstTime = 0;
2461  } else {
2462  cudaDeviceSynchronize();
2463  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2464  bra.firstLevelReduction,
2465  bra.secondLevelReduction, valuesPerCUDABlock);
2466  cudaDeviceSynchronize();
2468  }
2469  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2470  }
2471  cudaMemcpy(&innerProdImag, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2472 
2473  // return complex
2474  Complex innerProd;
2475  innerProd.real = innerProdReal;
2476  innerProd.imag = innerProdImag;
2477  return innerProd;
2478 }
2479 
2481 __global__ void densmatr_calcFidelityKernel(Qureg dens, Qureg vec, long long int dim, qreal* reducedArray) {
2482 
2483  // figure out which density matrix row to consider
2484  long long int col;
2485  long long int row = blockIdx.x*blockDim.x + threadIdx.x;
2486  if (row >= dim) return;
2487 
2488  qreal* densReal = dens.deviceStateVec.real;
2489  qreal* densImag = dens.deviceStateVec.imag;
2490  qreal* vecReal = vec.deviceStateVec.real;
2491  qreal* vecImag = vec.deviceStateVec.imag;
2492 
2493  // compute the row-th element of the product dens*vec
2494  qreal prodReal = 0;
2495  qreal prodImag = 0;
2496  for (col=0LL; col < dim; col++) {
2497  qreal densElemReal = densReal[dim*col + row];
2498  qreal densElemImag = densImag[dim*col + row];
2499 
2500  prodReal += densElemReal*vecReal[col] - densElemImag*vecImag[col];
2501  prodImag += densElemReal*vecImag[col] + densElemImag*vecReal[col];
2502  }
2503 
2504  // multiply with row-th elem of (vec^*)
2505  qreal termReal = prodImag*vecImag[row] + prodReal*vecReal[row];
2506 
2507  // imag of every term should be zero, because each is a valid fidelity calc of an eigenstate
2508  //qreal termImag = prodImag*vecReal[row] - prodReal*vecImag[row];
2509 
2510  extern __shared__ qreal tempReductionArray[];
2511  tempReductionArray[threadIdx.x] = termReal;
2512  __syncthreads();
2513 
2514  // every second thread reduces
2515  if (threadIdx.x<blockDim.x/2)
2516  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2517 }
2518 
2520 
2521  // we're summing the square of every term in the density matrix
2522  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2523  long long int numValuesToReduce = densityDim;
2524 
2525  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2526  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2527  int firstTime = 1;
2528 
2529  while (numValuesToReduce > 1) {
2530 
2531  // need less than one CUDA-BLOCK to reduce
2532  if (numValuesToReduce < maxReducedPerLevel) {
2533  valuesPerCUDABlock = numValuesToReduce;
2534  numCUDABlocks = 1;
2535  }
2536  // otherwise use only full CUDA-BLOCKS
2537  else {
2538  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2539  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2540  }
2541  // dictates size of reduction array
2542  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2543 
2544  // spawn threads to sum the probs in each block
2545  // store the reduction in the pureState array
2546  if (firstTime) {
2547  densmatr_calcFidelityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2548  qureg, pureState, densityDim, pureState.firstLevelReduction);
2549  firstTime = 0;
2550 
2551  // sum the block probs
2552  } else {
2553  cudaDeviceSynchronize();
2554  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2555  pureState.firstLevelReduction,
2556  pureState.secondLevelReduction, valuesPerCUDABlock);
2557  cudaDeviceSynchronize();
2558  swapDouble(&(pureState.firstLevelReduction), &(pureState.secondLevelReduction));
2559  }
2560 
2561  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2562  }
2563 
2564  qreal fidelity;
2565  cudaMemcpy(&fidelity, pureState.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2566  return fidelity;
2567 }
2568 
2570  qreal* aRe, qreal* aIm, qreal* bRe, qreal* bIm,
2571  long long int numAmpsToSum, qreal *reducedArray
2572 ) {
2573  // figure out which density matrix term this thread is assigned
2574  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2575  if (index >= numAmpsToSum) return;
2576 
2577  // compute this thread's sum term
2578  qreal difRe = aRe[index] - bRe[index];
2579  qreal difIm = aIm[index] - bIm[index];
2580  qreal term = difRe*difRe + difIm*difIm;
2581 
2582  // array of each thread's collected term, to be summed
2583  extern __shared__ qreal tempReductionArray[];
2584  tempReductionArray[threadIdx.x] = term;
2585  __syncthreads();
2586 
2587  // every second thread reduces
2588  if (threadIdx.x<blockDim.x/2)
2589  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2590 }
2591 
2592 /* computes sqrt(Tr( (a-b) conjTrans(a-b) ) = sqrt( sum of abs vals of (a-b)) */
2594 
2595  // we're summing the square of every term in (a-b)
2596  long long int numValuesToReduce = a.numAmpsPerChunk;
2597 
2598  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2599  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2600  int firstTime = 1;
2601 
2602  while (numValuesToReduce > 1) {
2603 
2604  // need less than one CUDA-BLOCK to reduce
2605  if (numValuesToReduce < maxReducedPerLevel) {
2606  valuesPerCUDABlock = numValuesToReduce;
2607  numCUDABlocks = 1;
2608  }
2609  // otherwise use only full CUDA-BLOCKS
2610  else {
2611  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2612  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2613  }
2614  // dictates size of reduction array
2615  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2616 
2617  // spawn threads to sum the probs in each block (store reduction temp values in a's reduction array)
2618  if (firstTime) {
2619  densmatr_calcHilbertSchmidtDistanceSquaredKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2620  a.deviceStateVec.real, a.deviceStateVec.imag,
2621  b.deviceStateVec.real, b.deviceStateVec.imag,
2622  numValuesToReduce, a.firstLevelReduction);
2623  firstTime = 0;
2624 
2625  // sum the block probs
2626  } else {
2627  cudaDeviceSynchronize();
2628  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2630  a.secondLevelReduction, valuesPerCUDABlock);
2631  cudaDeviceSynchronize();
2633  }
2634 
2635  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2636  }
2637 
2638  qreal trace;
2639  cudaMemcpy(&trace, a.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2640 
2641  qreal sqrtTrace = sqrt(trace);
2642  return sqrtTrace;
2643 }
2644 
2645 __global__ void densmatr_calcPurityKernel(qreal* vecReal, qreal* vecImag, long long int numAmpsToSum, qreal *reducedArray) {
2646 
2647  // figure out which density matrix term this thread is assigned
2648  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2649  if (index >= numAmpsToSum) return;
2650 
2651  qreal term = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2652 
2653  // array of each thread's collected probability, to be summed
2654  extern __shared__ qreal tempReductionArray[];
2655  tempReductionArray[threadIdx.x] = term;
2656  __syncthreads();
2657 
2658  // every second thread reduces
2659  if (threadIdx.x<blockDim.x/2)
2660  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2661 }
2662 
2665 
2666  // we're summing the square of every term in the density matrix
2667  long long int numValuesToReduce = qureg.numAmpsPerChunk;
2668 
2669  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2670  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2671  int firstTime = 1;
2672 
2673  while (numValuesToReduce > 1) {
2674 
2675  // need less than one CUDA-BLOCK to reduce
2676  if (numValuesToReduce < maxReducedPerLevel) {
2677  valuesPerCUDABlock = numValuesToReduce;
2678  numCUDABlocks = 1;
2679  }
2680  // otherwise use only full CUDA-BLOCKS
2681  else {
2682  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2683  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2684  }
2685  // dictates size of reduction array
2686  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2687 
2688  // spawn threads to sum the probs in each block
2689  if (firstTime) {
2690  densmatr_calcPurityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2691  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
2692  numValuesToReduce, qureg.firstLevelReduction);
2693  firstTime = 0;
2694 
2695  // sum the block probs
2696  } else {
2697  cudaDeviceSynchronize();
2698  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2699  qureg.firstLevelReduction,
2700  qureg.secondLevelReduction, valuesPerCUDABlock);
2701  cudaDeviceSynchronize();
2703  }
2704 
2705  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2706  }
2707 
2708  qreal traceDensSquared;
2709  cudaMemcpy(&traceDensSquared, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2710  return traceDensSquared;
2711 }
2712 
2713 __global__ void statevec_collapseToKnownProbOutcomeKernel(Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
2714 {
2715  // ----- sizes
2716  long long int sizeBlock, // size of blocks
2717  sizeHalfBlock; // size of blocks halved
2718  // ----- indices
2719  long long int thisBlock, // current block
2720  index; // current index for first half block
2721  // ----- measured probability
2722  qreal renorm; // probability (returned) value
2723  // ----- temp variables
2724  long long int thisTask; // task based approach for expose loop with small granularity
2725  // (good for shared memory parallelism)
2726  long long int numTasks=qureg.numAmpsPerChunk>>1;
2727 
2728  // ---------------------------------------------------------------- //
2729  // dimensions //
2730  // ---------------------------------------------------------------- //
2731  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2732  // and then the number to skip
2733  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2734 
2735  // ---------------------------------------------------------------- //
2736  // find probability //
2737  // ---------------------------------------------------------------- //
2738 
2739  //
2740  // --- task-based shared-memory parallel implementation
2741  //
2742  renorm=1/sqrt(totalProbability);
2743  qreal *stateVecReal = qureg.deviceStateVec.real;
2744  qreal *stateVecImag = qureg.deviceStateVec.imag;
2745 
2746  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2747  if (thisTask>=numTasks) return;
2748  thisBlock = thisTask / sizeHalfBlock;
2749  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2750 
2751  if (outcome==0){
2752  stateVecReal[index]=stateVecReal[index]*renorm;
2753  stateVecImag[index]=stateVecImag[index]*renorm;
2754 
2755  stateVecReal[index+sizeHalfBlock]=0;
2756  stateVecImag[index+sizeHalfBlock]=0;
2757  } else if (outcome==1){
2758  stateVecReal[index]=0;
2759  stateVecImag[index]=0;
2760 
2761  stateVecReal[index+sizeHalfBlock]=stateVecReal[index+sizeHalfBlock]*renorm;
2762  stateVecImag[index+sizeHalfBlock]=stateVecImag[index+sizeHalfBlock]*renorm;
2763  }
2764 }
2765 
2766 /*
2767  * outcomeProb must accurately be the probability of that qubit outcome in the state-vector, or
2768  * else the state-vector will lose normalisation
2769  */
2770 void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
2771 {
2772  int threadsPerCUDABlock, CUDABlocks;
2773  threadsPerCUDABlock = 128;
2774  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
2775  statevec_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, measureQubit, outcome, outcomeProb);
2776 }
2777 
2780  qreal outcomeProb, qreal* vecReal, qreal *vecImag, long long int numBasesToVisit,
2781  long long int part1, long long int part2, long long int part3,
2782  long long int rowBit, long long int colBit, long long int desired, long long int undesired)
2783 {
2784  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2785  if (scanInd >= numBasesToVisit) return;
2786 
2787  long long int base = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2788 
2789  // renormalise desired outcome
2790  vecReal[base + desired] /= outcomeProb;
2791  vecImag[base + desired] /= outcomeProb;
2792 
2793  // kill undesired outcome
2794  vecReal[base + undesired] = 0;
2795  vecImag[base + undesired] = 0;
2796 
2797  // kill |..0..><..1..| states
2798  vecReal[base + colBit] = 0;
2799  vecImag[base + colBit] = 0;
2800  vecReal[base + rowBit] = 0;
2801  vecImag[base + rowBit] = 0;
2802 }
2803 
2805 void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb) {
2806 
2807  int rowQubit = measureQubit + qureg.numQubitsRepresented;
2808 
2809  int colBit = 1LL << measureQubit;
2810  int rowBit = 1LL << rowQubit;
2811 
2812  long long int numBasesToVisit = qureg.numAmpsPerChunk/4;
2813  long long int part1 = colBit -1;
2814  long long int part2 = (rowBit >> 1) - colBit;
2815  long long int part3 = numBasesToVisit - (rowBit >> 1);
2816 
2817  long long int desired, undesired;
2818  if (outcome == 0) {
2819  desired = 0;
2820  undesired = colBit | rowBit;
2821  } else {
2822  desired = colBit | rowBit;
2823  undesired = 0;
2824  }
2825 
2826  int threadsPerCUDABlock, CUDABlocks;
2827  threadsPerCUDABlock = 128;
2828  CUDABlocks = ceil(numBasesToVisit / (qreal) threadsPerCUDABlock);
2829  densmatr_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2830  outcomeProb, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBasesToVisit,
2831  part1, part2, part3, rowBit, colBit, desired, undesired);
2832 }
2833 
2834 __global__ void densmatr_mixDensityMatrixKernel(Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit) {
2835 
2836  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2837  if (ampInd >= numAmpsToVisit) return;
2838 
2839  combineQureg.deviceStateVec.real[ampInd] *= 1-otherProb;
2840  combineQureg.deviceStateVec.imag[ampInd] *= 1-otherProb;
2841 
2842  combineQureg.deviceStateVec.real[ampInd] += otherProb*otherQureg.deviceStateVec.real[ampInd];
2843  combineQureg.deviceStateVec.imag[ampInd] += otherProb*otherQureg.deviceStateVec.imag[ampInd];
2844 }
2845 
2846 void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg) {
2847 
2848  long long int numAmpsToVisit = combineQureg.numAmpsPerChunk;
2849 
2850  int threadsPerCUDABlock, CUDABlocks;
2851  threadsPerCUDABlock = 128;
2852  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2853  densmatr_mixDensityMatrixKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2854  combineQureg, otherProb, otherQureg, numAmpsToVisit
2855  );
2856 }
2857 
2864  qreal fac, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2865  long long int part1, long long int part2, long long int part3,
2866  long long int colBit, long long int rowBit)
2867 {
2868  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2869  if (scanInd >= numAmpsToVisit) return;
2870 
2871  long long int ampInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2872  vecReal[ampInd + colBit] *= fac;
2873  vecImag[ampInd + colBit] *= fac;
2874  vecReal[ampInd + rowBit] *= fac;
2875  vecImag[ampInd + rowBit] *= fac;
2876 }
2877 
2878 
2879 void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac) {
2880 
2881  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2882 
2883  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2884  long long int colBit = 1LL << targetQubit;
2885  long long int rowBit = 1LL << rowQubit;
2886 
2887  long long int part1 = colBit - 1;
2888  long long int part2 = (rowBit >> 1) - colBit;
2889  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2890 
2891  int threadsPerCUDABlock, CUDABlocks;
2892  threadsPerCUDABlock = 128;
2893  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2894  densmatr_mixDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2895  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2896  part1, part2, part3, colBit, rowBit);
2897 }
2898 
2899 void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase) {
2900 
2901  if (dephase == 0)
2902  return;
2903 
2904  qreal dephFac = 1 - dephase;
2905  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephFac);
2906 }
2907 
2915  qreal fac, qreal* vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit,
2916  long long int part1, long long int part2, long long int part3, long long int part4, long long int part5,
2917  long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
2918 {
2919  long long int outerInd = blockIdx.x*blockDim.x + threadIdx.x;
2920  if (outerInd >= numAmpsToVisit) return;
2921 
2922  // sets meta in 1...14 excluding 5, 10, creating bit string DCBA for |..D..C..><..B..A|
2923  int meta = 1 + (outerInd/numBackgroundStates);
2924  if (meta > 4) meta++;
2925  if (meta > 9) meta++;
2926 
2927  long long int shift = rowBit2*((meta>>3)%2) + rowBit1*((meta>>2)%2) + colBit2*((meta>>1)%2) + colBit1*(meta%2);
2928  long long int scanInd = outerInd % numBackgroundStates;
2929  long long int stateInd = (
2930  shift +
2931  (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4));
2932 
2933  vecReal[stateInd] *= fac;
2934  vecImag[stateInd] *= fac;
2935 }
2936 
2937 // @TODO is separating these 12 amplitudes really faster than letting every 16th base modify 12 elems?
2938 void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase) {
2939 
2940  if (dephase == 0)
2941  return;
2942 
2943  // assumes qubit2 > qubit1
2944 
2945  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2946  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2947 
2948  long long int colBit1 = 1LL << qubit1;
2949  long long int rowBit1 = 1LL << rowQubit1;
2950  long long int colBit2 = 1LL << qubit2;
2951  long long int rowBit2 = 1LL << rowQubit2;
2952 
2953  long long int part1 = colBit1 - 1;
2954  long long int part2 = (colBit2 >> 1) - colBit1;
2955  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2956  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2957  long long int part5 = (qureg.numAmpsPerChunk/16) - (rowBit2 >> 3);
2958  qreal dephFac = 1 - dephase;
2959 
2960  // refers to states |a 0 b 0 c><d 0 e 0 f| (target qubits are fixed)
2961  long long int numBackgroundStates = qureg.numAmpsPerChunk/16;
2962 
2963  // 12 of these states experience dephasing
2964  long long int numAmpsToVisit = 12 * numBackgroundStates;
2965 
2966  int threadsPerCUDABlock, CUDABlocks;
2967  threadsPerCUDABlock = 128;
2968  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2969  densmatr_mixTwoQubitDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2970  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBackgroundStates, numAmpsToVisit,
2971  part1, part2, part3, part4, part5, colBit1, rowBit1, colBit2, rowBit2);
2972 }
2973 
2976  qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
2977  long long int part1, long long int part2, long long int part3,
2978  long long int bothBits)
2979 {
2980  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2981  if (scanInd >= numAmpsToVisit) return;
2982 
2983  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2984  long long int targetInd = baseInd + bothBits;
2985 
2986  qreal realAvDepol = depolLevel * 0.5 * (vecReal[baseInd] + vecReal[targetInd]);
2987  qreal imagAvDepol = depolLevel * 0.5 * (vecImag[baseInd] + vecImag[targetInd]);
2988 
2989  vecReal[baseInd] *= 1 - depolLevel;
2990  vecImag[baseInd] *= 1 - depolLevel;
2991  vecReal[targetInd] *= 1 - depolLevel;
2992  vecImag[targetInd] *= 1 - depolLevel;
2993 
2994  vecReal[baseInd] += realAvDepol;
2995  vecImag[baseInd] += imagAvDepol;
2996  vecReal[targetInd] += realAvDepol;
2997  vecImag[targetInd] += imagAvDepol;
2998 }
2999 
3002  qreal damping, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
3003  long long int part1, long long int part2, long long int part3,
3004  long long int bothBits)
3005 {
3006  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
3007  if (scanInd >= numAmpsToVisit) return;
3008 
3009  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
3010  long long int targetInd = baseInd + bothBits;
3011 
3012  qreal realAvDepol = damping * ( vecReal[targetInd]);
3013  qreal imagAvDepol = damping * ( vecImag[targetInd]);
3014 
3015  vecReal[targetInd] *= 1 - damping;
3016  vecImag[targetInd] *= 1 - damping;
3017 
3018  vecReal[baseInd] += realAvDepol;
3019  vecImag[baseInd] += imagAvDepol;
3020 }
3021 
3022 void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel) {
3023 
3024  if (depolLevel == 0)
3025  return;
3026 
3027  densmatr_mixDephasing(qureg, targetQubit, depolLevel);
3028 
3029  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
3030  int rowQubit = targetQubit + qureg.numQubitsRepresented;
3031 
3032  long long int colBit = 1LL << targetQubit;
3033  long long int rowBit = 1LL << rowQubit;
3034  long long int bothBits = colBit | rowBit;
3035 
3036  long long int part1 = colBit - 1;
3037  long long int part2 = (rowBit >> 1) - colBit;
3038  long long int part3 = numAmpsToVisit - (rowBit >> 1);
3039 
3040  int threadsPerCUDABlock, CUDABlocks;
3041  threadsPerCUDABlock = 128;
3042  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3043  densmatr_mixDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3044  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3045  part1, part2, part3, bothBits);
3046 }
3047 
3048 void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping) {
3049 
3050  if (damping == 0)
3051  return;
3052 
3053  qreal dephase = sqrt(1-damping);
3054  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephase);
3055 
3056  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
3057  int rowQubit = targetQubit + qureg.numQubitsRepresented;
3058 
3059  long long int colBit = 1LL << targetQubit;
3060  long long int rowBit = 1LL << rowQubit;
3061  long long int bothBits = colBit | rowBit;
3062 
3063  long long int part1 = colBit - 1;
3064  long long int part2 = (rowBit >> 1) - colBit;
3065  long long int part3 = numAmpsToVisit - (rowBit >> 1);
3066 
3067  int threadsPerCUDABlock, CUDABlocks;
3068  threadsPerCUDABlock = 128;
3069  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3070  densmatr_mixDampingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3071  damping, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3072  part1, part2, part3, bothBits);
3073 }
3074 
3077  qreal depolLevel, qreal* vecReal, qreal *vecImag, long long int numAmpsToVisit,
3078  long long int part1, long long int part2, long long int part3,
3079  long long int part4, long long int part5,
3080  long long int rowCol1, long long int rowCol2)
3081 {
3082  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
3083  if (scanInd >= numAmpsToVisit) return;
3084 
3085  // index of |..0..0..><..0..0|
3086  long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
3087  long long int ind01 = ind00 + rowCol1;
3088  long long int ind10 = ind00 + rowCol2;
3089  long long int ind11 = ind00 + rowCol1 + rowCol2;
3090 
3091  qreal realAvDepol = depolLevel * 0.25 * (
3092  vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]);
3093  qreal imagAvDepol = depolLevel * 0.25 * (
3094  vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]);
3095 
3096  qreal retain = 1 - depolLevel;
3097  vecReal[ind00] *= retain; vecImag[ind00] *= retain;
3098  vecReal[ind01] *= retain; vecImag[ind01] *= retain;
3099  vecReal[ind10] *= retain; vecImag[ind10] *= retain;
3100  vecReal[ind11] *= retain; vecImag[ind11] *= retain;
3101 
3102  vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol;
3103  vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol;
3104  vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol;
3105  vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol;
3106 }
3107 
3108 void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel) {
3109 
3110  if (depolLevel == 0)
3111  return;
3112 
3113  // assumes qubit2 > qubit1
3114 
3115  densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
3116 
3117  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
3118  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
3119 
3120  long long int colBit1 = 1LL << qubit1;
3121  long long int rowBit1 = 1LL << rowQubit1;
3122  long long int colBit2 = 1LL << qubit2;
3123  long long int rowBit2 = 1LL << rowQubit2;
3124 
3125  long long int rowCol1 = colBit1 | rowBit1;
3126  long long int rowCol2 = colBit2 | rowBit2;
3127 
3128  long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
3129  long long int part1 = colBit1 - 1;
3130  long long int part2 = (colBit2 >> 1) - colBit1;
3131  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
3132  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
3133  long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
3134 
3135  int threadsPerCUDABlock, CUDABlocks;
3136  threadsPerCUDABlock = 128;
3137  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3138  densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3139  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3140  part1, part2, part3, part4, part5, rowCol1, rowCol2);
3141 }
3142 
3143 __global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) {
3144 
3145  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
3146  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
3147  if (ampInd >= numAmpsToVisit) return;
3148 
3149  qreal *vecRe1 = qureg1.deviceStateVec.real;
3150  qreal *vecIm1 = qureg1.deviceStateVec.imag;
3151  qreal *vecRe2 = qureg2.deviceStateVec.real;
3152  qreal *vecIm2 = qureg2.deviceStateVec.imag;
3153  qreal *vecReOut = out.deviceStateVec.real;
3154  qreal *vecImOut = out.deviceStateVec.imag;
3155 
3156  qreal facRe1 = fac1.real;
3157  qreal facIm1 = fac1.imag;
3158  qreal facRe2 = fac2.real;
3159  qreal facIm2 = fac2.imag;
3160  qreal facReOut = facOut.real;
3161  qreal facImOut = facOut.imag;
3162 
3163  qreal re1,im1, re2,im2, reOut,imOut;
3164  long long int index = ampInd;
3165 
3166  re1 = vecRe1[index]; im1 = vecIm1[index];
3167  re2 = vecRe2[index]; im2 = vecIm2[index];
3168  reOut = vecReOut[index];
3169  imOut = vecImOut[index];
3170 
3171  vecReOut[index] = (facReOut*reOut - facImOut*imOut) + (facRe1*re1 - facIm1*im1) + (facRe2*re2 - facIm2*im2);
3172  vecImOut[index] = (facReOut*imOut + facImOut*reOut) + (facRe1*im1 + facIm1*re1) + (facRe2*im2 + facIm2*re2);
3173 }
3174 
3175 void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out) {
3176 
3177  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
3178 
3179  int threadsPerCUDABlock, CUDABlocks;
3180  threadsPerCUDABlock = 128;
3181  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3182  statevec_setWeightedQuregKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3183  fac1, qureg1, fac2, qureg2, facOut, out
3184  );
3185 }
3186 
3188 
3189  // each thread modifies one value; a wasteful and inefficient strategy
3190  long long int numTasks = qureg.numAmpsPerChunk;
3191  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
3192  if (thisTask >= numTasks) return;
3193 
3194  qreal* stateRe = qureg.deviceStateVec.real;
3195  qreal* stateIm = qureg.deviceStateVec.imag;
3196  qreal* opRe = op.deviceOperator.real;
3197  qreal* opIm = op.deviceOperator.imag;
3198 
3199  qreal a = stateRe[thisTask];
3200  qreal b = stateIm[thisTask];
3201  qreal c = opRe[thisTask];
3202  qreal d = opIm[thisTask];
3203 
3204  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
3205  stateRe[thisTask] = a*c - b*d;
3206  stateIm[thisTask] = a*d + b*c;
3207 }
3208 
3210 {
3211  int threadsPerCUDABlock, CUDABlocks;
3212  threadsPerCUDABlock = 128;
3213  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
3214  statevec_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
3215 }
3216 
3218 
3219  // each thread modifies one value; a wasteful and inefficient strategy
3220  long long int numTasks = qureg.numAmpsPerChunk;
3221  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
3222  if (thisTask >= numTasks) return;
3223 
3224  qreal* stateRe = qureg.deviceStateVec.real;
3225  qreal* stateIm = qureg.deviceStateVec.imag;
3226  qreal* opRe = op.deviceOperator.real;
3227  qreal* opIm = op.deviceOperator.imag;
3228 
3229  int opDim = (1 << op.numQubits);
3230  qreal a = stateRe[thisTask];
3231  qreal b = stateIm[thisTask];
3232  qreal c = opRe[thisTask % opDim];
3233  qreal d = opIm[thisTask % opDim];
3234 
3235  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
3236  stateRe[thisTask] = a*c - b*d;
3237  stateIm[thisTask] = a*d + b*c;
3238 }
3239 
3241 
3242  int threadsPerCUDABlock, CUDABlocks;
3243  threadsPerCUDABlock = 128;
3244  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
3245  densmatr_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
3246 }
3247 
3250  int getRealComp,
3251  qreal* vecReal, qreal* vecImag, qreal* opReal, qreal* opImag,
3252  long long int numTermsToSum, qreal* reducedArray)
3253 {
3254  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3255  if (index >= numTermsToSum) return;
3256 
3257  qreal vecAbs = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
3258 
3259  // choose whether to calculate the real or imaginary term of the expec term
3260  qreal expecVal;
3261  if (getRealComp)
3262  expecVal = vecAbs * opReal[index];
3263  else
3264  expecVal = vecAbs * opImag[index];
3265 
3266  // array of each thread's collected sum term, to be summed
3267  extern __shared__ qreal tempReductionArray[];
3268  tempReductionArray[threadIdx.x] = expecVal;
3269  __syncthreads();
3270 
3271  // every second thread reduces
3272  if (threadIdx.x<blockDim.x/2)
3273  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3274 }
3275 
3277 
3278  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3279  * (e.g. a func which accepts a pointer to do every-value reduction?)
3280  */
3281 
3282  qreal expecReal, expecImag;
3283 
3284  int getRealComp;
3285  long long int numValuesToReduce;
3286  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3287  int maxReducedPerLevel;
3288  int firstTime;
3289 
3290  // compute real component of inner product
3291  getRealComp = 1;
3292  numValuesToReduce = qureg.numAmpsPerChunk;
3293  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3294  firstTime = 1;
3295  while (numValuesToReduce > 1) {
3296  if (numValuesToReduce < maxReducedPerLevel) {
3297  valuesPerCUDABlock = numValuesToReduce;
3298  numCUDABlocks = 1;
3299  }
3300  else {
3301  valuesPerCUDABlock = maxReducedPerLevel;
3302  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3303  }
3304  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3305  if (firstTime) {
3306  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3307  getRealComp,
3308  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3309  op.deviceOperator.real, op.deviceOperator.imag,
3310  numValuesToReduce,
3311  qureg.firstLevelReduction);
3312  firstTime = 0;
3313  } else {
3314  cudaDeviceSynchronize();
3315  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3316  qureg.firstLevelReduction,
3317  qureg.secondLevelReduction, valuesPerCUDABlock);
3318  cudaDeviceSynchronize();
3320  }
3321  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3322  }
3323  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3324 
3325  // compute imag component of inner product
3326  getRealComp = 0;
3327  numValuesToReduce = qureg.numAmpsPerChunk;
3328  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3329  firstTime = 1;
3330  while (numValuesToReduce > 1) {
3331  if (numValuesToReduce < maxReducedPerLevel) {
3332  valuesPerCUDABlock = numValuesToReduce;
3333  numCUDABlocks = 1;
3334  }
3335  else {
3336  valuesPerCUDABlock = maxReducedPerLevel;
3337  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3338  }
3339  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3340  if (firstTime) {
3341  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3342  getRealComp,
3343  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3344  op.deviceOperator.real, op.deviceOperator.imag,
3345  numValuesToReduce,
3346  qureg.firstLevelReduction);
3347  firstTime = 0;
3348  } else {
3349  cudaDeviceSynchronize();
3350  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3351  qureg.firstLevelReduction,
3352  qureg.secondLevelReduction, valuesPerCUDABlock);
3353  cudaDeviceSynchronize();
3355  }
3356  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3357  }
3358  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3359 
3360  // return complex
3361  Complex expecVal;
3362  expecVal.real = expecReal;
3363  expecVal.imag = expecImag;
3364  return expecVal;
3365 }
3366 
3368  int getRealComp,
3369  qreal* matReal, qreal* matImag, qreal* opReal, qreal* opImag,
3370  int numQubits, long long int numTermsToSum, qreal* reducedArray)
3371 {
3377  // index will identy one of the 2^Q diagonals to be summed
3378  long long int matInd = blockIdx.x*blockDim.x + threadIdx.x;
3379  if (matInd >= numTermsToSum) return;
3380 
3381  long long int diagSpacing = (1LL << numQubits) + 1LL;
3382  int isDiag = ((matInd % diagSpacing) == 0);
3383 
3384  long long int opInd = matInd / diagSpacing;
3385 
3386  qreal val = 0;
3387  if (isDiag) {
3388 
3389  qreal matRe = matReal[matInd];
3390  qreal matIm = matImag[matInd];
3391  qreal opRe = opReal[opInd];
3392  qreal opIm = opImag[opInd];
3393 
3394  // (matRe + matIm i)(opRe + opIm i) =
3395  // (matRe opRe - matIm opIm) + i (matRe opIm + matIm opRe)
3396  if (getRealComp)
3397  val = matRe * opRe - matIm * opIm;
3398  else
3399  val = matRe * opIm + matIm * opRe;
3400  }
3401 
3402  // array of each thread's collected sum term, to be summed
3403  extern __shared__ qreal tempReductionArray[];
3404  tempReductionArray[threadIdx.x] = val;
3405  __syncthreads();
3406 
3407  // every second thread reduces
3408  if (threadIdx.x<blockDim.x/2)
3409  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3410 }
3411 
3413 
3414  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3415  * (e.g. a func which accepts a pointer to do every-value reduction?)
3416  */
3417 
3418  qreal expecReal, expecImag;
3419 
3420  int getRealComp;
3421  long long int numValuesToReduce;
3422  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3423  int maxReducedPerLevel;
3424  int firstTime;
3425 
3426  // compute real component of inner product
3427  getRealComp = 1;
3428  numValuesToReduce = qureg.numAmpsPerChunk;
3429  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3430  firstTime = 1;
3431  while (numValuesToReduce > 1) {
3432  if (numValuesToReduce < maxReducedPerLevel) {
3433  valuesPerCUDABlock = numValuesToReduce;
3434  numCUDABlocks = 1;
3435  }
3436  else {
3437  valuesPerCUDABlock = maxReducedPerLevel;
3438  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3439  }
3440  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3441  if (firstTime) {
3442  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3443  getRealComp,
3444  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3445  op.deviceOperator.real, op.deviceOperator.imag,
3446  op.numQubits, numValuesToReduce,
3447  qureg.firstLevelReduction);
3448  firstTime = 0;
3449  } else {
3450  cudaDeviceSynchronize();
3451  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3452  qureg.firstLevelReduction,
3453  qureg.secondLevelReduction, valuesPerCUDABlock);
3454  cudaDeviceSynchronize();
3456  }
3457  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3458  }
3459  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3460 
3461  // compute imag component of inner product
3462  getRealComp = 0;
3463  numValuesToReduce = qureg.numAmpsPerChunk;
3464  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3465  firstTime = 1;
3466  while (numValuesToReduce > 1) {
3467  if (numValuesToReduce < maxReducedPerLevel) {
3468  valuesPerCUDABlock = numValuesToReduce;
3469  numCUDABlocks = 1;
3470  }
3471  else {
3472  valuesPerCUDABlock = maxReducedPerLevel;
3473  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3474  }
3475  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3476  if (firstTime) {
3477  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3478  getRealComp,
3479  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3480  op.deviceOperator.real, op.deviceOperator.imag,
3481  op.numQubits, numValuesToReduce,
3482  qureg.firstLevelReduction);
3483  firstTime = 0;
3484  } else {
3485  cudaDeviceSynchronize();
3486  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3487  qureg.firstLevelReduction,
3488  qureg.secondLevelReduction, valuesPerCUDABlock);
3489  cudaDeviceSynchronize();
3491  }
3492  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3493  }
3494  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3495 
3496  // return complex
3497  Complex expecVal;
3498  expecVal.real = expecReal;
3499  expecVal.imag = expecImag;
3500  return expecVal;
3501 }
3502 
3503 void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal* real, qreal* imag, long long int numElems) {
3504 
3505  // update both RAM and VRAM, for consistency
3506  memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
3507  memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
3508 
3509  cudaDeviceSynchronize();
3510  cudaMemcpy(
3511  op.deviceOperator.real + startInd,
3512  real,
3513  numElems * sizeof(*(op.deviceOperator.real)),
3514  cudaMemcpyHostToDevice);
3515  cudaMemcpy(
3516  op.deviceOperator.imag + startInd,
3517  imag,
3518  numElems * sizeof(*(op.deviceOperator.imag)),
3519  cudaMemcpyHostToDevice);
3520 }
3521 
3523  Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
3524  qreal* coeffs, qreal* exponents, int numTerms,
3525  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3526  int conj
3527 ) {
3528  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3529  if (index>=qureg.numAmpsPerChunk) return;
3530 
3531  // determine global amplitude index (non-distributed, so it's just local index)
3532  long long int globalAmpInd = index;
3533 
3534  // determine phase index of {qubits}
3535  long long int phaseInd = 0LL;
3536  if (encoding == UNSIGNED) {
3537  for (int q=0; q<numQubits; q++)
3538  phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
3539  }
3540  else if (encoding == TWOS_COMPLEMENT) {
3541  for (int q=0; q<numQubits-1; q++) // use final qubit to indicate sign
3542  phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
3543  if (extractBit(qubits[numQubits-1], globalAmpInd) == 1)
3544  phaseInd -= (1LL << (numQubits-1));
3545  }
3546 
3547  // determine if this phase index has an overriden value (i < numOverrides)
3548  int i;
3549  for (i=0; i<numOverrides; i++)
3550  if (phaseInd == overrideInds[i])
3551  break;
3552 
3553  // determine phase from {coeffs}, {exponents} (unless overriden)
3554  qreal phase = 0;
3555  if (i < numOverrides)
3556  phase = overridePhases[i];
3557  else
3558  for (int t=0; t<numTerms; t++)
3559  phase += coeffs[t] * pow(phaseInd, exponents[t]);
3560 
3561  // negate phase to conjugate operator
3562  if (conj)
3563  phase *= -1;
3564 
3565  // modify amp to amp * exp(i phase)
3566  qreal c = cos(phase);
3567  qreal s = sin(phase);
3568  qreal re = qureg.deviceStateVec.real[index];
3569  qreal im = qureg.deviceStateVec.imag[index];
3570 
3571  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3572  qureg.deviceStateVec.real[index] = re*c - im*s;
3573  qureg.deviceStateVec.imag[index] = re*s + im*c;
3574 }
3575 
3577  Qureg qureg, int* qubits, int numQubits, enum bitEncoding encoding,
3578  qreal* coeffs, qreal* exponents, int numTerms,
3579  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3580  int conj
3581  ) {
3582  // allocate device space for global list of {qubits}, {coeffs}, {exponents}, {overrideInds} and {overridePhases}
3583  int* d_qubits; size_t mem_qubits = numQubits * sizeof *d_qubits;
3584  qreal* d_coeffs; size_t mem_terms = numTerms * sizeof *d_coeffs;
3585  qreal* d_exponents;
3586  long long int* d_overrideInds; size_t mem_inds = numOverrides * sizeof *d_overrideInds;
3587  qreal* d_overridePhases; size_t mem_phas = numOverrides * sizeof *d_overridePhases;
3588  cudaMalloc(&d_qubits, mem_qubits); cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3589  cudaMalloc(&d_coeffs, mem_terms); cudaMemcpy(d_coeffs, coeffs, mem_terms, cudaMemcpyHostToDevice);
3590  cudaMalloc(&d_exponents, mem_terms); cudaMemcpy(d_exponents, exponents, mem_terms, cudaMemcpyHostToDevice);
3591  cudaMalloc(&d_overrideInds, mem_inds); cudaMemcpy(d_overrideInds, overrideInds, mem_inds, cudaMemcpyHostToDevice);
3592  cudaMalloc(&d_overridePhases,mem_phas); cudaMemcpy(d_overridePhases, overridePhases, mem_phas, cudaMemcpyHostToDevice);
3593 
3594  // call kernel
3595  int threadsPerCUDABlock = 128;
3596  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3597  statevec_applyPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3598  qureg, d_qubits, numQubits, encoding,
3599  d_coeffs, d_exponents, numTerms,
3600  d_overrideInds, d_overridePhases, numOverrides,
3601  conj);
3602 
3603  // cleanup device memory
3604  cudaFree(d_qubits);
3605  cudaFree(d_coeffs);
3606  cudaFree(d_exponents);
3607  cudaFree(d_overrideInds);
3608  cudaFree(d_overridePhases);
3609 }
3610 
3612  Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
3613  qreal* coeffs, qreal* exponents, int* numTermsPerReg,
3614  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3615  long long int *phaseInds,
3616  int conj
3617 ) {
3618  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3619  if (index>=qureg.numAmpsPerChunk) return;
3620 
3621  // determine global amplitude index (non-distributed, so it's just local index)
3622  long long int globalAmpInd = index;
3623 
3624  /*
3625  * each thread needs to write to a local:
3626  * long long int phaseInds[numRegs];
3627  * but instead has access to shared array phaseInds, with below stride and offset
3628  */
3629  size_t stride = gridDim.x*blockDim.x;
3630  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
3631 
3632  // determine phase indices
3633  int flatInd = 0;
3634  if (encoding == UNSIGNED) {
3635  for (int r=0; r<numRegs; r++) {
3636  phaseInds[r*stride+offset] = 0LL;
3637  for (int q=0; q<numQubitsPerReg[r]; q++)
3638  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3639  }
3640  }
3641  else if (encoding == TWOS_COMPLEMENT) {
3642  for (int r=0; r<numRegs; r++) {
3643  phaseInds[r*stride+offset] = 0LL;
3644  for (int q=0; q<numQubitsPerReg[r]-1; q++)
3645  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3646  // use final qubit to indicate sign
3647  if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
3648  phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
3649  }
3650  }
3651 
3652  // determine if this phase index has an overriden value (i < numOverrides)
3653  int i;
3654  for (i=0; i<numOverrides; i++) {
3655  int found = 1;
3656  for (int r=0; r<numRegs; r++) {
3657  if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
3658  found = 0;
3659  break;
3660  }
3661  }
3662  if (found)
3663  break;
3664  }
3665 
3666  // compute the phase (unless overriden)
3667  qreal phase = 0;
3668  if (i < numOverrides)
3669  phase = overridePhases[i];
3670  else {
3671  flatInd = 0;
3672  for (int r=0; r<numRegs; r++) {
3673  for (int t=0; t<numTermsPerReg[r]; t++) {
3674  phase += coeffs[flatInd] * pow(phaseInds[r*stride+offset], exponents[flatInd]);
3675  flatInd++;
3676  }
3677  }
3678  }
3679 
3680  // negate phase to conjugate operator
3681  if (conj)
3682  phase *= -1;
3683 
3684  // modify amp to amp * exp(i phase)
3685  qreal c = cos(phase);
3686  qreal s = sin(phase);
3687  qreal re = qureg.deviceStateVec.real[index];
3688  qreal im = qureg.deviceStateVec.imag[index];
3689 
3690  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3691  qureg.deviceStateVec.real[index] = re*c - im*s;
3692  qureg.deviceStateVec.imag[index] = re*s + im*c;
3693 }
3694 
3696  Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
3697  qreal* coeffs, qreal* exponents, int* numTermsPerReg,
3698  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3699  int conj
3700 ) {
3701  // determine size of arrays, for cloning into GPU memory
3702  size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
3703  size_t mem_numTermsPerReg = numRegs * sizeof *numTermsPerReg;
3704  size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
3705  size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
3706  size_t mem_qubits = 0;
3707  size_t mem_coeffs = 0;
3708  size_t mem_exponents = 0;
3709  for (int r=0; r<numRegs; r++) {
3710  mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
3711  mem_coeffs += numTermsPerReg[r] * sizeof *coeffs;
3712  mem_exponents += numTermsPerReg[r] * sizeof *exponents;
3713  }
3714 
3715  // allocate global GPU memory
3716  int* d_qubits; cudaMalloc(&d_qubits, mem_qubits);
3717  qreal* d_coeffs; cudaMalloc(&d_coeffs, mem_coeffs);
3718  qreal* d_exponents; cudaMalloc(&d_exponents, mem_exponents);
3719  int* d_numQubitsPerReg; cudaMalloc(&d_numQubitsPerReg, mem_numQubitsPerReg);
3720  int* d_numTermsPerReg; cudaMalloc(&d_numTermsPerReg, mem_numTermsPerReg);
3721  long long int* d_overrideInds; cudaMalloc(&d_overrideInds, mem_overrideInds);
3722  qreal* d_overridePhases; cudaMalloc(&d_overridePhases, mem_overridePhases);
3723 
3724  // copy function args into GPU memory
3725  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3726  cudaMemcpy(d_coeffs, coeffs, mem_coeffs, cudaMemcpyHostToDevice);
3727  cudaMemcpy(d_exponents, exponents, mem_exponents, cudaMemcpyHostToDevice);
3728  cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice);
3729  cudaMemcpy(d_numTermsPerReg, numTermsPerReg, mem_numTermsPerReg, cudaMemcpyHostToDevice);
3730  cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice);
3731  cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice);
3732 
3733  int threadsPerCUDABlock = 128;
3734  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3735 
3736  // allocate thread-local working space {phaseInds}
3737  long long int *d_phaseInds;
3738  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
3739  cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
3740 
3741  // call kernel
3742  statevec_applyMultiVarPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3743  qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
3744  d_coeffs, d_exponents, d_numTermsPerReg,
3745  d_overrideInds, d_overridePhases, numOverrides,
3746  d_phaseInds,
3747  conj);
3748 
3749  // free device memory
3750  cudaFree(d_qubits);
3751  cudaFree(d_coeffs);
3752  cudaFree(d_exponents);
3753  cudaFree(d_numQubitsPerReg);
3754  cudaFree(d_numTermsPerReg);
3755  cudaFree(d_overrideInds);
3756  cudaFree(d_overridePhases);
3757  cudaFree(d_phaseInds);
3758 }
3759 
3761  Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
3762  enum phaseFunc phaseFuncName, qreal* params, int numParams,
3763  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3764  long long int* phaseInds,
3765  int conj
3766 ) {
3767  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3768  if (index>=qureg.numAmpsPerChunk) return;
3769 
3770  // determine global amplitude index (non-distributed, so it's just local index)
3771  long long int globalAmpInd = index;
3772 
3773  /*
3774  * each thread needs to write to a local:
3775  * long long int phaseInds[numRegs];
3776  * but instead has access to shared array phaseInds, with below stride and offset
3777  */
3778  size_t stride = gridDim.x*blockDim.x;
3779  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
3780 
3781  // determine phase indices
3782  if (encoding == UNSIGNED) {
3783  int flatInd = 0;
3784  for (int r=0; r<numRegs; r++) {
3785  phaseInds[r*stride+offset] = 0LL;
3786  for (int q=0; q<numQubitsPerReg[r]; q++)
3787  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3788  }
3789  }
3790  else if (encoding == TWOS_COMPLEMENT) {
3791  int flatInd = 0;
3792  for (int r=0; r<numRegs; r++) {
3793  phaseInds[r*stride+offset] = 0LL;
3794  for (int q=0; q<numQubitsPerReg[r]-1; q++)
3795  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3796  // use final qubit to indicate sign
3797  if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
3798  phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
3799  }
3800  }
3801 
3802  // determine if this phase index has an overriden value (i < numOverrides)
3803  int i;
3804  for (i=0; i<numOverrides; i++) {
3805  int found = 1;
3806  for (int r=0; r<numRegs; r++) {
3807  if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
3808  found = 0;
3809  break;
3810  }
3811  }
3812  if (found)
3813  break;
3814  }
3815 
3816  // compute the phase (unless overriden)
3817  qreal phase = 0;
3818  if (i < numOverrides)
3819  phase = overridePhases[i];
3820  else {
3821  // compute norm related phases
3822  if (phaseFuncName == NORM || phaseFuncName == INVERSE_NORM ||
3823  phaseFuncName == SCALED_NORM || phaseFuncName == SCALED_INVERSE_NORM ||
3824  phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
3825  qreal norm = 0;
3826  if (phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
3827  for (int r=0; r<numRegs; r++) {
3828  qreal dif = phaseInds[r*stride+offset] - params[2+r];
3829  norm += dif*dif;
3830  }
3831  }
3832  else
3833  for (int r=0; r<numRegs; r++)
3834  norm += phaseInds[r*stride+offset]*phaseInds[r*stride+offset];
3835  norm = sqrt(norm);
3836 
3837  if (phaseFuncName == NORM)
3838  phase = norm;
3839  else if (phaseFuncName == INVERSE_NORM)
3840  phase = (norm == 0.)? params[0] : 1/norm; // smallest non-zero norm is 1
3841  else if (phaseFuncName == SCALED_NORM)
3842  phase = params[0] * norm;
3843  else if (phaseFuncName == SCALED_INVERSE_NORM || phaseFuncName == SCALED_INVERSE_SHIFTED_NORM)
3844  phase = (norm <= REAL_EPS)? params[1] : params[0] / norm; // unless shifted closer to zero
3845  }
3846  // compute product related phases
3847  else if (phaseFuncName == PRODUCT || phaseFuncName == INVERSE_PRODUCT ||
3848  phaseFuncName == SCALED_PRODUCT || phaseFuncName == SCALED_INVERSE_PRODUCT) {
3849 
3850  qreal prod = 1;
3851  for (int r=0; r<numRegs; r++)
3852  prod *= phaseInds[r*stride+offset];
3853 
3854  if (phaseFuncName == PRODUCT)
3855  phase = prod;
3856  else if (phaseFuncName == INVERSE_PRODUCT)
3857  phase = (prod == 0.)? params[0] : 1/prod; // smallest non-zero prod is +- 1
3858  else if (phaseFuncName == SCALED_PRODUCT)
3859  phase = params[0] * prod;
3860  else if (phaseFuncName == SCALED_INVERSE_PRODUCT)
3861  phase = (prod == 0.)? params[1] : params[0] / prod;
3862  }
3863  // compute Euclidean distance related phases
3864  else if (phaseFuncName == DISTANCE || phaseFuncName == INVERSE_DISTANCE ||
3865  phaseFuncName == SCALED_DISTANCE || phaseFuncName == SCALED_INVERSE_DISTANCE ||
3866  phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
3867 
3868  qreal dist = 0;
3869  if (phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
3870  for (int r=0; r<numRegs; r+=2) {
3871  qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r/2]);
3872  dist += dif*dif;
3873  }
3874  }
3875  else
3876  for (int r=0; r<numRegs; r+=2) {
3877  qreal dif = (phaseInds[(r+1)*stride+offset] - phaseInds[r*stride+offset]);
3878  dist += dif*dif;
3879  }
3880  dist = sqrt(dist);
3881 
3882  if (phaseFuncName == DISTANCE)
3883  phase = dist;
3884  else if (phaseFuncName == INVERSE_DISTANCE)
3885  phase = (dist == 0.)? params[0] : 1/dist; // smallest non-zero dist is 1
3886  else if (phaseFuncName == SCALED_DISTANCE)
3887  phase = params[0] * dist;
3888  else if (phaseFuncName == SCALED_INVERSE_DISTANCE || phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE)
3889  phase = (dist <= REAL_EPS)? params[1] : params[0] / dist; // unless shifted closer
3890  }
3891  }
3892 
3893 
3894  // negate phase to conjugate operator
3895  if (conj)
3896  phase *= -1;
3897 
3898  // modify amp to amp * exp(i phase)
3899  qreal c = cos(phase);
3900  qreal s = sin(phase);
3901  qreal re = qureg.deviceStateVec.real[index];
3902  qreal im = qureg.deviceStateVec.imag[index];
3903 
3904  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3905  qureg.deviceStateVec.real[index] = re*c - im*s;
3906  qureg.deviceStateVec.imag[index] = re*s + im*c;
3907 }
3908 
3910  Qureg qureg, int* qubits, int* numQubitsPerReg, int numRegs, enum bitEncoding encoding,
3911  enum phaseFunc phaseFuncName, qreal* params, int numParams,
3912  long long int* overrideInds, qreal* overridePhases, int numOverrides,
3913  int conj
3914 ) {
3915  // determine size of arrays, for cloning into GPU memory
3916  size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
3917  size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
3918  size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
3919  size_t mem_params = numParams * sizeof *params;
3920  size_t mem_qubits = 0;
3921  for (int r=0; r<numRegs; r++)
3922  mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
3923 
3924  // allocate global GPU memory
3925  int* d_qubits; cudaMalloc(&d_qubits, mem_qubits);
3926  int* d_numQubitsPerReg; cudaMalloc(&d_numQubitsPerReg, mem_numQubitsPerReg);
3927  long long int* d_overrideInds; cudaMalloc(&d_overrideInds, mem_overrideInds);
3928  qreal* d_overridePhases; cudaMalloc(&d_overridePhases, mem_overridePhases);
3929  qreal* d_params = NULL; if (numParams > 0) cudaMalloc(&d_params, mem_params);
3930 
3931  // copy function args into GPU memory
3932  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3933  cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice);
3934  cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice);
3935  cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice);
3936  if (numParams > 0)
3937  cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice);
3938 
3939  int threadsPerCUDABlock = 128;
3940  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3941 
3942  // allocate thread-local working space {phaseInds}
3943  long long int *d_phaseInds;
3944  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
3945  cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
3946 
3947  // call kernel
3948  statevec_applyParamNamedPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3949  qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
3950  phaseFuncName, d_params, numParams,
3951  d_overrideInds, d_overridePhases, numOverrides,
3952  d_phaseInds,
3953  conj);
3954 
3955  // free device memory
3956  cudaFree(d_qubits);
3957  cudaFree(d_numQubitsPerReg);
3958  cudaFree(d_overrideInds);
3959  cudaFree(d_overridePhases);
3960  cudaFree(d_phaseInds);
3961  if (numParams > 0)
3962  cudaFree(d_params);
3963 }
3964 
3965 void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds) {
3966 
3967  // free existing seed array, if exists
3968  if (env->seeds != NULL)
3969  free(env->seeds);
3970 
3971  // record keys in permanent heap
3972  env->seeds = malloc(numSeeds * sizeof *(env->seeds));
3973  for (int i=0; i<numSeeds; i++)
3974  (env->seeds)[i] = seedArray[i];
3975  env->numSeeds = numSeeds;
3976 
3977  // pass keys to Mersenne Twister seeder
3978  init_by_array(seedArray, numSeeds);
3979 }
3980 
3981 
3982 
3983 #ifdef __cplusplus
3984 }
3985 #endif
void agnostic_initDiagonalOpFromPauliHamil(DiagonalOp op, PauliHamil hamil)
Definition: QuEST_gpu.cu:418
@ INVERSE_PRODUCT
Definition: QuEST.h:233
void densmatr_mixDamping(Qureg qureg, int targetQubit, qreal damping)
Definition: QuEST_gpu.cu:3048
__global__ void statevec_multiControlledUnitaryKernel(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:1245
__global__ void statevec_initPlusStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:631
void destroyQuESTEnv(QuESTEnv env)
Destroy the QuEST environment.
Definition: QuEST_gpu.cu:489
pauliOpType
Codes for specifying Pauli operators.
Definition: QuEST.h:96
void init_by_array(unsigned long init_key[], int key_length)
Definition: mt19937ar.c:80
__device__ __host__ unsigned int log2Int(unsigned int x)
Definition: QuEST_gpu.cu:1925
__global__ void agnostic_initDiagonalOpFromPauliHamilKernel(DiagonalOp op, enum pauliOpType *pauliCodes, qreal *termCoeffs, int numSumTerms)
Definition: QuEST_gpu.cu:389
__global__ void statevec_setWeightedQuregKernel(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
Definition: QuEST_gpu.cu:3143
__global__ void densmatr_collapseToKnownProbOutcomeKernel(qreal outcomeProb, qreal *vecReal, qreal *vecImag, long long int numBasesToVisit, long long int part1, long long int part2, long long int part3, long long int rowBit, long long int colBit, long long int desired, long long int undesired)
Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|.
Definition: QuEST_gpu.cu:2779
void statevec_swapQubitAmps(Qureg qureg, int qb1, int qb2)
Definition: QuEST_gpu.cu:1769
void copyStateFromGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg....
Definition: QuEST_gpu.cu:529
__global__ void statevec_controlledUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:1179
void statevec_multiRotateZ(Qureg qureg, long long int mask, qreal angle)
Definition: QuEST_gpu.cu:1588
qreal real[4][4]
Definition: QuEST.h:177
void syncQuESTEnv(QuESTEnv env)
Guarantees that all code up to the given point has been executed on all nodes (if running in distribu...
Definition: QuEST_gpu.cu:481
__global__ void densmatr_initPureStateKernel(long long int numPureAmps, qreal *targetVecReal, qreal *targetVecImag, qreal *copyVecReal, qreal *copyVecImag)
Definition: QuEST_gpu.cu:186
@ PAULI_Z
Definition: QuEST.h:96
void statevec_setAmps(Qureg qureg, long long int startInd, qreal *reals, qreal *imags, long long int numAmps)
Definition: QuEST_gpu.cu:153
__global__ void densmatr_mixTwoQubitDepolarisingKernel(qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2)
Called once for every 16 amplitudes.
Definition: QuEST_gpu.cu:3076
@ DISTANCE
Definition: QuEST.h:234
int rank
Definition: QuEST.h:364
void statevec_multiControlledPhaseShift(Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
Definition: QuEST_gpu.cu:1558
int GPUExists(void)
Definition: QuEST_gpu.cu:446
__global__ void copySharedReduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1951
void swapDouble(qreal **a, qreal **b)
Definition: QuEST_gpu.cu:2057
void seedQuESTDefault(QuESTEnv *env)
Seeds the random number generator with the (master node) current time and process ID.
Definition: QuEST.c:1614
__global__ void statevec_applyPhaseFuncOverridesKernel(Qureg qureg, int *qubits, int numQubits, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int numTerms, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
Definition: QuEST_gpu.cu:3522
__global__ void statevec_multiControlledMultiQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int *targs, int numTargs, qreal *uRe, qreal *uIm, long long int *ampInds, qreal *reAmps, qreal *imAmps, long long int numTargAmps)
Definition: QuEST_gpu.cu:980
__global__ void densmatr_initClassicalStateKernel(long long int densityNumElems, qreal *densityReal, qreal *densityImag, long long int densityInd)
Definition: QuEST_gpu.cu:239
int statevec_initStateFromSingleFile(Qureg *qureg, char filename[200], QuESTEnv env)
Definition: QuEST_gpu.cu:727
int numChunks
The number of nodes between which the elements of this operator are split.
Definition: QuEST.h:304
void statevec_pauliY(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1393
ComplexArray pairStateVec
Temporary storage for a chunk of the state vector received from another process in the MPI version.
Definition: QuEST.h:343
qreal statevec_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:2112
void statevec_controlledCompactUnitary(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:911
__global__ void densmatr_calcExpecDiagonalOpKernel(int getRealComp, qreal *matReal, qreal *matImag, qreal *opReal, qreal *opImag, int numQubits, long long int numTermsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:3367
void agnostic_syncDiagonalOp(DiagonalOp op)
Definition: QuEST_gpu.cu:382
void getEnvironmentString(QuESTEnv env, char str[200])
Sets str to a string containing information about the runtime environment, including whether simulati...
Definition: QuEST_gpu.cu:505
__global__ void statevec_controlledPhaseShiftKernel(Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1502
#define DEBUG
Definition: QuEST_gpu.cu:20
int numSeeds
Definition: QuEST.h:367
__global__ void statevec_multiRotateZKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1571
int numChunks
Number of chunks the state vector is broken up into – the number of MPI processes used.
Definition: QuEST.h:338
@ TWOS_COMPLEMENT
Definition: QuEST.h:269
__global__ void densmatr_applyDiagonalOpKernel(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3217
ComplexArray deviceOperator
A copy of the elements stored persistently on the GPU.
Definition: QuEST.h:312
void statevec_multiControlledTwoQubitUnitary(Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks ...
Definition: QuEST_gpu.cu:1172
__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel(qreal *aRe, qreal *aIm, qreal *bRe, qreal *bIm, long long int numAmpsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:2569
void statevec_controlledPhaseFlip(Qureg qureg, int idQubit1, int idQubit2)
Definition: QuEST_gpu.cu:1708
int chunkId
The position of the chunk of the operator held by this process in the full operator.
Definition: QuEST.h:306
ComplexArray deviceStateVec
Storage for wavefunction amplitudes in the GPU version.
Definition: QuEST.h:346
qreal statevec_getRealAmp(Qureg qureg, long long int index)
Definition: QuEST_gpu.cu:569
@ NORM
Definition: QuEST.h:232
void statevec_createQureg(Qureg *qureg, int numQubits, QuESTEnv env)
Definition: QuEST_gpu.cu:275
qreal densmatr_calcInnerProduct(Qureg a, Qureg b)
Definition: QuEST_gpu.cu:2313
__global__ void densmatr_mixDampingKernel(qreal damping, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
Works like mixDephasing but modifies every other element, and elements are averaged in pairs.
Definition: QuEST_gpu.cu:3001
void statevec_hadamard(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1826
void statevec_applyMultiVarPhaseFuncOverrides(Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int *numTermsPerReg, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
Definition: QuEST_gpu.cu:3695
__global__ void statevec_calcExpecDiagonalOpKernel(int getRealComp, qreal *vecReal, qreal *vecImag, qreal *opReal, qreal *opImag, long long int numTermsToSum, qreal *reducedArray)
computes either a real or imag term of |vec_i|^2 op_i
Definition: QuEST_gpu.cu:3249
Represents a 4x4 matrix of complex numbers.
Definition: QuEST.h:175
void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac)
Definition: QuEST_gpu.cu:2879
@ SCALED_INVERSE_DISTANCE
Definition: QuEST.h:234
__global__ void densmatr_mixTwoQubitDephasingKernel(qreal fac, qreal *vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0....
Definition: QuEST_gpu.cu:2914
Information about the environment the program is running in.
Definition: QuEST.h:362
void statevec_initClassicalState(Qureg qureg, long long int stateInd)
Definition: QuEST_gpu.cu:668
@ UNSIGNED
Definition: QuEST.h:269
void densmatr_calcProbOfAllOutcomes(qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
Definition: QuEST_gpu.cu:2259
void statevec_multiControlledMultiQubitUnitary(Qureg qureg, long long int ctrlMask, int *targs, int numTargs, ComplexMatrixN u)
This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks ...
Definition: QuEST_gpu.cu:1039
Represents a general 2^N by 2^N matrix of complex numbers.
Definition: QuEST.h:186
@ INVERSE_DISTANCE
Definition: QuEST.h:234
qreal densmatr_calcTotalProb(Qureg qureg)
Definition: QuEST_gpu.cu:1632
void statevec_multiControlledPhaseFlip(Qureg qureg, int *controlQubits, int numControlQubits)
Definition: QuEST_gpu.cu:1734
__global__ void statevec_controlledCompactUnitaryKernel(Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:852
#define qreal
void statevec_unitary(Qureg qureg, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:972
void statevec_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
Definition: QuEST_gpu.cu:2770
void statevec_pauliX(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1360
void densmatr_initClassicalState(Qureg qureg, long long int stateInd)
Definition: QuEST_gpu.cu:258
__forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd)
Definition: QuEST_gpu.cu:95
__global__ void statevec_unitaryKernel(Qureg qureg, int targetQubit, ArgMatrix2 u)
Definition: QuEST_gpu.cu:919
__global__ void statevec_multiControlledPhaseFlipKernel(Qureg qureg, long long int mask)
Definition: QuEST_gpu.cu:1716
void statevec_initZeroState(Qureg qureg)
Definition: QuEST_gpu.cu:620
__global__ void statevec_findProbabilityOfZeroKernel(Qureg qureg, int measureQubit, qreal *reducedArray)
Definition: QuEST_gpu.cu:1998
__global__ void statevec_applyDiagonalOpKernel(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3187
int numQubitsInStateVec
Number of qubits in the state-vector - this is double the number represented for mixed states.
Definition: QuEST.h:329
__global__ void statevec_initBlankStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:583
qreal statevec_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
Definition: QuEST_gpu.cu:2150
void statevec_multiControlledMultiRotateZ(Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle)
Definition: QuEST_gpu.cu:1621
void statevec_applyPhaseFuncOverrides(Qureg qureg, int *qubits, int numQubits, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int numTerms, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
Definition: QuEST_gpu.cu:3576
qreal statevec_calcTotalProb(Qureg qureg)
Definition: QuEST_gpu.cu:1655
__global__ void densmatr_calcInnerProductKernel(Qureg a, Qureg b, long long int numTermsToSum, qreal *reducedArray)
computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number
Definition: QuEST_gpu.cu:2292
int chunkId
The position of the chunk of the state vector held by this process in the full state vector.
Definition: QuEST.h:336
void statevec_phaseShiftByTerm(Qureg qureg, int targetQubit, Complex term)
Definition: QuEST_gpu.cu:1491
qreal imag[2][2]
Definition: QuEST.h:140
int statevec_compareStates(Qureg mq1, Qureg mq2, qreal precision)
Definition: QuEST_gpu.cu:771
__forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index)
Definition: QuEST_gpu.cu:99
qreal * imag
The imaginary values of the 2^numQubits complex elements.
Definition: QuEST.h:310
void densmatr_mixDensityMatrix(Qureg combineQureg, qreal otherProb, Qureg otherQureg)
Definition: QuEST_gpu.cu:2846
qreal densmatr_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:2064
phaseFunc
Flags for specifying named phase functions.
Definition: QuEST.h:231
__global__ void statevec_pauliXKernel(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1316
__forceinline__ __device__ int getBitMaskParity(long long int mask)
Definition: QuEST_gpu.cu:86
long long int numAmpsPerChunk
Number of probability amplitudes held in stateVec by this process In the non-MPI version,...
Definition: QuEST.h:332
void statevec_applyDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3209
qreal * termCoeffs
The real coefficient of each Pauli product. This is an array of length PauliHamil....
Definition: QuEST.h:283
void densmatr_initPureState(Qureg targetQureg, Qureg copyQureg)
Definition: QuEST_gpu.cu:205
__global__ void statevec_multiControlledMultiQubitNotKernel(Qureg qureg, int ctrlMask, int targMask)
Definition: QuEST_gpu.cu:1881
__global__ void statevec_phaseShiftByTermKernel(Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1463
void statevec_multiControlledMultiQubitNot(Qureg qureg, int ctrlMask, int targMask)
Definition: QuEST_gpu.cu:1918
enum pauliOpType * pauliCodes
The Pauli operators acting on each qubit, flattened over every operator.
Definition: QuEST.h:281
__global__ void statevec_initZeroStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:604
void copyStateToGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU...
Definition: QuEST_gpu.cu:519
#define REDUCE_SHARED_SIZE
Definition: QuEST_gpu.cu:19
@ SCALED_PRODUCT
Definition: QuEST.h:233
__global__ void densmatr_calcPurityKernel(qreal *vecReal, qreal *vecImag, long long int numAmpsToSum, qreal *reducedArray)
Definition: QuEST_gpu.cu:2645
DiagonalOp agnostic_createDiagonalOp(int numQubits, QuESTEnv env)
Definition: QuEST_gpu.cu:338
void statevec_calcProbOfAllOutcomes(qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
Definition: QuEST_gpu.cu:2207
void statevec_applyParamNamedPhaseFuncOverrides(Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, enum phaseFunc phaseFuncName, qreal *params, int numParams, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
Definition: QuEST_gpu.cu:3909
qreal statevec_getImagAmp(Qureg qureg, long long int index)
Definition: QuEST_gpu.cu:576
int numRanks
Definition: QuEST.h:365
qreal imag[4][4]
Definition: QuEST.h:178
qreal densmatr_calcProbOfOutcome(Qureg qureg, int measureQubit, int outcome)
Definition: QuEST_gpu.cu:2158
void statevec_compactUnitary(Qureg qureg, int targetQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:844
int numQubits
The number of qubits this operator can act on (informing its size)
Definition: QuEST.h:300
int numSumTerms
The number of terms in the weighted sum, or the number of Pauli products.
Definition: QuEST.h:285
long long int getQubitBitMask(int *qubits, int numQubits)
Definition: QuEST_common.c:50
Represents a diagonal complex operator on the full Hilbert state of a Qureg.
Definition: QuEST.h:297
A Pauli Hamiltonian, expressed as a real-weighted sum of pauli products, and which can hence represen...
Definition: QuEST.h:277
__forceinline__ __device__ long long int insertZeroBits(long long int number, int *inds, const int numInds)
Definition: QuEST_gpu.cu:112
Complex densmatr_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3412
__global__ void densmatr_mixDephasingKernel(qreal fac, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int colBit, long long int rowBit)
Called once for every 4 amplitudes in density matrix Works by establishing the |.....
Definition: QuEST_gpu.cu:2863
@ SCALED_INVERSE_SHIFTED_NORM
Definition: QuEST.h:232
int getNumReductionLevels(long long int numValuesToReduce, int numReducedPerLevel)
Definition: QuEST_gpu.cu:2048
__global__ void statevec_multiControlledMultiRotateZKernel(Qureg qureg, long long int ctrlMask, long long int targMask, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1599
qreal ** real
Definition: QuEST.h:189
__global__ void statevec_hadamardKernel(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1777
__global__ void statevec_controlledPhaseFlipKernel(Qureg qureg, int idQubit1, int idQubit2)
Definition: QuEST_gpu.cu:1687
qreal * secondLevelReduction
Definition: QuEST.h:348
__global__ void densmatr_initPlusStateKernel(long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:216
__forceinline__ __device__ int extractBit(const int locationOfBitFromRight, const long long int theEncodedNumber)
Definition: QuEST_gpu.cu:82
__global__ void densmatr_mixDepolarisingKernel(qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
Works like mixDephasing but modifies every other element, and elements are averaged in pairs.
Definition: QuEST_gpu.cu:2975
void densmatr_mixTwoQubitDepolarising(Qureg qureg, int qubit1, int qubit2, qreal depolLevel)
Definition: QuEST_gpu.cu:3108
unsigned long int * seeds
Definition: QuEST.h:366
__global__ void statevec_calcProbOfAllOutcomesKernel(qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
Definition: QuEST_gpu.cu:2186
void statevec_controlledPhaseShift(Qureg qureg, int idQubit1, int idQubit2, qreal angle)
Definition: QuEST_gpu.cu:1527
qreal densmatr_calcHilbertSchmidtDistance(Qureg a, Qureg b)
Definition: QuEST_gpu.cu:2593
void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase)
Definition: QuEST_gpu.cu:2938
void statevec_initBlankState(Qureg qureg)
Definition: QuEST_gpu.cu:593
@ INVERSE_NORM
Definition: QuEST.h:232
Represents a system of qubits.
Definition: QuEST.h:322
void statevec_controlledPauliY(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1445
__global__ void statevec_initDebugStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
Definition: QuEST_gpu.cu:679
__forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2)
Definition: QuEST_gpu.cu:106
Complex statevec_calcInnerProduct(Qureg bra, Qureg ket)
Terrible code which unnecessarily individually computes and sums the real and imaginary components of...
Definition: QuEST_gpu.cu:2393
qreal ** imag
Definition: QuEST.h:190
void densmatr_collapseToKnownProbOutcome(Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
This involves finding |...i...><...j...| states and killing those where i!=j.
Definition: QuEST_gpu.cu:2805
__global__ void statevec_pauliYKernel(Qureg qureg, int targetQubit, int conjFac)
Definition: QuEST_gpu.cu:1368
void statevec_reportStateToScreen(Qureg qureg, QuESTEnv env, int reportRank)
Print the current state vector of probability amplitudes for a set of qubits to standard out.
Definition: QuEST_gpu.cu:543
@ PRODUCT
Definition: QuEST.h:233
__global__ void statevec_initClassicalStateKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd)
Definition: QuEST_gpu.cu:653
ComplexArray stateVec
Computational state amplitudes - a subset thereof in the MPI version.
Definition: QuEST.h:341
__global__ void statevec_calcInnerProductKernel(int getRealComp, qreal *vecReal1, qreal *vecImag1, qreal *vecReal2, qreal *vecImag2, long long int numTermsToSum, qreal *reducedArray)
computes either a real or imag term in the inner product
Definition: QuEST_gpu.cu:2363
__global__ void densmatr_findProbabilityOfZeroKernel(Qureg qureg, int measureQubit, qreal *reducedArray)
Definition: QuEST_gpu.cu:1960
qreal real[2][2]
Definition: QuEST.h:139
__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel(Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, enum phaseFunc phaseFuncName, qreal *params, int numParams, long long int *overrideInds, qreal *overridePhases, int numOverrides, long long int *phaseInds, int conj)
Definition: QuEST_gpu.cu:3760
long long int numElemsPerChunk
The number of the 2^numQubits amplitudes stored on each distributed node.
Definition: QuEST.h:302
int isDensityMatrix
Whether this instance is a density-state representation.
Definition: QuEST.h:325
__global__ void statevec_controlledPauliYKernel(Qureg qureg, int controlQubit, int targetQubit, int conjFac)
Definition: QuEST_gpu.cu:1409
void reportQuESTEnv(QuESTEnv env)
Report information about the QuEST environment.
Definition: QuEST_gpu.cu:493
int numQubits
Definition: QuEST.h:188
void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase)
Definition: QuEST_gpu.cu:2899
void statevec_controlledUnitary(Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:1237
void densmatr_initPlusState(Qureg qureg)
Definition: QuEST_gpu.cu:226
void statevec_pauliYConj(Qureg qureg, int targetQubit)
Definition: QuEST_gpu.cu:1401
void statevec_destroyQureg(Qureg qureg, QuESTEnv env)
Definition: QuEST_gpu.cu:321
void statevec_controlledNot(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1873
int numQubitsRepresented
The number of qubits represented in either the state-vector or density matrix.
Definition: QuEST.h:327
void statevec_cloneQureg(Qureg targetQureg, Qureg copyQureg)
works for both statevectors and density matrices
Definition: QuEST_gpu.cu:170
void agnostic_destroyDiagonalOp(DiagonalOp op)
Definition: QuEST_gpu.cu:375
long long int numAmpsTotal
Total number of amplitudes, which are possibly distributed among machines.
Definition: QuEST.h:334
@ SCALED_DISTANCE
Definition: QuEST.h:234
int syncQuESTSuccess(int successCode)
Performs a logical AND on all successCodes held by all processes.
Definition: QuEST_gpu.cu:485
qreal * real
The real values of the 2^numQubits complex elements.
Definition: QuEST.h:308
qreal real
Definition: QuEST.h:105
__device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1932
qreal imag
Definition: QuEST.h:106
__global__ void statevec_initStateOfSingleQubitKernel(long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome)
Definition: QuEST_gpu.cu:700
__global__ void statevec_collapseToKnownProbOutcomeKernel(Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
Definition: QuEST_gpu.cu:2713
void densmatr_applyDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3240
void statevec_initDebugState(Qureg qureg)
Initialise the state vector of probability amplitudes to an (unphysical) state with each component of...
Definition: QuEST_gpu.cu:689
void statevec_multiControlledUnitary(Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u)
Definition: QuEST_gpu.cu:1305
qreal densmatr_calcPurity(Qureg qureg)
Computes the trace of the density matrix squared.
Definition: QuEST_gpu.cu:2664
__global__ void statevec_multiControlledTwoQubitUnitaryKernel(Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u)
Definition: QuEST_gpu.cu:1096
@ SCALED_INVERSE_SHIFTED_DISTANCE
Definition: QuEST.h:234
void seedQuEST(QuESTEnv *env, unsigned long int *seedArray, int numSeeds)
Seeds the random number generator with a custom array of key(s), overriding the default keys.
Definition: QuEST_gpu.cu:3965
Represents one complex number.
Definition: QuEST.h:103
QuESTEnv createQuESTEnv(void)
Create the QuEST execution environment.
Definition: QuEST_gpu.cu:463
void statevec_initStateOfSingleQubit(Qureg *qureg, int qubitId, int outcome)
Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all...
Definition: QuEST_gpu.cu:718
@ SCALED_NORM
Definition: QuEST.h:232
@ SCALED_INVERSE_PRODUCT
Definition: QuEST.h:233
void statevec_setWeightedQureg(Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
Definition: QuEST_gpu.cu:3175
qreal densmatr_calcFidelity(Qureg qureg, Qureg pureState)
Definition: QuEST_gpu.cu:2519
void statevec_initPlusState(Qureg qureg)
Definition: QuEST_gpu.cu:642
qreal * firstLevelReduction
Storage for reduction of probabilities on GPU.
Definition: QuEST.h:348
__global__ void statevec_compactUnitaryKernel(Qureg qureg, int rotQubit, Complex alpha, Complex beta)
Definition: QuEST_gpu.cu:789
__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel(Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int *numTermsPerReg, long long int *overrideInds, qreal *overridePhases, int numOverrides, long long int *phaseInds, int conj)
Definition: QuEST_gpu.cu:3611
__global__ void statevec_controlledNotKernel(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1834
__global__ void densmatr_calcProbOfAllOutcomesKernel(qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
Definition: QuEST_gpu.cu:2238
Complex statevec_calcExpecDiagonalOp(Qureg qureg, DiagonalOp op)
Definition: QuEST_gpu.cu:3276
void agnostic_setDiagonalOpElems(DiagonalOp op, long long int startInd, qreal *real, qreal *imag, long long int numElems)
Definition: QuEST_gpu.cu:3503
__global__ void densmatr_mixDensityMatrixKernel(Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit)
Definition: QuEST_gpu.cu:2834
void densmatr_mixDepolarising(Qureg qureg, int targetQubit, qreal depolLevel)
Definition: QuEST_gpu.cu:3022
bitEncoding
Flags for specifying how the bits in sub-register computational basis states are mapped to indices in...
Definition: QuEST.h:269
__global__ void statevec_swapQubitAmpsKernel(Qureg qureg, int qb1, int qb2)
Definition: QuEST_gpu.cu:1743
Represents a 2x2 matrix of complex numbers.
Definition: QuEST.h:137
void statevec_controlledPauliYConj(Qureg qureg, int controlQubit, int targetQubit)
Definition: QuEST_gpu.cu:1454
__global__ void densmatr_calcFidelityKernel(Qureg dens, Qureg vec, long long int dim, qreal *reducedArray)
computes one term of (vec^*T) dens * vec
Definition: QuEST_gpu.cu:2481
@ SCALED_INVERSE_NORM
Definition: QuEST.h:232
__global__ void statevec_multiControlledPhaseShiftKernel(Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
Definition: QuEST_gpu.cu:1538