QuEST_gpu.cu File Reference
#include "QuEST.h"
#include "QuEST_precision.h"
#include "QuEST_internal.h"
#include "mt19937ar.h"
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

Go to the source code of this file.

Macros

#define DEBUG   0
 
#define REDUCE_SHARED_SIZE   512
 

Functions

DiagonalOp agnostic_createDiagonalOp (int numQubits, QuESTEnv env)
 
void agnostic_destroyDiagonalOp (DiagonalOp op)
 
void agnostic_initDiagonalOpFromPauliHamil (DiagonalOp op, PauliHamil hamil)
 
__global__ void agnostic_initDiagonalOpFromPauliHamilKernel (DiagonalOp op, enum pauliOpType *pauliCodes, qreal *termCoeffs, int numSumTerms)
 
void agnostic_setDiagonalOpElems (DiagonalOp op, long long int startInd, qreal *real, qreal *imag, long long int numElems)
 
void agnostic_syncDiagonalOp (DiagonalOp op)
 
__global__ void copySharedReduceBlock (qreal *arrayIn, qreal *reducedArray, int length)
 
void copyStateFromGPU (Qureg qureg)
 In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg.deviceStateVec) to RAM (qureg.stateVec), where it can be accessed/modified by the user. More...
 
void copyStateToGPU (Qureg qureg)
 In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU-memory (qureg.deviceStateVec), which is the version operated upon by other calls to the API. More...
 
QuESTEnv createQuESTEnv (void)
 Create the QuEST execution environment. More...
 
void densmatr_applyDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void densmatr_applyDiagonalOpKernel (Qureg qureg, DiagonalOp op)
 
Complex densmatr_calcExpecDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void densmatr_calcExpecDiagonalOpKernel (int getRealComp, qreal *matReal, qreal *matImag, qreal *opReal, qreal *opImag, int numQubits, long long int numTermsToSum, qreal *reducedArray)
 
qreal densmatr_calcFidelity (Qureg qureg, Qureg pureState)
 
__global__ void densmatr_calcFidelityKernel (Qureg dens, Qureg vec, long long int dim, qreal *reducedArray)
 computes one term of (vec^*T) dens * vec More...
 
qreal densmatr_calcHilbertSchmidtDistance (Qureg a, Qureg b)
 
__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel (qreal *aRe, qreal *aIm, qreal *bRe, qreal *bIm, long long int numAmpsToSum, qreal *reducedArray)
 
qreal densmatr_calcInnerProduct (Qureg a, Qureg b)
 
__global__ void densmatr_calcInnerProductKernel (Qureg a, Qureg b, long long int numTermsToSum, qreal *reducedArray)
 computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number More...
 
void densmatr_calcProbOfAllOutcomes (qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
 
__global__ void densmatr_calcProbOfAllOutcomesKernel (qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
 
qreal densmatr_calcProbOfOutcome (Qureg qureg, int measureQubit, int outcome)
 
qreal densmatr_calcPurity (Qureg qureg)
 Computes the trace of the density matrix squared. More...
 
__global__ void densmatr_calcPurityKernel (qreal *vecReal, qreal *vecImag, long long int numAmpsToSum, qreal *reducedArray)
 
qreal densmatr_calcTotalProb (Qureg qureg)
 
void densmatr_collapseToKnownProbOutcome (Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
 This involves finding |...i...><...j...| states and killing those where i!=j. More...
 
__global__ void densmatr_collapseToKnownProbOutcomeKernel (qreal outcomeProb, qreal *vecReal, qreal *vecImag, long long int numBasesToVisit, long long int part1, long long int part2, long long int part3, long long int rowBit, long long int colBit, long long int desired, long long int undesired)
 Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|. More...
 
qreal densmatr_findProbabilityOfZero (Qureg qureg, int measureQubit)
 
__global__ void densmatr_findProbabilityOfZeroKernel (Qureg qureg, int measureQubit, qreal *reducedArray)
 
void densmatr_initClassicalState (Qureg qureg, long long int stateInd)
 
__global__ void densmatr_initClassicalStateKernel (long long int densityNumElems, qreal *densityReal, qreal *densityImag, long long int densityInd)
 
void densmatr_initPlusState (Qureg qureg)
 
__global__ void densmatr_initPlusStateKernel (long long int stateVecSize, qreal probFactor, qreal *stateVecReal, qreal *stateVecImag)
 
void densmatr_initPureState (Qureg targetQureg, Qureg copyQureg)
 
__global__ void densmatr_initPureStateKernel (long long int numPureAmps, qreal *targetVecReal, qreal *targetVecImag, qreal *copyVecReal, qreal *copyVecImag)
 
void densmatr_mixDamping (Qureg qureg, int targetQubit, qreal damping)
 
__global__ void densmatr_mixDampingKernel (qreal damping, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
 Works like mixDephasing but modifies every other element, and elements are averaged in pairs. More...
 
void densmatr_mixDensityMatrix (Qureg combineQureg, qreal otherProb, Qureg otherQureg)
 
__global__ void densmatr_mixDensityMatrixKernel (Qureg combineQureg, qreal otherProb, Qureg otherQureg, long long int numAmpsToVisit)
 
void densmatr_mixDephasing (Qureg qureg, int targetQubit, qreal dephase)
 
__global__ void densmatr_mixDephasingKernel (qreal fac, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int colBit, long long int rowBit)
 Called once for every 4 amplitudes in density matrix Works by establishing the |..0..><..0..| state (for its given index) then visiting |..1..><..0..| and |..0..><..1..|. More...
 
void densmatr_mixDepolarising (Qureg qureg, int targetQubit, qreal depolLevel)
 
__global__ void densmatr_mixDepolarisingKernel (qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int bothBits)
 Works like mixDephasing but modifies every other element, and elements are averaged in pairs. More...
 
void densmatr_mixTwoQubitDephasing (Qureg qureg, int qubit1, int qubit2, qreal dephase)
 
__global__ void densmatr_mixTwoQubitDephasingKernel (qreal fac, qreal *vecReal, qreal *vecImag, long long int numBackgroundStates, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int colBit1, long long int rowBit1, long long int colBit2, long long int rowBit2)
 Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0..0..><..0..0..| index to visit either |..0..0..><..0..1..|, |..0..0..><..1..0..|, |..0..0..><..1..1..|, |..0..1..><..0..0..| etc and so on to |..1..1..><..1..0|. More...
 
void densmatr_mixTwoQubitDepolarising (Qureg qureg, int qubit1, int qubit2, qreal depolLevel)
 
__global__ void densmatr_mixTwoQubitDepolarisingKernel (qreal depolLevel, qreal *vecReal, qreal *vecImag, long long int numAmpsToVisit, long long int part1, long long int part2, long long int part3, long long int part4, long long int part5, long long int rowCol1, long long int rowCol2)
 Called once for every 16 amplitudes. More...
 
void densmatr_oneQubitDegradeOffDiagonal (Qureg qureg, int targetQubit, qreal dephFac)
 
void destroyQuESTEnv (QuESTEnv env)
 Destroy the QuEST environment. More...
 
__forceinline__ __device__ int extractBit (const int locationOfBitFromRight, const long long int theEncodedNumber)
 
__forceinline__ __device__ long long int flipBit (const long long int number, const int bitInd)
 
__forceinline__ __device__ int getBitMaskParity (long long int mask)
 
void getEnvironmentString (QuESTEnv env, char str[200])
 Sets str to a string containing information about the runtime environment, including whether simulation is using CUDA (for GPU), OpenMP (for multithreading) and/or MPI (for distribution). More...
 
int getNumReductionLevels (long long int numValuesToReduce, int numReducedPerLevel)
 
int GPUExists (void)
 
__forceinline__ __device__ long long int insertTwoZeroBits (const long long int number, const int bit1, const int bit2)
 
__forceinline__ __device__ long long int insertZeroBit (const long long int number, const int index)
 
__forceinline__ __device__ long long int insertZeroBits (long long int number, int *inds, const int numInds)
 
__device__ __host__ unsigned int log2Int (unsigned int x)
 
__device__ void reduceBlock (qreal *arrayIn, qreal *reducedArray, int length)
 
void reportQuESTEnv (QuESTEnv env)
 Report information about the QuEST environment. More...
 
void seedQuEST (QuESTEnv *env, unsigned long int *seedArray, int numSeeds)
 Seeds the random number generator with a custom array of key(s), overriding the default keys. More...
 
void statevec_applyDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void statevec_applyDiagonalOpKernel (Qureg qureg, DiagonalOp op)
 
void statevec_applyMultiVarPhaseFuncOverrides (Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int *numTermsPerReg, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
 
__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel (Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int *numTermsPerReg, long long int *overrideInds, qreal *overridePhases, int numOverrides, long long int *phaseInds, int conj)
 
void statevec_applyParamNamedPhaseFuncOverrides (Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, enum phaseFunc phaseFuncName, qreal *params, int numParams, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
 
__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel (Qureg qureg, int *qubits, int *numQubitsPerReg, int numRegs, enum bitEncoding encoding, enum phaseFunc phaseFuncName, qreal *params, int numParams, long long int *overrideInds, qreal *overridePhases, int numOverrides, long long int *phaseInds, int conj)
 
void statevec_applyPhaseFuncOverrides (Qureg qureg, int *qubits, int numQubits, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int numTerms, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
 
__global__ void statevec_applyPhaseFuncOverridesKernel (Qureg qureg, int *qubits, int numQubits, enum bitEncoding encoding, qreal *coeffs, qreal *exponents, int numTerms, long long int *overrideInds, qreal *overridePhases, int numOverrides, int conj)
 
Complex statevec_calcExpecDiagonalOp (Qureg qureg, DiagonalOp op)
 
__global__ void statevec_calcExpecDiagonalOpKernel (int getRealComp, qreal *vecReal, qreal *vecImag, qreal *opReal, qreal *opImag, long long int numTermsToSum, qreal *reducedArray)
 computes either a real or imag term of |vec_i|^2 op_i More...
 
Complex statevec_calcInnerProduct (Qureg bra, Qureg ket)
 Terrible code which unnecessarily individually computes and sums the real and imaginary components of the inner product, so as to not have to worry about keeping the sums separated during reduction. More...
 
__global__ void statevec_calcInnerProductKernel (int getRealComp, qreal *vecReal1, qreal *vecImag1, qreal *vecReal2, qreal *vecImag2, long long int numTermsToSum, qreal *reducedArray)
 computes either a real or imag term in the inner product More...
 
void statevec_calcProbOfAllOutcomes (qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
 
__global__ void statevec_calcProbOfAllOutcomesKernel (qreal *outcomeProbs, Qureg qureg, int *qubits, int numQubits)
 
qreal statevec_calcProbOfOutcome (Qureg qureg, int measureQubit, int outcome)
 
qreal statevec_calcTotalProb (Qureg qureg)
 
void statevec_cloneQureg (Qureg targetQureg, Qureg copyQureg)
 works for both statevectors and density matrices More...
 
void statevec_collapseToKnownProbOutcome (Qureg qureg, int measureQubit, int outcome, qreal outcomeProb)
 
__global__ void statevec_collapseToKnownProbOutcomeKernel (Qureg qureg, int measureQubit, int outcome, qreal totalProbability)
 
void statevec_compactUnitary (Qureg qureg, int targetQubit, Complex alpha, Complex beta)
 
__global__ void statevec_compactUnitaryKernel (Qureg qureg, int rotQubit, Complex alpha, Complex beta)
 
int statevec_compareStates (Qureg mq1, Qureg mq2, qreal precision)
 
void statevec_controlledCompactUnitary (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
 
__global__ void statevec_controlledCompactUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, Complex alpha, Complex beta)
 
void statevec_controlledNot (Qureg qureg, int controlQubit, int targetQubit)
 
__global__ void statevec_controlledNotKernel (Qureg qureg, int controlQubit, int targetQubit)
 
void statevec_controlledPauliY (Qureg qureg, int controlQubit, int targetQubit)
 
void statevec_controlledPauliYConj (Qureg qureg, int controlQubit, int targetQubit)
 
__global__ void statevec_controlledPauliYKernel (Qureg qureg, int controlQubit, int targetQubit, int conjFac)
 
void statevec_controlledPhaseFlip (Qureg qureg, int idQubit1, int idQubit2)
 
__global__ void statevec_controlledPhaseFlipKernel (Qureg qureg, int idQubit1, int idQubit2)
 
void statevec_controlledPhaseShift (Qureg qureg, int idQubit1, int idQubit2, qreal angle)
 
__global__ void statevec_controlledPhaseShiftKernel (Qureg qureg, int idQubit1, int idQubit2, qreal cosAngle, qreal sinAngle)
 
void statevec_controlledUnitary (Qureg qureg, int controlQubit, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_controlledUnitaryKernel (Qureg qureg, int controlQubit, int targetQubit, ArgMatrix2 u)
 
void statevec_createQureg (Qureg *qureg, int numQubits, QuESTEnv env)
 
void statevec_destroyQureg (Qureg qureg, QuESTEnv env)
 
qreal statevec_findProbabilityOfZero (Qureg qureg, int measureQubit)
 
__global__ void statevec_findProbabilityOfZeroKernel (Qureg qureg, int measureQubit, qreal *reducedArray)
 
qreal statevec_getImagAmp (Qureg qureg, long long int index)
 
qreal statevec_getRealAmp (Qureg qureg, long long int index)
 
void statevec_hadamard (Qureg qureg, int targetQubit)
 
__global__ void statevec_hadamardKernel (Qureg qureg, int targetQubit)
 
void statevec_initBlankState (Qureg qureg)
 
__global__ void statevec_initBlankStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_initClassicalState (Qureg qureg, long long int stateInd)
 
__global__ void statevec_initClassicalStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, long long int stateInd)
 
void statevec_initDebugState (Qureg qureg)
 Initialise the state vector of probability amplitudes to an (unphysical) state with each component of each probability amplitude a unique floating point value. More...
 
__global__ void statevec_initDebugStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_initPlusState (Qureg qureg)
 
__global__ void statevec_initPlusStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
int statevec_initStateFromSingleFile (Qureg *qureg, char filename[200], QuESTEnv env)
 
void statevec_initStateOfSingleQubit (Qureg *qureg, int qubitId, int outcome)
 Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one. More...
 
__global__ void statevec_initStateOfSingleQubitKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag, int qubitId, int outcome)
 
void statevec_initZeroState (Qureg qureg)
 
__global__ void statevec_initZeroStateKernel (long long int stateVecSize, qreal *stateVecReal, qreal *stateVecImag)
 
void statevec_multiControlledMultiQubitNot (Qureg qureg, int ctrlMask, int targMask)
 
__global__ void statevec_multiControlledMultiQubitNotKernel (Qureg qureg, int ctrlMask, int targMask)
 
void statevec_multiControlledMultiQubitUnitary (Qureg qureg, long long int ctrlMask, int *targs, int numTargs, ComplexMatrixN u)
 This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct. More...
 
__global__ void statevec_multiControlledMultiQubitUnitaryKernel (Qureg qureg, long long int ctrlMask, int *targs, int numTargs, qreal *uRe, qreal *uIm, long long int *ampInds, qreal *reAmps, qreal *imAmps, long long int numTargAmps)
 
void statevec_multiControlledMultiRotateZ (Qureg qureg, long long int ctrlMask, long long int targMask, qreal angle)
 
__global__ void statevec_multiControlledMultiRotateZKernel (Qureg qureg, long long int ctrlMask, long long int targMask, qreal cosAngle, qreal sinAngle)
 
void statevec_multiControlledPhaseFlip (Qureg qureg, int *controlQubits, int numControlQubits)
 
__global__ void statevec_multiControlledPhaseFlipKernel (Qureg qureg, long long int mask)
 
void statevec_multiControlledPhaseShift (Qureg qureg, int *controlQubits, int numControlQubits, qreal angle)
 
__global__ void statevec_multiControlledPhaseShiftKernel (Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
 
void statevec_multiControlledTwoQubitUnitary (Qureg qureg, long long int ctrlMask, int q1, int q2, ComplexMatrix4 u)
 This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct. More...
 
__global__ void statevec_multiControlledTwoQubitUnitaryKernel (Qureg qureg, long long int ctrlMask, int q1, int q2, ArgMatrix4 u)
 
void statevec_multiControlledUnitary (Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_multiControlledUnitaryKernel (Qureg qureg, long long int ctrlQubitsMask, long long int ctrlFlipMask, int targetQubit, ArgMatrix2 u)
 
void statevec_multiRotateZ (Qureg qureg, long long int mask, qreal angle)
 
__global__ void statevec_multiRotateZKernel (Qureg qureg, long long int mask, qreal cosAngle, qreal sinAngle)
 
void statevec_pauliX (Qureg qureg, int targetQubit)
 
__global__ void statevec_pauliXKernel (Qureg qureg, int targetQubit)
 
void statevec_pauliY (Qureg qureg, int targetQubit)
 
void statevec_pauliYConj (Qureg qureg, int targetQubit)
 
__global__ void statevec_pauliYKernel (Qureg qureg, int targetQubit, int conjFac)
 
void statevec_phaseShiftByTerm (Qureg qureg, int targetQubit, Complex term)
 
__global__ void statevec_phaseShiftByTermKernel (Qureg qureg, int targetQubit, qreal cosAngle, qreal sinAngle)
 
void statevec_reportStateToScreen (Qureg qureg, QuESTEnv env, int reportRank)
 Print the current state vector of probability amplitudes for a set of qubits to standard out. More...
 
void statevec_setAmps (Qureg qureg, long long int startInd, qreal *reals, qreal *imags, long long int numAmps)
 
void statevec_setWeightedQureg (Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
 
__global__ void statevec_setWeightedQuregKernel (Complex fac1, Qureg qureg1, Complex fac2, Qureg qureg2, Complex facOut, Qureg out)
 
void statevec_swapQubitAmps (Qureg qureg, int qb1, int qb2)
 
__global__ void statevec_swapQubitAmpsKernel (Qureg qureg, int qb1, int qb2)
 
void statevec_unitary (Qureg qureg, int targetQubit, ComplexMatrix2 u)
 
__global__ void statevec_unitaryKernel (Qureg qureg, int targetQubit, ArgMatrix2 u)
 
void swapDouble (qreal **a, qreal **b)
 
void syncQuESTEnv (QuESTEnv env)
 Guarantees that all code up to the given point has been executed on all nodes (if running in distributed mode) More...
 
int syncQuESTSuccess (int successCode)
 Performs a logical AND on all successCodes held by all processes. More...
 

Detailed Description

An implementation of the backend in ../QuEST_internal.h for a GPU environment.

Author
Ania Brown
Tyson Jones

Definition in file QuEST_gpu.cu.

Macro Definition Documentation

◆ DEBUG

#define DEBUG   0

Definition at line 20 of file QuEST_gpu.cu.

◆ REDUCE_SHARED_SIZE

#define REDUCE_SHARED_SIZE   512

Definition at line 19 of file QuEST_gpu.cu.

Function Documentation

◆ agnostic_createDiagonalOp()

DiagonalOp agnostic_createDiagonalOp ( int  numQubits,
QuESTEnv  env 
)

Definition at line 338 of file QuEST_gpu.cu.

338  {
339 
340  DiagonalOp op;
341  op.numQubits = numQubits;
342  op.numElemsPerChunk = (1LL << numQubits) / env.numRanks;
343  op.chunkId = env.rank;
344  op.numChunks = env.numRanks;
345 
346  // allocate CPU memory (initialised to zero)
347  op.real = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
348  op.imag = (qreal*) calloc(op.numElemsPerChunk, sizeof(qreal));
349  // @TODO no handling of rank>1 allocation (no distributed GPU)
350 
351  // check cpu memory allocation was successful
352  if ( !op.real || !op.imag ) {
353  printf("Could not allocate memory!\n");
354  exit(EXIT_FAILURE);
355  }
356 
357  // allocate GPU memory
358  size_t arrSize = op.numElemsPerChunk * sizeof(qreal);
359  cudaMalloc(&(op.deviceOperator.real), arrSize);
360  cudaMalloc(&(op.deviceOperator.imag), arrSize);
361 
362  // check gpu memory allocation was successful
363  if (!op.deviceOperator.real || !op.deviceOperator.imag) {
364  printf("Could not allocate memory on GPU!\n");
365  exit(EXIT_FAILURE);
366  }
367 
368  // initialise GPU memory to zero
369  cudaMemset(op.deviceOperator.real, 0, arrSize);
370  cudaMemset(op.deviceOperator.imag, 0, arrSize);
371 
372  return op;
373 }

References DiagonalOp::chunkId, DiagonalOp::deviceOperator, DiagonalOp::imag, DiagonalOp::numChunks, DiagonalOp::numElemsPerChunk, DiagonalOp::numQubits, QuESTEnv::numRanks, qreal, QuESTEnv::rank, and DiagonalOp::real.

Referenced by createDiagonalOp(), and createDiagonalOpFromPauliHamilFile().

◆ agnostic_destroyDiagonalOp()

void agnostic_destroyDiagonalOp ( DiagonalOp  op)

Definition at line 375 of file QuEST_gpu.cu.

375  {
376  free(op.real);
377  free(op.imag);
378  cudaFree(op.deviceOperator.real);
379  cudaFree(op.deviceOperator.imag);
380 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, and DiagonalOp::real.

Referenced by destroyDiagonalOp().

◆ agnostic_initDiagonalOpFromPauliHamil()

void agnostic_initDiagonalOpFromPauliHamil ( DiagonalOp  op,
PauliHamil  hamil 
)

Definition at line 418 of file QuEST_gpu.cu.

418  {
419 
420  // copy args intop GPU memory
421  enum pauliOpType* d_pauliCodes;
422  size_t mem_pauliCodes = hamil.numSumTerms * op.numQubits * sizeof *d_pauliCodes;
423  cudaMalloc(&d_pauliCodes, mem_pauliCodes);
424  cudaMemcpy(d_pauliCodes, hamil.pauliCodes, mem_pauliCodes, cudaMemcpyHostToDevice);
425 
426  qreal* d_termCoeffs;
427  size_t mem_termCoeffs = hamil.numSumTerms * sizeof *d_termCoeffs;
428  cudaMalloc(&d_termCoeffs, mem_termCoeffs);
429  cudaMemcpy(d_termCoeffs, hamil.termCoeffs, mem_termCoeffs, cudaMemcpyHostToDevice);
430 
431  int numThreadsPerBlock = 128;
432  int numBlocks = ceil(op.numElemsPerChunk / (qreal) numThreadsPerBlock);
433  agnostic_initDiagonalOpFromPauliHamilKernel<<<numBlocks, numThreadsPerBlock>>>(
434  op, d_pauliCodes, d_termCoeffs, hamil.numSumTerms);
435 
436  // copy populated operator into to RAM
437  cudaDeviceSynchronize();
438  size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
439  cudaMemcpy(op.real, op.deviceOperator.real, mem_elems, cudaMemcpyDeviceToHost);
440  cudaMemcpy(op.imag, op.deviceOperator.imag, mem_elems, cudaMemcpyDeviceToHost);
441 
442  cudaFree(d_pauliCodes);
443  cudaFree(d_termCoeffs);
444 }

References DiagonalOp::numElemsPerChunk, DiagonalOp::numQubits, PauliHamil::numSumTerms, PauliHamil::pauliCodes, qreal, and PauliHamil::termCoeffs.

Referenced by createDiagonalOpFromPauliHamilFile(), and initDiagonalOpFromPauliHamil().

◆ agnostic_initDiagonalOpFromPauliHamilKernel()

__global__ void agnostic_initDiagonalOpFromPauliHamilKernel ( DiagonalOp  op,
enum pauliOpType pauliCodes,
qreal termCoeffs,
int  numSumTerms 
)

Definition at line 389 of file QuEST_gpu.cu.

391  {
392  // each thread processes one diagonal element
393  long long int elemInd = blockIdx.x*blockDim.x + threadIdx.x;
394  if (elemInd >= op.numElemsPerChunk)
395  return;
396 
397  qreal elem = 0;
398 
399  // elem is (+-) every coefficient, with sign determined by parity
400  for (int t=0; t<numSumTerms; t++) {
401 
402  // determine the parity of the Z-targeted qubits in the element's corresponding state
403  int isOddNumOnes = 0;
404  for (int q=0; q<op.numQubits; q++)
405  if (pauliCodes[q + t*op.numQubits] == PAULI_Z)
406  if (extractBit(q, elemInd))
407  isOddNumOnes = !isOddNumOnes;
408 
409  // avoid warp divergence
410  int sign = 1 - 2*isOddNumOnes; // (-1 if isOddNumOnes, else +1)
411  elem += termCoeffs[t] * sign;
412  }
413 
414  op.deviceOperator.real[elemInd] = elem;
415  op.deviceOperator.imag[elemInd] = 0;
416 }

References DiagonalOp::deviceOperator, extractBit(), DiagonalOp::numElemsPerChunk, DiagonalOp::numQubits, PAULI_Z, and qreal.

◆ agnostic_setDiagonalOpElems()

void agnostic_setDiagonalOpElems ( DiagonalOp  op,
long long int  startInd,
qreal real,
qreal imag,
long long int  numElems 
)

Definition at line 3503 of file QuEST_gpu.cu.

3503  {
3504 
3505  // update both RAM and VRAM, for consistency
3506  memcpy(&op.real[startInd], real, numElems * sizeof(qreal));
3507  memcpy(&op.imag[startInd], imag, numElems * sizeof(qreal));
3508 
3509  cudaDeviceSynchronize();
3510  cudaMemcpy(
3511  op.deviceOperator.real + startInd,
3512  real,
3513  numElems * sizeof(*(op.deviceOperator.real)),
3514  cudaMemcpyHostToDevice);
3515  cudaMemcpy(
3516  op.deviceOperator.imag + startInd,
3517  imag,
3518  numElems * sizeof(*(op.deviceOperator.imag)),
3519  cudaMemcpyHostToDevice);
3520 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, qreal, and DiagonalOp::real.

Referenced by initDiagonalOp(), and setDiagonalOpElems().

◆ agnostic_syncDiagonalOp()

void agnostic_syncDiagonalOp ( DiagonalOp  op)

Definition at line 382 of file QuEST_gpu.cu.

382  {
383  cudaDeviceSynchronize();
384  size_t mem_elems = op.numElemsPerChunk * sizeof *op.real;
385  cudaMemcpy(op.deviceOperator.real, op.real, mem_elems, cudaMemcpyHostToDevice);
386  cudaMemcpy(op.deviceOperator.imag, op.imag, mem_elems, cudaMemcpyHostToDevice);
387 }

References DiagonalOp::deviceOperator, DiagonalOp::imag, DiagonalOp::numElemsPerChunk, and DiagonalOp::real.

Referenced by syncDiagonalOp().

◆ copySharedReduceBlock()

__global__ void copySharedReduceBlock ( qreal arrayIn,
qreal reducedArray,
int  length 
)

Definition at line 1951 of file QuEST_gpu.cu.

1951  {
1952  extern __shared__ qreal tempReductionArray[];
1953  int blockOffset = blockIdx.x*length;
1954  tempReductionArray[threadIdx.x*2] = arrayIn[blockOffset + threadIdx.x*2];
1955  tempReductionArray[threadIdx.x*2+1] = arrayIn[blockOffset + threadIdx.x*2+1];
1956  __syncthreads();
1957  reduceBlock(tempReductionArray, reducedArray, length);
1958 }

References qreal, and reduceBlock().

Referenced by densmatr_calcExpecDiagonalOp(), densmatr_calcFidelity(), densmatr_calcHilbertSchmidtDistance(), densmatr_calcInnerProduct(), densmatr_calcPurity(), densmatr_findProbabilityOfZero(), statevec_calcExpecDiagonalOp(), statevec_calcInnerProduct(), and statevec_findProbabilityOfZero().

◆ densmatr_applyDiagonalOp()

void densmatr_applyDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3240 of file QuEST_gpu.cu.

3240  {
3241 
3242  int threadsPerCUDABlock, CUDABlocks;
3243  threadsPerCUDABlock = 128;
3244  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
3245  densmatr_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
3246 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyDiagonalOp().

◆ densmatr_applyDiagonalOpKernel()

__global__ void densmatr_applyDiagonalOpKernel ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3217 of file QuEST_gpu.cu.

3217  {
3218 
3219  // each thread modifies one value; a wasteful and inefficient strategy
3220  long long int numTasks = qureg.numAmpsPerChunk;
3221  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
3222  if (thisTask >= numTasks) return;
3223 
3224  qreal* stateRe = qureg.deviceStateVec.real;
3225  qreal* stateIm = qureg.deviceStateVec.imag;
3226  qreal* opRe = op.deviceOperator.real;
3227  qreal* opIm = op.deviceOperator.imag;
3228 
3229  int opDim = (1 << op.numQubits);
3230  qreal a = stateRe[thisTask];
3231  qreal b = stateIm[thisTask];
3232  qreal c = opRe[thisTask % opDim];
3233  qreal d = opIm[thisTask % opDim];
3234 
3235  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
3236  stateRe[thisTask] = a*c - b*d;
3237  stateIm[thisTask] = a*d + b*c;
3238 }

References DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::numAmpsPerChunk, DiagonalOp::numQubits, and qreal.

◆ densmatr_calcExpecDiagonalOp()

Complex densmatr_calcExpecDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3412 of file QuEST_gpu.cu.

3412  {
3413 
3414  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3415  * (e.g. a func which accepts a pointer to do every-value reduction?)
3416  */
3417 
3418  qreal expecReal, expecImag;
3419 
3420  int getRealComp;
3421  long long int numValuesToReduce;
3422  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3423  int maxReducedPerLevel;
3424  int firstTime;
3425 
3426  // compute real component of inner product
3427  getRealComp = 1;
3428  numValuesToReduce = qureg.numAmpsPerChunk;
3429  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3430  firstTime = 1;
3431  while (numValuesToReduce > 1) {
3432  if (numValuesToReduce < maxReducedPerLevel) {
3433  valuesPerCUDABlock = numValuesToReduce;
3434  numCUDABlocks = 1;
3435  }
3436  else {
3437  valuesPerCUDABlock = maxReducedPerLevel;
3438  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3439  }
3440  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3441  if (firstTime) {
3442  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3443  getRealComp,
3444  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3445  op.deviceOperator.real, op.deviceOperator.imag,
3446  op.numQubits, numValuesToReduce,
3447  qureg.firstLevelReduction);
3448  firstTime = 0;
3449  } else {
3450  cudaDeviceSynchronize();
3451  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3452  qureg.firstLevelReduction,
3453  qureg.secondLevelReduction, valuesPerCUDABlock);
3454  cudaDeviceSynchronize();
3456  }
3457  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3458  }
3459  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3460 
3461  // compute imag component of inner product
3462  getRealComp = 0;
3463  numValuesToReduce = qureg.numAmpsPerChunk;
3464  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3465  firstTime = 1;
3466  while (numValuesToReduce > 1) {
3467  if (numValuesToReduce < maxReducedPerLevel) {
3468  valuesPerCUDABlock = numValuesToReduce;
3469  numCUDABlocks = 1;
3470  }
3471  else {
3472  valuesPerCUDABlock = maxReducedPerLevel;
3473  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3474  }
3475  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3476  if (firstTime) {
3477  densmatr_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3478  getRealComp,
3479  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3480  op.deviceOperator.real, op.deviceOperator.imag,
3481  op.numQubits, numValuesToReduce,
3482  qureg.firstLevelReduction);
3483  firstTime = 0;
3484  } else {
3485  cudaDeviceSynchronize();
3486  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3487  qureg.firstLevelReduction,
3488  qureg.secondLevelReduction, valuesPerCUDABlock);
3489  cudaDeviceSynchronize();
3491  }
3492  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3493  }
3494  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3495 
3496  // return complex
3497  Complex expecVal;
3498  expecVal.real = expecReal;
3499  expecVal.imag = expecImag;
3500  return expecVal;
3501 }

References copySharedReduceBlock(), DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, DiagonalOp::numQubits, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcExpecDiagonalOp().

◆ densmatr_calcExpecDiagonalOpKernel()

__global__ void densmatr_calcExpecDiagonalOpKernel ( int  getRealComp,
qreal matReal,
qreal matImag,
qreal opReal,
qreal opImag,
int  numQubits,
long long int  numTermsToSum,
qreal reducedArray 
)

if the thread represents a diagonal op, then it computes either a real or imag term of matr_{ii} op_i. Otherwise, it writes a 0 to the reduction array

Definition at line 3367 of file QuEST_gpu.cu.

3371 {
3377  // index will identy one of the 2^Q diagonals to be summed
3378  long long int matInd = blockIdx.x*blockDim.x + threadIdx.x;
3379  if (matInd >= numTermsToSum) return;
3380 
3381  long long int diagSpacing = (1LL << numQubits) + 1LL;
3382  int isDiag = ((matInd % diagSpacing) == 0);
3383 
3384  long long int opInd = matInd / diagSpacing;
3385 
3386  qreal val = 0;
3387  if (isDiag) {
3388 
3389  qreal matRe = matReal[matInd];
3390  qreal matIm = matImag[matInd];
3391  qreal opRe = opReal[opInd];
3392  qreal opIm = opImag[opInd];
3393 
3394  // (matRe + matIm i)(opRe + opIm i) =
3395  // (matRe opRe - matIm opIm) + i (matRe opIm + matIm opRe)
3396  if (getRealComp)
3397  val = matRe * opRe - matIm * opIm;
3398  else
3399  val = matRe * opIm + matIm * opRe;
3400  }
3401 
3402  // array of each thread's collected sum term, to be summed
3403  extern __shared__ qreal tempReductionArray[];
3404  tempReductionArray[threadIdx.x] = val;
3405  __syncthreads();
3406 
3407  // every second thread reduces
3408  if (threadIdx.x<blockDim.x/2)
3409  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3410 }

References qreal, and reduceBlock().

◆ densmatr_calcFidelity()

qreal densmatr_calcFidelity ( Qureg  qureg,
Qureg  pureState 
)

Definition at line 2519 of file QuEST_gpu.cu.

2519  {
2520 
2521  // we're summing the square of every term in the density matrix
2522  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2523  long long int numValuesToReduce = densityDim;
2524 
2525  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2526  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2527  int firstTime = 1;
2528 
2529  while (numValuesToReduce > 1) {
2530 
2531  // need less than one CUDA-BLOCK to reduce
2532  if (numValuesToReduce < maxReducedPerLevel) {
2533  valuesPerCUDABlock = numValuesToReduce;
2534  numCUDABlocks = 1;
2535  }
2536  // otherwise use only full CUDA-BLOCKS
2537  else {
2538  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2539  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2540  }
2541  // dictates size of reduction array
2542  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2543 
2544  // spawn threads to sum the probs in each block
2545  // store the reduction in the pureState array
2546  if (firstTime) {
2547  densmatr_calcFidelityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2548  qureg, pureState, densityDim, pureState.firstLevelReduction);
2549  firstTime = 0;
2550 
2551  // sum the block probs
2552  } else {
2553  cudaDeviceSynchronize();
2554  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2555  pureState.firstLevelReduction,
2556  pureState.secondLevelReduction, valuesPerCUDABlock);
2557  cudaDeviceSynchronize();
2558  swapDouble(&(pureState.firstLevelReduction), &(pureState.secondLevelReduction));
2559  }
2560 
2561  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2562  }
2563 
2564  qreal fidelity;
2565  cudaMemcpy(&fidelity, pureState.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2566  return fidelity;
2567 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numQubitsRepresented, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcFidelity().

◆ densmatr_calcFidelityKernel()

__global__ void densmatr_calcFidelityKernel ( Qureg  dens,
Qureg  vec,
long long int  dim,
qreal reducedArray 
)

computes one term of (vec^*T) dens * vec

Definition at line 2481 of file QuEST_gpu.cu.

2481  {
2482 
2483  // figure out which density matrix row to consider
2484  long long int col;
2485  long long int row = blockIdx.x*blockDim.x + threadIdx.x;
2486  if (row >= dim) return;
2487 
2488  qreal* densReal = dens.deviceStateVec.real;
2489  qreal* densImag = dens.deviceStateVec.imag;
2490  qreal* vecReal = vec.deviceStateVec.real;
2491  qreal* vecImag = vec.deviceStateVec.imag;
2492 
2493  // compute the row-th element of the product dens*vec
2494  qreal prodReal = 0;
2495  qreal prodImag = 0;
2496  for (col=0LL; col < dim; col++) {
2497  qreal densElemReal = densReal[dim*col + row];
2498  qreal densElemImag = densImag[dim*col + row];
2499 
2500  prodReal += densElemReal*vecReal[col] - densElemImag*vecImag[col];
2501  prodImag += densElemReal*vecImag[col] + densElemImag*vecReal[col];
2502  }
2503 
2504  // multiply with row-th elem of (vec^*)
2505  qreal termReal = prodImag*vecImag[row] + prodReal*vecReal[row];
2506 
2507  // imag of every term should be zero, because each is a valid fidelity calc of an eigenstate
2508  //qreal termImag = prodImag*vecReal[row] - prodReal*vecImag[row];
2509 
2510  extern __shared__ qreal tempReductionArray[];
2511  tempReductionArray[threadIdx.x] = termReal;
2512  __syncthreads();
2513 
2514  // every second thread reduces
2515  if (threadIdx.x<blockDim.x/2)
2516  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2517 }

References Qureg::deviceStateVec, qreal, and reduceBlock().

◆ densmatr_calcHilbertSchmidtDistance()

qreal densmatr_calcHilbertSchmidtDistance ( Qureg  a,
Qureg  b 
)

Definition at line 2593 of file QuEST_gpu.cu.

2593  {
2594 
2595  // we're summing the square of every term in (a-b)
2596  long long int numValuesToReduce = a.numAmpsPerChunk;
2597 
2598  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2599  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2600  int firstTime = 1;
2601 
2602  while (numValuesToReduce > 1) {
2603 
2604  // need less than one CUDA-BLOCK to reduce
2605  if (numValuesToReduce < maxReducedPerLevel) {
2606  valuesPerCUDABlock = numValuesToReduce;
2607  numCUDABlocks = 1;
2608  }
2609  // otherwise use only full CUDA-BLOCKS
2610  else {
2611  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2612  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2613  }
2614  // dictates size of reduction array
2615  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2616 
2617  // spawn threads to sum the probs in each block (store reduction temp values in a's reduction array)
2618  if (firstTime) {
2619  densmatr_calcHilbertSchmidtDistanceSquaredKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2620  a.deviceStateVec.real, a.deviceStateVec.imag,
2621  b.deviceStateVec.real, b.deviceStateVec.imag,
2622  numValuesToReduce, a.firstLevelReduction);
2623  firstTime = 0;
2624 
2625  // sum the block probs
2626  } else {
2627  cudaDeviceSynchronize();
2628  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2630  a.secondLevelReduction, valuesPerCUDABlock);
2631  cudaDeviceSynchronize();
2633  }
2634 
2635  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2636  }
2637 
2638  qreal trace;
2639  cudaMemcpy(&trace, a.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2640 
2641  qreal sqrtTrace = sqrt(trace);
2642  return sqrtTrace;
2643 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcHilbertSchmidtDistance().

◆ densmatr_calcHilbertSchmidtDistanceSquaredKernel()

__global__ void densmatr_calcHilbertSchmidtDistanceSquaredKernel ( qreal aRe,
qreal aIm,
qreal bRe,
qreal bIm,
long long int  numAmpsToSum,
qreal reducedArray 
)

Definition at line 2569 of file QuEST_gpu.cu.

2572  {
2573  // figure out which density matrix term this thread is assigned
2574  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2575  if (index >= numAmpsToSum) return;
2576 
2577  // compute this thread's sum term
2578  qreal difRe = aRe[index] - bRe[index];
2579  qreal difIm = aIm[index] - bIm[index];
2580  qreal term = difRe*difRe + difIm*difIm;
2581 
2582  // array of each thread's collected term, to be summed
2583  extern __shared__ qreal tempReductionArray[];
2584  tempReductionArray[threadIdx.x] = term;
2585  __syncthreads();
2586 
2587  // every second thread reduces
2588  if (threadIdx.x<blockDim.x/2)
2589  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2590 }

References qreal, and reduceBlock().

◆ densmatr_calcInnerProduct()

qreal densmatr_calcInnerProduct ( Qureg  a,
Qureg  b 
)

Definition at line 2313 of file QuEST_gpu.cu.

2313  {
2314 
2315  // we're summing the square of every term in the density matrix
2316  long long int numValuesToReduce = a.numAmpsTotal;
2317 
2318  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2319  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2320  int firstTime = 1;
2321 
2322  while (numValuesToReduce > 1) {
2323 
2324  // need less than one CUDA-BLOCK to reduce
2325  if (numValuesToReduce < maxReducedPerLevel) {
2326  valuesPerCUDABlock = numValuesToReduce;
2327  numCUDABlocks = 1;
2328  }
2329  // otherwise use only full CUDA-BLOCKS
2330  else {
2331  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2332  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2333  }
2334  // dictates size of reduction array
2335  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2336 
2337  // spawn threads to sum the terms in each block
2338  // arbitrarily store the reduction in the b qureg's array
2339  if (firstTime) {
2340  densmatr_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2341  a, b, a.numAmpsTotal, b.firstLevelReduction);
2342  firstTime = 0;
2343  }
2344  // sum the block terms
2345  else {
2346  cudaDeviceSynchronize();
2347  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2349  b.secondLevelReduction, valuesPerCUDABlock);
2350  cudaDeviceSynchronize();
2352  }
2353 
2354  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2355  }
2356 
2357  qreal innerprod;
2358  cudaMemcpy(&innerprod, b.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2359  return innerprod;
2360 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numAmpsTotal, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcDensityInnerProduct().

◆ densmatr_calcInnerProductKernel()

__global__ void densmatr_calcInnerProductKernel ( Qureg  a,
Qureg  b,
long long int  numTermsToSum,
qreal reducedArray 
)

computes Tr(conjTrans(a) b) = sum of (a_ij^* b_ij), which is a real number

Definition at line 2292 of file QuEST_gpu.cu.

2294  {
2295  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2296  if (index >= numTermsToSum) return;
2297 
2298  // Re{ conj(a) b } = Re{ (aRe - i aIm)(bRe + i bIm) } = aRe bRe + aIm bIm
2299  qreal prod = (
2300  a.deviceStateVec.real[index]*b.deviceStateVec.real[index]
2301  + a.deviceStateVec.imag[index]*b.deviceStateVec.imag[index]);
2302 
2303  // array of each thread's collected sum term, to be summed
2304  extern __shared__ qreal tempReductionArray[];
2305  tempReductionArray[threadIdx.x] = prod;
2306  __syncthreads();
2307 
2308  // every second thread reduces
2309  if (threadIdx.x<blockDim.x/2)
2310  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2311 }

References Qureg::deviceStateVec, qreal, and reduceBlock().

◆ densmatr_calcProbOfAllOutcomes()

void densmatr_calcProbOfAllOutcomes ( qreal outcomeProbs,
Qureg  qureg,
int *  qubits,
int  numQubits 
)

Definition at line 2259 of file QuEST_gpu.cu.

2259  {
2260 
2261  // copy qubits to GPU memory
2262  int* d_qubits;
2263  size_t mem_qubits = numQubits * sizeof *d_qubits;
2264  cudaMalloc(&d_qubits, mem_qubits);
2265  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
2266 
2267  // create global array, with per-block subarrays
2268  int numThreadsPerBlock = 128;
2269  int numDiags = (1LL << qureg.numQubitsRepresented);
2270  int numBlocks = ceil(numDiags / (qreal) numThreadsPerBlock);
2271 
2272  // create global GPU array for outcomeProbs
2273  qreal* d_outcomeProbs;
2274  long long int numOutcomes = (1LL << numQubits);
2275  size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
2276  cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
2277  cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
2278 
2279  // populate per-block subarrays
2280  densmatr_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
2281  d_outcomeProbs, qureg, d_qubits, numQubits);
2282 
2283  // copy outcomeProbs from GPU memory
2284  cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
2285 
2286  // free GPU memory
2287  cudaFree(d_qubits);
2288  cudaFree(d_outcomeProbs);
2289 }

References Qureg::numQubitsRepresented, and qreal.

Referenced by calcProbOfAllOutcomes().

◆ densmatr_calcProbOfAllOutcomesKernel()

__global__ void densmatr_calcProbOfAllOutcomesKernel ( qreal outcomeProbs,
Qureg  qureg,
int *  qubits,
int  numQubits 
)

Definition at line 2238 of file QuEST_gpu.cu.

2240  {
2241  // each thread handles one diagonal amplitude
2242  long long int diagInd = blockIdx.x*blockDim.x + threadIdx.x;
2243  long long int numDiags = (1LL << qureg.numQubitsRepresented);
2244  if (diagInd >= numDiags) return;
2245 
2246  long long int flatInd = (1 + numDiags)*diagInd;
2247  qreal prob = qureg.deviceStateVec.real[flatInd]; // im[flatInd] assumed ~ 0
2248 
2249  // each diagonal amplitude contributes to one outcome
2250  long long int outcomeInd = 0;
2251  for (int q=0; q<numQubits; q++)
2252  outcomeInd += extractBit(qubits[q], diagInd) * (1LL << q);
2253 
2254  // each thread atomically writes directly to the global output.
2255  // this beat block-heirarchal atomic reductions in both global and shared memory!
2256  atomicAdd(&outcomeProbs[outcomeInd], prob);
2257 }

References Qureg::deviceStateVec, extractBit(), Qureg::numQubitsRepresented, and qreal.

◆ densmatr_calcProbOfOutcome()

qreal densmatr_calcProbOfOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome 
)

Definition at line 2158 of file QuEST_gpu.cu.

2159 {
2160  qreal outcomeProb = densmatr_findProbabilityOfZero(qureg, measureQubit);
2161  if (outcome==1)
2162  outcomeProb = 1.0 - outcomeProb;
2163  return outcomeProb;
2164 }

References densmatr_findProbabilityOfZero(), and qreal.

Referenced by calcProbOfOutcome(), collapseToOutcome(), and densmatr_measureWithStats().

◆ densmatr_calcPurity()

qreal densmatr_calcPurity ( Qureg  qureg)

Computes the trace of the density matrix squared.

Definition at line 2664 of file QuEST_gpu.cu.

2664  {
2665 
2666  // we're summing the square of every term in the density matrix
2667  long long int numValuesToReduce = qureg.numAmpsPerChunk;
2668 
2669  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2670  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2671  int firstTime = 1;
2672 
2673  while (numValuesToReduce > 1) {
2674 
2675  // need less than one CUDA-BLOCK to reduce
2676  if (numValuesToReduce < maxReducedPerLevel) {
2677  valuesPerCUDABlock = numValuesToReduce;
2678  numCUDABlocks = 1;
2679  }
2680  // otherwise use only full CUDA-BLOCKS
2681  else {
2682  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2683  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2684  }
2685  // dictates size of reduction array
2686  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2687 
2688  // spawn threads to sum the probs in each block
2689  if (firstTime) {
2690  densmatr_calcPurityKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2691  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
2692  numValuesToReduce, qureg.firstLevelReduction);
2693  firstTime = 0;
2694 
2695  // sum the block probs
2696  } else {
2697  cudaDeviceSynchronize();
2698  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2699  qureg.firstLevelReduction,
2700  qureg.secondLevelReduction, valuesPerCUDABlock);
2701  cudaDeviceSynchronize();
2703  }
2704 
2705  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2706  }
2707 
2708  qreal traceDensSquared;
2709  cudaMemcpy(&traceDensSquared, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2710  return traceDensSquared;
2711 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcPurity().

◆ densmatr_calcPurityKernel()

__global__ void densmatr_calcPurityKernel ( qreal vecReal,
qreal vecImag,
long long int  numAmpsToSum,
qreal reducedArray 
)

Definition at line 2645 of file QuEST_gpu.cu.

2645  {
2646 
2647  // figure out which density matrix term this thread is assigned
2648  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2649  if (index >= numAmpsToSum) return;
2650 
2651  qreal term = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
2652 
2653  // array of each thread's collected probability, to be summed
2654  extern __shared__ qreal tempReductionArray[];
2655  tempReductionArray[threadIdx.x] = term;
2656  __syncthreads();
2657 
2658  // every second thread reduces
2659  if (threadIdx.x<blockDim.x/2)
2660  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2661 }

References qreal, and reduceBlock().

◆ densmatr_calcTotalProb()

qreal densmatr_calcTotalProb ( Qureg  qureg)

Definition at line 1632 of file QuEST_gpu.cu.

1632  {
1633 
1634  // computes the trace using Kahan summation
1635  qreal pTotal=0;
1636  qreal y, t, c;
1637  c = 0;
1638 
1639  long long int numCols = 1LL << qureg.numQubitsRepresented;
1640  long long diagIndex;
1641 
1642  copyStateFromGPU(qureg);
1643 
1644  for (int col=0; col< numCols; col++) {
1645  diagIndex = col*(numCols + 1);
1646  y = qureg.stateVec.real[diagIndex] - c;
1647  t = pTotal + y;
1648  c = ( t - pTotal ) - y; // brackets are important
1649  pTotal = t;
1650  }
1651 
1652  return pTotal;
1653 }

References copyStateFromGPU(), Qureg::numQubitsRepresented, qreal, and Qureg::stateVec.

Referenced by calcTotalProb(), and statevec_calcExpecPauliProd().

◆ densmatr_collapseToKnownProbOutcome()

void densmatr_collapseToKnownProbOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  outcomeProb 
)

This involves finding |...i...><...j...| states and killing those where i!=j.

Renorms (/prob) every | * outcome * >< * outcome * | state, setting all others to zero.

Definition at line 2805 of file QuEST_gpu.cu.

2805  {
2806 
2807  int rowQubit = measureQubit + qureg.numQubitsRepresented;
2808 
2809  int colBit = 1LL << measureQubit;
2810  int rowBit = 1LL << rowQubit;
2811 
2812  long long int numBasesToVisit = qureg.numAmpsPerChunk/4;
2813  long long int part1 = colBit -1;
2814  long long int part2 = (rowBit >> 1) - colBit;
2815  long long int part3 = numBasesToVisit - (rowBit >> 1);
2816 
2817  long long int desired, undesired;
2818  if (outcome == 0) {
2819  desired = 0;
2820  undesired = colBit | rowBit;
2821  } else {
2822  desired = colBit | rowBit;
2823  undesired = 0;
2824  }
2825 
2826  int threadsPerCUDABlock, CUDABlocks;
2827  threadsPerCUDABlock = 128;
2828  CUDABlocks = ceil(numBasesToVisit / (qreal) threadsPerCUDABlock);
2829  densmatr_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2830  outcomeProb, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBasesToVisit,
2831  part1, part2, part3, rowBit, colBit, desired, undesired);
2832 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by applyProjector(), collapseToOutcome(), and densmatr_measureWithStats().

◆ densmatr_collapseToKnownProbOutcomeKernel()

__global__ void densmatr_collapseToKnownProbOutcomeKernel ( qreal  outcomeProb,
qreal vecReal,
qreal vecImag,
long long int  numBasesToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  rowBit,
long long int  colBit,
long long int  desired,
long long int  undesired 
)

Maps thread ID to a |..0..><..0..| state and then locates |0><1|, |1><0| and |1><1|.

Definition at line 2779 of file QuEST_gpu.cu.

2783 {
2784  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2785  if (scanInd >= numBasesToVisit) return;
2786 
2787  long long int base = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2788 
2789  // renormalise desired outcome
2790  vecReal[base + desired] /= outcomeProb;
2791  vecImag[base + desired] /= outcomeProb;
2792 
2793  // kill undesired outcome
2794  vecReal[base + undesired] = 0;
2795  vecImag[base + undesired] = 0;
2796 
2797  // kill |..0..><..1..| states
2798  vecReal[base + colBit] = 0;
2799  vecImag[base + colBit] = 0;
2800  vecReal[base + rowBit] = 0;
2801  vecImag[base + rowBit] = 0;
2802 }

◆ densmatr_findProbabilityOfZero()

qreal densmatr_findProbabilityOfZero ( Qureg  qureg,
int  measureQubit 
)

Definition at line 2064 of file QuEST_gpu.cu.

2065 {
2066  long long int densityDim = 1LL << qureg.numQubitsRepresented;
2067  long long int numValuesToReduce = densityDim >> 1; // half of the diagonal has measureQubit=0
2068 
2069  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2070  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2071  int firstTime = 1;
2072 
2073  while (numValuesToReduce > 1) {
2074 
2075  // need less than one CUDA-BLOCK to reduce
2076  if (numValuesToReduce < maxReducedPerLevel) {
2077  valuesPerCUDABlock = numValuesToReduce;
2078  numCUDABlocks = 1;
2079  }
2080  // otherwise use only full CUDA-BLOCKS
2081  else {
2082  valuesPerCUDABlock = maxReducedPerLevel; // constrained by shared memory
2083  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2084  }
2085 
2086  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2087 
2088  // spawn threads to sum the probs in each block
2089  if (firstTime) {
2090  densmatr_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2091  qureg, measureQubit, qureg.firstLevelReduction);
2092  firstTime = 0;
2093 
2094  // sum the block probs
2095  } else {
2096  cudaDeviceSynchronize();
2097  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2098  qureg.firstLevelReduction,
2099  qureg.secondLevelReduction, valuesPerCUDABlock);
2100  cudaDeviceSynchronize();
2102  }
2103 
2104  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2105  }
2106 
2107  qreal zeroProb;
2108  cudaMemcpy(&zeroProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2109  return zeroProb;
2110 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numQubitsRepresented, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by densmatr_calcProbOfOutcome().

◆ densmatr_findProbabilityOfZeroKernel()

__global__ void densmatr_findProbabilityOfZeroKernel ( Qureg  qureg,
int  measureQubit,
qreal reducedArray 
)

Definition at line 1960 of file QuEST_gpu.cu.

1962  {
1963  // run by each thread
1964  // use of block here refers to contiguous amplitudes where measureQubit = 0,
1965  // (then =1) and NOT the CUDA block, which is the partitioning of CUDA threads
1966 
1967  long long int densityDim = 1LL << qureg.numQubitsRepresented;
1968  long long int numTasks = densityDim >> 1;
1969  long long int sizeHalfBlock = 1LL << (measureQubit);
1970  long long int sizeBlock = 2LL * sizeHalfBlock;
1971 
1972  long long int thisBlock; // which block this thread is processing
1973  long long int thisTask; // which part of the block this thread is processing
1974  long long int basisIndex; // index of this thread's computational basis state
1975  long long int densityIndex; // " " index of |basis><basis| in the flat density matrix
1976 
1977  // array of each thread's collected probability, to be summed
1978  extern __shared__ qreal tempReductionArray[];
1979 
1980  // figure out which density matrix prob that this thread is assigned
1981  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1982  if (thisTask>=numTasks) return;
1983  thisBlock = thisTask / sizeHalfBlock;
1984  basisIndex = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1985  densityIndex = (densityDim + 1) * basisIndex;
1986 
1987  // record the probability in the CUDA-BLOCK-wide array
1988  qreal prob = qureg.deviceStateVec.real[densityIndex]; // im[densityIndex] assumed ~ 0
1989  tempReductionArray[threadIdx.x] = prob;
1990 
1991  // sum the probs collected by this CUDA-BLOCK's threads into a per-CUDA-BLOCK array
1992  __syncthreads();
1993  if (threadIdx.x<blockDim.x/2){
1994  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
1995  }
1996 }

References Qureg::deviceStateVec, Qureg::numQubitsRepresented, qreal, and reduceBlock().

◆ densmatr_initClassicalState()

void densmatr_initClassicalState ( Qureg  qureg,
long long int  stateInd 
)

Definition at line 258 of file QuEST_gpu.cu.

259 {
260  int threadsPerCUDABlock, CUDABlocks;
261  threadsPerCUDABlock = 128;
262  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
263 
264  // index of the desired state in the flat density matrix
265  long long int densityDim = 1LL << qureg.numQubitsRepresented;
266  long long int densityInd = (densityDim + 1)*stateInd;
267 
268  // identical to pure version
269  densmatr_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
270  qureg.numAmpsPerChunk,
271  qureg.deviceStateVec.real,
272  qureg.deviceStateVec.imag, densityInd);
273 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by initClassicalState().

◆ densmatr_initClassicalStateKernel()

__global__ void densmatr_initClassicalStateKernel ( long long int  densityNumElems,
qreal densityReal,
qreal densityImag,
long long int  densityInd 
)

Definition at line 239 of file QuEST_gpu.cu.

243 {
244  // initialise the state to all zeros
245  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
246  if (index >= densityNumElems) return;
247 
248  densityReal[index] = 0.0;
249  densityImag[index] = 0.0;
250 
251  if (index==densityInd){
252  // classical state has probability 1
253  densityReal[densityInd] = 1.0;
254  densityImag[densityInd] = 0.0;
255  }
256 }

◆ densmatr_initPlusState()

void densmatr_initPlusState ( Qureg  qureg)

Definition at line 226 of file QuEST_gpu.cu.

227 {
228  qreal probFactor = 1.0/((qreal) (1LL << qureg.numQubitsRepresented));
229  int threadsPerCUDABlock, CUDABlocks;
230  threadsPerCUDABlock = 128;
231  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
232  densmatr_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
233  qureg.numAmpsPerChunk,
234  probFactor,
235  qureg.deviceStateVec.real,
236  qureg.deviceStateVec.imag);
237 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by initPlusState().

◆ densmatr_initPlusStateKernel()

__global__ void densmatr_initPlusStateKernel ( long long int  stateVecSize,
qreal  probFactor,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 216 of file QuEST_gpu.cu.

216  {
217  long long int index;
218 
219  index = blockIdx.x*blockDim.x + threadIdx.x;
220  if (index>=stateVecSize) return;
221 
222  stateVecReal[index] = probFactor;
223  stateVecImag[index] = 0.0;
224 }

◆ densmatr_initPureState()

void densmatr_initPureState ( Qureg  targetQureg,
Qureg  copyQureg 
)

Definition at line 205 of file QuEST_gpu.cu.

206 {
207  int threadsPerCUDABlock, CUDABlocks;
208  threadsPerCUDABlock = 128;
209  CUDABlocks = ceil((qreal)(copyQureg.numAmpsPerChunk)/threadsPerCUDABlock);
210  densmatr_initPureStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
211  copyQureg.numAmpsPerChunk,
212  targetQureg.deviceStateVec.real, targetQureg.deviceStateVec.imag,
213  copyQureg.deviceStateVec.real, copyQureg.deviceStateVec.imag);
214 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initPureState().

◆ densmatr_initPureStateKernel()

__global__ void densmatr_initPureStateKernel ( long long int  numPureAmps,
qreal targetVecReal,
qreal targetVecImag,
qreal copyVecReal,
qreal copyVecImag 
)

Definition at line 186 of file QuEST_gpu.cu.

190 {
191  // this is a particular index of the pure copyQureg
192  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
193  if (index>=numPureAmps) return;
194 
195  qreal realRow = copyVecReal[index];
196  qreal imagRow = copyVecImag[index];
197  for (long long int col=0; col < numPureAmps; col++) {
198  qreal realCol = copyVecReal[col];
199  qreal imagCol = - copyVecImag[col]; // minus for conjugation
200  targetVecReal[col*numPureAmps + index] = realRow*realCol - imagRow*imagCol;
201  targetVecImag[col*numPureAmps + index] = realRow*imagCol + imagRow*realCol;
202  }
203 }

References qreal.

◆ densmatr_mixDamping()

void densmatr_mixDamping ( Qureg  qureg,
int  targetQubit,
qreal  damping 
)

Definition at line 3048 of file QuEST_gpu.cu.

3048  {
3049 
3050  if (damping == 0)
3051  return;
3052 
3053  qreal dephase = sqrt(1-damping);
3054  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephase);
3055 
3056  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
3057  int rowQubit = targetQubit + qureg.numQubitsRepresented;
3058 
3059  long long int colBit = 1LL << targetQubit;
3060  long long int rowBit = 1LL << rowQubit;
3061  long long int bothBits = colBit | rowBit;
3062 
3063  long long int part1 = colBit - 1;
3064  long long int part2 = (rowBit >> 1) - colBit;
3065  long long int part3 = numAmpsToVisit - (rowBit >> 1);
3066 
3067  int threadsPerCUDABlock, CUDABlocks;
3068  threadsPerCUDABlock = 128;
3069  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3070  densmatr_mixDampingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3071  damping, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3072  part1, part2, part3, bothBits);
3073 }

References densmatr_oneQubitDegradeOffDiagonal(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixDamping().

◆ densmatr_mixDampingKernel()

__global__ void densmatr_mixDampingKernel ( qreal  damping,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  bothBits 
)

Works like mixDephasing but modifies every other element, and elements are averaged in pairs.

Definition at line 3001 of file QuEST_gpu.cu.

3005 {
3006  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
3007  if (scanInd >= numAmpsToVisit) return;
3008 
3009  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
3010  long long int targetInd = baseInd + bothBits;
3011 
3012  qreal realAvDepol = damping * ( vecReal[targetInd]);
3013  qreal imagAvDepol = damping * ( vecImag[targetInd]);
3014 
3015  vecReal[targetInd] *= 1 - damping;
3016  vecImag[targetInd] *= 1 - damping;
3017 
3018  vecReal[baseInd] += realAvDepol;
3019  vecImag[baseInd] += imagAvDepol;
3020 }

References qreal.

◆ densmatr_mixDensityMatrix()

void densmatr_mixDensityMatrix ( Qureg  combineQureg,
qreal  otherProb,
Qureg  otherQureg 
)

Definition at line 2846 of file QuEST_gpu.cu.

2846  {
2847 
2848  long long int numAmpsToVisit = combineQureg.numAmpsPerChunk;
2849 
2850  int threadsPerCUDABlock, CUDABlocks;
2851  threadsPerCUDABlock = 128;
2852  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2853  densmatr_mixDensityMatrixKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2854  combineQureg, otherProb, otherQureg, numAmpsToVisit
2855  );
2856 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by mixDensityMatrix().

◆ densmatr_mixDensityMatrixKernel()

__global__ void densmatr_mixDensityMatrixKernel ( Qureg  combineQureg,
qreal  otherProb,
Qureg  otherQureg,
long long int  numAmpsToVisit 
)

Definition at line 2834 of file QuEST_gpu.cu.

2834  {
2835 
2836  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2837  if (ampInd >= numAmpsToVisit) return;
2838 
2839  combineQureg.deviceStateVec.real[ampInd] *= 1-otherProb;
2840  combineQureg.deviceStateVec.imag[ampInd] *= 1-otherProb;
2841 
2842  combineQureg.deviceStateVec.real[ampInd] += otherProb*otherQureg.deviceStateVec.real[ampInd];
2843  combineQureg.deviceStateVec.imag[ampInd] += otherProb*otherQureg.deviceStateVec.imag[ampInd];
2844 }

References Qureg::deviceStateVec.

◆ densmatr_mixDephasing()

void densmatr_mixDephasing ( Qureg  qureg,
int  targetQubit,
qreal  dephase 
)

Definition at line 2899 of file QuEST_gpu.cu.

2899  {
2900 
2901  if (dephase == 0)
2902  return;
2903 
2904  qreal dephFac = 1 - dephase;
2905  densmatr_oneQubitDegradeOffDiagonal(qureg, targetQubit, dephFac);
2906 }

References densmatr_oneQubitDegradeOffDiagonal(), and qreal.

Referenced by densmatr_mixDepolarising(), and mixDephasing().

◆ densmatr_mixDephasingKernel()

__global__ void densmatr_mixDephasingKernel ( qreal  fac,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  colBit,
long long int  rowBit 
)

Called once for every 4 amplitudes in density matrix Works by establishing the |..0..><..0..| state (for its given index) then visiting |..1..><..0..| and |..0..><..1..|.

Labels |part1 X pa><rt2 NOT(X) part3| From the brain of Simon Benjamin

Definition at line 2863 of file QuEST_gpu.cu.

2867 {
2868  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2869  if (scanInd >= numAmpsToVisit) return;
2870 
2871  long long int ampInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2872  vecReal[ampInd + colBit] *= fac;
2873  vecImag[ampInd + colBit] *= fac;
2874  vecReal[ampInd + rowBit] *= fac;
2875  vecImag[ampInd + rowBit] *= fac;
2876 }

◆ densmatr_mixDepolarising()

void densmatr_mixDepolarising ( Qureg  qureg,
int  targetQubit,
qreal  depolLevel 
)

Definition at line 3022 of file QuEST_gpu.cu.

3022  {
3023 
3024  if (depolLevel == 0)
3025  return;
3026 
3027  densmatr_mixDephasing(qureg, targetQubit, depolLevel);
3028 
3029  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
3030  int rowQubit = targetQubit + qureg.numQubitsRepresented;
3031 
3032  long long int colBit = 1LL << targetQubit;
3033  long long int rowBit = 1LL << rowQubit;
3034  long long int bothBits = colBit | rowBit;
3035 
3036  long long int part1 = colBit - 1;
3037  long long int part2 = (rowBit >> 1) - colBit;
3038  long long int part3 = numAmpsToVisit - (rowBit >> 1);
3039 
3040  int threadsPerCUDABlock, CUDABlocks;
3041  threadsPerCUDABlock = 128;
3042  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3043  densmatr_mixDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3044  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3045  part1, part2, part3, bothBits);
3046 }

References densmatr_mixDephasing(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixDepolarising().

◆ densmatr_mixDepolarisingKernel()

__global__ void densmatr_mixDepolarisingKernel ( qreal  depolLevel,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  bothBits 
)

Works like mixDephasing but modifies every other element, and elements are averaged in pairs.

Definition at line 2975 of file QuEST_gpu.cu.

2979 {
2980  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
2981  if (scanInd >= numAmpsToVisit) return;
2982 
2983  long long int baseInd = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2);
2984  long long int targetInd = baseInd + bothBits;
2985 
2986  qreal realAvDepol = depolLevel * 0.5 * (vecReal[baseInd] + vecReal[targetInd]);
2987  qreal imagAvDepol = depolLevel * 0.5 * (vecImag[baseInd] + vecImag[targetInd]);
2988 
2989  vecReal[baseInd] *= 1 - depolLevel;
2990  vecImag[baseInd] *= 1 - depolLevel;
2991  vecReal[targetInd] *= 1 - depolLevel;
2992  vecImag[targetInd] *= 1 - depolLevel;
2993 
2994  vecReal[baseInd] += realAvDepol;
2995  vecImag[baseInd] += imagAvDepol;
2996  vecReal[targetInd] += realAvDepol;
2997  vecImag[targetInd] += imagAvDepol;
2998 }

References qreal.

◆ densmatr_mixTwoQubitDephasing()

void densmatr_mixTwoQubitDephasing ( Qureg  qureg,
int  qubit1,
int  qubit2,
qreal  dephase 
)

Definition at line 2938 of file QuEST_gpu.cu.

2938  {
2939 
2940  if (dephase == 0)
2941  return;
2942 
2943  // assumes qubit2 > qubit1
2944 
2945  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
2946  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
2947 
2948  long long int colBit1 = 1LL << qubit1;
2949  long long int rowBit1 = 1LL << rowQubit1;
2950  long long int colBit2 = 1LL << qubit2;
2951  long long int rowBit2 = 1LL << rowQubit2;
2952 
2953  long long int part1 = colBit1 - 1;
2954  long long int part2 = (colBit2 >> 1) - colBit1;
2955  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
2956  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
2957  long long int part5 = (qureg.numAmpsPerChunk/16) - (rowBit2 >> 3);
2958  qreal dephFac = 1 - dephase;
2959 
2960  // refers to states |a 0 b 0 c><d 0 e 0 f| (target qubits are fixed)
2961  long long int numBackgroundStates = qureg.numAmpsPerChunk/16;
2962 
2963  // 12 of these states experience dephasing
2964  long long int numAmpsToVisit = 12 * numBackgroundStates;
2965 
2966  int threadsPerCUDABlock, CUDABlocks;
2967  threadsPerCUDABlock = 128;
2968  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2969  densmatr_mixTwoQubitDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2970  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numBackgroundStates, numAmpsToVisit,
2971  part1, part2, part3, part4, part5, colBit1, rowBit1, colBit2, rowBit2);
2972 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by densmatr_mixTwoQubitDepolarising(), and mixTwoQubitDephasing().

◆ densmatr_mixTwoQubitDephasingKernel()

__global__ void densmatr_mixTwoQubitDephasingKernel ( qreal  fac,
qreal vecReal,
qreal vecImag,
long long int  numBackgroundStates,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  part4,
long long int  part5,
long long int  colBit1,
long long int  rowBit1,
long long int  colBit2,
long long int  rowBit2 
)

Called 12 times for every 16 amplitudes in density matrix Each sums from the |..0..0..><..0..0..| index to visit either |..0..0..><..0..1..|, |..0..0..><..1..0..|, |..0..0..><..1..1..|, |..0..1..><..0..0..| etc and so on to |..1..1..><..1..0|.

Labels |part1 0 part2 0 par><t3 0 part4 0 part5|. From the brain of Simon Benjamin

Definition at line 2914 of file QuEST_gpu.cu.

2918 {
2919  long long int outerInd = blockIdx.x*blockDim.x + threadIdx.x;
2920  if (outerInd >= numAmpsToVisit) return;
2921 
2922  // sets meta in 1...14 excluding 5, 10, creating bit string DCBA for |..D..C..><..B..A|
2923  int meta = 1 + (outerInd/numBackgroundStates);
2924  if (meta > 4) meta++;
2925  if (meta > 9) meta++;
2926 
2927  long long int shift = rowBit2*((meta>>3)%2) + rowBit1*((meta>>2)%2) + colBit2*((meta>>1)%2) + colBit1*(meta%2);
2928  long long int scanInd = outerInd % numBackgroundStates;
2929  long long int stateInd = (
2930  shift +
2931  (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4));
2932 
2933  vecReal[stateInd] *= fac;
2934  vecImag[stateInd] *= fac;
2935 }

◆ densmatr_mixTwoQubitDepolarising()

void densmatr_mixTwoQubitDepolarising ( Qureg  qureg,
int  qubit1,
int  qubit2,
qreal  depolLevel 
)

Definition at line 3108 of file QuEST_gpu.cu.

3108  {
3109 
3110  if (depolLevel == 0)
3111  return;
3112 
3113  // assumes qubit2 > qubit1
3114 
3115  densmatr_mixTwoQubitDephasing(qureg, qubit1, qubit2, depolLevel);
3116 
3117  int rowQubit1 = qubit1 + qureg.numQubitsRepresented;
3118  int rowQubit2 = qubit2 + qureg.numQubitsRepresented;
3119 
3120  long long int colBit1 = 1LL << qubit1;
3121  long long int rowBit1 = 1LL << rowQubit1;
3122  long long int colBit2 = 1LL << qubit2;
3123  long long int rowBit2 = 1LL << rowQubit2;
3124 
3125  long long int rowCol1 = colBit1 | rowBit1;
3126  long long int rowCol2 = colBit2 | rowBit2;
3127 
3128  long long int numAmpsToVisit = qureg.numAmpsPerChunk/16;
3129  long long int part1 = colBit1 - 1;
3130  long long int part2 = (colBit2 >> 1) - colBit1;
3131  long long int part3 = (rowBit1 >> 2) - (colBit2 >> 1);
3132  long long int part4 = (rowBit2 >> 3) - (rowBit1 >> 2);
3133  long long int part5 = numAmpsToVisit - (rowBit2 >> 3);
3134 
3135  int threadsPerCUDABlock, CUDABlocks;
3136  threadsPerCUDABlock = 128;
3137  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3138  densmatr_mixTwoQubitDepolarisingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3139  depolLevel, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
3140  part1, part2, part3, part4, part5, rowCol1, rowCol2);
3141 }

References densmatr_mixTwoQubitDephasing(), Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by mixTwoQubitDepolarising().

◆ densmatr_mixTwoQubitDepolarisingKernel()

__global__ void densmatr_mixTwoQubitDepolarisingKernel ( qreal  depolLevel,
qreal vecReal,
qreal vecImag,
long long int  numAmpsToVisit,
long long int  part1,
long long int  part2,
long long int  part3,
long long int  part4,
long long int  part5,
long long int  rowCol1,
long long int  rowCol2 
)

Called once for every 16 amplitudes.

Definition at line 3076 of file QuEST_gpu.cu.

3081 {
3082  long long int scanInd = blockIdx.x*blockDim.x + threadIdx.x;
3083  if (scanInd >= numAmpsToVisit) return;
3084 
3085  // index of |..0..0..><..0..0|
3086  long long int ind00 = (scanInd&part1) + ((scanInd&part2)<<1) + ((scanInd&part3)<<2) + ((scanInd&part4)<<3) + ((scanInd&part5)<<4);
3087  long long int ind01 = ind00 + rowCol1;
3088  long long int ind10 = ind00 + rowCol2;
3089  long long int ind11 = ind00 + rowCol1 + rowCol2;
3090 
3091  qreal realAvDepol = depolLevel * 0.25 * (
3092  vecReal[ind00] + vecReal[ind01] + vecReal[ind10] + vecReal[ind11]);
3093  qreal imagAvDepol = depolLevel * 0.25 * (
3094  vecImag[ind00] + vecImag[ind01] + vecImag[ind10] + vecImag[ind11]);
3095 
3096  qreal retain = 1 - depolLevel;
3097  vecReal[ind00] *= retain; vecImag[ind00] *= retain;
3098  vecReal[ind01] *= retain; vecImag[ind01] *= retain;
3099  vecReal[ind10] *= retain; vecImag[ind10] *= retain;
3100  vecReal[ind11] *= retain; vecImag[ind11] *= retain;
3101 
3102  vecReal[ind00] += realAvDepol; vecImag[ind00] += imagAvDepol;
3103  vecReal[ind01] += realAvDepol; vecImag[ind01] += imagAvDepol;
3104  vecReal[ind10] += realAvDepol; vecImag[ind10] += imagAvDepol;
3105  vecReal[ind11] += realAvDepol; vecImag[ind11] += imagAvDepol;
3106 }

References qreal.

◆ densmatr_oneQubitDegradeOffDiagonal()

void densmatr_oneQubitDegradeOffDiagonal ( Qureg  qureg,
int  targetQubit,
qreal  dephFac 
)

Definition at line 2879 of file QuEST_gpu.cu.

2879  {
2880 
2881  long long int numAmpsToVisit = qureg.numAmpsPerChunk/4;
2882 
2883  int rowQubit = targetQubit + qureg.numQubitsRepresented;
2884  long long int colBit = 1LL << targetQubit;
2885  long long int rowBit = 1LL << rowQubit;
2886 
2887  long long int part1 = colBit - 1;
2888  long long int part2 = (rowBit >> 1) - colBit;
2889  long long int part3 = numAmpsToVisit - (rowBit >> 1);
2890 
2891  int threadsPerCUDABlock, CUDABlocks;
2892  threadsPerCUDABlock = 128;
2893  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
2894  densmatr_mixDephasingKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
2895  dephFac, qureg.deviceStateVec.real, qureg.deviceStateVec.imag, numAmpsToVisit,
2896  part1, part2, part3, colBit, rowBit);
2897 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, Qureg::numQubitsRepresented, and qreal.

Referenced by densmatr_mixDamping(), and densmatr_mixDephasing().

◆ extractBit()

__forceinline__ __device__ int extractBit ( const int  locationOfBitFromRight,
const long long int  theEncodedNumber 
)

Definition at line 82 of file QuEST_gpu.cu.

82  {
83  return (theEncodedNumber & ( 1LL << locationOfBitFromRight )) >> locationOfBitFromRight;
84 }

Referenced by agnostic_initDiagonalOpFromPauliHamil(), agnostic_initDiagonalOpFromPauliHamilKernel(), compressPairVectorForSingleQubitDepolarise(), compressPairVectorForTwoQubitDepolarise(), densmatr_calcProbOfAllOutcomesKernel(), densmatr_calcProbOfAllOutcomesLocal(), densmatr_collapseToKnownProbOutcome(), densmatr_findProbabilityOfZeroLocal(), densmatr_mixDampingDistributed(), densmatr_mixDepolarisingDistributed(), densmatr_mixTwoQubitDepolarisingDistributed(), densmatr_mixTwoQubitDepolarisingQ1LocalQ2DistributedPart3(), getGlobalIndOfOddParityInChunk(), statevec_applyMultiVarPhaseFuncOverrides(), statevec_applyMultiVarPhaseFuncOverridesKernel(), statevec_applyParamNamedPhaseFuncOverrides(), statevec_applyParamNamedPhaseFuncOverridesKernel(), statevec_applyPhaseFuncOverrides(), statevec_applyPhaseFuncOverridesKernel(), statevec_calcProbOfAllOutcomesKernel(), statevec_calcProbOfAllOutcomesLocal(), statevec_controlledCompactUnitaryDistributed(), statevec_controlledCompactUnitaryKernel(), statevec_controlledCompactUnitaryLocal(), statevec_controlledNotDistributed(), statevec_controlledNotKernel(), statevec_controlledNotLocal(), statevec_controlledPauliYDistributed(), statevec_controlledPauliYKernel(), statevec_controlledPauliYLocal(), statevec_controlledPhaseFlip(), statevec_controlledPhaseFlipKernel(), statevec_controlledPhaseShift(), statevec_controlledPhaseShiftKernel(), statevec_controlledUnitaryDistributed(), statevec_controlledUnitaryKernel(), statevec_controlledUnitaryLocal(), statevec_initStateOfSingleQubit(), statevec_initStateOfSingleQubitKernel(), statevec_multiControlledMultiQubitUnitaryKernel(), statevec_multiControlledMultiQubitUnitaryLocal(), and statevec_phaseShiftByTerm().

◆ flipBit()

◆ getBitMaskParity()

__forceinline__ __device__ int getBitMaskParity ( long long int  mask)

Definition at line 86 of file QuEST_gpu.cu.

86  {
87  int parity = 0;
88  while (mask) {
89  parity = !parity;
90  mask = mask & (mask-1);
91  }
92  return parity;
93 }

Referenced by statevec_multiControlledMultiRotateZKernel(), and statevec_multiRotateZKernel().

◆ getNumReductionLevels()

int getNumReductionLevels ( long long int  numValuesToReduce,
int  numReducedPerLevel 
)

Definition at line 2048 of file QuEST_gpu.cu.

2048  {
2049  int levels=0;
2050  while (numValuesToReduce){
2051  numValuesToReduce = numValuesToReduce/numReducedPerLevel;
2052  levels++;
2053  }
2054  return levels;
2055 }

◆ GPUExists()

int GPUExists ( void  )

Definition at line 446 of file QuEST_gpu.cu.

446  {
447  int deviceCount, device;
448  int gpuDeviceCount = 0;
449  struct cudaDeviceProp properties;
450  cudaError_t cudaResultCode = cudaGetDeviceCount(&deviceCount);
451  if (cudaResultCode != cudaSuccess) deviceCount = 0;
452  /* machines with no GPUs can still report one emulation device */
453  for (device = 0; device < deviceCount; ++device) {
454  cudaGetDeviceProperties(&properties, device);
455  if (properties.major != 9999) { /* 9999 means emulation only */
456  ++gpuDeviceCount;
457  }
458  }
459  if (gpuDeviceCount) return 1;
460  else return 0;
461 }

Referenced by createQuESTEnv().

◆ insertTwoZeroBits()

__forceinline__ __device__ long long int insertTwoZeroBits ( const long long int  number,
const int  bit1,
const int  bit2 
)

Definition at line 106 of file QuEST_gpu.cu.

106  {
107  int small = (bit1 < bit2)? bit1 : bit2;
108  int big = (bit1 < bit2)? bit2 : bit1;
109  return insertZeroBit(insertZeroBit(number, small), big);
110 }

References insertZeroBit().

Referenced by statevec_multiControlledTwoQubitUnitaryKernel(), statevec_multiControlledTwoQubitUnitaryLocal(), statevec_swapQubitAmpsKernel(), and statevec_swapQubitAmpsLocal().

◆ insertZeroBit()

__forceinline__ __device__ long long int insertZeroBit ( const long long int  number,
const int  index 
)

Definition at line 99 of file QuEST_gpu.cu.

99  {
100  long long int left, right;
101  left = (number >> index) << index;
102  right = number - left;
103  return (left << 1) ^ right;
104 }

Referenced by insertTwoZeroBits(), insertZeroBits(), and statevec_multiControlledMultiQubitUnitaryLocal().

◆ insertZeroBits()

__forceinline__ __device__ long long int insertZeroBits ( long long int  number,
int *  inds,
const int  numInds 
)

Definition at line 112 of file QuEST_gpu.cu.

112  {
113  /* inserted bit inds must strictly increase, so that their final indices are correct.
114  * in-lieu of sorting (avoided since no C++ variable-size arrays, and since we're already
115  * memory bottle-necked so overhead eats this slowdown), we find the next-smallest index each
116  * at each insert. recall every element of inds (a positive or zero number) is unique.
117  * This function won't appear in the CPU code, which can use C99 variable-size arrays and
118  * ought to make a sorted array before threading
119  */
120  int curMin = inds[0];
121  int prevMin = -1;
122  for (int n=0; n < numInds; n++) {
123 
124  // find next min
125  for (int t=0; t < numInds; t++)
126  if (inds[t]>prevMin && inds[t]<curMin)
127  curMin = inds[t];
128 
129  number = insertZeroBit(number, curMin);
130 
131  // set curMin to an arbitrary non-visited elem
132  prevMin = curMin;
133  for (int t=0; t < numInds; t++)
134  if (inds[t] > curMin) {
135  curMin = inds[t];
136  break;
137  }
138  }
139  return number;
140 }

References insertZeroBit().

Referenced by statevec_multiControlledMultiQubitUnitaryKernel().

◆ log2Int()

__device__ __host__ unsigned int log2Int ( unsigned int  x)

Definition at line 1925 of file QuEST_gpu.cu.

1926 {
1927  unsigned int ans = 0 ;
1928  while( x>>=1 ) ans++;
1929  return ans ;
1930 }

Referenced by reduceBlock().

◆ reduceBlock()

__device__ void reduceBlock ( qreal arrayIn,
qreal reducedArray,
int  length 
)

Definition at line 1932 of file QuEST_gpu.cu.

1932  {
1933  int i, l, r;
1934  int threadMax, maxDepth;
1935  threadMax = length/2;
1936  maxDepth = log2Int(length/2);
1937 
1938  for (i=0; i<maxDepth+1; i++){
1939  if (threadIdx.x<threadMax){
1940  l = threadIdx.x;
1941  r = l + threadMax;
1942  arrayIn[l] = arrayIn[r] + arrayIn[l];
1943  }
1944  threadMax = threadMax >> 1;
1945  __syncthreads(); // optimise -- use warp shuffle instead
1946  }
1947 
1948  if (threadIdx.x==0) reducedArray[blockIdx.x] = arrayIn[0];
1949 }

References log2Int().

Referenced by copySharedReduceBlock(), densmatr_calcExpecDiagonalOpKernel(), densmatr_calcFidelityKernel(), densmatr_calcHilbertSchmidtDistanceSquaredKernel(), densmatr_calcInnerProductKernel(), densmatr_calcPurityKernel(), densmatr_findProbabilityOfZeroKernel(), statevec_calcExpecDiagonalOpKernel(), statevec_calcInnerProductKernel(), and statevec_findProbabilityOfZeroKernel().

◆ statevec_applyDiagonalOp()

void statevec_applyDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3209 of file QuEST_gpu.cu.

3210 {
3211  int threadsPerCUDABlock, CUDABlocks;
3212  threadsPerCUDABlock = 128;
3213  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
3214  statevec_applyDiagonalOpKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, op);
3215 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyDiagonalOp().

◆ statevec_applyDiagonalOpKernel()

__global__ void statevec_applyDiagonalOpKernel ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3187 of file QuEST_gpu.cu.

3187  {
3188 
3189  // each thread modifies one value; a wasteful and inefficient strategy
3190  long long int numTasks = qureg.numAmpsPerChunk;
3191  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
3192  if (thisTask >= numTasks) return;
3193 
3194  qreal* stateRe = qureg.deviceStateVec.real;
3195  qreal* stateIm = qureg.deviceStateVec.imag;
3196  qreal* opRe = op.deviceOperator.real;
3197  qreal* opIm = op.deviceOperator.imag;
3198 
3199  qreal a = stateRe[thisTask];
3200  qreal b = stateIm[thisTask];
3201  qreal c = opRe[thisTask];
3202  qreal d = opIm[thisTask];
3203 
3204  // (a + b i)(c + d i) = (a c - b d) + i (a d + b c)
3205  stateRe[thisTask] = a*c - b*d;
3206  stateIm[thisTask] = a*d + b*c;
3207 }

References DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_applyMultiVarPhaseFuncOverrides()

void statevec_applyMultiVarPhaseFuncOverrides ( Qureg  qureg,
int *  qubits,
int *  numQubitsPerReg,
int  numRegs,
enum bitEncoding  encoding,
qreal coeffs,
qreal exponents,
int *  numTermsPerReg,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
int  conj 
)

Definition at line 3695 of file QuEST_gpu.cu.

3700  {
3701  // determine size of arrays, for cloning into GPU memory
3702  size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
3703  size_t mem_numTermsPerReg = numRegs * sizeof *numTermsPerReg;
3704  size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
3705  size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
3706  size_t mem_qubits = 0;
3707  size_t mem_coeffs = 0;
3708  size_t mem_exponents = 0;
3709  for (int r=0; r<numRegs; r++) {
3710  mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
3711  mem_coeffs += numTermsPerReg[r] * sizeof *coeffs;
3712  mem_exponents += numTermsPerReg[r] * sizeof *exponents;
3713  }
3714 
3715  // allocate global GPU memory
3716  int* d_qubits; cudaMalloc(&d_qubits, mem_qubits);
3717  qreal* d_coeffs; cudaMalloc(&d_coeffs, mem_coeffs);
3718  qreal* d_exponents; cudaMalloc(&d_exponents, mem_exponents);
3719  int* d_numQubitsPerReg; cudaMalloc(&d_numQubitsPerReg, mem_numQubitsPerReg);
3720  int* d_numTermsPerReg; cudaMalloc(&d_numTermsPerReg, mem_numTermsPerReg);
3721  long long int* d_overrideInds; cudaMalloc(&d_overrideInds, mem_overrideInds);
3722  qreal* d_overridePhases; cudaMalloc(&d_overridePhases, mem_overridePhases);
3723 
3724  // copy function args into GPU memory
3725  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3726  cudaMemcpy(d_coeffs, coeffs, mem_coeffs, cudaMemcpyHostToDevice);
3727  cudaMemcpy(d_exponents, exponents, mem_exponents, cudaMemcpyHostToDevice);
3728  cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice);
3729  cudaMemcpy(d_numTermsPerReg, numTermsPerReg, mem_numTermsPerReg, cudaMemcpyHostToDevice);
3730  cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice);
3731  cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice);
3732 
3733  int threadsPerCUDABlock = 128;
3734  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3735 
3736  // allocate thread-local working space {phaseInds}
3737  long long int *d_phaseInds;
3738  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
3739  cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
3740 
3741  // call kernel
3742  statevec_applyMultiVarPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3743  qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
3744  d_coeffs, d_exponents, d_numTermsPerReg,
3745  d_overrideInds, d_overridePhases, numOverrides,
3746  d_phaseInds,
3747  conj);
3748 
3749  // free device memory
3750  cudaFree(d_qubits);
3751  cudaFree(d_coeffs);
3752  cudaFree(d_exponents);
3753  cudaFree(d_numQubitsPerReg);
3754  cudaFree(d_numTermsPerReg);
3755  cudaFree(d_overrideInds);
3756  cudaFree(d_overridePhases);
3757  cudaFree(d_phaseInds);
3758 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyMultiVarPhaseFunc(), and applyMultiVarPhaseFuncOverrides().

◆ statevec_applyMultiVarPhaseFuncOverridesKernel()

__global__ void statevec_applyMultiVarPhaseFuncOverridesKernel ( Qureg  qureg,
int *  qubits,
int *  numQubitsPerReg,
int  numRegs,
enum bitEncoding  encoding,
qreal coeffs,
qreal exponents,
int *  numTermsPerReg,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
long long int *  phaseInds,
int  conj 
)

Definition at line 3611 of file QuEST_gpu.cu.

3617  {
3618  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3619  if (index>=qureg.numAmpsPerChunk) return;
3620 
3621  // determine global amplitude index (non-distributed, so it's just local index)
3622  long long int globalAmpInd = index;
3623 
3624  /*
3625  * each thread needs to write to a local:
3626  * long long int phaseInds[numRegs];
3627  * but instead has access to shared array phaseInds, with below stride and offset
3628  */
3629  size_t stride = gridDim.x*blockDim.x;
3630  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
3631 
3632  // determine phase indices
3633  int flatInd = 0;
3634  if (encoding == UNSIGNED) {
3635  for (int r=0; r<numRegs; r++) {
3636  phaseInds[r*stride+offset] = 0LL;
3637  for (int q=0; q<numQubitsPerReg[r]; q++)
3638  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3639  }
3640  }
3641  else if (encoding == TWOS_COMPLEMENT) {
3642  for (int r=0; r<numRegs; r++) {
3643  phaseInds[r*stride+offset] = 0LL;
3644  for (int q=0; q<numQubitsPerReg[r]-1; q++)
3645  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3646  // use final qubit to indicate sign
3647  if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
3648  phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
3649  }
3650  }
3651 
3652  // determine if this phase index has an overriden value (i < numOverrides)
3653  int i;
3654  for (i=0; i<numOverrides; i++) {
3655  int found = 1;
3656  for (int r=0; r<numRegs; r++) {
3657  if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
3658  found = 0;
3659  break;
3660  }
3661  }
3662  if (found)
3663  break;
3664  }
3665 
3666  // compute the phase (unless overriden)
3667  qreal phase = 0;
3668  if (i < numOverrides)
3669  phase = overridePhases[i];
3670  else {
3671  flatInd = 0;
3672  for (int r=0; r<numRegs; r++) {
3673  for (int t=0; t<numTermsPerReg[r]; t++) {
3674  phase += coeffs[flatInd] * pow(phaseInds[r*stride+offset], exponents[flatInd]);
3675  flatInd++;
3676  }
3677  }
3678  }
3679 
3680  // negate phase to conjugate operator
3681  if (conj)
3682  phase *= -1;
3683 
3684  // modify amp to amp * exp(i phase)
3685  qreal c = cos(phase);
3686  qreal s = sin(phase);
3687  qreal re = qureg.deviceStateVec.real[index];
3688  qreal im = qureg.deviceStateVec.imag[index];
3689 
3690  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3691  qureg.deviceStateVec.real[index] = re*c - im*s;
3692  qureg.deviceStateVec.imag[index] = re*s + im*c;
3693 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, qreal, TWOS_COMPLEMENT, and UNSIGNED.

◆ statevec_applyParamNamedPhaseFuncOverrides()

void statevec_applyParamNamedPhaseFuncOverrides ( Qureg  qureg,
int *  qubits,
int *  numQubitsPerReg,
int  numRegs,
enum bitEncoding  encoding,
enum phaseFunc  phaseFuncName,
qreal params,
int  numParams,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
int  conj 
)

Definition at line 3909 of file QuEST_gpu.cu.

3914  {
3915  // determine size of arrays, for cloning into GPU memory
3916  size_t mem_numQubitsPerReg = numRegs * sizeof *numQubitsPerReg;
3917  size_t mem_overridePhases = numOverrides * sizeof *overridePhases;
3918  size_t mem_overrideInds = numOverrides * numRegs * sizeof *overrideInds;
3919  size_t mem_params = numParams * sizeof *params;
3920  size_t mem_qubits = 0;
3921  for (int r=0; r<numRegs; r++)
3922  mem_qubits += numQubitsPerReg[r] * sizeof *qubits;
3923 
3924  // allocate global GPU memory
3925  int* d_qubits; cudaMalloc(&d_qubits, mem_qubits);
3926  int* d_numQubitsPerReg; cudaMalloc(&d_numQubitsPerReg, mem_numQubitsPerReg);
3927  long long int* d_overrideInds; cudaMalloc(&d_overrideInds, mem_overrideInds);
3928  qreal* d_overridePhases; cudaMalloc(&d_overridePhases, mem_overridePhases);
3929  qreal* d_params = NULL; if (numParams > 0) cudaMalloc(&d_params, mem_params);
3930 
3931  // copy function args into GPU memory
3932  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3933  cudaMemcpy(d_numQubitsPerReg, numQubitsPerReg, mem_numQubitsPerReg, cudaMemcpyHostToDevice);
3934  cudaMemcpy(d_overrideInds, overrideInds, mem_overrideInds, cudaMemcpyHostToDevice);
3935  cudaMemcpy(d_overridePhases, overridePhases, mem_overridePhases, cudaMemcpyHostToDevice);
3936  if (numParams > 0)
3937  cudaMemcpy(d_params, params, mem_params, cudaMemcpyHostToDevice);
3938 
3939  int threadsPerCUDABlock = 128;
3940  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3941 
3942  // allocate thread-local working space {phaseInds}
3943  long long int *d_phaseInds;
3944  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
3945  cudaMalloc(&d_phaseInds, numRegs*gridSize * sizeof *d_phaseInds);
3946 
3947  // call kernel
3948  statevec_applyParamNamedPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3949  qureg, d_qubits, d_numQubitsPerReg, numRegs, encoding,
3950  phaseFuncName, d_params, numParams,
3951  d_overrideInds, d_overridePhases, numOverrides,
3952  d_phaseInds,
3953  conj);
3954 
3955  // free device memory
3956  cudaFree(d_qubits);
3957  cudaFree(d_numQubitsPerReg);
3958  cudaFree(d_overrideInds);
3959  cudaFree(d_overridePhases);
3960  cudaFree(d_phaseInds);
3961  if (numParams > 0)
3962  cudaFree(d_params);
3963 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by agnostic_applyQFT(), applyNamedPhaseFunc(), applyNamedPhaseFuncOverrides(), applyParamNamedPhaseFunc(), and applyParamNamedPhaseFuncOverrides().

◆ statevec_applyParamNamedPhaseFuncOverridesKernel()

__global__ void statevec_applyParamNamedPhaseFuncOverridesKernel ( Qureg  qureg,
int *  qubits,
int *  numQubitsPerReg,
int  numRegs,
enum bitEncoding  encoding,
enum phaseFunc  phaseFuncName,
qreal params,
int  numParams,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
long long int *  phaseInds,
int  conj 
)

Definition at line 3760 of file QuEST_gpu.cu.

3766  {
3767  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3768  if (index>=qureg.numAmpsPerChunk) return;
3769 
3770  // determine global amplitude index (non-distributed, so it's just local index)
3771  long long int globalAmpInd = index;
3772 
3773  /*
3774  * each thread needs to write to a local:
3775  * long long int phaseInds[numRegs];
3776  * but instead has access to shared array phaseInds, with below stride and offset
3777  */
3778  size_t stride = gridDim.x*blockDim.x;
3779  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
3780 
3781  // determine phase indices
3782  if (encoding == UNSIGNED) {
3783  int flatInd = 0;
3784  for (int r=0; r<numRegs; r++) {
3785  phaseInds[r*stride+offset] = 0LL;
3786  for (int q=0; q<numQubitsPerReg[r]; q++)
3787  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3788  }
3789  }
3790  else if (encoding == TWOS_COMPLEMENT) {
3791  int flatInd = 0;
3792  for (int r=0; r<numRegs; r++) {
3793  phaseInds[r*stride+offset] = 0LL;
3794  for (int q=0; q<numQubitsPerReg[r]-1; q++)
3795  phaseInds[r*stride+offset] += (1LL << q) * extractBit(qubits[flatInd++], globalAmpInd);
3796  // use final qubit to indicate sign
3797  if (extractBit(qubits[flatInd++], globalAmpInd) == 1)
3798  phaseInds[r*stride+offset] -= (1LL << (numQubitsPerReg[r]-1));
3799  }
3800  }
3801 
3802  // determine if this phase index has an overriden value (i < numOverrides)
3803  int i;
3804  for (i=0; i<numOverrides; i++) {
3805  int found = 1;
3806  for (int r=0; r<numRegs; r++) {
3807  if (phaseInds[r*stride+offset] != overrideInds[i*numRegs+r]) {
3808  found = 0;
3809  break;
3810  }
3811  }
3812  if (found)
3813  break;
3814  }
3815 
3816  // compute the phase (unless overriden)
3817  qreal phase = 0;
3818  if (i < numOverrides)
3819  phase = overridePhases[i];
3820  else {
3821  // compute norm related phases
3822  if (phaseFuncName == NORM || phaseFuncName == INVERSE_NORM ||
3823  phaseFuncName == SCALED_NORM || phaseFuncName == SCALED_INVERSE_NORM ||
3824  phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
3825  qreal norm = 0;
3826  if (phaseFuncName == SCALED_INVERSE_SHIFTED_NORM) {
3827  for (int r=0; r<numRegs; r++) {
3828  qreal dif = phaseInds[r*stride+offset] - params[2+r];
3829  norm += dif*dif;
3830  }
3831  }
3832  else
3833  for (int r=0; r<numRegs; r++)
3834  norm += phaseInds[r*stride+offset]*phaseInds[r*stride+offset];
3835  norm = sqrt(norm);
3836 
3837  if (phaseFuncName == NORM)
3838  phase = norm;
3839  else if (phaseFuncName == INVERSE_NORM)
3840  phase = (norm == 0.)? params[0] : 1/norm; // smallest non-zero norm is 1
3841  else if (phaseFuncName == SCALED_NORM)
3842  phase = params[0] * norm;
3843  else if (phaseFuncName == SCALED_INVERSE_NORM || phaseFuncName == SCALED_INVERSE_SHIFTED_NORM)
3844  phase = (norm <= REAL_EPS)? params[1] : params[0] / norm; // unless shifted closer to zero
3845  }
3846  // compute product related phases
3847  else if (phaseFuncName == PRODUCT || phaseFuncName == INVERSE_PRODUCT ||
3848  phaseFuncName == SCALED_PRODUCT || phaseFuncName == SCALED_INVERSE_PRODUCT) {
3849 
3850  qreal prod = 1;
3851  for (int r=0; r<numRegs; r++)
3852  prod *= phaseInds[r*stride+offset];
3853 
3854  if (phaseFuncName == PRODUCT)
3855  phase = prod;
3856  else if (phaseFuncName == INVERSE_PRODUCT)
3857  phase = (prod == 0.)? params[0] : 1/prod; // smallest non-zero prod is +- 1
3858  else if (phaseFuncName == SCALED_PRODUCT)
3859  phase = params[0] * prod;
3860  else if (phaseFuncName == SCALED_INVERSE_PRODUCT)
3861  phase = (prod == 0.)? params[1] : params[0] / prod;
3862  }
3863  // compute Euclidean distance related phases
3864  else if (phaseFuncName == DISTANCE || phaseFuncName == INVERSE_DISTANCE ||
3865  phaseFuncName == SCALED_DISTANCE || phaseFuncName == SCALED_INVERSE_DISTANCE ||
3866  phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
3867 
3868  qreal dist = 0;
3869  if (phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE) {
3870  for (int r=0; r<numRegs; r+=2) {
3871  qreal dif = (phaseInds[r*stride+offset] - phaseInds[(r+1)*stride+offset] - params[2+r/2]);
3872  dist += dif*dif;
3873  }
3874  }
3875  else
3876  for (int r=0; r<numRegs; r+=2) {
3877  qreal dif = (phaseInds[(r+1)*stride+offset] - phaseInds[r*stride+offset]);
3878  dist += dif*dif;
3879  }
3880  dist = sqrt(dist);
3881 
3882  if (phaseFuncName == DISTANCE)
3883  phase = dist;
3884  else if (phaseFuncName == INVERSE_DISTANCE)
3885  phase = (dist == 0.)? params[0] : 1/dist; // smallest non-zero dist is 1
3886  else if (phaseFuncName == SCALED_DISTANCE)
3887  phase = params[0] * dist;
3888  else if (phaseFuncName == SCALED_INVERSE_DISTANCE || phaseFuncName == SCALED_INVERSE_SHIFTED_DISTANCE)
3889  phase = (dist <= REAL_EPS)? params[1] : params[0] / dist; // unless shifted closer
3890  }
3891  }
3892 
3893 
3894  // negate phase to conjugate operator
3895  if (conj)
3896  phase *= -1;
3897 
3898  // modify amp to amp * exp(i phase)
3899  qreal c = cos(phase);
3900  qreal s = sin(phase);
3901  qreal re = qureg.deviceStateVec.real[index];
3902  qreal im = qureg.deviceStateVec.imag[index];
3903 
3904  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3905  qureg.deviceStateVec.real[index] = re*c - im*s;
3906  qureg.deviceStateVec.imag[index] = re*s + im*c;
3907 }

References Qureg::deviceStateVec, DISTANCE, extractBit(), INVERSE_DISTANCE, INVERSE_NORM, INVERSE_PRODUCT, NORM, Qureg::numAmpsPerChunk, PRODUCT, qreal, SCALED_DISTANCE, SCALED_INVERSE_DISTANCE, SCALED_INVERSE_NORM, SCALED_INVERSE_PRODUCT, SCALED_INVERSE_SHIFTED_DISTANCE, SCALED_INVERSE_SHIFTED_NORM, SCALED_NORM, SCALED_PRODUCT, TWOS_COMPLEMENT, and UNSIGNED.

◆ statevec_applyPhaseFuncOverrides()

void statevec_applyPhaseFuncOverrides ( Qureg  qureg,
int *  qubits,
int  numQubits,
enum bitEncoding  encoding,
qreal coeffs,
qreal exponents,
int  numTerms,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
int  conj 
)

Definition at line 3576 of file QuEST_gpu.cu.

3581  {
3582  // allocate device space for global list of {qubits}, {coeffs}, {exponents}, {overrideInds} and {overridePhases}
3583  int* d_qubits; size_t mem_qubits = numQubits * sizeof *d_qubits;
3584  qreal* d_coeffs; size_t mem_terms = numTerms * sizeof *d_coeffs;
3585  qreal* d_exponents;
3586  long long int* d_overrideInds; size_t mem_inds = numOverrides * sizeof *d_overrideInds;
3587  qreal* d_overridePhases; size_t mem_phas = numOverrides * sizeof *d_overridePhases;
3588  cudaMalloc(&d_qubits, mem_qubits); cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
3589  cudaMalloc(&d_coeffs, mem_terms); cudaMemcpy(d_coeffs, coeffs, mem_terms, cudaMemcpyHostToDevice);
3590  cudaMalloc(&d_exponents, mem_terms); cudaMemcpy(d_exponents, exponents, mem_terms, cudaMemcpyHostToDevice);
3591  cudaMalloc(&d_overrideInds, mem_inds); cudaMemcpy(d_overrideInds, overrideInds, mem_inds, cudaMemcpyHostToDevice);
3592  cudaMalloc(&d_overridePhases,mem_phas); cudaMemcpy(d_overridePhases, overridePhases, mem_phas, cudaMemcpyHostToDevice);
3593 
3594  // call kernel
3595  int threadsPerCUDABlock = 128;
3596  int CUDABlocks = ceil((qreal) qureg.numAmpsPerChunk / threadsPerCUDABlock);
3597  statevec_applyPhaseFuncOverridesKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
3598  qureg, d_qubits, numQubits, encoding,
3599  d_coeffs, d_exponents, numTerms,
3600  d_overrideInds, d_overridePhases, numOverrides,
3601  conj);
3602 
3603  // cleanup device memory
3604  cudaFree(d_qubits);
3605  cudaFree(d_coeffs);
3606  cudaFree(d_exponents);
3607  cudaFree(d_overrideInds);
3608  cudaFree(d_overridePhases);
3609 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyPhaseFunc(), and applyPhaseFuncOverrides().

◆ statevec_applyPhaseFuncOverridesKernel()

__global__ void statevec_applyPhaseFuncOverridesKernel ( Qureg  qureg,
int *  qubits,
int  numQubits,
enum bitEncoding  encoding,
qreal coeffs,
qreal exponents,
int  numTerms,
long long int *  overrideInds,
qreal overridePhases,
int  numOverrides,
int  conj 
)

Definition at line 3522 of file QuEST_gpu.cu.

3527  {
3528  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3529  if (index>=qureg.numAmpsPerChunk) return;
3530 
3531  // determine global amplitude index (non-distributed, so it's just local index)
3532  long long int globalAmpInd = index;
3533 
3534  // determine phase index of {qubits}
3535  long long int phaseInd = 0LL;
3536  if (encoding == UNSIGNED) {
3537  for (int q=0; q<numQubits; q++)
3538  phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
3539  }
3540  else if (encoding == TWOS_COMPLEMENT) {
3541  for (int q=0; q<numQubits-1; q++) // use final qubit to indicate sign
3542  phaseInd += (1LL << q) * extractBit(qubits[q], globalAmpInd);
3543  if (extractBit(qubits[numQubits-1], globalAmpInd) == 1)
3544  phaseInd -= (1LL << (numQubits-1));
3545  }
3546 
3547  // determine if this phase index has an overriden value (i < numOverrides)
3548  int i;
3549  for (i=0; i<numOverrides; i++)
3550  if (phaseInd == overrideInds[i])
3551  break;
3552 
3553  // determine phase from {coeffs}, {exponents} (unless overriden)
3554  qreal phase = 0;
3555  if (i < numOverrides)
3556  phase = overridePhases[i];
3557  else
3558  for (int t=0; t<numTerms; t++)
3559  phase += coeffs[t] * pow(phaseInd, exponents[t]);
3560 
3561  // negate phase to conjugate operator
3562  if (conj)
3563  phase *= -1;
3564 
3565  // modify amp to amp * exp(i phase)
3566  qreal c = cos(phase);
3567  qreal s = sin(phase);
3568  qreal re = qureg.deviceStateVec.real[index];
3569  qreal im = qureg.deviceStateVec.imag[index];
3570 
3571  // = {re[amp] cos(phase) - im[amp] sin(phase)} + i {re[amp] sin(phase) + im[amp] cos(phase)}
3572  qureg.deviceStateVec.real[index] = re*c - im*s;
3573  qureg.deviceStateVec.imag[index] = re*s + im*c;
3574 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, qreal, TWOS_COMPLEMENT, and UNSIGNED.

◆ statevec_calcExpecDiagonalOp()

Complex statevec_calcExpecDiagonalOp ( Qureg  qureg,
DiagonalOp  op 
)

Definition at line 3276 of file QuEST_gpu.cu.

3276  {
3277 
3278  /* @TODO: remove all this reduction boilerplate from QuEST GPU
3279  * (e.g. a func which accepts a pointer to do every-value reduction?)
3280  */
3281 
3282  qreal expecReal, expecImag;
3283 
3284  int getRealComp;
3285  long long int numValuesToReduce;
3286  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
3287  int maxReducedPerLevel;
3288  int firstTime;
3289 
3290  // compute real component of inner product
3291  getRealComp = 1;
3292  numValuesToReduce = qureg.numAmpsPerChunk;
3293  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3294  firstTime = 1;
3295  while (numValuesToReduce > 1) {
3296  if (numValuesToReduce < maxReducedPerLevel) {
3297  valuesPerCUDABlock = numValuesToReduce;
3298  numCUDABlocks = 1;
3299  }
3300  else {
3301  valuesPerCUDABlock = maxReducedPerLevel;
3302  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3303  }
3304  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3305  if (firstTime) {
3306  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3307  getRealComp,
3308  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3309  op.deviceOperator.real, op.deviceOperator.imag,
3310  numValuesToReduce,
3311  qureg.firstLevelReduction);
3312  firstTime = 0;
3313  } else {
3314  cudaDeviceSynchronize();
3315  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3316  qureg.firstLevelReduction,
3317  qureg.secondLevelReduction, valuesPerCUDABlock);
3318  cudaDeviceSynchronize();
3320  }
3321  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3322  }
3323  cudaMemcpy(&expecReal, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3324 
3325  // compute imag component of inner product
3326  getRealComp = 0;
3327  numValuesToReduce = qureg.numAmpsPerChunk;
3328  maxReducedPerLevel = REDUCE_SHARED_SIZE;
3329  firstTime = 1;
3330  while (numValuesToReduce > 1) {
3331  if (numValuesToReduce < maxReducedPerLevel) {
3332  valuesPerCUDABlock = numValuesToReduce;
3333  numCUDABlocks = 1;
3334  }
3335  else {
3336  valuesPerCUDABlock = maxReducedPerLevel;
3337  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
3338  }
3339  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
3340  if (firstTime) {
3341  statevec_calcExpecDiagonalOpKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
3342  getRealComp,
3343  qureg.deviceStateVec.real, qureg.deviceStateVec.imag,
3344  op.deviceOperator.real, op.deviceOperator.imag,
3345  numValuesToReduce,
3346  qureg.firstLevelReduction);
3347  firstTime = 0;
3348  } else {
3349  cudaDeviceSynchronize();
3350  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
3351  qureg.firstLevelReduction,
3352  qureg.secondLevelReduction, valuesPerCUDABlock);
3353  cudaDeviceSynchronize();
3355  }
3356  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
3357  }
3358  cudaMemcpy(&expecImag, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
3359 
3360  // return complex
3361  Complex expecVal;
3362  expecVal.real = expecReal;
3363  expecVal.imag = expecImag;
3364  return expecVal;
3365 }

References copySharedReduceBlock(), DiagonalOp::deviceOperator, Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcExpecDiagonalOp().

◆ statevec_calcExpecDiagonalOpKernel()

__global__ void statevec_calcExpecDiagonalOpKernel ( int  getRealComp,
qreal vecReal,
qreal vecImag,
qreal opReal,
qreal opImag,
long long int  numTermsToSum,
qreal reducedArray 
)

computes either a real or imag term of |vec_i|^2 op_i

Definition at line 3249 of file QuEST_gpu.cu.

3253 {
3254  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
3255  if (index >= numTermsToSum) return;
3256 
3257  qreal vecAbs = vecReal[index]*vecReal[index] + vecImag[index]*vecImag[index];
3258 
3259  // choose whether to calculate the real or imaginary term of the expec term
3260  qreal expecVal;
3261  if (getRealComp)
3262  expecVal = vecAbs * opReal[index];
3263  else
3264  expecVal = vecAbs * opImag[index];
3265 
3266  // array of each thread's collected sum term, to be summed
3267  extern __shared__ qreal tempReductionArray[];
3268  tempReductionArray[threadIdx.x] = expecVal;
3269  __syncthreads();
3270 
3271  // every second thread reduces
3272  if (threadIdx.x<blockDim.x/2)
3273  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
3274 }

References qreal, and reduceBlock().

◆ statevec_calcInnerProduct()

Complex statevec_calcInnerProduct ( Qureg  bra,
Qureg  ket 
)

Terrible code which unnecessarily individually computes and sums the real and imaginary components of the inner product, so as to not have to worry about keeping the sums separated during reduction.

Truly disgusting, probably doubles runtime, please fix.

Todo:
could even do the kernel twice, storing real in bra.reduc and imag in ket.reduc?

Definition at line 2393 of file QuEST_gpu.cu.

2393  {
2394 
2395  qreal innerProdReal, innerProdImag;
2396 
2397  int getRealComp;
2398  long long int numValuesToReduce;
2399  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2400  int maxReducedPerLevel;
2401  int firstTime;
2402 
2403  // compute real component of inner product
2404  getRealComp = 1;
2405  numValuesToReduce = bra.numAmpsPerChunk;
2406  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2407  firstTime = 1;
2408  while (numValuesToReduce > 1) {
2409  if (numValuesToReduce < maxReducedPerLevel) {
2410  valuesPerCUDABlock = numValuesToReduce;
2411  numCUDABlocks = 1;
2412  }
2413  else {
2414  valuesPerCUDABlock = maxReducedPerLevel;
2415  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2416  }
2417  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2418  if (firstTime) {
2419  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2420  getRealComp,
2421  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2422  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2423  numValuesToReduce,
2424  bra.firstLevelReduction);
2425  firstTime = 0;
2426  } else {
2427  cudaDeviceSynchronize();
2428  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2429  bra.firstLevelReduction,
2430  bra.secondLevelReduction, valuesPerCUDABlock);
2431  cudaDeviceSynchronize();
2433  }
2434  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2435  }
2436  cudaMemcpy(&innerProdReal, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2437 
2438  // compute imag component of inner product
2439  getRealComp = 0;
2440  numValuesToReduce = bra.numAmpsPerChunk;
2441  maxReducedPerLevel = REDUCE_SHARED_SIZE;
2442  firstTime = 1;
2443  while (numValuesToReduce > 1) {
2444  if (numValuesToReduce < maxReducedPerLevel) {
2445  valuesPerCUDABlock = numValuesToReduce;
2446  numCUDABlocks = 1;
2447  }
2448  else {
2449  valuesPerCUDABlock = maxReducedPerLevel;
2450  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2451  }
2452  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2453  if (firstTime) {
2454  statevec_calcInnerProductKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2455  getRealComp,
2456  bra.deviceStateVec.real, bra.deviceStateVec.imag,
2457  ket.deviceStateVec.real, ket.deviceStateVec.imag,
2458  numValuesToReduce,
2459  bra.firstLevelReduction);
2460  firstTime = 0;
2461  } else {
2462  cudaDeviceSynchronize();
2463  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2464  bra.firstLevelReduction,
2465  bra.secondLevelReduction, valuesPerCUDABlock);
2466  cudaDeviceSynchronize();
2468  }
2469  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2470  }
2471  cudaMemcpy(&innerProdImag, bra.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2472 
2473  // return complex
2474  Complex innerProd;
2475  innerProd.real = innerProdReal;
2476  innerProd.imag = innerProdImag;
2477  return innerProd;
2478 }

References copySharedReduceBlock(), Qureg::deviceStateVec, Qureg::firstLevelReduction, Complex::imag, Qureg::numAmpsPerChunk, qreal, Complex::real, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by calcInnerProduct(), statevec_calcExpecPauliProd(), and statevec_calcFidelity().

◆ statevec_calcInnerProductKernel()

__global__ void statevec_calcInnerProductKernel ( int  getRealComp,
qreal vecReal1,
qreal vecImag1,
qreal vecReal2,
qreal vecImag2,
long long int  numTermsToSum,
qreal reducedArray 
)

computes either a real or imag term in the inner product

Definition at line 2363 of file QuEST_gpu.cu.

2367 {
2368  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
2369  if (index >= numTermsToSum) return;
2370 
2371  // choose whether to calculate the real or imaginary term of the inner product
2372  qreal innerProdTerm;
2373  if (getRealComp)
2374  innerProdTerm = vecReal1[index]*vecReal2[index] + vecImag1[index]*vecImag2[index];
2375  else
2376  innerProdTerm = vecReal1[index]*vecImag2[index] - vecImag1[index]*vecReal2[index];
2377 
2378  // array of each thread's collected sum term, to be summed
2379  extern __shared__ qreal tempReductionArray[];
2380  tempReductionArray[threadIdx.x] = innerProdTerm;
2381  __syncthreads();
2382 
2383  // every second thread reduces
2384  if (threadIdx.x<blockDim.x/2)
2385  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2386 }

References qreal, and reduceBlock().

◆ statevec_calcProbOfAllOutcomes()

void statevec_calcProbOfAllOutcomes ( qreal outcomeProbs,
Qureg  qureg,
int *  qubits,
int  numQubits 
)

Definition at line 2207 of file QuEST_gpu.cu.

2207  {
2208 
2209  // copy qubits to GPU memory
2210  int* d_qubits;
2211  size_t mem_qubits = numQubits * sizeof *d_qubits;
2212  cudaMalloc(&d_qubits, mem_qubits);
2213  cudaMemcpy(d_qubits, qubits, mem_qubits, cudaMemcpyHostToDevice);
2214 
2215  // create one thread for every amplitude
2216  int numThreadsPerBlock = 128;
2217  int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
2218 
2219  // create global GPU array for outcomeProbs
2220  qreal* d_outcomeProbs;
2221  long long int numOutcomes = (1LL << numQubits);
2222  size_t mem_outcomeProbs = numOutcomes * sizeof *d_outcomeProbs;
2223  cudaMalloc(&d_outcomeProbs, mem_outcomeProbs);
2224  cudaMemset(d_outcomeProbs, 0, mem_outcomeProbs);
2225 
2226  // populate per-block subarrays
2227  statevec_calcProbOfAllOutcomesKernel<<<numBlocks, numThreadsPerBlock>>>(
2228  d_outcomeProbs, qureg, d_qubits, numQubits);
2229 
2230  // copy outcomeProbs from GPU memory
2231  cudaMemcpy(outcomeProbs, d_outcomeProbs, mem_outcomeProbs, cudaMemcpyDeviceToHost);
2232 
2233  // free GPU memory
2234  cudaFree(d_qubits);
2235  cudaFree(d_outcomeProbs);
2236 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by calcProbOfAllOutcomes().

◆ statevec_calcProbOfAllOutcomesKernel()

__global__ void statevec_calcProbOfAllOutcomesKernel ( qreal outcomeProbs,
Qureg  qureg,
int *  qubits,
int  numQubits 
)

Definition at line 2186 of file QuEST_gpu.cu.

2188  {
2189  // each thread handles one amplitude (all amplitudes are involved)
2190  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
2191  if (ampInd >= qureg.numAmpsTotal) return;
2192 
2193  qreal prob = (
2194  qureg.deviceStateVec.real[ampInd]*qureg.deviceStateVec.real[ampInd] +
2195  qureg.deviceStateVec.imag[ampInd]*qureg.deviceStateVec.imag[ampInd]);
2196 
2197  // each amplitude contributes to one outcome
2198  long long int outcomeInd = 0;
2199  for (int q=0; q<numQubits; q++)
2200  outcomeInd += extractBit(qubits[q], ampInd) * (1LL << q);
2201 
2202  // each thread atomically writes directly to the global output.
2203  // this beat block-heirarchal atomic reductions in both global and shared memory!
2204  atomicAdd(&outcomeProbs[outcomeInd], prob);
2205 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsTotal, and qreal.

◆ statevec_calcProbOfOutcome()

qreal statevec_calcProbOfOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome 
)

Definition at line 2150 of file QuEST_gpu.cu.

2151 {
2152  qreal outcomeProb = statevec_findProbabilityOfZero(qureg, measureQubit);
2153  if (outcome==1)
2154  outcomeProb = 1.0 - outcomeProb;
2155  return outcomeProb;
2156 }

References qreal, and statevec_findProbabilityOfZero().

Referenced by calcProbOfOutcome(), collapseToOutcome(), and statevec_measureWithStats().

◆ statevec_calcTotalProb()

qreal statevec_calcTotalProb ( Qureg  qureg)

Definition at line 1655 of file QuEST_gpu.cu.

1655  {
1656  /* IJB - implemented using Kahan summation for greater accuracy at a slight floating
1657  point operation overhead. For more details see https://en.wikipedia.org/wiki/Kahan_summation_algorithm */
1658  /* Don't change the bracketing in this routine! */
1659  qreal pTotal=0;
1660  qreal y, t, c;
1661  long long int index;
1662  long long int numAmpsPerRank = qureg.numAmpsPerChunk;
1663 
1664  copyStateFromGPU(qureg);
1665 
1666  c = 0.0;
1667  for (index=0; index<numAmpsPerRank; index++){
1668  /* Perform pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index]; by Kahan */
1669  // pTotal+=qureg.stateVec.real[index]*qureg.stateVec.real[index];
1670  y = qureg.stateVec.real[index]*qureg.stateVec.real[index] - c;
1671  t = pTotal + y;
1672  c = ( t - pTotal ) - y;
1673  pTotal = t;
1674 
1675  /* Perform pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index]; by Kahan */
1676  //pTotal+=qureg.stateVec.imag[index]*qureg.stateVec.imag[index];
1677  y = qureg.stateVec.imag[index]*qureg.stateVec.imag[index] - c;
1678  t = pTotal + y;
1679  c = ( t - pTotal ) - y;
1680  pTotal = t;
1681 
1682 
1683  }
1684  return pTotal;
1685 }

References copyStateFromGPU(), Qureg::numAmpsPerChunk, qreal, and Qureg::stateVec.

Referenced by calcTotalProb().

◆ statevec_cloneQureg()

void statevec_cloneQureg ( Qureg  targetQureg,
Qureg  copyQureg 
)

works for both statevectors and density matrices

Definition at line 170 of file QuEST_gpu.cu.

170  {
171 
172  // copy copyQureg's GPU statevec to targetQureg's GPU statevec
173  cudaDeviceSynchronize();
174  cudaMemcpy(
175  targetQureg.deviceStateVec.real,
176  copyQureg.deviceStateVec.real,
177  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.real)),
178  cudaMemcpyDeviceToDevice);
179  cudaMemcpy(
180  targetQureg.deviceStateVec.imag,
181  copyQureg.deviceStateVec.imag,
182  targetQureg.numAmpsPerChunk*sizeof(*(targetQureg.deviceStateVec.imag)),
183  cudaMemcpyDeviceToDevice);
184 }

References Qureg::deviceStateVec, and Qureg::numAmpsPerChunk.

Referenced by cloneQureg(), createCloneQureg(), initPureState(), and statevec_calcExpecPauliProd().

◆ statevec_collapseToKnownProbOutcome()

void statevec_collapseToKnownProbOutcome ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  outcomeProb 
)

Definition at line 2770 of file QuEST_gpu.cu.

2771 {
2772  int threadsPerCUDABlock, CUDABlocks;
2773  threadsPerCUDABlock = 128;
2774  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
2775  statevec_collapseToKnownProbOutcomeKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, measureQubit, outcome, outcomeProb);
2776 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyProjector(), collapseToOutcome(), and statevec_measureWithStats().

◆ statevec_collapseToKnownProbOutcomeKernel()

__global__ void statevec_collapseToKnownProbOutcomeKernel ( Qureg  qureg,
int  measureQubit,
int  outcome,
qreal  totalProbability 
)

Definition at line 2713 of file QuEST_gpu.cu.

2714 {
2715  // ----- sizes
2716  long long int sizeBlock, // size of blocks
2717  sizeHalfBlock; // size of blocks halved
2718  // ----- indices
2719  long long int thisBlock, // current block
2720  index; // current index for first half block
2721  // ----- measured probability
2722  qreal renorm; // probability (returned) value
2723  // ----- temp variables
2724  long long int thisTask; // task based approach for expose loop with small granularity
2725  // (good for shared memory parallelism)
2726  long long int numTasks=qureg.numAmpsPerChunk>>1;
2727 
2728  // ---------------------------------------------------------------- //
2729  // dimensions //
2730  // ---------------------------------------------------------------- //
2731  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2732  // and then the number to skip
2733  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2734 
2735  // ---------------------------------------------------------------- //
2736  // find probability //
2737  // ---------------------------------------------------------------- //
2738 
2739  //
2740  // --- task-based shared-memory parallel implementation
2741  //
2742  renorm=1/sqrt(totalProbability);
2743  qreal *stateVecReal = qureg.deviceStateVec.real;
2744  qreal *stateVecImag = qureg.deviceStateVec.imag;
2745 
2746  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2747  if (thisTask>=numTasks) return;
2748  thisBlock = thisTask / sizeHalfBlock;
2749  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2750 
2751  if (outcome==0){
2752  stateVecReal[index]=stateVecReal[index]*renorm;
2753  stateVecImag[index]=stateVecImag[index]*renorm;
2754 
2755  stateVecReal[index+sizeHalfBlock]=0;
2756  stateVecImag[index+sizeHalfBlock]=0;
2757  } else if (outcome==1){
2758  stateVecReal[index]=0;
2759  stateVecImag[index]=0;
2760 
2761  stateVecReal[index+sizeHalfBlock]=stateVecReal[index+sizeHalfBlock]*renorm;
2762  stateVecImag[index+sizeHalfBlock]=stateVecImag[index+sizeHalfBlock]*renorm;
2763  }
2764 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_compactUnitary()

void statevec_compactUnitary ( Qureg  qureg,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

Definition at line 844 of file QuEST_gpu.cu.

845 {
846  int threadsPerCUDABlock, CUDABlocks;
847  threadsPerCUDABlock = 128;
848  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
849  statevec_compactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, alpha, beta);
850 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by compactUnitary(), statevec_multiRotatePauli(), statevec_rotateAroundAxis(), and statevec_rotateAroundAxisConj().

◆ statevec_compactUnitaryKernel()

__global__ void statevec_compactUnitaryKernel ( Qureg  qureg,
int  rotQubit,
Complex  alpha,
Complex  beta 
)

fix – no necessary for GPU version

Definition at line 789 of file QuEST_gpu.cu.

789  {
790  // ----- sizes
791  long long int sizeBlock, // size of blocks
792  sizeHalfBlock; // size of blocks halved
793  // ----- indices
794  long long int thisBlock, // current block
795  indexUp,indexLo; // current index and corresponding index in lower half block
796 
797  // ----- temp variables
798  qreal stateRealUp,stateRealLo, // storage for previous state values
799  stateImagUp,stateImagLo; // (used in updates)
800  // ----- temp variables
801  long long int thisTask; // task based approach for expose loop with small granularity
802  long long int numTasks=qureg.numAmpsPerChunk>>1;
803 
804  sizeHalfBlock = 1LL << rotQubit; // size of blocks halved
805  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
806 
807  // ---------------------------------------------------------------- //
808  // rotate //
809  // ---------------------------------------------------------------- //
810 
812  qreal *stateVecReal = qureg.deviceStateVec.real;
813  qreal *stateVecImag = qureg.deviceStateVec.imag;
814  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
815  qreal betaImag=beta.imag, betaReal=beta.real;
816 
817  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
818  if (thisTask>=numTasks) return;
819 
820  thisBlock = thisTask / sizeHalfBlock;
821  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
822  indexLo = indexUp + sizeHalfBlock;
823 
824  // store current state vector values in temp variables
825  stateRealUp = stateVecReal[indexUp];
826  stateImagUp = stateVecImag[indexUp];
827 
828  stateRealLo = stateVecReal[indexLo];
829  stateImagLo = stateVecImag[indexLo];
830 
831  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
832  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
833  - betaReal*stateRealLo - betaImag*stateImagLo;
834  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
835  - betaReal*stateImagLo + betaImag*stateRealLo;
836 
837  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
838  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
839  + alphaReal*stateRealLo + alphaImag*stateImagLo;
840  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
841  + alphaReal*stateImagLo - alphaImag*stateRealLo;
842 }

References Qureg::deviceStateVec, Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_compareStates()

int statevec_compareStates ( Qureg  mq1,
Qureg  mq2,
qreal  precision 
)

Definition at line 771 of file QuEST_gpu.cu.

771  {
772  qreal diff;
773  int chunkSize = mq1.numAmpsPerChunk;
774 
775  copyStateFromGPU(mq1);
776  copyStateFromGPU(mq2);
777 
778  for (int i=0; i<chunkSize; i++){
779  diff = mq1.stateVec.real[i] - mq2.stateVec.real[i];
780  if (diff<0) diff *= -1;
781  if (diff>precision) return 0;
782  diff = mq1.stateVec.imag[i] - mq2.stateVec.imag[i];
783  if (diff<0) diff *= -1;
784  if (diff>precision) return 0;
785  }
786  return 1;
787 }

References copyStateFromGPU(), Qureg::numAmpsPerChunk, qreal, and Qureg::stateVec.

Referenced by compareStates().

◆ statevec_controlledCompactUnitary()

void statevec_controlledCompactUnitary ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

Definition at line 911 of file QuEST_gpu.cu.

912 {
913  int threadsPerCUDABlock, CUDABlocks;
914  threadsPerCUDABlock = 128;
915  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
916  statevec_controlledCompactUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, alpha, beta);
917 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledCompactUnitary(), statevec_controlledRotateAroundAxis(), and statevec_controlledRotateAroundAxisConj().

◆ statevec_controlledCompactUnitaryKernel()

__global__ void statevec_controlledCompactUnitaryKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
Complex  alpha,
Complex  beta 
)

fix – no necessary for GPU version

Definition at line 852 of file QuEST_gpu.cu.

852  {
853  // ----- sizes
854  long long int sizeBlock, // size of blocks
855  sizeHalfBlock; // size of blocks halved
856  // ----- indices
857  long long int thisBlock, // current block
858  indexUp,indexLo; // current index and corresponding index in lower half block
859 
860  // ----- temp variables
861  qreal stateRealUp,stateRealLo, // storage for previous state values
862  stateImagUp,stateImagLo; // (used in updates)
863  // ----- temp variables
864  long long int thisTask; // task based approach for expose loop with small granularity
865  long long int numTasks=qureg.numAmpsPerChunk>>1;
866  int controlBit;
867 
868  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
869  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
870 
871  // ---------------------------------------------------------------- //
872  // rotate //
873  // ---------------------------------------------------------------- //
874 
876  qreal *stateVecReal = qureg.deviceStateVec.real;
877  qreal *stateVecImag = qureg.deviceStateVec.imag;
878  qreal alphaImag=alpha.imag, alphaReal=alpha.real;
879  qreal betaImag=beta.imag, betaReal=beta.real;
880 
881  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
882  if (thisTask>=numTasks) return;
883 
884  thisBlock = thisTask / sizeHalfBlock;
885  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
886  indexLo = indexUp + sizeHalfBlock;
887 
888  controlBit = extractBit(controlQubit, indexUp);
889  if (controlBit){
890  // store current state vector values in temp variables
891  stateRealUp = stateVecReal[indexUp];
892  stateImagUp = stateVecImag[indexUp];
893 
894  stateRealLo = stateVecReal[indexLo];
895  stateImagLo = stateVecImag[indexLo];
896 
897  // state[indexUp] = alpha * state[indexUp] - conj(beta) * state[indexLo]
898  stateVecReal[indexUp] = alphaReal*stateRealUp - alphaImag*stateImagUp
899  - betaReal*stateRealLo - betaImag*stateImagLo;
900  stateVecImag[indexUp] = alphaReal*stateImagUp + alphaImag*stateRealUp
901  - betaReal*stateImagLo + betaImag*stateRealLo;
902 
903  // state[indexLo] = beta * state[indexUp] + conj(alpha) * state[indexLo]
904  stateVecReal[indexLo] = betaReal*stateRealUp - betaImag*stateImagUp
905  + alphaReal*stateRealLo + alphaImag*stateImagLo;
906  stateVecImag[indexLo] = betaReal*stateImagUp + betaImag*stateRealUp
907  + alphaReal*stateImagLo - alphaImag*stateRealLo;
908  }
909 }

References Qureg::deviceStateVec, extractBit(), Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_controlledNot()

void statevec_controlledNot ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1873 of file QuEST_gpu.cu.

1874 {
1875  int threadsPerCUDABlock, CUDABlocks;
1876  threadsPerCUDABlock = 128;
1877  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1878  statevec_controlledNotKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit);
1879 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledNot().

◆ statevec_controlledNotKernel()

__global__ void statevec_controlledNotKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1834 of file QuEST_gpu.cu.

1835 {
1836  long long int index;
1837  long long int sizeBlock, // size of blocks
1838  sizeHalfBlock; // size of blocks halved
1839  long long int stateVecSize;
1840  int controlBit;
1841 
1842  // ----- temp variables
1843  qreal stateRealUp, // storage for previous state values
1844  stateImagUp; // (used in updates)
1845  long long int thisBlock, // current block
1846  indexUp,indexLo; // current index and corresponding index in lower half block
1847  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1848  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1849 
1850  stateVecSize = qureg.numAmpsPerChunk;
1851  qreal *stateVecReal = qureg.deviceStateVec.real;
1852  qreal *stateVecImag = qureg.deviceStateVec.imag;
1853 
1854  index = blockIdx.x*blockDim.x + threadIdx.x;
1855  if (index>=(stateVecSize>>1)) return;
1856  thisBlock = index / sizeHalfBlock;
1857  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1858  indexLo = indexUp + sizeHalfBlock;
1859 
1860  controlBit = extractBit(controlQubit, indexUp);
1861  if (controlBit){
1862  stateRealUp = stateVecReal[indexUp];
1863  stateImagUp = stateVecImag[indexUp];
1864 
1865  stateVecReal[indexUp] = stateVecReal[indexLo];
1866  stateVecImag[indexUp] = stateVecImag[indexLo];
1867 
1868  stateVecReal[indexLo] = stateRealUp;
1869  stateVecImag[indexLo] = stateImagUp;
1870  }
1871 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPauliY()

void statevec_controlledPauliY ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1445 of file QuEST_gpu.cu.

1446 {
1447  int conjFactor = 1;
1448  int threadsPerCUDABlock, CUDABlocks;
1449  threadsPerCUDABlock = 128;
1450  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1451  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1452 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPauliY().

◆ statevec_controlledPauliYConj()

void statevec_controlledPauliYConj ( Qureg  qureg,
int  controlQubit,
int  targetQubit 
)

Definition at line 1454 of file QuEST_gpu.cu.

1455 {
1456  int conjFactor = -1;
1457  int threadsPerCUDABlock, CUDABlocks;
1458  threadsPerCUDABlock = 128;
1459  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1460  statevec_controlledPauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, conjFactor);
1461 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPauliY().

◆ statevec_controlledPauliYKernel()

__global__ void statevec_controlledPauliYKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
int  conjFac 
)

Definition at line 1409 of file QuEST_gpu.cu.

1410 {
1411  long long int index;
1412  long long int sizeBlock, sizeHalfBlock;
1413  long long int stateVecSize;
1414  int controlBit;
1415 
1416  qreal stateRealUp, stateImagUp;
1417  long long int thisBlock, indexUp, indexLo;
1418  sizeHalfBlock = 1LL << targetQubit;
1419  sizeBlock = 2LL * sizeHalfBlock;
1420 
1421  stateVecSize = qureg.numAmpsPerChunk;
1422  qreal *stateVecReal = qureg.deviceStateVec.real;
1423  qreal *stateVecImag = qureg.deviceStateVec.imag;
1424 
1425  index = blockIdx.x*blockDim.x + threadIdx.x;
1426  if (index>=(stateVecSize>>1)) return;
1427  thisBlock = index / sizeHalfBlock;
1428  indexUp = thisBlock*sizeBlock + index%sizeHalfBlock;
1429  indexLo = indexUp + sizeHalfBlock;
1430 
1431  controlBit = extractBit(controlQubit, indexUp);
1432  if (controlBit){
1433 
1434  stateRealUp = stateVecReal[indexUp];
1435  stateImagUp = stateVecImag[indexUp];
1436 
1437  // update under +-{{0, -i}, {i, 0}}
1438  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1439  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1440  stateVecReal[indexLo] = conjFac * -stateImagUp;
1441  stateVecImag[indexLo] = conjFac * stateRealUp;
1442  }
1443 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPhaseFlip()

void statevec_controlledPhaseFlip ( Qureg  qureg,
int  idQubit1,
int  idQubit2 
)

Definition at line 1708 of file QuEST_gpu.cu.

1709 {
1710  int threadsPerCUDABlock, CUDABlocks;
1711  threadsPerCUDABlock = 128;
1712  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1713  statevec_controlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2);
1714 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPhaseFlip().

◆ statevec_controlledPhaseFlipKernel()

__global__ void statevec_controlledPhaseFlipKernel ( Qureg  qureg,
int  idQubit1,
int  idQubit2 
)

Definition at line 1687 of file QuEST_gpu.cu.

1688 {
1689  long long int index;
1690  long long int stateVecSize;
1691  int bit1, bit2;
1692 
1693  stateVecSize = qureg.numAmpsPerChunk;
1694  qreal *stateVecReal = qureg.deviceStateVec.real;
1695  qreal *stateVecImag = qureg.deviceStateVec.imag;
1696 
1697  index = blockIdx.x*blockDim.x + threadIdx.x;
1698  if (index>=stateVecSize) return;
1699 
1700  bit1 = extractBit (idQubit1, index);
1701  bit2 = extractBit (idQubit2, index);
1702  if (bit1 && bit2) {
1703  stateVecReal [index] = - stateVecReal [index];
1704  stateVecImag [index] = - stateVecImag [index];
1705  }
1706 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledPhaseShift()

void statevec_controlledPhaseShift ( Qureg  qureg,
int  idQubit1,
int  idQubit2,
qreal  angle 
)

Definition at line 1527 of file QuEST_gpu.cu.

1528 {
1529  qreal cosAngle = cos(angle);
1530  qreal sinAngle = sin(angle);
1531 
1532  int threadsPerCUDABlock, CUDABlocks;
1533  threadsPerCUDABlock = 128;
1534  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1535  statevec_controlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, idQubit1, idQubit2, cosAngle, sinAngle);
1536 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledPhaseShift().

◆ statevec_controlledPhaseShiftKernel()

__global__ void statevec_controlledPhaseShiftKernel ( Qureg  qureg,
int  idQubit1,
int  idQubit2,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1502 of file QuEST_gpu.cu.

1503 {
1504  long long int index;
1505  long long int stateVecSize;
1506  int bit1, bit2;
1507  qreal stateRealLo, stateImagLo;
1508 
1509  stateVecSize = qureg.numAmpsPerChunk;
1510  qreal *stateVecReal = qureg.deviceStateVec.real;
1511  qreal *stateVecImag = qureg.deviceStateVec.imag;
1512 
1513  index = blockIdx.x*blockDim.x + threadIdx.x;
1514  if (index>=stateVecSize) return;
1515 
1516  bit1 = extractBit (idQubit1, index);
1517  bit2 = extractBit (idQubit2, index);
1518  if (bit1 && bit2) {
1519  stateRealLo = stateVecReal[index];
1520  stateImagLo = stateVecImag[index];
1521 
1522  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1523  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1524  }
1525 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_controlledUnitary()

void statevec_controlledUnitary ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 1237 of file QuEST_gpu.cu.

1238 {
1239  int threadsPerCUDABlock, CUDABlocks;
1240  threadsPerCUDABlock = 128;
1241  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1242  statevec_controlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, controlQubit, targetQubit, argifyMatrix2(u));
1243 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by controlledUnitary().

◆ statevec_controlledUnitaryKernel()

__global__ void statevec_controlledUnitaryKernel ( Qureg  qureg,
int  controlQubit,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 1179 of file QuEST_gpu.cu.

1179  {
1180  // ----- sizes
1181  long long int sizeBlock, // size of blocks
1182  sizeHalfBlock; // size of blocks halved
1183  // ----- indices
1184  long long int thisBlock, // current block
1185  indexUp,indexLo; // current index and corresponding index in lower half block
1186 
1187  // ----- temp variables
1188  qreal stateRealUp,stateRealLo, // storage for previous state values
1189  stateImagUp,stateImagLo; // (used in updates)
1190  // ----- temp variables
1191  long long int thisTask; // task based approach for expose loop with small granularity
1192  long long int numTasks=qureg.numAmpsPerChunk>>1;
1193 
1194  int controlBit;
1195 
1196  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1197  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1198 
1199  // ---------------------------------------------------------------- //
1200  // rotate //
1201  // ---------------------------------------------------------------- //
1202 
1204  qreal *stateVecReal = qureg.deviceStateVec.real;
1205  qreal *stateVecImag = qureg.deviceStateVec.imag;
1206 
1207  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1208  if (thisTask>=numTasks) return;
1209 
1210  thisBlock = thisTask / sizeHalfBlock;
1211  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1212  indexLo = indexUp + sizeHalfBlock;
1213 
1214  // store current state vector values in temp variables
1215  stateRealUp = stateVecReal[indexUp];
1216  stateImagUp = stateVecImag[indexUp];
1217 
1218  stateRealLo = stateVecReal[indexLo];
1219  stateImagLo = stateVecImag[indexLo];
1220 
1221  controlBit = extractBit(controlQubit, indexUp);
1222  if (controlBit){
1223  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1224  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1225  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1226  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1227  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1228 
1229  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1230  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1231  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1232  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1233  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1234  }
1235 }

References Qureg::deviceStateVec, extractBit(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_createQureg()

void statevec_createQureg ( Qureg qureg,
int  numQubits,
QuESTEnv  env 
)

Definition at line 275 of file QuEST_gpu.cu.

276 {
277  // allocate CPU memory
278  long long int numAmps = 1L << numQubits;
279  long long int numAmpsPerRank = numAmps/env.numRanks;
280  qureg->stateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.real));
281  qureg->stateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->stateVec.imag));
282  if (env.numRanks>1){
283  qureg->pairStateVec.real = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.real));
284  qureg->pairStateVec.imag = (qreal*) malloc(numAmpsPerRank * sizeof(qureg->pairStateVec.imag));
285  }
286 
287  // check cpu memory allocation was successful
288  if ( (!(qureg->stateVec.real) || !(qureg->stateVec.imag))
289  && numAmpsPerRank ) {
290  printf("Could not allocate memory!\n");
291  exit (EXIT_FAILURE);
292  }
293  if ( env.numRanks>1 && (!(qureg->pairStateVec.real) || !(qureg->pairStateVec.imag))
294  && numAmpsPerRank ) {
295  printf("Could not allocate memory!\n");
296  exit (EXIT_FAILURE);
297  }
298 
299  qureg->numQubitsInStateVec = numQubits;
300  qureg->numAmpsPerChunk = numAmpsPerRank;
301  qureg->numAmpsTotal = numAmps;
302  qureg->chunkId = env.rank;
303  qureg->numChunks = env.numRanks;
304  qureg->isDensityMatrix = 0;
305 
306  // allocate GPU memory
307  cudaMalloc(&(qureg->deviceStateVec.real), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.real)));
308  cudaMalloc(&(qureg->deviceStateVec.imag), qureg->numAmpsPerChunk*sizeof(*(qureg->deviceStateVec.imag)));
309  cudaMalloc(&(qureg->firstLevelReduction), ceil(qureg->numAmpsPerChunk/(qreal)REDUCE_SHARED_SIZE)*sizeof(qreal));
311  sizeof(qreal));
312 
313  // check gpu memory allocation was successful
314  if (!(qureg->deviceStateVec.real) || !(qureg->deviceStateVec.imag)){
315  printf("Could not allocate memory on GPU!\n");
316  exit (EXIT_FAILURE);
317  }
318 
319 }

References Qureg::chunkId, Qureg::deviceStateVec, Qureg::firstLevelReduction, Qureg::isDensityMatrix, Qureg::numAmpsPerChunk, Qureg::numAmpsTotal, Qureg::numChunks, Qureg::numQubitsInStateVec, QuESTEnv::numRanks, Qureg::pairStateVec, qreal, QuESTEnv::rank, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and Qureg::stateVec.

Referenced by createCloneQureg(), createDensityQureg(), and createQureg().

◆ statevec_destroyQureg()

void statevec_destroyQureg ( Qureg  qureg,
QuESTEnv  env 
)

Definition at line 321 of file QuEST_gpu.cu.

322 {
323  // Free CPU memory
324  free(qureg.stateVec.real);
325  free(qureg.stateVec.imag);
326  if (env.numRanks>1){
327  free(qureg.pairStateVec.real);
328  free(qureg.pairStateVec.imag);
329  }
330 
331  // Free GPU memory
332  cudaFree(qureg.deviceStateVec.real);
333  cudaFree(qureg.deviceStateVec.imag);
334  cudaFree(qureg.firstLevelReduction);
335  cudaFree(qureg.secondLevelReduction);
336 }

References Qureg::deviceStateVec, Qureg::firstLevelReduction, QuESTEnv::numRanks, Qureg::pairStateVec, Qureg::secondLevelReduction, and Qureg::stateVec.

Referenced by destroyQureg().

◆ statevec_findProbabilityOfZero()

qreal statevec_findProbabilityOfZero ( Qureg  qureg,
int  measureQubit 
)

Definition at line 2112 of file QuEST_gpu.cu.

2113 {
2114  long long int numValuesToReduce = qureg.numAmpsPerChunk>>1;
2115  int valuesPerCUDABlock, numCUDABlocks, sharedMemSize;
2116  qreal stateProb=0;
2117  int firstTime=1;
2118  int maxReducedPerLevel = REDUCE_SHARED_SIZE;
2119 
2120  while(numValuesToReduce>1){
2121  if (numValuesToReduce<maxReducedPerLevel){
2122  // Need less than one CUDA block to reduce values
2123  valuesPerCUDABlock = numValuesToReduce;
2124  numCUDABlocks = 1;
2125  } else {
2126  // Use full CUDA blocks, with block size constrained by shared mem usage
2127  valuesPerCUDABlock = maxReducedPerLevel;
2128  numCUDABlocks = ceil((qreal)numValuesToReduce/valuesPerCUDABlock);
2129  }
2130  sharedMemSize = valuesPerCUDABlock*sizeof(qreal);
2131 
2132  if (firstTime){
2133  statevec_findProbabilityOfZeroKernel<<<numCUDABlocks, valuesPerCUDABlock, sharedMemSize>>>(
2134  qureg, measureQubit, qureg.firstLevelReduction);
2135  firstTime=0;
2136  } else {
2137  cudaDeviceSynchronize();
2138  copySharedReduceBlock<<<numCUDABlocks, valuesPerCUDABlock/2, sharedMemSize>>>(
2139  qureg.firstLevelReduction,
2140  qureg.secondLevelReduction, valuesPerCUDABlock);
2141  cudaDeviceSynchronize();
2143  }
2144  numValuesToReduce = numValuesToReduce/maxReducedPerLevel;
2145  }
2146  cudaMemcpy(&stateProb, qureg.firstLevelReduction, sizeof(qreal), cudaMemcpyDeviceToHost);
2147  return stateProb;
2148 }

References copySharedReduceBlock(), Qureg::firstLevelReduction, Qureg::numAmpsPerChunk, qreal, REDUCE_SHARED_SIZE, Qureg::secondLevelReduction, and swapDouble().

Referenced by statevec_calcProbOfOutcome().

◆ statevec_findProbabilityOfZeroKernel()

__global__ void statevec_findProbabilityOfZeroKernel ( Qureg  qureg,
int  measureQubit,
qreal reducedArray 
)

Definition at line 1998 of file QuEST_gpu.cu.

2000  {
2001  // ----- sizes
2002  long long int sizeBlock, // size of blocks
2003  sizeHalfBlock; // size of blocks halved
2004  // ----- indices
2005  long long int thisBlock, // current block
2006  index; // current index for first half block
2007  // ----- temp variables
2008  long long int thisTask; // task based approach for expose loop with small granularity
2009  long long int numTasks=qureg.numAmpsPerChunk>>1;
2010  // (good for shared memory parallelism)
2011 
2012  extern __shared__ qreal tempReductionArray[];
2013 
2014  // ---------------------------------------------------------------- //
2015  // dimensions //
2016  // ---------------------------------------------------------------- //
2017  sizeHalfBlock = 1LL << (measureQubit); // number of state vector elements to sum,
2018  // and then the number to skip
2019  sizeBlock = 2LL * sizeHalfBlock; // size of blocks (pairs of measure and skip entries)
2020 
2021  // ---------------------------------------------------------------- //
2022  // find probability //
2023  // ---------------------------------------------------------------- //
2024 
2025  //
2026  // --- task-based shared-memory parallel implementation
2027  //
2028 
2029  qreal *stateVecReal = qureg.deviceStateVec.real;
2030  qreal *stateVecImag = qureg.deviceStateVec.imag;
2031 
2032  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
2033  if (thisTask>=numTasks) return;
2034 
2035  thisBlock = thisTask / sizeHalfBlock;
2036  index = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
2037  qreal realVal, imagVal;
2038  realVal = stateVecReal[index];
2039  imagVal = stateVecImag[index];
2040  tempReductionArray[threadIdx.x] = realVal*realVal + imagVal*imagVal;
2041  __syncthreads();
2042 
2043  if (threadIdx.x<blockDim.x/2){
2044  reduceBlock(tempReductionArray, reducedArray, blockDim.x);
2045  }
2046 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, qreal, and reduceBlock().

◆ statevec_getImagAmp()

qreal statevec_getImagAmp ( Qureg  qureg,
long long int  index 
)

Definition at line 576 of file QuEST_gpu.cu.

576  {
577  qreal el=0;
578  cudaMemcpy(&el, &(qureg.deviceStateVec.imag[index]),
579  sizeof(*(qureg.deviceStateVec.imag)), cudaMemcpyDeviceToHost);
580  return el;
581 }

References Qureg::deviceStateVec, and qreal.

Referenced by getAmp(), getDensityAmp(), getImagAmp(), and statevec_getProbAmp().

◆ statevec_getRealAmp()

qreal statevec_getRealAmp ( Qureg  qureg,
long long int  index 
)

Definition at line 569 of file QuEST_gpu.cu.

569  {
570  qreal el=0;
571  cudaMemcpy(&el, &(qureg.deviceStateVec.real[index]),
572  sizeof(*(qureg.deviceStateVec.real)), cudaMemcpyDeviceToHost);
573  return el;
574 }

References Qureg::deviceStateVec, and qreal.

Referenced by getAmp(), getDensityAmp(), getRealAmp(), and statevec_getProbAmp().

◆ statevec_hadamard()

void statevec_hadamard ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1826 of file QuEST_gpu.cu.

1827 {
1828  int threadsPerCUDABlock, CUDABlocks;
1829  threadsPerCUDABlock = 128;
1830  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1831  statevec_hadamardKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1832 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by agnostic_applyQFT(), and hadamard().

◆ statevec_hadamardKernel()

__global__ void statevec_hadamardKernel ( Qureg  qureg,
int  targetQubit 
)

fix – no necessary for GPU version

Definition at line 1777 of file QuEST_gpu.cu.

1777  {
1778  // ----- sizes
1779  long long int sizeBlock, // size of blocks
1780  sizeHalfBlock; // size of blocks halved
1781  // ----- indices
1782  long long int thisBlock, // current block
1783  indexUp,indexLo; // current index and corresponding index in lower half block
1784 
1785  // ----- temp variables
1786  qreal stateRealUp,stateRealLo, // storage for previous state values
1787  stateImagUp,stateImagLo; // (used in updates)
1788  // ----- temp variables
1789  long long int thisTask; // task based approach for expose loop with small granularity
1790  long long int numTasks=qureg.numAmpsPerChunk>>1;
1791 
1792  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1793  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1794 
1795  // ---------------------------------------------------------------- //
1796  // rotate //
1797  // ---------------------------------------------------------------- //
1798 
1800  qreal *stateVecReal = qureg.deviceStateVec.real;
1801  qreal *stateVecImag = qureg.deviceStateVec.imag;
1802 
1803  qreal recRoot2 = 1.0/sqrt(2.0);
1804 
1805  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1806  if (thisTask>=numTasks) return;
1807 
1808  thisBlock = thisTask / sizeHalfBlock;
1809  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1810  indexLo = indexUp + sizeHalfBlock;
1811 
1812  // store current state vector values in temp variables
1813  stateRealUp = stateVecReal[indexUp];
1814  stateImagUp = stateVecImag[indexUp];
1815 
1816  stateRealLo = stateVecReal[indexLo];
1817  stateImagLo = stateVecImag[indexLo];
1818 
1819  stateVecReal[indexUp] = recRoot2*(stateRealUp + stateRealLo);
1820  stateVecImag[indexUp] = recRoot2*(stateImagUp + stateImagLo);
1821 
1822  stateVecReal[indexLo] = recRoot2*(stateRealUp - stateRealLo);
1823  stateVecImag[indexLo] = recRoot2*(stateImagUp - stateImagLo);
1824 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_initBlankState()

void statevec_initBlankState ( Qureg  qureg)

Definition at line 593 of file QuEST_gpu.cu.

594 {
595  int threadsPerCUDABlock, CUDABlocks;
596  threadsPerCUDABlock = 128;
597  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
598  statevec_initBlankStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
599  qureg.numAmpsPerChunk,
600  qureg.deviceStateVec.real,
601  qureg.deviceStateVec.imag);
602 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initBlankState(), and statevec_applyPauliSum().

◆ statevec_initBlankStateKernel()

__global__ void statevec_initBlankStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 583 of file QuEST_gpu.cu.

583  {
584  long long int index;
585 
586  // initialise the statevector to be all-zeros
587  index = blockIdx.x*blockDim.x + threadIdx.x;
588  if (index>=stateVecSize) return;
589  stateVecReal[index] = 0.0;
590  stateVecImag[index] = 0.0;
591 }

◆ statevec_initClassicalState()

void statevec_initClassicalState ( Qureg  qureg,
long long int  stateInd 
)

Definition at line 668 of file QuEST_gpu.cu.

669 {
670  int threadsPerCUDABlock, CUDABlocks;
671  threadsPerCUDABlock = 128;
672  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
673  statevec_initClassicalStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
674  qureg.numAmpsPerChunk,
675  qureg.deviceStateVec.real,
676  qureg.deviceStateVec.imag, stateInd);
677 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initClassicalState().

◆ statevec_initClassicalStateKernel()

__global__ void statevec_initClassicalStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag,
long long int  stateInd 
)

Definition at line 653 of file QuEST_gpu.cu.

653  {
654  long long int index;
655 
656  // initialise the state to |stateInd>
657  index = blockIdx.x*blockDim.x + threadIdx.x;
658  if (index>=stateVecSize) return;
659  stateVecReal[index] = 0.0;
660  stateVecImag[index] = 0.0;
661 
662  if (index==stateInd){
663  // classical state has probability 1
664  stateVecReal[stateInd] = 1.0;
665  stateVecImag[stateInd] = 0.0;
666  }
667 }

◆ statevec_initDebugState()

void statevec_initDebugState ( Qureg  qureg)

Initialise the state vector of probability amplitudes to an (unphysical) state with each component of each probability amplitude a unique floating point value.

For debugging processes

Parameters
[in,out]quregobject representing the set of qubits to be initialised

Definition at line 689 of file QuEST_gpu.cu.

690 {
691  int threadsPerCUDABlock, CUDABlocks;
692  threadsPerCUDABlock = 128;
693  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
694  statevec_initDebugStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
695  qureg.numAmpsPerChunk,
696  qureg.deviceStateVec.real,
697  qureg.deviceStateVec.imag);
698 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initDebugState().

◆ statevec_initDebugStateKernel()

__global__ void statevec_initDebugStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 679 of file QuEST_gpu.cu.

679  {
680  long long int index;
681 
682  index = blockIdx.x*blockDim.x + threadIdx.x;
683  if (index>=stateVecSize) return;
684 
685  stateVecReal[index] = (index*2.0)/10.0;
686  stateVecImag[index] = (index*2.0+1.0)/10.0;
687 }

◆ statevec_initPlusState()

void statevec_initPlusState ( Qureg  qureg)

Definition at line 642 of file QuEST_gpu.cu.

643 {
644  int threadsPerCUDABlock, CUDABlocks;
645  threadsPerCUDABlock = 128;
646  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
647  statevec_initPlusStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
648  qureg.numAmpsPerChunk,
649  qureg.deviceStateVec.real,
650  qureg.deviceStateVec.imag);
651 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initPlusState().

◆ statevec_initPlusStateKernel()

__global__ void statevec_initPlusStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 631 of file QuEST_gpu.cu.

631  {
632  long long int index;
633 
634  index = blockIdx.x*blockDim.x + threadIdx.x;
635  if (index>=stateVecSize) return;
636 
637  qreal normFactor = 1.0/sqrt((qreal)stateVecSize);
638  stateVecReal[index] = normFactor;
639  stateVecImag[index] = 0.0;
640 }

References qreal.

◆ statevec_initStateFromSingleFile()

int statevec_initStateFromSingleFile ( Qureg qureg,
char  filename[200],
QuESTEnv  env 
)

Definition at line 727 of file QuEST_gpu.cu.

727  {
728  long long int chunkSize, stateVecSize;
729  long long int indexInChunk, totalIndex;
730 
731  chunkSize = qureg->numAmpsPerChunk;
732  stateVecSize = chunkSize*qureg->numChunks;
733 
734  qreal *stateVecReal = qureg->stateVec.real;
735  qreal *stateVecImag = qureg->stateVec.imag;
736 
737  FILE *fp;
738  char line[200];
739 
740  fp = fopen(filename, "r");
741  if (fp == NULL)
742  return 0;
743 
744  indexInChunk = 0; totalIndex = 0;
745  while (fgets(line, sizeof(char)*200, fp) != NULL && totalIndex<stateVecSize){
746  if (line[0]!='#'){
747  int chunkId = totalIndex/chunkSize;
748  if (chunkId==qureg->chunkId){
749  # if QuEST_PREC==1
750  sscanf(line, "%f, %f", &(stateVecReal[indexInChunk]),
751  &(stateVecImag[indexInChunk]));
752  # elif QuEST_PREC==2
753  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
754  &(stateVecImag[indexInChunk]));
755  # elif QuEST_PREC==4
756  sscanf(line, "%lf, %lf", &(stateVecReal[indexInChunk]),
757  &(stateVecImag[indexInChunk]));
758  # endif
759  indexInChunk += 1;
760  }
761  totalIndex += 1;
762  }
763  }
764  fclose(fp);
765  copyStateToGPU(*qureg);
766 
767  // indicate success
768  return 1;
769 }

References Qureg::chunkId, copyStateToGPU(), Qureg::numAmpsPerChunk, Qureg::numChunks, qreal, and Qureg::stateVec.

Referenced by initStateFromSingleFile().

◆ statevec_initStateOfSingleQubit()

void statevec_initStateOfSingleQubit ( Qureg qureg,
int  qubitId,
int  outcome 
)

Initialise the state vector of probability amplitudes such that one qubit is set to 'outcome' and all other qubits are in an equal superposition of zero and one.

Parameters
[in,out]quregobject representing the set of qubits to be initialised
[in]qubitIdid of qubit to set to state 'outcome'
[in]outcomeof qubit 'qubitId'

Definition at line 718 of file QuEST_gpu.cu.

719 {
720  int threadsPerCUDABlock, CUDABlocks;
721  threadsPerCUDABlock = 128;
722  CUDABlocks = ceil((qreal)(qureg->numAmpsPerChunk)/threadsPerCUDABlock);
723  statevec_initStateOfSingleQubitKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg->numAmpsPerChunk, qureg->deviceStateVec.real, qureg->deviceStateVec.imag, qubitId, outcome);
724 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initStateOfSingleQubit().

◆ statevec_initStateOfSingleQubitKernel()

__global__ void statevec_initStateOfSingleQubitKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag,
int  qubitId,
int  outcome 
)

Definition at line 700 of file QuEST_gpu.cu.

700  {
701  long long int index;
702  int bit;
703 
704  index = blockIdx.x*blockDim.x + threadIdx.x;
705  if (index>=stateVecSize) return;
706 
707  qreal normFactor = 1.0/sqrt((qreal)stateVecSize/2);
708  bit = extractBit(qubitId, index);
709  if (bit==outcome) {
710  stateVecReal[index] = normFactor;
711  stateVecImag[index] = 0.0;
712  } else {
713  stateVecReal[index] = 0.0;
714  stateVecImag[index] = 0.0;
715  }
716 }

References extractBit(), and qreal.

◆ statevec_initZeroState()

void statevec_initZeroState ( Qureg  qureg)

Definition at line 620 of file QuEST_gpu.cu.

621 {
622  int threadsPerCUDABlock, CUDABlocks;
623  threadsPerCUDABlock = 128;
624  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
625  statevec_initZeroStateKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
626  qureg.numAmpsPerChunk,
627  qureg.deviceStateVec.real,
628  qureg.deviceStateVec.imag);
629 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

Referenced by initZeroState().

◆ statevec_initZeroStateKernel()

__global__ void statevec_initZeroStateKernel ( long long int  stateVecSize,
qreal stateVecReal,
qreal stateVecImag 
)

Definition at line 604 of file QuEST_gpu.cu.

604  {
605  long long int index;
606 
607  // initialise the state to |0000..0000>
608  index = blockIdx.x*blockDim.x + threadIdx.x;
609  if (index>=stateVecSize) return;
610  stateVecReal[index] = 0.0;
611  stateVecImag[index] = 0.0;
612 
613  if (index==0){
614  // zero state |0000..0000> has probability 1
615  stateVecReal[0] = 1.0;
616  stateVecImag[0] = 0.0;
617  }
618 }

◆ statevec_multiControlledMultiQubitNot()

void statevec_multiControlledMultiQubitNot ( Qureg  qureg,
int  ctrlMask,
int  targMask 
)

Definition at line 1918 of file QuEST_gpu.cu.

1918  {
1919 
1920  int numThreadsPerBlock = 128;
1921  int numBlocks = ceil(qureg.numAmpsPerChunk / (qreal) numThreadsPerBlock);
1922  statevec_multiControlledMultiQubitNotKernel<<<numBlocks, numThreadsPerBlock>>>(qureg, ctrlMask, targMask);
1923 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledMultiQubitNot(), and multiQubitNot().

◆ statevec_multiControlledMultiQubitNotKernel()

__global__ void statevec_multiControlledMultiQubitNotKernel ( Qureg  qureg,
int  ctrlMask,
int  targMask 
)

Definition at line 1881 of file QuEST_gpu.cu.

1881  {
1882 
1883  qreal* stateRe = qureg.deviceStateVec.real;
1884  qreal* stateIm = qureg.deviceStateVec.imag;
1885 
1886  // althouugh each thread swaps/updates two amplitudes, we still invoke one thread per amp
1887  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
1888  if (ampInd >= qureg.numAmpsPerChunk)
1889  return;
1890 
1891  // modify amplitudes only if control qubits are 1 for this state
1892  if (ctrlMask && ((ctrlMask & ampInd) != ctrlMask))
1893  return;
1894 
1895  long long int mateInd = ampInd ^ targMask;
1896 
1897  // if the mate is lower index, another thread is handling it
1898  if (mateInd < ampInd)
1899  return;
1900 
1901  /* it may seem wasteful to spawn more threads than are needed, and abort
1902  * half of them due to the amp pairing above (and potentially abort
1903  * an exponential number due to ctrlMask). however, since we are moving
1904  * global memory directly in a potentially non-contiguous fashoin, this
1905  * method is likely to be memory bandwidth bottlenecked anyway
1906  */
1907 
1908  qreal mateRe = stateRe[mateInd];
1909  qreal mateIm = stateIm[mateInd];
1910 
1911  // swap amp with mate
1912  stateRe[mateInd] = stateRe[ampInd];
1913  stateIm[mateInd] = stateIm[ampInd];
1914  stateRe[ampInd] = mateRe;
1915  stateIm[ampInd] = mateIm;
1916 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledMultiQubitUnitary()

void statevec_multiControlledMultiQubitUnitary ( Qureg  qureg,
long long int  ctrlMask,
int *  targs,
int  numTargs,
ComplexMatrixN  u 
)

This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct.

It is already gauranteed here that all target qubits can fit on each node (this is validated in the front-end)

Todo:
refactor so that the 'swap back' isn't performed; instead the qubit locations are updated.

Definition at line 1039 of file QuEST_gpu.cu.

1040 {
1041  int threadsPerCUDABlock = 128;
1042  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>numTargs)/threadsPerCUDABlock);
1043 
1044  // allocate device space for global {targs} (length: numTargs) and populate
1045  int *d_targs;
1046  size_t targMemSize = numTargs * sizeof *d_targs;
1047  cudaMalloc(&d_targs, targMemSize);
1048  cudaMemcpy(d_targs, targs, targMemSize, cudaMemcpyHostToDevice);
1049 
1050  // flatten out the u.real and u.imag lists
1051  int uNumRows = (1 << u.numQubits);
1052  qreal* uReFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uReFlat);
1053  qreal* uImFlat = (qreal*) malloc(uNumRows*uNumRows * sizeof *uImFlat);
1054  long long int i = 0;
1055  for (int r=0; r < uNumRows; r++)
1056  for (int c=0; c < uNumRows; c++) {
1057  uReFlat[i] = u.real[r][c];
1058  uImFlat[i] = u.imag[r][c];
1059  i++;
1060  }
1061 
1062  // allocate device space for global u.real and u.imag (flatten by concatenating rows) and populate
1063  qreal* d_uRe;
1064  qreal* d_uIm;
1065  size_t uMemSize = uNumRows*uNumRows * sizeof *d_uRe; // size of each of d_uRe and d_uIm
1066  cudaMalloc(&d_uRe, uMemSize);
1067  cudaMalloc(&d_uIm, uMemSize);
1068  cudaMemcpy(d_uRe, uReFlat, uMemSize, cudaMemcpyHostToDevice);
1069  cudaMemcpy(d_uIm, uImFlat, uMemSize, cudaMemcpyHostToDevice);
1070 
1071  // allocate device Wspace for thread-local {ampInds}, {reAmps}, {imAmps} (length: 1<<numTargs)
1072  long long int *d_ampInds;
1073  qreal *d_reAmps;
1074  qreal *d_imAmps;
1075  size_t gridSize = (size_t) threadsPerCUDABlock * CUDABlocks;
1076  int numTargAmps = uNumRows;
1077  cudaMalloc(&d_ampInds, numTargAmps*gridSize * sizeof *d_ampInds);
1078  cudaMalloc(&d_reAmps, numTargAmps*gridSize * sizeof *d_reAmps);
1079  cudaMalloc(&d_imAmps, numTargAmps*gridSize * sizeof *d_imAmps);
1080 
1081  // call kernel
1082  statevec_multiControlledMultiQubitUnitaryKernel<<<CUDABlocks,threadsPerCUDABlock>>>(
1083  qureg, ctrlMask, d_targs, numTargs, d_uRe, d_uIm, d_ampInds, d_reAmps, d_imAmps, numTargAmps);
1084 
1085  // free kernel memory
1086  free(uReFlat);
1087  free(uImFlat);
1088  cudaFree(d_targs);
1089  cudaFree(d_uRe);
1090  cudaFree(d_uIm);
1091  cudaFree(d_ampInds);
1092  cudaFree(d_reAmps);
1093  cudaFree(d_imAmps);
1094 }

References ComplexMatrixN::imag, Qureg::numAmpsPerChunk, ComplexMatrixN::numQubits, qreal, and ComplexMatrixN::real.

Referenced by applyMultiControlledMatrixN(), densmatr_applyMultiQubitKrausSuperoperator(), densmatr_applyTwoQubitKrausSuperoperator(), multiControlledMultiQubitUnitary(), statevec_controlledMultiQubitUnitary(), and statevec_multiQubitUnitary().

◆ statevec_multiControlledMultiQubitUnitaryKernel()

__global__ void statevec_multiControlledMultiQubitUnitaryKernel ( Qureg  qureg,
long long int  ctrlMask,
int *  targs,
int  numTargs,
qreal uRe,
qreal uIm,
long long int *  ampInds,
qreal reAmps,
qreal imAmps,
long long int  numTargAmps 
)

Definition at line 980 of file QuEST_gpu.cu.

983 {
984 
985  // decide the amplitudes this thread will modify
986  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
987  long long int numTasks = qureg.numAmpsPerChunk >> numTargs; // kernel called on every 1 in 2^numTargs amplitudes
988  if (thisTask>=numTasks) return;
989 
990  // find this task's start index (where all targs are 0)
991  long long int ind00 = insertZeroBits(thisTask, targs, numTargs);
992 
993  // this task only modifies amplitudes if control qubits are 1 for this state
994  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
995  return;
996 
997  qreal *reVec = qureg.deviceStateVec.real;
998  qreal *imVec = qureg.deviceStateVec.imag;
999 
1000  /*
1001  each thread needs:
1002  long long int ampInds[numAmps];
1003  qreal reAmps[numAmps];
1004  qreal imAmps[numAmps];
1005  but instead has access to shared arrays, with below stride and offset
1006  */
1007  size_t stride = gridDim.x*blockDim.x;
1008  size_t offset = blockIdx.x*blockDim.x + threadIdx.x;
1009 
1010  // determine the indices and record values of target amps
1011  long long int ind;
1012  for (int i=0; i < numTargAmps; i++) {
1013 
1014  // get global index of current target qubit assignment
1015  ind = ind00;
1016  for (int t=0; t < numTargs; t++)
1017  if (extractBit(t, i))
1018  ind = flipBit(ind, targs[t]);
1019 
1020  ampInds[i*stride+offset] = ind;
1021  reAmps [i*stride+offset] = reVec[ind];
1022  imAmps [i*stride+offset] = imVec[ind];
1023  }
1024 
1025  // update the amplitudes
1026  for (int r=0; r < numTargAmps; r++) {
1027  ind = ampInds[r*stride+offset];
1028  reVec[ind] = 0;
1029  imVec[ind] = 0;
1030  for (int c=0; c < numTargAmps; c++) {
1031  qreal uReElem = uRe[c + r*numTargAmps];
1032  qreal uImElem = uIm[c + r*numTargAmps];
1033  reVec[ind] += reAmps[c*stride+offset]*uReElem - imAmps[c*stride+offset]*uImElem;
1034  imVec[ind] += reAmps[c*stride+offset]*uImElem + imAmps[c*stride+offset]*uReElem;
1035  }
1036  }
1037 }

References Qureg::deviceStateVec, extractBit(), flipBit(), insertZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledMultiRotateZ()

void statevec_multiControlledMultiRotateZ ( Qureg  qureg,
long long int  ctrlMask,
long long int  targMask,
qreal  angle 
)

Definition at line 1621 of file QuEST_gpu.cu.

1622 {
1623  qreal cosAngle = cos(angle/2.0);
1624  qreal sinAngle = sin(angle/2.0);
1625 
1626  int threadsPerCUDABlock, CUDABlocks;
1627  threadsPerCUDABlock = 128;
1628  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1629  statevec_multiControlledMultiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, targMask, cosAngle, sinAngle);
1630 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledMultiRotateZ(), and statevec_multiControlledMultiRotatePauli().

◆ statevec_multiControlledMultiRotateZKernel()

__global__ void statevec_multiControlledMultiRotateZKernel ( Qureg  qureg,
long long int  ctrlMask,
long long int  targMask,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1599 of file QuEST_gpu.cu.

1599  {
1600 
1601  long long int stateVecSize = qureg.numAmpsPerChunk;
1602  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1603  if (index>=stateVecSize) return;
1604 
1605  // amplitudes corresponding to control qubits not all-in-one are unmodified
1606  if (ctrlMask && ((ctrlMask & index) != ctrlMask))
1607  return;
1608 
1609  qreal *stateVecReal = qureg.deviceStateVec.real;
1610  qreal *stateVecImag = qureg.deviceStateVec.imag;
1611 
1612  // avoid warp divergence, setting fac = +- 1
1613  int fac = 1-2*getBitMaskParity(targMask & index);
1614  qreal stateReal = stateVecReal[index];
1615  qreal stateImag = stateVecImag[index];
1616 
1617  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1618  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1619 }

References Qureg::deviceStateVec, getBitMaskParity(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledPhaseFlip()

void statevec_multiControlledPhaseFlip ( Qureg  qureg,
int *  controlQubits,
int  numControlQubits 
)

Definition at line 1734 of file QuEST_gpu.cu.

1735 {
1736  int threadsPerCUDABlock, CUDABlocks;
1737  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1738  threadsPerCUDABlock = 128;
1739  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1740  statevec_multiControlledPhaseFlipKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask);
1741 }

References getQubitBitMask(), Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledPhaseFlip().

◆ statevec_multiControlledPhaseFlipKernel()

__global__ void statevec_multiControlledPhaseFlipKernel ( Qureg  qureg,
long long int  mask 
)

Definition at line 1716 of file QuEST_gpu.cu.

1717 {
1718  long long int index;
1719  long long int stateVecSize;
1720 
1721  stateVecSize = qureg.numAmpsPerChunk;
1722  qreal *stateVecReal = qureg.deviceStateVec.real;
1723  qreal *stateVecImag = qureg.deviceStateVec.imag;
1724 
1725  index = blockIdx.x*blockDim.x + threadIdx.x;
1726  if (index>=stateVecSize) return;
1727 
1728  if (mask == (mask & index) ){
1729  stateVecReal [index] = - stateVecReal [index];
1730  stateVecImag [index] = - stateVecImag [index];
1731  }
1732 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledPhaseShift()

void statevec_multiControlledPhaseShift ( Qureg  qureg,
int *  controlQubits,
int  numControlQubits,
qreal  angle 
)

Definition at line 1558 of file QuEST_gpu.cu.

1559 {
1560  qreal cosAngle = cos(angle);
1561  qreal sinAngle = sin(angle);
1562 
1563  long long int mask = getQubitBitMask(controlQubits, numControlQubits);
1564 
1565  int threadsPerCUDABlock, CUDABlocks;
1566  threadsPerCUDABlock = 128;
1567  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1568  statevec_multiControlledPhaseShiftKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1569 }

References getQubitBitMask(), Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledPhaseShift().

◆ statevec_multiControlledPhaseShiftKernel()

__global__ void statevec_multiControlledPhaseShiftKernel ( Qureg  qureg,
long long int  mask,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1538 of file QuEST_gpu.cu.

1538  {
1539  qreal stateRealLo, stateImagLo;
1540  long long int index;
1541  long long int stateVecSize;
1542 
1543  stateVecSize = qureg.numAmpsPerChunk;
1544  qreal *stateVecReal = qureg.deviceStateVec.real;
1545  qreal *stateVecImag = qureg.deviceStateVec.imag;
1546 
1547  index = blockIdx.x*blockDim.x + threadIdx.x;
1548  if (index>=stateVecSize) return;
1549 
1550  if (mask == (mask & index) ){
1551  stateRealLo = stateVecReal[index];
1552  stateImagLo = stateVecImag[index];
1553  stateVecReal[index] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1554  stateVecImag[index] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1555  }
1556 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledTwoQubitUnitary()

void statevec_multiControlledTwoQubitUnitary ( Qureg  qureg,
long long int  ctrlMask,
int  q1,
int  q2,
ComplexMatrix4  u 
)

This calls swapQubitAmps only when it would involve a distributed communication; if the qubit chunks already fit in the node, it operates the unitary direct.

Note the order of q1 and q2 in the call to twoQubitUnitaryLocal is important.

Todo:

refactor so that the 'swap back' isn't performed; instead the qubit locations are updated.

the double swap (q1,q2 to 0,1) may be possible simultaneously by a bespoke swap routine.

Definition at line 1172 of file QuEST_gpu.cu.

1173 {
1174  int threadsPerCUDABlock = 128;
1175  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock); // one kernel eval for every 4 amplitudes
1176  statevec_multiControlledTwoQubitUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, ctrlMask, q1, q2, argifyMatrix4(u));
1177 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by densmatr_applyKrausSuperoperator(), multiControlledTwoQubitUnitary(), statevec_controlledTwoQubitUnitary(), and statevec_twoQubitUnitary().

◆ statevec_multiControlledTwoQubitUnitaryKernel()

__global__ void statevec_multiControlledTwoQubitUnitaryKernel ( Qureg  qureg,
long long int  ctrlMask,
int  q1,
int  q2,
ArgMatrix4  u 
)

Definition at line 1096 of file QuEST_gpu.cu.

1096  {
1097 
1098  // decide the 4 amplitudes this thread will modify
1099  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1100  long long int numTasks = qureg.numAmpsPerChunk >> 2; // kernel called on every 1 in 4 amplitudes
1101  if (thisTask>=numTasks) return;
1102 
1103  qreal *reVec = qureg.deviceStateVec.real;
1104  qreal *imVec = qureg.deviceStateVec.imag;
1105 
1106  // find indices of amplitudes to modify (treat q1 as the least significant bit)
1107  long long int ind00, ind01, ind10, ind11;
1108  ind00 = insertTwoZeroBits(thisTask, q1, q2);
1109 
1110  // modify only if control qubits are 1 for this state
1111  if (ctrlMask && (ctrlMask&ind00) != ctrlMask)
1112  return;
1113 
1114  ind01 = flipBit(ind00, q1);
1115  ind10 = flipBit(ind00, q2);
1116  ind11 = flipBit(ind01, q2);
1117 
1118  // extract statevec amplitudes
1119  qreal re00, re01, re10, re11;
1120  qreal im00, im01, im10, im11;
1121  re00 = reVec[ind00]; im00 = imVec[ind00];
1122  re01 = reVec[ind01]; im01 = imVec[ind01];
1123  re10 = reVec[ind10]; im10 = imVec[ind10];
1124  re11 = reVec[ind11]; im11 = imVec[ind11];
1125 
1126  // apply u * {amp00, amp01, amp10, amp11}
1127  reVec[ind00] =
1128  u.r0c0.real*re00 - u.r0c0.imag*im00 +
1129  u.r0c1.real*re01 - u.r0c1.imag*im01 +
1130  u.r0c2.real*re10 - u.r0c2.imag*im10 +
1131  u.r0c3.real*re11 - u.r0c3.imag*im11;
1132  imVec[ind00] =
1133  u.r0c0.imag*re00 + u.r0c0.real*im00 +
1134  u.r0c1.imag*re01 + u.r0c1.real*im01 +
1135  u.r0c2.imag*re10 + u.r0c2.real*im10 +
1136  u.r0c3.imag*re11 + u.r0c3.real*im11;
1137 
1138  reVec[ind01] =
1139  u.r1c0.real*re00 - u.r1c0.imag*im00 +
1140  u.r1c1.real*re01 - u.r1c1.imag*im01 +
1141  u.r1c2.real*re10 - u.r1c2.imag*im10 +
1142  u.r1c3.real*re11 - u.r1c3.imag*im11;
1143  imVec[ind01] =
1144  u.r1c0.imag*re00 + u.r1c0.real*im00 +
1145  u.r1c1.imag*re01 + u.r1c1.real*im01 +
1146  u.r1c2.imag*re10 + u.r1c2.real*im10 +
1147  u.r1c3.imag*re11 + u.r1c3.real*im11;
1148 
1149  reVec[ind10] =
1150  u.r2c0.real*re00 - u.r2c0.imag*im00 +
1151  u.r2c1.real*re01 - u.r2c1.imag*im01 +
1152  u.r2c2.real*re10 - u.r2c2.imag*im10 +
1153  u.r2c3.real*re11 - u.r2c3.imag*im11;
1154  imVec[ind10] =
1155  u.r2c0.imag*re00 + u.r2c0.real*im00 +
1156  u.r2c1.imag*re01 + u.r2c1.real*im01 +
1157  u.r2c2.imag*re10 + u.r2c2.real*im10 +
1158  u.r2c3.imag*re11 + u.r2c3.real*im11;
1159 
1160  reVec[ind11] =
1161  u.r3c0.real*re00 - u.r3c0.imag*im00 +
1162  u.r3c1.real*re01 - u.r3c1.imag*im01 +
1163  u.r3c2.real*re10 - u.r3c2.imag*im10 +
1164  u.r3c3.real*re11 - u.r3c3.imag*im11;
1165  imVec[ind11] =
1166  u.r3c0.imag*re00 + u.r3c0.real*im00 +
1167  u.r3c1.imag*re01 + u.r3c1.real*im01 +
1168  u.r3c2.imag*re10 + u.r3c2.real*im10 +
1169  u.r3c3.imag*re11 + u.r3c3.real*im11;
1170 }

References Qureg::deviceStateVec, flipBit(), insertTwoZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiControlledUnitary()

void statevec_multiControlledUnitary ( Qureg  qureg,
long long int  ctrlQubitsMask,
long long int  ctrlFlipMask,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 1305 of file QuEST_gpu.cu.

1309  {
1310  int threadsPerCUDABlock = 128;
1311  int CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1312  statevec_multiControlledUnitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
1313  qureg, ctrlQubitsMask, ctrlFlipMask, targetQubit, argifyMatrix2(u));
1314 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiControlledUnitary(), multiStateControlledUnitary(), and statevec_multiControlledMultiRotatePauli().

◆ statevec_multiControlledUnitaryKernel()

__global__ void statevec_multiControlledUnitaryKernel ( Qureg  qureg,
long long int  ctrlQubitsMask,
long long int  ctrlFlipMask,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 1245 of file QuEST_gpu.cu.

1249  {
1250  // ----- sizes
1251  long long int sizeBlock, // size of blocks
1252  sizeHalfBlock; // size of blocks halved
1253  // ----- indices
1254  long long int thisBlock, // current block
1255  indexUp,indexLo; // current index and corresponding index in lower half block
1256 
1257  // ----- temp variables
1258  qreal stateRealUp,stateRealLo, // storage for previous state values
1259  stateImagUp,stateImagLo; // (used in updates)
1260  // ----- temp variables
1261  long long int thisTask; // task based approach for expose loop with small granularity
1262  long long int numTasks=qureg.numAmpsPerChunk>>1;
1263 
1264 
1265  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1266  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1267 
1268  // ---------------------------------------------------------------- //
1269  // rotate //
1270  // ---------------------------------------------------------------- //
1271 
1273  qreal *stateVecReal = qureg.deviceStateVec.real;
1274  qreal *stateVecImag = qureg.deviceStateVec.imag;
1275 
1276  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1277  if (thisTask>=numTasks) return;
1278 
1279  thisBlock = thisTask / sizeHalfBlock;
1280  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1281  indexLo = indexUp + sizeHalfBlock;
1282 
1283  if (ctrlQubitsMask == (ctrlQubitsMask & (indexUp ^ ctrlFlipMask))) {
1284  // store current state vector values in temp variables
1285  stateRealUp = stateVecReal[indexUp];
1286  stateImagUp = stateVecImag[indexUp];
1287 
1288  stateRealLo = stateVecReal[indexLo];
1289  stateImagLo = stateVecImag[indexLo];
1290 
1291  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
1292  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
1293  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
1294  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
1295  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
1296 
1297  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
1298  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
1299  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
1300  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
1301  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
1302  }
1303 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_multiRotateZ()

void statevec_multiRotateZ ( Qureg  qureg,
long long int  mask,
qreal  angle 
)

Definition at line 1588 of file QuEST_gpu.cu.

1589 {
1590  qreal cosAngle = cos(angle/2.0);
1591  qreal sinAngle = sin(angle/2.0);
1592 
1593  int threadsPerCUDABlock, CUDABlocks;
1594  threadsPerCUDABlock = 128;
1595  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk)/threadsPerCUDABlock);
1596  statevec_multiRotateZKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, mask, cosAngle, sinAngle);
1597 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by multiRotateZ(), and statevec_multiRotatePauli().

◆ statevec_multiRotateZKernel()

__global__ void statevec_multiRotateZKernel ( Qureg  qureg,
long long int  mask,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1571 of file QuEST_gpu.cu.

1571  {
1572 
1573  long long int stateVecSize = qureg.numAmpsPerChunk;
1574  long long int index = blockIdx.x*blockDim.x + threadIdx.x;
1575  if (index>=stateVecSize) return;
1576 
1577  qreal *stateVecReal = qureg.deviceStateVec.real;
1578  qreal *stateVecImag = qureg.deviceStateVec.imag;
1579 
1580  int fac = getBitMaskParity(mask & index)? -1 : 1;
1581  qreal stateReal = stateVecReal[index];
1582  qreal stateImag = stateVecImag[index];
1583 
1584  stateVecReal[index] = cosAngle*stateReal + fac * sinAngle*stateImag;
1585  stateVecImag[index] = - fac * sinAngle*stateReal + cosAngle*stateImag;
1586 }

References Qureg::deviceStateVec, getBitMaskParity(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_pauliX()

void statevec_pauliX ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1360 of file QuEST_gpu.cu.

1361 {
1362  int threadsPerCUDABlock, CUDABlocks;
1363  threadsPerCUDABlock = 128;
1364  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1365  statevec_pauliXKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit);
1366 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliX(), and statevec_applyPauliProd().

◆ statevec_pauliXKernel()

__global__ void statevec_pauliXKernel ( Qureg  qureg,
int  targetQubit 
)

fix – no necessary for GPU version

Definition at line 1316 of file QuEST_gpu.cu.

1316  {
1317  // ----- sizes
1318  long long int sizeBlock, // size of blocks
1319  sizeHalfBlock; // size of blocks halved
1320  // ----- indices
1321  long long int thisBlock, // current block
1322  indexUp,indexLo; // current index and corresponding index in lower half block
1323 
1324  // ----- temp variables
1325  qreal stateRealUp, // storage for previous state values
1326  stateImagUp; // (used in updates)
1327  // ----- temp variables
1328  long long int thisTask; // task based approach for expose loop with small granularity
1329  long long int numTasks=qureg.numAmpsPerChunk>>1;
1330 
1331  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
1332  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
1333 
1334  // ---------------------------------------------------------------- //
1335  // rotate //
1336  // ---------------------------------------------------------------- //
1337 
1339  qreal *stateVecReal = qureg.deviceStateVec.real;
1340  qreal *stateVecImag = qureg.deviceStateVec.imag;
1341 
1342  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1343  if (thisTask>=numTasks) return;
1344 
1345  thisBlock = thisTask / sizeHalfBlock;
1346  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1347  indexLo = indexUp + sizeHalfBlock;
1348 
1349  // store current state vector values in temp variables
1350  stateRealUp = stateVecReal[indexUp];
1351  stateImagUp = stateVecImag[indexUp];
1352 
1353  stateVecReal[indexUp] = stateVecReal[indexLo];
1354  stateVecImag[indexUp] = stateVecImag[indexLo];
1355 
1356  stateVecReal[indexLo] = stateRealUp;
1357  stateVecImag[indexLo] = stateImagUp;
1358 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_pauliY()

void statevec_pauliY ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1393 of file QuEST_gpu.cu.

1394 {
1395  int threadsPerCUDABlock, CUDABlocks;
1396  threadsPerCUDABlock = 128;
1397  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1398  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, 1);
1399 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliY(), and statevec_applyPauliProd().

◆ statevec_pauliYConj()

void statevec_pauliYConj ( Qureg  qureg,
int  targetQubit 
)

Definition at line 1401 of file QuEST_gpu.cu.

1402 {
1403  int threadsPerCUDABlock, CUDABlocks;
1404  threadsPerCUDABlock = 128;
1405  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1406  statevec_pauliYKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, -1);
1407 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by pauliY().

◆ statevec_pauliYKernel()

__global__ void statevec_pauliYKernel ( Qureg  qureg,
int  targetQubit,
int  conjFac 
)

Definition at line 1368 of file QuEST_gpu.cu.

1368  {
1369 
1370  long long int sizeHalfBlock = 1LL << targetQubit;
1371  long long int sizeBlock = 2LL * sizeHalfBlock;
1372  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1373  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1374  if (thisTask>=numTasks) return;
1375 
1376  long long int thisBlock = thisTask / sizeHalfBlock;
1377  long long int indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1378  long long int indexLo = indexUp + sizeHalfBlock;
1379  qreal stateRealUp, stateImagUp;
1380 
1381  qreal *stateVecReal = qureg.deviceStateVec.real;
1382  qreal *stateVecImag = qureg.deviceStateVec.imag;
1383  stateRealUp = stateVecReal[indexUp];
1384  stateImagUp = stateVecImag[indexUp];
1385 
1386  // update under +-{{0, -i}, {i, 0}}
1387  stateVecReal[indexUp] = conjFac * stateVecImag[indexLo];
1388  stateVecImag[indexUp] = conjFac * -stateVecReal[indexLo];
1389  stateVecReal[indexLo] = conjFac * -stateImagUp;
1390  stateVecImag[indexLo] = conjFac * stateRealUp;
1391 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_phaseShiftByTerm()

void statevec_phaseShiftByTerm ( Qureg  qureg,
int  targetQubit,
Complex  term 
)

Definition at line 1491 of file QuEST_gpu.cu.

1492 {
1493  qreal cosAngle = term.real;
1494  qreal sinAngle = term.imag;
1495 
1496  int threadsPerCUDABlock, CUDABlocks;
1497  threadsPerCUDABlock = 128;
1498  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
1499  statevec_phaseShiftByTermKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, cosAngle, sinAngle);
1500 }

References Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

Referenced by statevec_pauliZ(), statevec_phaseShift(), statevec_sGate(), statevec_sGateConj(), statevec_tGate(), and statevec_tGateConj().

◆ statevec_phaseShiftByTermKernel()

__global__ void statevec_phaseShiftByTermKernel ( Qureg  qureg,
int  targetQubit,
qreal  cosAngle,
qreal  sinAngle 
)

Definition at line 1463 of file QuEST_gpu.cu.

1463  {
1464 
1465  long long int sizeBlock, sizeHalfBlock;
1466  long long int thisBlock, indexUp,indexLo;
1467 
1468  qreal stateRealLo, stateImagLo;
1469  long long int thisTask;
1470  long long int numTasks = qureg.numAmpsPerChunk >> 1;
1471 
1472  sizeHalfBlock = 1LL << targetQubit;
1473  sizeBlock = 2LL * sizeHalfBlock;
1474 
1475  qreal *stateVecReal = qureg.deviceStateVec.real;
1476  qreal *stateVecImag = qureg.deviceStateVec.imag;
1477 
1478  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1479  if (thisTask>=numTasks) return;
1480  thisBlock = thisTask / sizeHalfBlock;
1481  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
1482  indexLo = indexUp + sizeHalfBlock;
1483 
1484  stateRealLo = stateVecReal[indexLo];
1485  stateImagLo = stateVecImag[indexLo];
1486 
1487  stateVecReal[indexLo] = cosAngle*stateRealLo - sinAngle*stateImagLo;
1488  stateVecImag[indexLo] = sinAngle*stateRealLo + cosAngle*stateImagLo;
1489 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ statevec_reportStateToScreen()

void statevec_reportStateToScreen ( Qureg  qureg,
QuESTEnv  env,
int  reportRank 
)

Print the current state vector of probability amplitudes for a set of qubits to standard out.

For debugging purposes. Each rank should print output serially. Only print output for systems <= 5 qubits

Definition at line 543 of file QuEST_gpu.cu.

543  {
544  long long int index;
545  int rank;
546  copyStateFromGPU(qureg);
547  if (qureg.numQubitsInStateVec<=5){
548  for (rank=0; rank<qureg.numChunks; rank++){
549  if (qureg.chunkId==rank){
550  if (reportRank) {
551  printf("Reporting state from rank %d [\n", qureg.chunkId);
552  //printf("\trank, index, real, imag\n");
553  printf("real, imag\n");
554  } else if (rank==0) {
555  printf("Reporting state [\n");
556  printf("real, imag\n");
557  }
558 
559  for(index=0; index<qureg.numAmpsPerChunk; index++){
560  printf(REAL_STRING_FORMAT ", " REAL_STRING_FORMAT "\n", qureg.stateVec.real[index], qureg.stateVec.imag[index]);
561  }
562  if (reportRank || rank==qureg.numChunks-1) printf("]\n");
563  }
564  syncQuESTEnv(env);
565  }
566  }
567 }

References Qureg::chunkId, copyStateFromGPU(), Qureg::numAmpsPerChunk, Qureg::numChunks, Qureg::numQubitsInStateVec, Qureg::stateVec, and syncQuESTEnv().

Referenced by reportStateToScreen().

◆ statevec_setAmps()

void statevec_setAmps ( Qureg  qureg,
long long int  startInd,
qreal reals,
qreal imags,
long long int  numAmps 
)

Definition at line 153 of file QuEST_gpu.cu.

153  {
154 
155  cudaDeviceSynchronize();
156  cudaMemcpy(
157  qureg.deviceStateVec.real + startInd,
158  reals,
159  numAmps * sizeof(*(qureg.deviceStateVec.real)),
160  cudaMemcpyHostToDevice);
161  cudaMemcpy(
162  qureg.deviceStateVec.imag + startInd,
163  imags,
164  numAmps * sizeof(*(qureg.deviceStateVec.imag)),
165  cudaMemcpyHostToDevice);
166 }

References Qureg::deviceStateVec.

Referenced by initStateFromAmps(), setAmps(), and setDensityAmps().

◆ statevec_setWeightedQureg()

void statevec_setWeightedQureg ( Complex  fac1,
Qureg  qureg1,
Complex  fac2,
Qureg  qureg2,
Complex  facOut,
Qureg  out 
)

Definition at line 3175 of file QuEST_gpu.cu.

3175  {
3176 
3177  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
3178 
3179  int threadsPerCUDABlock, CUDABlocks;
3180  threadsPerCUDABlock = 128;
3181  CUDABlocks = ceil(numAmpsToVisit / (qreal) threadsPerCUDABlock);
3182  statevec_setWeightedQuregKernel<<<CUDABlocks, threadsPerCUDABlock>>>(
3183  fac1, qureg1, fac2, qureg2, facOut, out
3184  );
3185 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by setWeightedQureg(), and statevec_applyPauliSum().

◆ statevec_setWeightedQuregKernel()

__global__ void statevec_setWeightedQuregKernel ( Complex  fac1,
Qureg  qureg1,
Complex  fac2,
Qureg  qureg2,
Complex  facOut,
Qureg  out 
)

Definition at line 3143 of file QuEST_gpu.cu.

3143  {
3144 
3145  long long int ampInd = blockIdx.x*blockDim.x + threadIdx.x;
3146  long long int numAmpsToVisit = qureg1.numAmpsPerChunk;
3147  if (ampInd >= numAmpsToVisit) return;
3148 
3149  qreal *vecRe1 = qureg1.deviceStateVec.real;
3150  qreal *vecIm1 = qureg1.deviceStateVec.imag;
3151  qreal *vecRe2 = qureg2.deviceStateVec.real;
3152  qreal *vecIm2 = qureg2.deviceStateVec.imag;
3153  qreal *vecReOut = out.deviceStateVec.real;
3154  qreal *vecImOut = out.deviceStateVec.imag;
3155 
3156  qreal facRe1 = fac1.real;
3157  qreal facIm1 = fac1.imag;
3158  qreal facRe2 = fac2.real;
3159  qreal facIm2 = fac2.imag;
3160  qreal facReOut = facOut.real;
3161  qreal facImOut = facOut.imag;
3162 
3163  qreal re1,im1, re2,im2, reOut,imOut;
3164  long long int index = ampInd;
3165 
3166  re1 = vecRe1[index]; im1 = vecIm1[index];
3167  re2 = vecRe2[index]; im2 = vecIm2[index];
3168  reOut = vecReOut[index];
3169  imOut = vecImOut[index];
3170 
3171  vecReOut[index] = (facReOut*reOut - facImOut*imOut) + (facRe1*re1 - facIm1*im1) + (facRe2*re2 - facIm2*im2);
3172  vecImOut[index] = (facReOut*imOut + facImOut*reOut) + (facRe1*im1 + facIm1*re1) + (facRe2*im2 + facIm2*re2);
3173 }

References Qureg::deviceStateVec, Complex::imag, Qureg::numAmpsPerChunk, qreal, and Complex::real.

◆ statevec_swapQubitAmps()

void statevec_swapQubitAmps ( Qureg  qureg,
int  qb1,
int  qb2 
)

Definition at line 1769 of file QuEST_gpu.cu.

1770 {
1771  int threadsPerCUDABlock, CUDABlocks;
1772  threadsPerCUDABlock = 128;
1773  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>2)/threadsPerCUDABlock);
1774  statevec_swapQubitAmpsKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, qb1, qb2);
1775 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by agnostic_applyQFT(), and swapGate().

◆ statevec_swapQubitAmpsKernel()

__global__ void statevec_swapQubitAmpsKernel ( Qureg  qureg,
int  qb1,
int  qb2 
)

Definition at line 1743 of file QuEST_gpu.cu.

1743  {
1744 
1745  qreal *reVec = qureg.deviceStateVec.real;
1746  qreal *imVec = qureg.deviceStateVec.imag;
1747 
1748  long long int numTasks = qureg.numAmpsPerChunk >> 2; // each iteration updates 2 amps and skips 2 amps
1749  long long int thisTask = blockIdx.x*blockDim.x + threadIdx.x;
1750  if (thisTask>=numTasks) return;
1751 
1752  long long int ind00, ind01, ind10;
1753  qreal re01, re10, im01, im10;
1754 
1755  // determine ind00 of |..0..0..>, |..0..1..> and |..1..0..>
1756  ind00 = insertTwoZeroBits(thisTask, qb1, qb2);
1757  ind01 = flipBit(ind00, qb1);
1758  ind10 = flipBit(ind00, qb2);
1759 
1760  // extract statevec amplitudes
1761  re01 = reVec[ind01]; im01 = imVec[ind01];
1762  re10 = reVec[ind10]; im10 = imVec[ind10];
1763 
1764  // swap 01 and 10 amps
1765  reVec[ind01] = re10; reVec[ind10] = re01;
1766  imVec[ind01] = im10; imVec[ind10] = im01;
1767 }

References Qureg::deviceStateVec, flipBit(), insertTwoZeroBits(), Qureg::numAmpsPerChunk, and qreal.

◆ statevec_unitary()

void statevec_unitary ( Qureg  qureg,
int  targetQubit,
ComplexMatrix2  u 
)

Definition at line 972 of file QuEST_gpu.cu.

973 {
974  int threadsPerCUDABlock, CUDABlocks;
975  threadsPerCUDABlock = 128;
976  CUDABlocks = ceil((qreal)(qureg.numAmpsPerChunk>>1)/threadsPerCUDABlock);
977  statevec_unitaryKernel<<<CUDABlocks, threadsPerCUDABlock>>>(qureg, targetQubit, argifyMatrix2(u));
978 }

References Qureg::numAmpsPerChunk, and qreal.

Referenced by applyMatrix2(), and unitary().

◆ statevec_unitaryKernel()

__global__ void statevec_unitaryKernel ( Qureg  qureg,
int  targetQubit,
ArgMatrix2  u 
)

fix – no necessary for GPU version

Definition at line 919 of file QuEST_gpu.cu.

919  {
920  // ----- sizes
921  long long int sizeBlock, // size of blocks
922  sizeHalfBlock; // size of blocks halved
923  // ----- indices
924  long long int thisBlock, // current block
925  indexUp,indexLo; // current index and corresponding index in lower half block
926 
927  // ----- temp variables
928  qreal stateRealUp,stateRealLo, // storage for previous state values
929  stateImagUp,stateImagLo; // (used in updates)
930  // ----- temp variables
931  long long int thisTask; // task based approach for expose loop with small granularity
932  long long int numTasks=qureg.numAmpsPerChunk>>1;
933 
934  sizeHalfBlock = 1LL << targetQubit; // size of blocks halved
935  sizeBlock = 2LL * sizeHalfBlock; // size of blocks
936 
937  // ---------------------------------------------------------------- //
938  // rotate //
939  // ---------------------------------------------------------------- //
940 
942  qreal *stateVecReal = qureg.deviceStateVec.real;
943  qreal *stateVecImag = qureg.deviceStateVec.imag;
944 
945  thisTask = blockIdx.x*blockDim.x + threadIdx.x;
946  if (thisTask>=numTasks) return;
947 
948  thisBlock = thisTask / sizeHalfBlock;
949  indexUp = thisBlock*sizeBlock + thisTask%sizeHalfBlock;
950  indexLo = indexUp + sizeHalfBlock;
951 
952  // store current state vector values in temp variables
953  stateRealUp = stateVecReal[indexUp];
954  stateImagUp = stateVecImag[indexUp];
955 
956  stateRealLo = stateVecReal[indexLo];
957  stateImagLo = stateVecImag[indexLo];
958 
959  // state[indexUp] = u00 * state[indexUp] + u01 * state[indexLo]
960  stateVecReal[indexUp] = u.r0c0.real*stateRealUp - u.r0c0.imag*stateImagUp
961  + u.r0c1.real*stateRealLo - u.r0c1.imag*stateImagLo;
962  stateVecImag[indexUp] = u.r0c0.real*stateImagUp + u.r0c0.imag*stateRealUp
963  + u.r0c1.real*stateImagLo + u.r0c1.imag*stateRealLo;
964 
965  // state[indexLo] = u10 * state[indexUp] + u11 * state[indexLo]
966  stateVecReal[indexLo] = u.r1c0.real*stateRealUp - u.r1c0.imag*stateImagUp
967  + u.r1c1.real*stateRealLo - u.r1c1.imag*stateImagLo;
968  stateVecImag[indexLo] = u.r1c0.real*stateImagUp + u.r1c0.imag*stateRealUp
969  + u.r1c1.real*stateImagLo + u.r1c1.imag*stateRealLo;
970 }

References Qureg::deviceStateVec, Qureg::numAmpsPerChunk, and qreal.

◆ swapDouble()

void swapDouble ( qreal **  a,
qreal **  b 
)
@ INVERSE_PRODUCT
Definition: QuEST.h:233
pauliOpType
Codes for specifying Pauli operators.
Definition: QuEST.h:96
__device__ __host__ unsigned int log2Int(unsigned int x)
Definition: QuEST_gpu.cu:1925
void copyStateFromGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from GPU memory (qureg....
Definition: QuEST_gpu.cu:529
void syncQuESTEnv(QuESTEnv env)
Guarantees that all code up to the given point has been executed on all nodes (if running in distribu...
Definition: QuEST_gpu.cu:481
@ PAULI_Z
Definition: QuEST.h:96
@ DISTANCE
Definition: QuEST.h:234
int rank
Definition: QuEST.h:364
__global__ void copySharedReduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1951
void swapDouble(qreal **a, qreal **b)
Definition: QuEST_gpu.cu:2057
int numChunks
The number of nodes between which the elements of this operator are split.
Definition: QuEST.h:304
ComplexArray pairStateVec
Temporary storage for a chunk of the state vector received from another process in the MPI version.
Definition: QuEST.h:343
qreal statevec_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:2112
int numChunks
Number of chunks the state vector is broken up into – the number of MPI processes used.
Definition: QuEST.h:338
@ TWOS_COMPLEMENT
Definition: QuEST.h:269
ComplexArray deviceOperator
A copy of the elements stored persistently on the GPU.
Definition: QuEST.h:312
int chunkId
The position of the chunk of the operator held by this process in the full operator.
Definition: QuEST.h:306
ComplexArray deviceStateVec
Storage for wavefunction amplitudes in the GPU version.
Definition: QuEST.h:346
@ NORM
Definition: QuEST.h:232
void densmatr_oneQubitDegradeOffDiagonal(Qureg qureg, int targetQubit, qreal dephFac)
Definition: QuEST_gpu.cu:2879
@ SCALED_INVERSE_DISTANCE
Definition: QuEST.h:234
@ UNSIGNED
Definition: QuEST.h:269
@ INVERSE_DISTANCE
Definition: QuEST.h:234
#define qreal
__forceinline__ __device__ long long int flipBit(const long long int number, const int bitInd)
Definition: QuEST_gpu.cu:95
int numQubitsInStateVec
Number of qubits in the state-vector - this is double the number represented for mixed states.
Definition: QuEST.h:329
int chunkId
The position of the chunk of the state vector held by this process in the full state vector.
Definition: QuEST.h:336
__forceinline__ __device__ long long int insertZeroBit(const long long int number, const int index)
Definition: QuEST_gpu.cu:99
qreal * imag
The imaginary values of the 2^numQubits complex elements.
Definition: QuEST.h:310
qreal densmatr_findProbabilityOfZero(Qureg qureg, int measureQubit)
Definition: QuEST_gpu.cu:2064
__forceinline__ __device__ int getBitMaskParity(long long int mask)
Definition: QuEST_gpu.cu:86
long long int numAmpsPerChunk
Number of probability amplitudes held in stateVec by this process In the non-MPI version,...
Definition: QuEST.h:332
qreal * termCoeffs
The real coefficient of each Pauli product. This is an array of length PauliHamil....
Definition: QuEST.h:283
enum pauliOpType * pauliCodes
The Pauli operators acting on each qubit, flattened over every operator.
Definition: QuEST.h:281
void copyStateToGPU(Qureg qureg)
In GPU mode, this copies the state-vector (or density matrix) from RAM (qureg.stateVec) to VRAM / GPU...
Definition: QuEST_gpu.cu:519
#define REDUCE_SHARED_SIZE
Definition: QuEST_gpu.cu:19
@ SCALED_PRODUCT
Definition: QuEST.h:233
int numRanks
Definition: QuEST.h:365
int numQubits
The number of qubits this operator can act on (informing its size)
Definition: QuEST.h:300
int numSumTerms
The number of terms in the weighted sum, or the number of Pauli products.
Definition: QuEST.h:285
long long int getQubitBitMask(int *qubits, int numQubits)
Definition: QuEST_common.c:50
Represents a diagonal complex operator on the full Hilbert state of a Qureg.
Definition: QuEST.h:297
__forceinline__ __device__ long long int insertZeroBits(long long int number, int *inds, const int numInds)
Definition: QuEST_gpu.cu:112
@ SCALED_INVERSE_SHIFTED_NORM
Definition: QuEST.h:232
qreal ** real
Definition: QuEST.h:189
qreal * secondLevelReduction
Definition: QuEST.h:348
__forceinline__ __device__ int extractBit(const int locationOfBitFromRight, const long long int theEncodedNumber)
Definition: QuEST_gpu.cu:82
void densmatr_mixTwoQubitDephasing(Qureg qureg, int qubit1, int qubit2, qreal dephase)
Definition: QuEST_gpu.cu:2938
@ INVERSE_NORM
Definition: QuEST.h:232
__forceinline__ __device__ long long int insertTwoZeroBits(const long long int number, const int bit1, const int bit2)
Definition: QuEST_gpu.cu:106
qreal ** imag
Definition: QuEST.h:190
@ PRODUCT
Definition: QuEST.h:233
ComplexArray stateVec
Computational state amplitudes - a subset thereof in the MPI version.
Definition: QuEST.h:341
long long int numElemsPerChunk
The number of the 2^numQubits amplitudes stored on each distributed node.
Definition: QuEST.h:302
int isDensityMatrix
Whether this instance is a density-state representation.
Definition: QuEST.h:325
int numQubits
Definition: QuEST.h:188
void densmatr_mixDephasing(Qureg qureg, int targetQubit, qreal dephase)
Definition: QuEST_gpu.cu:2899
int numQubitsRepresented
The number of qubits represented in either the state-vector or density matrix.
Definition: QuEST.h:327
long long int numAmpsTotal
Total number of amplitudes, which are possibly distributed among machines.
Definition: QuEST.h:334
@ SCALED_DISTANCE
Definition: QuEST.h:234
qreal * real
The real values of the 2^numQubits complex elements.
Definition: QuEST.h:308
qreal real
Definition: QuEST.h:105
__device__ void reduceBlock(qreal *arrayIn, qreal *reducedArray, int length)
Definition: QuEST_gpu.cu:1932
qreal imag
Definition: QuEST.h:106
@ SCALED_INVERSE_SHIFTED_DISTANCE
Definition: QuEST.h:234
Represents one complex number.
Definition: QuEST.h:103
@ SCALED_NORM
Definition: QuEST.h:232
@ SCALED_INVERSE_PRODUCT
Definition: QuEST.h:233
qreal * firstLevelReduction
Storage for reduction of probabilities on GPU.
Definition: QuEST.h:348
@ SCALED_INVERSE_NORM
Definition: QuEST.h:232